From 290d5d4a0f6f68f228f6abfe758c8ac2760c97fc Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Fri, 22 Nov 2024 04:29:36 -0800
Subject: [PATCH 1/4] Relax PyTorch upper bound (allowing 2.4) (#4703)

As the issue around PyTorch being built without NumPy was fixed in conda-forge, we can now relax these upper bounds to allow PyTorch 2.4.

xref: https://github.com/conda-forge/pytorch-cpu-feedstock/issues/254
xref: https://github.com/conda-forge/pytorch-cpu-feedstock/issues/266
xref: https://github.com/rapidsai/cugraph/pull/4615

Authors:
  - https://github.com/jakirkham
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cugraph/pull/4703
---
 ci/build_docs.sh                                       | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml       | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml       | 2 +-
 conda/recipes/cugraph-dgl/meta.yaml                    | 2 +-
 conda/recipes/cugraph-pyg/meta.yaml                    | 2 +-
 dependencies.yaml                                      | 6 ++----
 python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml | 2 +-
 python/cugraph-dgl/pyproject.toml                      | 2 +-
 python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml | 2 +-
 python/cugraph-pyg/pyproject.toml                      | 2 +-
 10 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 01c573c96ca..2d7e90da8d0 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -48,7 +48,7 @@ rapids-mamba-retry install \
   "libcugraph_etl=${RAPIDS_VERSION_MAJOR_MINOR}.*" \
   "pylibcugraphops=${RAPIDS_VERSION_MAJOR_MINOR}.*" \
   "pylibwholegraph=${RAPIDS_VERSION_MAJOR_MINOR}.*" \
-  "pytorch>=2.3,<2.4" \
+  'pytorch>=2.3' \
   "cuda-version=${CONDA_CUDA_VERSION}"
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e4269707168..ec4c23541f9 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -54,7 +54,7 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - python-louvain
-- pytorch>=2.3,<2.4.0a0
+- pytorch>=2.3
 - raft-dask==24.12.*,>=0.0.0a0
 - rapids-build-backend>=0.3.1,<0.4.0.dev0
 - rapids-dask-dependency==24.12.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index eb2625b9d50..de0507c7c22 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -59,7 +59,7 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - python-louvain
-- pytorch>=2.3,<2.4.0a0
+- pytorch>=2.3
 - raft-dask==24.12.*,>=0.0.0a0
 - rapids-build-backend>=0.3.1,<0.4.0.dev0
 - rapids-dask-dependency==24.12.*,>=0.0.0a0
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index 0383fc8adf8..ed449b5982a 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -31,7 +31,7 @@ requirements:
     - pylibcugraphops ={{ minor_version }}
     - tensordict >=0.1.2
     - python
-    - pytorch >=2.3,<2.4.0a0
+    - pytorch >=2.3
     - cupy >=12.0.0
 
 tests:
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 7d3e503e23a..b44614baa9f 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -31,7 +31,7 @@ requirements:
     - numba >=0.57
     - numpy >=1.23,<3.0a0
     - python
-    - pytorch >=2.3,<2.4.0a0
+    - pytorch >=2.3
     - cupy >=12.0.0
     - cugraph ={{ version }}
     - pylibcugraphops ={{ minor_version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 6aa1b86aa60..f959f4089f5 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -630,9 +630,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - *cugraph_unsuffixed
-          # ceiling could be removed when this is fixed:
-          # https://github.com/conda-forge/pytorch-cpu-feedstock/issues/254
-          - &pytorch_conda pytorch>=2.3,<2.4.0a0
+          - &pytorch_conda pytorch>=2.3
           - pytorch-cuda==11.8
           - &tensordict tensordict>=0.1.2
           - dgl>=2.4.0.cu*
@@ -670,7 +668,7 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - &pytorch_pip torch>=2.3,<2.4.0a0
+              - &pytorch_pip torch>=2.3
               - *tensordict
           - matrix: {cuda: "11.*"}
             packages:
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
index 174012b8f8c..85c85c2043a 100644
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -18,7 +18,7 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - pytorch-cuda==11.8
-- pytorch>=2.3,<2.4.0a0
+- pytorch>=2.3
 - scipy
 - tensordict>=0.1.2
 name: cugraph_dgl_dev_cuda-118
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index e3e12216ac7..af9e91a988e 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -40,7 +40,7 @@ test = [
     "pytest-xdist",
     "scipy",
     "tensordict>=0.1.2",
-    "torch>=2.3,<2.4.0a0",
+    "torch>=2.3",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
index 4778ff0eaf6..5fbd947965f 100644
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -17,7 +17,7 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - pytorch-cuda==11.8
-- pytorch>=2.3,<2.4.0a0
+- pytorch>=2.3
 - pytorch_geometric>=2.5,<2.6
 - scipy
 - tensordict>=0.1.2
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index e157f36f8f6..a30cd375635 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -49,7 +49,7 @@ test = [
     "pytest-xdist",
     "scipy",
     "tensordict>=0.1.2",
-    "torch>=2.3,<2.4.0a0",
+    "torch>=2.3",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.setuptools]

From d3b80a2d5baf4bd910d211ef2b9825fb29101302 Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Fri, 22 Nov 2024 17:40:21 +0100
Subject: [PATCH 2/4] extract the edgelist from the graph (#4750)

This PR exposes the C++ function decompress_to_edgelist to the C, PLC and Python API. This will enable the extraction of the edgelist from a graph which is currently not supported. It also removes the deprecated parameter `legacy_renum_only`

Authors:
  - Joseph Nke (https://github.com/jnke2016)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/4750
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/include/cugraph_c/graph_functions.h       | 102 +++++++++++
 cpp/src/c_api/decompress_to_edgelist.cpp      | 137 ++++++++++++++
 cpp/src/c_api/edgelist.cpp                    |  83 +++++++++
 cpp/src/c_api/edgelist.hpp                    |  34 ++++
 .../cugraph/structure/graph_classes.py        |  33 ----
 .../simpleDistributedGraph.py                 |  99 ++++++++--
 .../graph_implementation/simpleGraph.py       |  75 ++++++--
 .../cugraph/cugraph/structure/number_map.py   |  27 +--
 .../cugraph/tests/structure/test_graph.py     |  52 ++++++
 .../cugraph/tests/structure/test_graph_mg.py  |  54 ++++++
 .../pylibcugraph/pylibcugraph/CMakeLists.txt  |   1 +
 python/pylibcugraph/pylibcugraph/__init__.py  |   2 +
 .../_cugraph_c/graph_functions.pxd            |  65 ++++++-
 .../pylibcugraph/decompress_to_edgelist.pyx   | 169 ++++++++++++++++++
 15 files changed, 840 insertions(+), 95 deletions(-)
 create mode 100644 cpp/src/c_api/decompress_to_edgelist.cpp
 create mode 100644 cpp/src/c_api/edgelist.cpp
 create mode 100644 cpp/src/c_api/edgelist.hpp
 create mode 100644 python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 27e1999cb75..65772b4f5dd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -538,6 +538,8 @@ add_library(cugraph_c
         src/c_api/weakly_connected_components.cpp
         src/c_api/strongly_connected_components.cpp
         src/c_api/allgather.cpp
+        src/c_api/decompress_to_edgelist.cpp
+        src/c_api/edgelist.cpp
         )
 add_library(cugraph::cugraph_c ALIAS cugraph_c)
 
diff --git a/cpp/include/cugraph_c/graph_functions.h b/cpp/include/cugraph_c/graph_functions.h
index ff7e439232a..964b2f2c8d6 100644
--- a/cpp/include/cugraph_c/graph_functions.h
+++ b/cpp/include/cugraph_c/graph_functions.h
@@ -104,6 +104,8 @@ cugraph_error_code_t cugraph_two_hop_neighbors(
 
 /**
  * @brief       Opaque induced subgraph type
+ *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_t
  */
 typedef struct {
   int32_t align_;
@@ -112,6 +114,8 @@ typedef struct {
 /**
  * @brief       Get the source vertex ids
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_sources
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of source vertex ids
  */
@@ -121,6 +125,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_sources(
 /**
  * @brief       Get the destination vertex ids
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_destinations
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of destination vertex ids
  */
@@ -130,6 +136,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_destinatio
 /**
  * @brief       Get the edge weights
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_edge_weights
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of edge weights
  */
@@ -139,6 +147,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_edge_weigh
 /**
  * @brief       Get the edge ids
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_edge_ids
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of edge ids
  */
@@ -148,6 +158,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_edge_ids(
 /**
  * @brief       Get the edge types
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_edge_type_ids
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of edge types
  */
@@ -157,6 +169,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_edge_type_
 /**
  * @brief       Get the subgraph offsets
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_get_edge_offsets
+ *
  * @param [in]     induced_subgraph   Opaque pointer to induced subgraph
  * @return type erased array view of subgraph identifiers
  */
@@ -166,6 +180,8 @@ cugraph_type_erased_device_array_view_t* cugraph_induced_subgraph_get_subgraph_o
 /**
  * @brief     Free induced subgraph
  *
+ * @deprecated  This API will be deleted, use cugraph_edgelist_free
+ *
  * @param [in]    induced subgraph   Opaque pointer to induced subgraph
  */
 void cugraph_induced_subgraph_result_free(cugraph_induced_subgraph_result_t* induced_subgraph);
@@ -361,6 +377,92 @@ cugraph_type_erased_device_array_view_t* cugraph_degrees_result_get_out_degrees(
  */
 void cugraph_degrees_result_free(cugraph_degrees_result_t* degrees_result);
 
+/**
+ * @brief       Opaque edgelist type
+ *
+ */
+typedef struct {
+  int32_t align_;
+} cugraph_edgelist_t;
+
+/**
+ * @brief       Get the source vertex ids
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of source vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_sources(cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Get the destination vertex ids
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of destination vertex ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_destinations(
+  cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Get the edge weights
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of edge weights
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_weights(
+  cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Get the edge ids
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of edge ids
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_ids(
+  cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Get the edge types
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of edge types
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_type_ids(
+  cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Get the edge offsets
+ *
+ * @param [in]     edgelist   Opaque pointer to edgelist
+ * @return type erased array view of subgraph identifiers
+ */
+cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_offsets(
+  cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief     Free edgelist
+ *
+ * @param [in]    edgelist   Opaque pointer to edgelist
+ */
+void cugraph_edgelist_free(cugraph_edgelist_t* edgelist);
+
+/**
+ * @brief       Construct the edge list from the graph view object.
+ *
+ * @param [in]  handle              Handle for accessing resources
+ * @param [in]  graph               Graph to operate on
+ * @param [in]  do_expensive_check  A flag to run expensive checks for input arguments (if set to
+ * true)
+ * @param [out] result              Opaque pointer to edgelist
+ * @param [out] error               Pointer to an error object storing details of any error.  Will
+ *                                  be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_decompress_to_edgelist(const cugraph_resource_handle_t* handle,
+                                                    cugraph_graph_t* graph,
+                                                    bool_t do_expensive_check,
+                                                    cugraph_edgelist_t** result,
+                                                    cugraph_error_t** error);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cpp/src/c_api/decompress_to_edgelist.cpp b/cpp/src/c_api/decompress_to_edgelist.cpp
new file mode 100644
index 00000000000..75bf0c0fd60
--- /dev/null
+++ b/cpp/src/c_api/decompress_to_edgelist.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/abstract_functor.hpp"
+#include "c_api/core_result.hpp"
+#include "c_api/edgelist.hpp"
+#include "c_api/graph.hpp"
+#include "c_api/resource_handle.hpp"
+#include "c_api/utils.hpp"
+
+#include <cugraph_c/algorithms.h>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+
+#include <optional>
+
+namespace {
+
+struct decompress_to_edgelist_functor : public cugraph::c_api::abstract_functor {
+  raft::handle_t const& handle_;
+  cugraph::c_api::cugraph_graph_t* graph_{};
+
+  cugraph::c_api::cugraph_core_result_t const* core_result_{};
+  bool do_expensive_check_{};
+  cugraph::c_api::cugraph_edgelist_t* result_{};
+
+  decompress_to_edgelist_functor(cugraph_resource_handle_t const* handle,
+                                 cugraph_graph_t* graph,
+                                 bool do_expensive_check)
+    : abstract_functor(),
+      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
+      do_expensive_check_(do_expensive_check)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename edge_type_type_t,
+            bool store_transposed,
+            bool multi_gpu>
+  void operator()()
+  {
+    if constexpr (!cugraph::is_candidate<vertex_t, edge_t, weight_t>::value) {
+      unsupported();
+    } else {
+      if constexpr (store_transposed) {
+        error_code_ = cugraph::c_api::
+          transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+            handle_, graph_, error_.get());
+        if (error_code_ != CUGRAPH_SUCCESS) return;
+      }
+
+      auto graph =
+        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>*>(
+          graph_->graph_);
+
+      auto graph_view = graph->view();
+
+      auto edge_weights = reinterpret_cast<cugraph::edge_property_t<
+        cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+        weight_t>*>(graph_->edge_weights_);
+
+      auto edge_ids = reinterpret_cast<cugraph::edge_property_t<
+        cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+        edge_t>*>(graph_->edge_ids_);
+
+      auto edge_types = reinterpret_cast<cugraph::edge_property_t<
+        cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+        edge_type_type_t>*>(graph_->edge_types_);
+
+      auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
+
+      auto [result_src, result_dst, result_wgt, result_edge_id, result_edge_type] =
+        cugraph::decompress_to_edgelist<vertex_t,
+                                        edge_t,
+                                        weight_t,
+                                        edge_type_type_t,
+                                        store_transposed,
+                                        multi_gpu>(
+          handle_,
+          graph_view,
+          (edge_weights != nullptr) ? std::make_optional(edge_weights->view()) : std::nullopt,
+          (edge_ids != nullptr) ? std::make_optional(edge_ids->view()) : std::nullopt,
+          (edge_types != nullptr) ? std::make_optional(edge_types->view()) : std::nullopt,
+          (number_map != nullptr) ? std::make_optional<raft::device_span<vertex_t const>>(
+                                      number_map->data(), number_map->size())
+                                  : std::nullopt,
+          do_expensive_check_);
+
+      result_ = new cugraph::c_api::cugraph_edgelist_t{
+        new cugraph::c_api::cugraph_type_erased_device_array_t(result_src, graph_->vertex_type_),
+        new cugraph::c_api::cugraph_type_erased_device_array_t(result_dst, graph_->vertex_type_),
+        result_wgt ? new cugraph::c_api::cugraph_type_erased_device_array_t(*result_wgt,
+                                                                            graph_->weight_type_)
+                   : NULL,
+        result_edge_id ? new cugraph::c_api::cugraph_type_erased_device_array_t(*result_edge_id,
+                                                                                graph_->edge_type_)
+                       : NULL,
+        result_edge_type ? new cugraph::c_api::cugraph_type_erased_device_array_t(
+                             *result_edge_type, graph_->edge_type_id_type_)
+                         : NULL,
+        NULL};
+    }
+  }
+};
+
+}  // namespace
+
+extern "C" cugraph_error_code_t cugraph_decompress_to_edgelist(
+  const cugraph_resource_handle_t* handle,
+  cugraph_graph_t* graph,
+  bool_t do_expensive_check,
+  cugraph_edgelist_t** result,
+  cugraph_error_t** error)
+{
+  decompress_to_edgelist_functor functor(handle, graph, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/c_api/edgelist.cpp b/cpp/src/c_api/edgelist.cpp
new file mode 100644
index 00000000000..640b2bf2853
--- /dev/null
+++ b/cpp/src/c_api/edgelist.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_api/edgelist.hpp"
+
+#include <cugraph_c/algorithms.h>
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_sources(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->src_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_destinations(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->dst_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_weights(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return (internal_pointer->wgt_ == nullptr)
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->wgt_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_ids(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return (internal_pointer->edge_ids_ == nullptr)
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->edge_ids_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_type_ids(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return (internal_pointer->edge_type_ids_ == nullptr)
+           ? NULL
+           : reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->edge_type_ids_->view());
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_edgelist_get_edge_offsets(
+  cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+    internal_pointer->subgraph_offsets_->view());
+}
+
+extern "C" void cugraph_edgelist_free(cugraph_edgelist_t* edgelist)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_edgelist_t*>(edgelist);
+  delete internal_pointer->src_;
+  delete internal_pointer->dst_;
+  delete internal_pointer->wgt_;
+  delete internal_pointer->edge_ids_;
+  delete internal_pointer->edge_type_ids_;
+  delete internal_pointer->subgraph_offsets_;
+  delete internal_pointer;
+}
diff --git a/cpp/src/c_api/edgelist.hpp b/cpp/src/c_api/edgelist.hpp
new file mode 100644
index 00000000000..bc0f2d337f1
--- /dev/null
+++ b/cpp/src/c_api/edgelist.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "c_api/array.hpp"
+
+namespace cugraph {
+namespace c_api {
+
+struct cugraph_edgelist_t {
+  cugraph_type_erased_device_array_t* src_{};
+  cugraph_type_erased_device_array_t* dst_{};
+  cugraph_type_erased_device_array_t* wgt_{};
+  cugraph_type_erased_device_array_t* edge_ids_{};
+  cugraph_type_erased_device_array_t* edge_type_ids_{};
+  cugraph_type_erased_device_array_t* subgraph_offsets_{};
+};
+
+}  // namespace c_api
+}  // namespace cugraph
diff --git a/python/cugraph/cugraph/structure/graph_classes.py b/python/cugraph/cugraph/structure/graph_classes.py
index 84234f7e904..90f809fa6c1 100644
--- a/python/cugraph/cugraph/structure/graph_classes.py
+++ b/python/cugraph/cugraph/structure/graph_classes.py
@@ -115,7 +115,6 @@ def from_cudf_edgelist(
         edge_type=None,
         renumber=True,
         store_transposed=False,
-        legacy_renum_only=False,
         symmetrize=None,
     ):
         """
@@ -168,13 +167,6 @@ def from_cudf_edgelist(
             If True, stores the transpose of the adjacency matrix.  Required
             for certain algorithms.
 
-        legacy_renum_only : bool, optional (default=False)
-            If True, skips the C++ renumbering step.  Must be true for
-            pylibcugraph algorithms.  Must be false for algorithms
-            not yet converted to the pylibcugraph C API.
-
-            This parameter is deprecated and will be removed.
-
         symmetrize: bool, optional (default=None)
             If True, symmetrize the edge list for an undirected graph. Setting
             this flag to True for a directed graph returns an error. The default
@@ -210,7 +202,6 @@ def from_cudf_edgelist(
             edge_type=edge_type,
             renumber=renumber,
             store_transposed=store_transposed,
-            legacy_renum_only=legacy_renum_only,
             symmetrize=symmetrize,
         )
 
@@ -306,7 +297,6 @@ def from_dask_cudf_edgelist(
         edge_type=None,
         renumber=True,
         store_transposed=False,
-        legacy_renum_only=False,
     ):
         """
         Initializes the distributed graph from the dask_cudf.DataFrame
@@ -353,13 +343,6 @@ def from_dask_cudf_edgelist(
             If True, stores the transpose of the adjacency matrix.  Required
             for certain algorithms.
 
-        legacy_renum_only : bool, optional (default=False)
-            If True, skips the C++ renumbering step.  Must be true for
-            pylibcugraph algorithms.  Must be false for algorithms
-            not yet converted to the pylibcugraph C API.
-
-            This parameter is deprecated and will be removed.
-
         """
 
         if self._Impl is None:
@@ -378,7 +361,6 @@ def from_dask_cudf_edgelist(
             edge_type=edge_type,
             renumber=renumber,
             store_transposed=store_transposed,
-            legacy_renum_only=legacy_renum_only,
         )
 
     # Move to Compat Module
@@ -869,7 +851,6 @@ def from_cudf_edgelist(
         edge_attr=None,
         renumber=True,
         store_transposed=False,
-        legacy_renum_only=False,
     ):
         """
         Initialize a graph from the edge list. It is an error to call this
@@ -909,13 +890,6 @@ def from_cudf_edgelist(
             If True, stores the transpose of the adjacency matrix.  Required
             for certain algorithms.
 
-        legacy_renum_only : bool, optional (default=False)
-            If True, skips the C++ renumbering step.  Must be true for
-            pylibcugraph algorithms.  Must be false for algorithms
-            not yet converted to the pylibcugraph C API.
-
-            This parameter is deprecated and will be removed.
-
         Examples
         --------
         >>> df = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ',
@@ -945,7 +919,6 @@ def from_dask_cudf_edgelist(
         edge_attr=None,
         renumber=True,
         store_transposed=False,
-        legacy_renum_only=False,
     ):
         """
         Initializes the distributed graph from the dask_cudf.DataFrame
@@ -980,12 +953,6 @@ def from_dask_cudf_edgelist(
             If True, stores the transpose of the adjacency matrix.  Required
             for certain algorithms.
 
-        legacy_renum_only : bool, optional (default=False)
-            If True, skips the C++ renumbering step.  Must be true for
-            pylibcugraph algorithms.  Must be false for algorithms
-            not yet converted to the pylibcugraph C API.
-
-            This parameter is deprecated and will be removed.
         """
         raise TypeError("Distributed N-partite graph not supported")
 
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index 83dad234287..ced72a6bbe2 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -14,6 +14,7 @@
 import gc
 from typing import Union, Iterable
 import warnings
+from typing import Tuple
 
 import cudf
 import cupy as cp
@@ -31,6 +32,7 @@
     degrees as pylibcugraph_degrees,
     in_degrees as pylibcugraph_in_degrees,
     out_degrees as pylibcugraph_out_degrees,
+    decompress_to_edgelist as pylibcugraph_decompress_to_edgelist,
 )
 
 from cugraph.structure.number_map import NumberMap
@@ -172,7 +174,6 @@ def __from_edgelist(
         edge_type=None,
         renumber=True,
         store_transposed=False,
-        legacy_renum_only=False,
         symmetrize=None,
     ):
         if not isinstance(input_ddf, dask_cudf.DataFrame):
@@ -333,9 +334,7 @@ def __from_edgelist(
         # the edgelist_df and not do any renumbering.
         # C++ renumbering is enabled by default for algorithms that
         # support it (but only called if renumbering is on)
-        self.compute_renumber_edge_list(
-            transposed=store_transposed, legacy_renum_only=legacy_renum_only
-        )
+        self.compute_renumber_edge_list(transposed=store_transposed)
 
         if renumber is False:
             self.properties.renumbered = False
@@ -979,6 +978,84 @@ def convert_to_cudf(cp_arrays):
 
         return ddf
 
+    def decompress_to_edgelist(
+        self, return_unrenumbered_edgelist: bool = True
+    ) -> dask_cudf.DataFrame:
+        """
+        Extract a the edgelist from a graph.
+
+        Parameters
+        ----------
+        return_unrenumbered_edgelist : bool (default=True)
+                                    Flag determining whether to return the original
+                                    input edgelist if 'True' or the renumbered one
+                                    of 'False' and the edgelist was renumbered.
+
+        Returns
+        -------
+        df : dask_cudf.cudf.DataFrame
+            Distributed GPU data frame containing all induced sources identifiers,
+            destination identifiers, and if applicable edge weights, edge ids and
+            edge types
+        """
+
+        # Initialize dask client
+        client = default_client()
+
+        do_expensive_check = False
+
+        def _call_decompress_to_edgelist(
+            sID: bytes,
+            mg_graph_x,
+            do_expensive_check: bool,
+        ) -> Tuple[cp.ndarray, cp.ndarray, cp.ndarray, cp.ndarray]:
+            return pylibcugraph_decompress_to_edgelist(
+                resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+                graph=mg_graph_x,
+                do_expensive_check=do_expensive_check,
+            )
+
+        result = [
+            client.submit(
+                _call_decompress_to_edgelist,
+                Comms.get_session_id(),
+                self._plc_graph[w],
+                do_expensive_check,
+            )
+            for w in Comms.get_workers()
+        ]
+        wait(result)
+
+        def convert_to_cudf(cp_arrays: cp.ndarray) -> cudf.DataFrame:
+            cp_src, cp_dst, cp_weight, cp_edge_ids, cp_edge_type_ids = cp_arrays
+
+            df = cudf.DataFrame()
+            df["src"] = cp_src
+            df["dst"] = cp_dst
+            if cp_weight is not None:
+                df["weight"] = cp_weight
+            if cp_edge_ids is not None:
+                df["edge_ids"] = cp_edge_ids
+            if cp_edge_type_ids is not None:
+                df["edge_type_ids"] = cp_edge_type_ids
+
+            return df
+
+        cudf_result = [
+            client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result
+        ]
+
+        wait(cudf_result)
+
+        ddf = dask_cudf.from_delayed(cudf_result).persist()
+        wait(ddf)
+
+        if self.properties.renumbered and return_unrenumbered_edgelist:
+            ddf = self.renumber_map.unrenumber(ddf, "src")
+            ddf = self.renumber_map.unrenumber(ddf, "dst")
+
+        return ddf
+
     def select_random_vertices(
         self, random_state: int = None, num_vertices: int = None
     ) -> Union[dask_cudf.Series, dask_cudf.DataFrame]:
@@ -1214,7 +1291,7 @@ def neighbors(self, n):
         ddf = self.edgelist.edgelist_df
         return ddf[ddf["src"] == n]["dst"].reset_index(drop=True)
 
-    def compute_renumber_edge_list(self, transposed=False, legacy_renum_only=False):
+    def compute_renumber_edge_list(self, transposed=False):
         """
         Compute a renumbered edge list
         This function works in the MNMG pipeline and will transform
@@ -1237,20 +1314,9 @@ def compute_renumber_edge_list(self, transposed=False, legacy_renum_only=False):
             structure.  If False, renumber with the intent to make
             a CSR-like structure.  Defaults to False.
 
-        legacy_renum_only : (optional) bool
-            if True, The C++ renumbering will not be triggered.
-            This parameter is added for new algos following the
-            C/Pylibcugraph path
-
             This parameter is deprecated and will be removed.
         """
 
-        if legacy_renum_only:
-            warning_msg = (
-                "The parameter 'legacy_renum_only' is deprecated and will be removed."
-            )
-            warnings.warn(warning_msg, DeprecationWarning)
-
         if not self.properties.renumber:
             self.edgelist = self.EdgeList(self.input_df)
             self.renumber_map = None
@@ -1269,7 +1335,6 @@ def compute_renumber_edge_list(self, transposed=False, legacy_renum_only=False):
                 self.source_columns,
                 self.destination_columns,
                 store_transposed=transposed,
-                legacy_renum_only=legacy_renum_only,
             )
 
             self.edgelist = self.EdgeList(renumbered_ddf)
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
index 858b114ebdc..4523b7f13b8 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -14,6 +14,7 @@
 from cugraph.structure import graph_primtypes_wrapper
 from cugraph.structure.replicate_edgelist import replicate_cudf_dataframe
 from cugraph.structure.symmetrize import symmetrize as symmetrize_df
+from pylibcugraph import decompress_to_edgelist as pylibcugraph_decompress_to_edgelist
 from cugraph.structure.number_map import NumberMap
 import cugraph.dask.common.mg_utils as mg_utils
 import cudf
@@ -132,17 +133,9 @@ def __from_edgelist(
         edge_id=None,
         edge_type=None,
         renumber=True,
-        legacy_renum_only=False,
         store_transposed=False,
         symmetrize=None,
     ):
-        if legacy_renum_only:
-            warning_msg = (
-                "The parameter 'legacy_renum_only' is deprecated and will be removed."
-            )
-            warnings.warn(
-                warning_msg,
-            )
 
         if self.properties.directed and symmetrize:
             raise ValueError(
@@ -266,11 +259,7 @@ def __from_edgelist(
         if renumber:
             # FIXME: Should SG do lazy evaluation like MG?
             elist, renumber_map = NumberMap.renumber(
-                elist,
-                source,
-                destination,
-                store_transposed=False,
-                legacy_renum_only=legacy_renum_only,
+                elist, source, destination, store_transposed=False
             )
             source = renumber_map.renumbered_src_col_name
             destination = renumber_map.renumbered_dst_col_name
@@ -312,6 +301,8 @@ def __from_edgelist(
 
         # FIXME: if the user calls self.edgelist.edgelist_df after creating a
         # symmetric graph, return the symmetric edgelist?
+        # FIXME: For better memory footprint, avoid storing this edgelist and instead
+        # call decompress_to_edgelist to extract the edgelist from the graph
         self.edgelist = simpleGraphImpl.EdgeList(
             elist[source], elist[destination], value_col
         )
@@ -804,6 +795,64 @@ def get_two_hop_neighbors(self, start_vertices=None):
 
         return df
 
+    def decompress_to_edgelist(
+        self, return_unrenumbered_edgelist: bool = True
+    ) -> cudf.DataFrame:
+        """
+        Extract a the edgelist from a graph.
+
+        Parameters
+        ----------
+        return_unrenumbered_edgelist : bool (default=True)
+            Flag determining whether to return the original input edgelist
+            if 'True' or the renumbered one of 'False' and the edgelist was
+            renumbered.
+
+        Returns
+        -------
+
+        df : cudf.DataFrame
+            GPU data frame containing all sources identifiers,
+            destination identifiers and if applicable edge weights, edge ids and
+            edge types
+
+        Examples
+        --------
+        >>> from cugraph.datasets import karate
+        >>> G = karate.get_graph(download=True)
+        >>> edgelist = G.decompress_to_edgelist()
+
+        """
+
+        do_expensive_check = False
+        (
+            source,
+            destination,
+            weight,
+            edge_ids,
+            edge_type_ids,
+        ) = pylibcugraph_decompress_to_edgelist(
+            resource_handle=ResourceHandle(),
+            graph=self._plc_graph,
+            do_expensive_check=do_expensive_check,
+        )
+
+        df = cudf.DataFrame()
+        df["src"] = source
+        df["dst"] = destination
+        if weight is not None:
+            df["weight"] = weight
+        if edge_ids is not None:
+            df["edge_ids"] = edge_ids
+        if edge_type_ids is not None:
+            df["edge_type_ids"] = edge_type_ids
+
+        if self.properties.renumbered and return_unrenumbered_edgelist:
+            df, _ = self.renumber_map.unrenumber(df, "src", get_column_names=True)
+            df, _ = self.renumber_map.unrenumber(df, "dst", get_column_names=True)
+
+        return df
+
     def select_random_vertices(
         self,
         random_state: int = None,
diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py
index b0118fee960..39738daff36 100644
--- a/python/cugraph/cugraph/structure/number_map.py
+++ b/python/cugraph/cugraph/structure/number_map.py
@@ -18,7 +18,6 @@
 import dask_cudf
 import numpy as np
 import cudf
-import warnings
 
 
 class NumberMap:
@@ -462,12 +461,7 @@ def from_internal_vertex_id(
 
     @staticmethod
     def renumber_and_segment(
-        df,
-        src_col_names,
-        dst_col_names,
-        preserve_order=False,
-        store_transposed=False,
-        legacy_renum_only=False,
+        df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False
     ):
         """
         Given an input dataframe with its column names, this function returns the
@@ -475,11 +469,6 @@ def renumber_and_segment(
         to external vertex IDs. the parameter 'preserve_order' ensures that the order
         of the edges is preserved during renumbering.
         """
-        if legacy_renum_only:
-            warning_msg = (
-                "The parameter 'legacy_renum_only' is deprecated and will be removed."
-            )
-            warnings.warn(warning_msg, DeprecationWarning)
 
         renumbered = False
 
@@ -584,20 +573,10 @@ def renumber_and_segment(
 
     @staticmethod
     def renumber(
-        df,
-        src_col_names,
-        dst_col_names,
-        preserve_order=False,
-        store_transposed=False,
-        legacy_renum_only=False,
+        df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False
     ):
         return NumberMap.renumber_and_segment(
-            df,
-            src_col_names,
-            dst_col_names,
-            preserve_order,
-            store_transposed,
-            legacy_renum_only,
+            df, src_col_names, dst_col_names, preserve_order, store_transposed
         )[0:2]
 
     def unrenumber(self, df, column_name, preserve_order=False, get_column_names=False):
diff --git a/python/cugraph/cugraph/tests/structure/test_graph.py b/python/cugraph/cugraph/tests/structure/test_graph.py
index b3e517100e1..6fcfef726b1 100644
--- a/python/cugraph/cugraph/tests/structure/test_graph.py
+++ b/python/cugraph/cugraph/tests/structure/test_graph.py
@@ -179,6 +179,58 @@ def test_add_edge_list_to_adj_list(graph_file):
     assert values_cu is None
 
 
+@pytest.mark.sg
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+@pytest.mark.parametrize("is_directed", [True, False])
+@pytest.mark.parametrize("renumber", [True, False])
+def test_decompress_to_edgelist(graph_file, is_directed, renumber):
+    input_df = utils.read_csv_file(graph_file)
+    input_df = input_df.rename(columns={"0": "src", "1": "dst", "2": "weight"})
+
+    G = cugraph.Graph(directed=is_directed)
+    input_df_ = cudf.DataFrame()
+    if renumber:
+        input_df_["src_0"] = cudf.Series(input_df["src"])
+        input_df_["dst_0"] = cudf.Series(input_df["dst"])
+        input_df_["weight"] = cudf.Series(input_df["weight"])
+        input_df_["src_1"] = input_df_["src_0"] + 1000
+        input_df_["dst_1"] = input_df_["dst_0"] + 1000
+
+        input_df = input_df_
+        source = ["src_0", "src_1"]
+        destination = ["dst_0", "dst_1"]
+    else:
+        source = "src"
+        destination = "dst"
+
+    G.from_cudf_edgelist(
+        input_df, source=source, destination=destination, weight="weight", renumber=True
+    )
+
+    extracted_df = G.decompress_to_edgelist(return_unrenumbered_edgelist=True)
+
+    if renumber:
+        extracted_df = extracted_df.rename(
+            columns={
+                "0_src": "src_0",
+                "1_src": "src_1",
+                "0_dst": "dst_0",
+                "1_dst": "dst_1",
+            }
+        )
+        extracted_df = extracted_df.sort_values(
+            ["src_0", "src_1", "dst_0", "dst_1"]
+        ).reset_index(drop=True)
+        input_df = input_df.sort_values(
+            ["src_0", "src_1", "dst_0", "dst_1"]
+        ).reset_index(drop=True)
+    else:
+        extracted_df = extracted_df.sort_values(["src", "dst"]).reset_index(drop=True)
+        input_df = input_df.sort_values(["src", "dst"]).reset_index(drop=True)
+
+    assert_frame_equal(input_df, extracted_df, check_dtype=False, check_like=True)
+
+
 # Test
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
diff --git a/python/cugraph/cugraph/tests/structure/test_graph_mg.py b/python/cugraph/cugraph/tests/structure/test_graph_mg.py
index f2cc1583f93..e5eeb0f653b 100644
--- a/python/cugraph/cugraph/tests/structure/test_graph_mg.py
+++ b/python/cugraph/cugraph/tests/structure/test_graph_mg.py
@@ -420,3 +420,57 @@ def test_graph_creation_properties(dask_client, graph_file, directed, renumber):
     assert sG.number_of_nodes() == mG.number_of_nodes()
     assert sG.number_of_edges() == mG.number_of_edges()
     assert_frame_equal(sG_edgelist_view, mG_edgelist_view, check_dtype=False)
+
+
+@pytest.mark.parametrize("directed", [True, False])
+@pytest.mark.parametrize("renumber", [True, False])
+@pytest.mark.parametrize("graph_file", datasets)
+def test_decompress_to_edgelist(dask_client, graph_file, directed, renumber):
+    input_df = utils.read_csv_file(graph_file)
+    input_df = input_df.rename(columns={"0": "src", "1": "dst", "2": "weight"})
+
+    G = cugraph.Graph(directed=directed)
+    input_df_ = cudf.DataFrame()
+    if renumber:
+        input_df_["src_0"] = cudf.Series(input_df["src"])
+        input_df_["dst_0"] = cudf.Series(input_df["dst"])
+        input_df_["weight"] = cudf.Series(input_df["weight"])
+        input_df_["src_1"] = input_df_["src_0"] + 1000
+        input_df_["dst_1"] = input_df_["dst_0"] + 1000
+
+        input_df = input_df_
+        source = ["src_0", "src_1"]
+        destination = ["dst_0", "dst_1"]
+    else:
+        source = "src"
+        destination = "dst"
+    num_workers = len(Comms.get_workers())
+
+    input_ddf = dask_cudf.from_cudf(input_df, npartitions=num_workers)
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(
+        input_ddf, source=source, destination=destination, weight="weight"
+    )
+
+    extracted_df = (
+        G.decompress_to_edgelist(return_unrenumbered_edgelist=True)
+        .compute()
+        .reset_index(drop=True)
+    )
+
+    if renumber:
+        extracted_df = extracted_df.rename(
+            columns={
+                "0_src": "src_0",
+                "1_src": "src_1",
+                "0_dst": "dst_0",
+                "1_dst": "dst_1",
+            }
+        )
+        extracted_df = extracted_df.sort_values(
+            ["src_0", "src_1", "dst_0", "dst_1"]
+        ).reset_index(drop=True)
+        input_df = input_df.sort_values(
+            ["src_0", "src_1", "dst_0", "dst_1"]
+        ).reset_index(drop=True)
diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
index fb46030bc56..fe7c4b64aa5 100644
--- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
@@ -70,6 +70,7 @@ set(cython_sources
     homogeneous_biased_neighbor_sample.pyx
     homogeneous_uniform_neighbor_sample.pyx
     edge_id_lookup_table.pyx
+    decompress_to_edgelist.pyx
 )
 set(linked_libraries cugraph::cugraph;cugraph::cugraph_c)
 
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index 5aa351f9ce1..9047144c13a 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -126,6 +126,8 @@
 
 from pylibcugraph.degrees import in_degrees, out_degrees, degrees
 
+from pylibcugraph.decompress_to_edgelist import decompress_to_edgelist
+
 
 from pylibcugraph import exceptions
 
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
index b8f16cb94c8..b27a7230a13 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/graph_functions.pxd
@@ -122,41 +122,41 @@ cdef extern from "cugraph_c/graph_functions.h":
 
     ###########################################################################
     # induced_subgraph
-    ctypedef struct cugraph_induced_subgraph_result_t:
+    ctypedef struct cugraph_induced_subgraph_result_t: # Deprecated
         pass
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_sources(
+        cugraph_induced_subgraph_get_sources( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_destinations(
+        cugraph_induced_subgraph_get_destinations( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_edge_weights(
+        cugraph_induced_subgraph_get_edge_weights( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_edge_ids(
+        cugraph_induced_subgraph_get_edge_ids( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_edge_type_ids(
+        cugraph_induced_subgraph_get_edge_type_ids( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef cugraph_type_erased_device_array_view_t* \
-        cugraph_induced_subgraph_get_subgraph_offsets(
+        cugraph_induced_subgraph_get_subgraph_offsets( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
     cdef void \
-        cugraph_induced_subgraph_result_free(
+        cugraph_induced_subgraph_result_free( # Deprecated
             cugraph_induced_subgraph_result_t* induced_subgraph
         )
 
@@ -250,3 +250,52 @@ cdef extern from "cugraph_c/graph_functions.h":
         cugraph_degrees_result_free(
             cugraph_degrees_result_t* degrees_result
         )
+
+    ###########################################################################
+    # decompress to edgelist
+    ctypedef struct cugraph_edgelist_t:
+        pass
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_sources(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_destinations(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_edge_weights(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_edge_ids(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_edge_type_ids(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_edgelist_get_edge_offsets(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef void \
+        cugraph_edgelist_free(
+            cugraph_edgelist_t* edgelist
+        )
+
+    cdef cugraph_error_code_t \
+        cugraph_decompress_to_edgelist(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            bool_t do_expensive_check,
+            cugraph_edgelist_t** result,
+            cugraph_error_t** error
+        )
diff --git a/python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx b/python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx
new file mode 100644
index 00000000000..58c29940aba
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/decompress_to_edgelist.pyx
@@ -0,0 +1,169 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+
+from pylibcugraph._cugraph_c.types cimport (
+    bool_t,
+)
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_edgelist_t,
+    cugraph_decompress_to_edgelist,
+    cugraph_edgelist_get_sources,
+    cugraph_edgelist_get_destinations,
+    cugraph_edgelist_get_edge_weights,
+    cugraph_edgelist_get_edge_ids,
+    cugraph_edgelist_get_edge_type_ids,
+    cugraph_edgelist_get_edge_offsets,
+    cugraph_edgelist_free,
+)
+
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+    create_cugraph_type_erased_device_array_view_from_py_obj,
+)
+
+
+def decompress_to_edgelist(ResourceHandle resource_handle,
+                           _GPUGraph graph,
+                           bool_t do_expensive_check):
+    """
+    Extract a the edgelist from a graph
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph or MGGraph
+        The input graph.
+
+    do_expensive_check : bool_t
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    Returns
+    -------
+    A tuple of device arrays containing the sources, destinations and if applicable
+    edge_weights, edge_ids and/or edge_type_ids.
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 2, 2, 2, 3, 4], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 3, 4, 0, 1, 3, 5, 5], dtype=numpy.int32)
+    >>> weights = cupy.asarray(
+    ...     [0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2], dtype=numpy.float32)
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=False, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weight_array=weights,
+    ...     store_transposed=False, renumber=False, do_expensive_check=False)
+    >>> (sources, destinations, edge_weights, _, _) =
+    ...     pylibcugraph.decompress_to_edgelist(
+    ...         resource_handle, G, False)
+    >>> sources
+    [0, 1, 1, 2, 2, 2, 3, 4]
+    >>> destinations
+    [1, 3, 4, 0, 1, 3, 5, 5]
+    >>> edge_weights
+    [0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2]
+    """
+
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+    cdef cugraph_edgelist_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    error_code = cugraph_decompress_to_edgelist(c_resource_handle_ptr,
+                                                c_graph_ptr,
+                                                do_expensive_check,
+                                                &result_ptr,
+                                                &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_decompress_to_edgelist")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* sources_ptr = \
+        cugraph_edgelist_get_sources(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* destinations_ptr = \
+        cugraph_edgelist_get_destinations(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* edge_weights_ptr = \
+        cugraph_edgelist_get_edge_weights(result_ptr)
+
+    cdef cugraph_type_erased_device_array_view_t* edge_ids_ptr = \
+        cugraph_edgelist_get_edge_ids(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* edge_type_ids_ptr = \
+        cugraph_edgelist_get_edge_type_ids(result_ptr)
+
+
+    """
+    cdef cugraph_type_erased_device_array_view_t* subgraph_offsets_ptr = \
+        cugraph_edgelist_get_edge_offsets(result_ptr)
+    """
+
+    # FIXME: Get ownership of the result data instead of performing a copy
+    # for perfomance improvement
+    cupy_edge_weights = None
+    cupy_edge_ids = None
+    cupy_edge_type_ids = None
+    cupy_sources = copy_to_cupy_array(
+        c_resource_handle_ptr, sources_ptr)
+    cupy_destinations = copy_to_cupy_array(
+        c_resource_handle_ptr, destinations_ptr)
+    if edge_weights_ptr != NULL:
+        cupy_edge_weights = copy_to_cupy_array(
+            c_resource_handle_ptr, edge_weights_ptr)
+    if edge_ids_ptr != NULL:
+        cupy_edge_ids = copy_to_cupy_array(
+            c_resource_handle_ptr, edge_ids_ptr)
+    if edge_type_ids_ptr != NULL:
+        cupy_edge_type_ids = copy_to_cupy_array(
+            c_resource_handle_ptr, edge_type_ids_ptr)
+
+    """
+    cupy_subgraph_offsets = copy_to_cupy_array(
+        c_resource_handle_ptr, subgraph_offsets_ptr)
+    """
+
+    # Free pointer
+    cugraph_edgelist_free(result_ptr)
+
+    return (cupy_sources, cupy_destinations,
+                cupy_edge_weights, cupy_edge_ids, cupy_edge_type_ids)

From 5e19f4ad74dd9cfb01fbf9e4b2c07f86f2c72c49 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Fri, 22 Nov 2024 12:09:34 -0500
Subject: [PATCH 3/4] Remove GNN Packages (#4765)

Removes the GNN packages, which are now part of [cugraph-gnn](https://github.com/rapidsai/cugraph-gnn).

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Bradley Dice (https://github.com/bdice)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4765
---
 .devcontainer/Dockerfile                      |    3 -
 README.md                                     |    1 -
 build.sh                                      |   24 -
 ci/release/update-version.sh                  |    4 +-
 ci/run_cugraph_dgl_pytests.sh                 |    9 -
 ci/run_cugraph_pyg_pytests.sh                 |   18 -
 ci/test.sh                                    |    7 -
 .../all_cuda-118_arch-x86_64.yaml             |    1 -
 .../all_cuda-125_arch-x86_64.yaml             |    1 -
 conda/recipes/cugraph-dgl/build.sh            |    7 -
 conda/recipes/cugraph-dgl/meta.yaml           |   46 -
 conda/recipes/cugraph-pyg/build.sh            |    5 -
 .../cugraph-pyg/conda_build_config.yaml       |   19 -
 conda/recipes/cugraph-pyg/meta.yaml           |   50 -
 dependencies.yaml                             |  151 +-
 python/.coveragerc                            |    2 -
 python/cugraph-dgl/LICENSE                    |    1 -
 python/cugraph-dgl/README.md                  |   57 -
 .../conda/cugraph_dgl_dev_cuda-118.yaml       |   24 -
 python/cugraph-dgl/cugraph_dgl/VERSION        |    1 -
 python/cugraph-dgl/cugraph_dgl/__init__.py    |   27 -
 python/cugraph-dgl/cugraph_dgl/_version.py    |   31 -
 python/cugraph-dgl/cugraph_dgl/convert.py     |   93 --
 .../cugraph_dgl/cugraph_storage.py            |  714 ---------
 .../cugraph_dgl/dataloading/__init__.py       |   35 -
 .../dataloading/dask_dataloader.py            |  321 ----
 .../cugraph_dgl/dataloading/dataloader.py     |  162 --
 .../cugraph_dgl/dataloading/dataset.py        |  218 ---
 .../dataloading/neighbor_sampler.py           |  206 ---
 .../cugraph_dgl/dataloading/sampler.py        |  196 ---
 .../cugraph_dgl/dataloading/utils/__init__.py |   12 -
 .../utils/extract_graph_helpers.py            |   73 -
 .../dataloading/utils/sampling_helpers.py     |  692 ---------
 python/cugraph-dgl/cugraph_dgl/features.py    |  121 --
 python/cugraph-dgl/cugraph_dgl/graph.py       |  931 ------------
 python/cugraph-dgl/cugraph_dgl/nn/__init__.py |   13 -
 .../cugraph_dgl/nn/conv/__init__.py           |   28 -
 .../cugraph-dgl/cugraph_dgl/nn/conv/base.py   |  376 -----
 .../cugraph_dgl/nn/conv/gatconv.py            |  314 ----
 .../cugraph_dgl/nn/conv/gatv2conv.py          |  254 ----
 .../cugraph_dgl/nn/conv/relgraphconv.py       |  192 ---
 .../cugraph_dgl/nn/conv/sageconv.py           |  166 ---
 .../cugraph_dgl/nn/conv/transformerconv.py    |  173 ---
 .../cugraph-dgl/cugraph_dgl/tests/__init__.py |   12 -
 .../cugraph-dgl/cugraph_dgl/tests/conftest.py |   68 -
 .../tests/dataloading/test_dask_dataloader.py |  153 --
 .../dataloading/test_dask_dataloader_mg.py    |  121 --
 .../tests/dataloading/test_dataloader.py      |  175 ---
 .../tests/dataloading/test_dataloader_mg.py   |  254 ----
 .../tests/dataloading/test_dataset.py         |  128 --
 .../cugraph_dgl/tests/nn/test_gatconv.py      |  185 ---
 .../cugraph_dgl/tests/nn/test_gatv2conv.py    |  182 ---
 .../cugraph_dgl/tests/nn/test_relgraphconv.py |  112 --
 .../cugraph_dgl/tests/nn/test_sageconv.py     |  100 --
 .../cugraph_dgl/tests/nn/test_sparsegraph.py  |   60 -
 .../tests/nn/test_transformerconv.py          |   93 --
 .../cugraph_dgl/tests/test_cugraph_storage.py |  150 --
 .../tests/test_from_dgl_heterograph.py        |  201 ---
 .../cugraph_dgl/tests/test_graph.py           |  217 ---
 .../cugraph_dgl/tests/test_graph_mg.py        |  310 ----
 .../cugraph_dgl/tests/test_utils.py           |  206 ---
 python/cugraph-dgl/cugraph_dgl/tests/utils.py |  154 --
 python/cugraph-dgl/cugraph_dgl/typing.py      |   40 -
 .../cugraph-dgl/cugraph_dgl/utils/__init__.py |   13 -
 .../utils/cugraph_conversion_utils.py         |  130 --
 .../utils/cugraph_storage_utils.py            |   85 --
 .../cugraph_dgl/utils/feature_storage.py      |   58 -
 python/cugraph-dgl/cugraph_dgl/view.py        |  346 -----
 .../examples/dataset_from_disk_cudf.ipynb     |  269 ----
 .../cugraph-dgl/examples/graphsage/README.MD  |   26 -
 .../graphsage/node-classification-dask.py     |  272 ----
 .../examples/graphsage/node-classification.py |  283 ----
 .../multi_trainer_MG_example/model.py         |  142 --
 .../multi_trainer_MG_example/workflow_mnmg.py |  311 ----
 .../multi_trainer_MG_example/workflow_snmg.py |  242 ---
 python/cugraph-dgl/pyproject.toml             |   64 -
 python/cugraph-dgl/tests/test_version.py      |   12 -
 python/cugraph-pyg/LICENSE                    |    1 -
 .../conda/cugraph_pyg_dev_cuda-118.yaml       |   24 -
 python/cugraph-pyg/cugraph_pyg/VERSION        |    1 -
 python/cugraph-pyg/cugraph_pyg/__init__.py    |   19 -
 python/cugraph-pyg/cugraph_pyg/_version.py    |   36 -
 .../cugraph-pyg/cugraph_pyg/data/__init__.py  |   26 -
 .../cugraph_pyg/data/dask_graph_store.py      | 1321 -----------------
 .../cugraph_pyg/data/feature_store.py         |  288 ----
 .../cugraph_pyg/data/graph_store.py           |  365 -----
 .../examples/cugraph_dist_sampling_mg.py      |  112 --
 .../examples/cugraph_dist_sampling_sg.py      |   82 -
 .../cugraph_pyg/examples/gcn_dist_mnmg.py     |  446 ------
 .../cugraph_pyg/examples/gcn_dist_sg.py       |  236 ---
 .../cugraph_pyg/examples/gcn_dist_snmg.py     |  339 -----
 .../cugraph_pyg/examples/graph_sage_mg.py     |  446 ------
 .../cugraph_pyg/examples/graph_sage_sg.py     |  217 ---
 .../cugraph_pyg/examples/pylibcugraph_mg.py   |  100 --
 .../cugraph_pyg/examples/pylibcugraph_sg.py   |   66 -
 .../examples/rgcn_link_class_mnmg.py          |  418 ------
 .../examples/rgcn_link_class_sg.py            |  219 ---
 .../examples/rgcn_link_class_snmg.py          |  320 ----
 .../cugraph_pyg/examples/start_dask.sh        |   21 -
 .../cugraph_pyg/loader/__init__.py            |   31 -
 .../cugraph_pyg/loader/dask_node_loader.py    |  558 -------
 .../cugraph_pyg/loader/link_loader.py         |  205 ---
 .../loader/link_neighbor_loader.py            |  243 ---
 .../cugraph_pyg/loader/neighbor_loader.py     |  233 ---
 .../cugraph_pyg/loader/node_loader.py         |  151 --
 python/cugraph-pyg/cugraph_pyg/nn/__init__.py |   14 -
 .../cugraph_pyg/nn/conv/__init__.py           |   28 -
 .../cugraph-pyg/cugraph_pyg/nn/conv/base.py   |  190 ---
 .../cugraph_pyg/nn/conv/gat_conv.py           |  259 ----
 .../cugraph_pyg/nn/conv/gatv2_conv.py         |  241 ---
 .../cugraph_pyg/nn/conv/hetero_gat_conv.py    |  266 ----
 .../cugraph_pyg/nn/conv/rgcn_conv.py          |  144 --
 .../cugraph_pyg/nn/conv/sage_conv.py          |  151 --
 .../cugraph_pyg/nn/conv/transformer_conv.py   |  214 ---
 .../cugraph_pyg/sampler/__init__.py           |   14 -
 .../cugraph_pyg/sampler/sampler.py            |  540 -------
 .../cugraph_pyg/sampler/sampler_utils.py      |  531 -------
 .../cugraph-pyg/cugraph_pyg/tests/conftest.py |  315 ----
 .../tests/data/test_dask_graph_store.py       |  413 ------
 .../tests/data/test_dask_graph_store_mg.py    |  424 ------
 .../tests/data/test_feature_store.py          |   44 -
 .../tests/data/test_feature_store_mg.py       |   85 --
 .../tests/data/test_graph_store.py            |   45 -
 .../tests/data/test_graph_store_mg.py         |   45 -
 .../tests/loader/test_dask_neighbor_loader.py |  543 -------
 .../loader/test_dask_neighbor_loader_mg.py    |   77 -
 .../tests/loader/test_neighbor_loader.py      |  196 ---
 .../tests/loader/test_neighbor_loader_mg.py   |  364 -----
 .../cugraph_pyg/tests/nn/test_gat_conv.py     |  141 --
 .../cugraph_pyg/tests/nn/test_gatv2_conv.py   |  101 --
 .../tests/nn/test_hetero_gat_conv.py          |  132 --
 .../cugraph_pyg/tests/nn/test_rgcn_conv.py    |   89 --
 .../cugraph_pyg/tests/nn/test_sage_conv.py    |  105 --
 .../tests/nn/test_transformer_conv.py         |  115 --
 .../cugraph-pyg/cugraph_pyg/tests/pytest.ini  |    4 -
 .../tests/sampler/test_sampler_utils.py       |  196 ---
 .../tests/sampler/test_sampler_utils_mg.py    |  235 ---
 .../cugraph_pyg/tests/test_version.py         |   12 -
 .../cugraph-pyg/cugraph_pyg/utils/__init__.py |   12 -
 .../cugraph-pyg/cugraph_pyg/utils/imports.py  |   32 -
 python/cugraph-pyg/pyproject.toml             |   70 -
 python/cugraph-pyg/pytest.ini                 |   40 -
 readme_pages/cugraph_dgl.md                   |   27 -
 readme_pages/cugraph_pyg.md                   |   22 -
 readme_pages/gnn_support.md                   |    2 +-
 145 files changed, 7 insertions(+), 23948 deletions(-)
 delete mode 100755 ci/run_cugraph_dgl_pytests.sh
 delete mode 100755 ci/run_cugraph_pyg_pytests.sh
 delete mode 100644 conda/recipes/cugraph-dgl/build.sh
 delete mode 100644 conda/recipes/cugraph-dgl/meta.yaml
 delete mode 100644 conda/recipes/cugraph-pyg/build.sh
 delete mode 100644 conda/recipes/cugraph-pyg/conda_build_config.yaml
 delete mode 100644 conda/recipes/cugraph-pyg/meta.yaml
 delete mode 120000 python/cugraph-dgl/LICENSE
 delete mode 100644 python/cugraph-dgl/README.md
 delete mode 100644 python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
 delete mode 120000 python/cugraph-dgl/cugraph_dgl/VERSION
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/__init__.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/_version.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/convert.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/cugraph_storage.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/utils/__init__.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/features.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/graph.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/nn/__init__.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/__init__.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/conftest.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/tests/utils.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/typing.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/utils/__init__.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/utils/cugraph_storage_utils.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/utils/feature_storage.py
 delete mode 100644 python/cugraph-dgl/cugraph_dgl/view.py
 delete mode 100644 python/cugraph-dgl/examples/dataset_from_disk_cudf.ipynb
 delete mode 100644 python/cugraph-dgl/examples/graphsage/README.MD
 delete mode 100644 python/cugraph-dgl/examples/graphsage/node-classification-dask.py
 delete mode 100644 python/cugraph-dgl/examples/graphsage/node-classification.py
 delete mode 100644 python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
 delete mode 100644 python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
 delete mode 100644 python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
 delete mode 100644 python/cugraph-dgl/pyproject.toml
 delete mode 100644 python/cugraph-dgl/tests/test_version.py
 delete mode 120000 python/cugraph-pyg/LICENSE
 delete mode 100644 python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
 delete mode 120000 python/cugraph-pyg/cugraph_pyg/VERSION
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/__init__.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/_version.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/data/__init__.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/data/feature_store.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/data/graph_store.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_mg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_sg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_mnmg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_sg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_snmg.py
 delete mode 100755 python/cugraph-pyg/cugraph_pyg/examples/start_dask.sh
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/loader/__init__.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/loader/link_loader.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/loader/link_neighbor_loader.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/nn/__init__.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/conftest.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/pytest.ini
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/tests/test_version.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/utils/__init__.py
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/utils/imports.py
 delete mode 100644 python/cugraph-pyg/pyproject.toml
 delete mode 100644 python/cugraph-pyg/pytest.ini
 delete mode 100644 readme_pages/cugraph_dgl.md
 delete mode 100644 readme_pages/cugraph_pyg.md

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9ff575865e3..96e491f4cb4 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -33,6 +33,3 @@ ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
 ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
-
-# cugraph_pyg's setup.py needs this defined when building in a conda env
-ENV CUDA_HOME="${CUDA_HOME:-/home/coder/.conda/envs/$DEFAULT_CONDA_ENV}"
diff --git a/README.md b/README.md
index e41caec17b0..857406075e0 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,6 @@ That's it.  NetworkX now leverages cuGraph for accelerated graph algorithms.
   - [libcugraph (C/C++/CUDA)](./readme_pages/libcugraph.md)
   - [nx-cugraph](https://rapids.ai/nx-cugraph/)
   - [cugraph-service](./readme_pages/cugraph_service.md)
-  - [cugraph-dgl](./readme_pages/cugraph_dgl.md)
   - [cugraph-ops](./readme_pages/cugraph_ops.md)
 - API Docs
   - Python
diff --git a/build.sh b/build.sh
index 398582c74c0..756045461dd 100755
--- a/build.sh
+++ b/build.sh
@@ -29,8 +29,6 @@ VALIDARGS="
    pylibcugraph
    cugraph
    cugraph-service
-   cugraph-pyg
-   cugraph-dgl
    cpp-mgtests
    cpp-mtmgtests
    docs
@@ -56,8 +54,6 @@ HELP="$0 [<target> ...] [<flag> ...]
    pylibcugraph               - build the pylibcugraph Python package
    cugraph                    - build the cugraph Python package
    cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
-   cugraph-pyg                - build the cugraph-pyg Python package
-   cugraph-dgl                - build the cugraph-dgl extensions for DGL
    cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
    cpp-mtmgtests              - build libcugraph MTMG tests. Adds UCX as a dependency (temporary).
    docs                       - build the docs
@@ -84,12 +80,10 @@ LIBCUGRAPH_ETL_BUILD_DIR=${LIBCUGRAPH_ETL_BUILD_DIR:=${REPODIR}/cpp/libcugraph_e
 CUGRAPH_SERVICE_BUILD_DIRS="${REPODIR}/python/cugraph-service/server/build
                             ${REPODIR}/python/cugraph-service/client/build
 "
-CUGRAPH_DGL_BUILD_DIR=${REPODIR}/python/cugraph-dgl/build
 
 BUILD_DIRS="${LIBCUGRAPH_BUILD_DIR}
             ${LIBCUGRAPH_ETL_BUILD_DIR}
             ${CUGRAPH_SERVICE_BUILD_DIRS}
-            ${CUGRAPH_DGL_BUILD_DIR}
 "
 
 # Set defaults for vars modified by flags to this script
@@ -325,24 +319,6 @@ if hasArg cugraph-service || hasArg all; then
     fi
 fi
 
-# Build and install the cugraph-pyg Python package
-if hasArg cugraph-pyg || hasArg all; then
-    if hasArg --clean; then
-        cleanPythonDir ${REPODIR}/python/cugraph-pyg
-    else
-        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/cugraph-pyg
-    fi
-fi
-
-# Install the cugraph-dgl extensions for DGL
-if hasArg cugraph-dgl || hasArg all; then
-    if hasArg --clean; then
-        cleanPythonDir ${REPODIR}/python/cugraph-dgl
-    else
-        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/cugraph-dgl
-    fi
-fi
-
 # Build the docs
 if hasArg docs || hasArg all; then
     if [ ! -d ${LIBCUGRAPH_BUILD_DIR} ]; then
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index be1988e31dd..961f7816caa 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -51,8 +51,6 @@ NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; p
 DEPENDENCIES=(
   cudf
   cugraph
-  cugraph-dgl
-  cugraph-pyg
   cugraph-service-server
   cugraph-service-client
   cuxfilter
@@ -75,7 +73,7 @@ DEPENDENCIES=(
 UCXX_DEPENDENCIES=(
   ucx-py
 )
-for FILE in dependencies.yaml conda/environments/*.yaml python/cugraph-{pyg,dgl}/conda/*.yaml; do
+for FILE in dependencies.yaml conda/environments/*.yaml; do
   for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
diff --git a/ci/run_cugraph_dgl_pytests.sh b/ci/run_cugraph_dgl_pytests.sh
deleted file mode 100755
index 83c26a57dc0..00000000000
--- a/ci/run_cugraph_dgl_pytests.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-# Support invoking run_cugraph_dgl_pytests.sh outside the script directory
-cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-dgl/tests
-
-pytest --cache-clear --ignore=mg "$@" .
diff --git a/ci/run_cugraph_pyg_pytests.sh b/ci/run_cugraph_pyg_pytests.sh
deleted file mode 100755
index fb27f16d79e..00000000000
--- a/ci/run_cugraph_pyg_pytests.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-# Support invoking run_cugraph_pyg_pytests.sh outside the script directory
-cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-pyg/cugraph_pyg
-
-pytest --cache-clear --benchmark-disable "$@" .
-
-# Used to skip certain examples in CI due to memory limitations
-export CI_RUN=1
-
-# Test examples
-for e in "$(pwd)"/examples/*.py; do
-  rapids-logger "running example $e"
-  (yes || true) | python $e
-done
diff --git a/ci/test.sh b/ci/test.sh
index 884ed7ac881..8e19b6c8c18 100755
--- a/ci/test.sh
+++ b/ci/test.sh
@@ -99,13 +99,6 @@ if hasArg "--run-python-tests"; then
     pytest -sv -m sg -m "managedmem_on and poolallocator_on and tiny" --benchmark-disable
     echo "Ran Python benchmarks for cuGraph (running as tests) : return code was: $?, test script exit code is now: $EXITCODE"
 
-    echo "Python pytest for cugraph_pyg (single-GPU only)..."
-    conda list
-    cd ${CUGRAPH_ROOT}/python/cugraph-pyg/cugraph_pyg
-    # rmat is not tested because of MG testing
-    pytest -sv -m sg --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-pytests.xml -v --cov-config=.coveragerc --cov=cugraph_pyg --cov-report=xml:${WORKSPACE}/python/cugraph_pyg/cugraph-coverage.xml --cov-report term --ignore=raft --benchmark-disable
-    echo "Ran Python pytest for cugraph_pyg : return code was: $?, test script exit code is now: $EXITCODE"
-
     echo "Python pytest for cugraph-service (single-GPU only)..."
     cd ${CUGRAPH_ROOT}/python/cugraph-service
     pytest -sv --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-service-pytests.xml --benchmark-disable -k "not mg" ./tests
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index ec4c23541f9..e7fa6d4ee42 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- dglteam/label/th23_cu118
 - conda-forge
 - nvidia
 dependencies:
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index de0507c7c22..7b7ac92b59b 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- dglteam/label/th23_cu118
 - conda-forge
 - nvidia
 dependencies:
diff --git a/conda/recipes/cugraph-dgl/build.sh b/conda/recipes/cugraph-dgl/build.sh
deleted file mode 100644
index 14d29b7eab9..00000000000
--- a/conda/recipes/cugraph-dgl/build.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2022, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-
-./build.sh cugraph-dgl
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
deleted file mode 100644
index ed449b5982a..00000000000
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: cugraph-dgl
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  build:
-      number: {{ GIT_DESCRIBE_NUMBER }}
-      string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-
-requirements:
-  host:
-    - python
-    - rapids-build-backend>=0.3.1,<0.4.0.dev0
-    - setuptools>=61.0.0
-  run:
-    - cugraph ={{ version }}
-    - dgl >=2.4.0.th23.cu*
-    - numba >=0.57
-    - numpy >=1.23,<3.0a0
-    - pylibcugraphops ={{ minor_version }}
-    - tensordict >=0.1.2
-    - python
-    - pytorch >=2.3
-    - cupy >=12.0.0
-
-tests:
-  imports:
-    - cugraph_dgl
-
-about:
-  home: https://rapids.ai/
-  dev_url: https://github.com/rapidsai/cugraph
-  license: Apache-2.0
-  license_file: ../../../LICENSE
-  summary: cuGraph library
diff --git a/conda/recipes/cugraph-pyg/build.sh b/conda/recipes/cugraph-pyg/build.sh
deleted file mode 100644
index ad2502985e5..00000000000
--- a/conda/recipes/cugraph-pyg/build.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-./build.sh cugraph-pyg --allgpuarch
diff --git a/conda/recipes/cugraph-pyg/conda_build_config.yaml b/conda/recipes/cugraph-pyg/conda_build_config.yaml
deleted file mode 100644
index 47d98b4800b..00000000000
--- a/conda/recipes/cugraph-pyg/conda_build_config.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-
-c_compiler_version:
-  - 11
-
-cxx_compiler_version:
-  - 11
-
-cuda_compiler:
-  - nvcc
-
-cmake_version:
-  - ">=3.26.4,!=3.30.0"
-
-c_stdlib:
-  - sysroot
-
-c_stdlib_version:
-  - "2.17"
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
deleted file mode 100644
index b44614baa9f..00000000000
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: cugraph-pyg
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-  script_env:
-    - PARALLEL_LEVEL
-
-requirements:
-  build:
-    - {{ stdlib("c") }}
-  host:
-    - cython >=3.0.0
-    - python
-    - rapids-build-backend>=0.3.1,<0.4.0.dev0
-    - setuptools>=61.0.0
-  run:
-    - rapids-dask-dependency ={{ minor_version }}
-    - numba >=0.57
-    - numpy >=1.23,<3.0a0
-    - python
-    - pytorch >=2.3
-    - cupy >=12.0.0
-    - cugraph ={{ version }}
-    - pylibcugraphops ={{ minor_version }}
-    - tensordict >=0.1.2
-    - pytorch_geometric >=2.5,<2.6
-
-tests:
-  imports:
-    - cugraph_pyg
-
-about:
-  home: https://rapids.ai/
-  dev_url: https://github.com/rapidsai/cugraph
-  license: Apache-2.0
-  license_file: ../../../LICENSE
-  summary: cuGraph-pyg library
diff --git a/dependencies.yaml b/dependencies.yaml
index f959f4089f5..7b25ac05d62 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -26,8 +26,6 @@ files:
       - depends_on_cupy
       - depends_on_pytorch
       - python_run_cugraph
-      - python_run_cugraph_dgl
-      - python_run_cugraph_pyg
       - test_notebook
       - test_python_common
       - test_python_cugraph
@@ -148,60 +146,6 @@ files:
       - depends_on_cudf
       - test_python_common
       - test_python_pylibcugraph
-  py_build_cugraph_dgl:
-    output: pyproject
-    pyproject_dir: python/cugraph-dgl
-    extras:
-      table: build-system
-    includes:
-      - python_build_rapids
-      - python_build_wheel
-  py_run_cugraph_dgl:
-    output: pyproject
-    pyproject_dir: python/cugraph-dgl
-    extras:
-      table: project
-    includes:
-      - python_run_cugraph_dgl
-        # Deprecate pylibcugraphops
-      - depends_on_pylibcugraphops
-  py_test_cugraph_dgl:
-    output: pyproject
-    pyproject_dir: python/cugraph-dgl
-    extras:
-      table: project.optional-dependencies
-      key: test
-    includes:
-      - test_python_common
-      - depends_on_pylibwholegraph
-      - depends_on_pytorch
-  py_build_cugraph_pyg:
-    output: pyproject
-    pyproject_dir: python/cugraph-pyg
-    extras:
-      table: build-system
-    includes:
-      - python_build_rapids
-      - python_build_wheel
-  py_run_cugraph_pyg:
-    output: pyproject
-    pyproject_dir: python/cugraph-pyg
-    extras:
-      table: project
-    includes:
-      - python_run_cugraph_pyg
-        # Deprecate pylibcugraphops
-      - depends_on_pylibcugraphops
-  py_test_cugraph_pyg:
-    output: pyproject
-    pyproject_dir: python/cugraph-pyg
-    extras:
-      table: project.optional-dependencies
-      key: test
-    includes:
-      - test_python_common
-      - depends_on_pylibwholegraph
-      - depends_on_pytorch
   py_build_cugraph_service_client:
     output: pyproject
     pyproject_dir: python/cugraph-service/client
@@ -245,33 +189,10 @@ files:
     includes:
       - test_python_common
       - test_python_cugraph
-  cugraph_dgl_dev:
-    matrix:
-      cuda: ["11.8"]
-    output: conda
-    conda_dir: python/cugraph-dgl/conda
-    includes:
-      - checks
-        # Deprecate pylibcugraphops
-      - depends_on_pylibcugraphops
-      - cugraph_dgl_dev
-      - test_python_common
-  cugraph_pyg_dev:
-    matrix:
-      cuda: ["11.8"]
-    output: conda
-    conda_dir: python/cugraph-pyg/conda
-    includes:
-      - checks
-        # Deprecate pylibcugraphops
-      - depends_on_pylibcugraphops
-      - cugraph_pyg_dev
-      - test_python_common
 channels:
   - rapidsai
   - rapidsai-nightly
   - dask/label/dev
-  - dglteam/label/th23_cu118
   - conda-forge
   - nvidia
 dependencies:
@@ -501,50 +422,6 @@ dependencies:
           - matrix:
             packages:
               - *ucx_py_unsuffixed
-  python_run_cugraph_dgl:
-    common:
-      - output_types: [conda, pyproject]
-        packages:
-          - *numba
-          - *numpy
-    specific:
-      - output_types: [pyproject]
-        matrices:
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - &cugraph_cu11 cugraph-cu11==24.12.*,>=0.0.0a0
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - &cugraph_cu12 cugraph-cu12==24.12.*,>=0.0.0a0
-          - matrix:
-            packages:
-              - &cugraph_unsuffixed cugraph==24.12.*,>=0.0.0a0
-  python_run_cugraph_pyg:
-    common:
-      - output_types: [conda, pyproject]
-        packages:
-          - *numba
-          - *numpy
-    specific:
-      - output_types: [pyproject]
-        matrices:
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - *cugraph_cu11
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - *cugraph_cu12
-          - matrix:
-            packages:
-              - *cugraph_unsuffixed
   python_run_cugraph_service_client:
     common:
       - output_types: [conda, pyproject]
@@ -569,19 +446,19 @@ dependencies:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - *cugraph_cu11
+              - &cugraph_cu11 cugraph-cu11==24.12.*,>=0.0.0a0
               - cugraph-service-client-cu11==24.12.*,>=0.0.0a0
               - *ucx_py_cu11
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - *cugraph_cu12
+              - &cugraph_cu12 cugraph-cu12==24.12.*,>=0.0.0a0
               - cugraph-service-client-cu12==24.12.*,>=0.0.0a0
               - *ucx_py_cu12
           - matrix:
             packages:
-              - *cugraph_unsuffixed
+              - &cugraph_unsuffixed cugraph==24.12.*,>=0.0.0a0
               - cugraph-service-client==24.12.*,>=0.0.0a0
               - *ucx_py_unsuffixed
   test_cpp:
@@ -625,30 +502,12 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - *numpy
-  cugraph_dgl_dev:
-    common:
-      - output_types: [conda]
-        packages:
-          - *cugraph_unsuffixed
-          - &pytorch_conda pytorch>=2.3
-          - pytorch-cuda==11.8
-          - &tensordict tensordict>=0.1.2
-          - dgl>=2.4.0.cu*
-  cugraph_pyg_dev:
-    common:
-      - output_types: [conda]
-        packages:
-          - *cugraph_unsuffixed
-          - *pytorch_conda
-          - pytorch-cuda==11.8
-          - *tensordict
-          - pytorch_geometric>=2.5,<2.6
 
   depends_on_pytorch:
     common:
       - output_types: [conda]
         packages:
-          - *pytorch_conda
+          - &pytorch_conda pytorch>=2.3
           - torchdata
           - pydantic
           - ogb
@@ -669,7 +528,7 @@ dependencies:
           - matrix: {cuda: "12.*"}
             packages:
               - &pytorch_pip torch>=2.3
-              - *tensordict
+              - &tensordict tensordict>=0.1.2
           - matrix: {cuda: "11.*"}
             packages:
               - *pytorch_pip
diff --git a/python/.coveragerc b/python/.coveragerc
index 1c33570c05c..9e15f7d1acc 100644
--- a/python/.coveragerc
+++ b/python/.coveragerc
@@ -1,10 +1,8 @@
 # Configuration file for Python coverage tests
 [run]
 include = cugraph/cugraph/*
-          cugraph-pyg/cugraph_pyg/*
           cugraph-service/*
           pylibcugraph/pylibcugraph/*
 omit = cugraph/cugraph/tests/*
-       cugraph-pyg/cugraph_pyg/tests/*
        cugraph-service/tests/*
        pylibcugraph/pylibcugraph/tests/*
diff --git a/python/cugraph-dgl/LICENSE b/python/cugraph-dgl/LICENSE
deleted file mode 120000
index 30cff7403da..00000000000
--- a/python/cugraph-dgl/LICENSE
+++ /dev/null
@@ -1 +0,0 @@
-../../LICENSE
\ No newline at end of file
diff --git a/python/cugraph-dgl/README.md b/python/cugraph-dgl/README.md
deleted file mode 100644
index 013d4fe5e2e..00000000000
--- a/python/cugraph-dgl/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# cugraph_dgl
-
-## Description
-
-[RAPIDS](https://rapids.ai) cugraph_dgl provides a duck-typed version of the [DGLGraph](https://docs.dgl.ai/api/python/dgl.DGLGraph.html#dgl.DGLGraph) class, which uses cugraph for storing graph structure and node/edge feature data.  Using cugraph as the backend allows DGL users to access a collection of GPU accelerated algorithms for graph analytics, such as centrality computation and community detection.
-
-## Conda
-
-Install and update cugraph-dgl and the required dependencies using the command:
-
-```shell
-# CUDA 11
-conda install -c rapidsai -c pytorch -c conda-forge -c nvidia -c dglteam/label/th23_cu118 cugraph-dgl
-
-# CUDA 12
-conda install -c rapidsai -c pytorch -c conda-forge -c nvidia -c dglteam/label/th23_cu121 cugraph-dgl
-```
-
-## Build from Source
-
-### Create the conda development environment
-```
-mamba env create -n cugraph_dgl_dev --file conda/cugraph_dgl_dev_11.6.yml
-```
-
-### Install  in editable mode
-```
-pip install -e .
-```
-
-### Run tests
-
-```
-pytest tests/*
-```
-
-
-## Usage
-```diff
-
-+from cugraph_dgl.convert import cugraph_storage_from_heterograph
-+cugraph_g = cugraph_storage_from_heterograph(dgl_g)
-
-sampler = dgl.dataloading.NeighborSampler(
-        [15, 10, 5], prefetch_node_feats=['feat'], prefetch_labels=['label'])
-
-train_dataloader = dgl.dataloading.DataLoader(
-- dgl_g,
-+ cugraph_g,
-train_idx,
-sampler,
-device=device,
-batch_size=1024,
-shuffle=True,
-drop_last=False,
-num_workers=0)
-```
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
deleted file mode 100644
index 85c85c2043a..00000000000
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# This file is generated by `rapids-dependency-file-generator`.
-# To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
-channels:
-- rapidsai
-- rapidsai-nightly
-- dask/label/dev
-- dglteam/label/th23_cu118
-- conda-forge
-- nvidia
-dependencies:
-- cugraph==24.12.*,>=0.0.0a0
-- dgl>=2.4.0.cu*
-- pandas
-- pre-commit
-- pylibcugraphops==24.12.*,>=0.0.0a0
-- pytest
-- pytest-benchmark
-- pytest-cov
-- pytest-xdist
-- pytorch-cuda==11.8
-- pytorch>=2.3
-- scipy
-- tensordict>=0.1.2
-name: cugraph_dgl_dev_cuda-118
diff --git a/python/cugraph-dgl/cugraph_dgl/VERSION b/python/cugraph-dgl/cugraph_dgl/VERSION
deleted file mode 120000
index d62dc733efd..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-../../../VERSION
\ No newline at end of file
diff --git a/python/cugraph-dgl/cugraph_dgl/__init__.py b/python/cugraph-dgl/cugraph_dgl/__init__.py
deleted file mode 100644
index 58850d47fba..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-# to prevent rapids context being created when importing cugraph_dgl
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-from cugraph_dgl.graph import Graph
-from cugraph_dgl.cugraph_storage import CuGraphStorage
-from cugraph_dgl.convert import (
-    cugraph_storage_from_heterograph,
-    cugraph_dgl_graph_from_heterograph,
-)
-import cugraph_dgl.dataloading
-import cugraph_dgl.nn
-
-from cugraph_dgl._version import __git_commit__, __version__
diff --git a/python/cugraph-dgl/cugraph_dgl/_version.py b/python/cugraph-dgl/cugraph_dgl/_version.py
deleted file mode 100644
index e8adcc31430..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/_version.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import importlib.resources
-
-__version__ = (
-    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
-)
-try:
-    __git_commit__ = (
-        importlib.resources.files(__package__)
-        .joinpath("GIT_COMMIT")
-        .read_text()
-        .strip()
-    )
-except FileNotFoundError:
-    __git_commit__ = ""
-
-__all__ = ["__git_commit__", "__version__"]
diff --git a/python/cugraph-dgl/cugraph_dgl/convert.py b/python/cugraph-dgl/cugraph_dgl/convert.py
deleted file mode 100644
index ae4b96dd391..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/convert.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from cugraph.utilities.utils import import_optional
-
-import cugraph_dgl
-from cugraph_dgl import CuGraphStorage
-from cugraph_dgl.utils.cugraph_conversion_utils import (
-    get_edges_dict_from_dgl_HeteroGraph,
-    add_ndata_from_dgl_HeteroGraph,
-    add_edata_from_dgl_HeteroGraph,
-)
-
-dgl = import_optional("dgl")
-
-
-def cugraph_storage_from_heterograph(
-    g: dgl.DGLGraph, single_gpu: bool = True
-) -> CuGraphStorage:
-    """
-    Convert DGL Graph to CuGraphStorage graph
-    """
-    num_nodes_dict = {ntype: g.num_nodes(ntype) for ntype in g.ntypes}
-    edges_dict = get_edges_dict_from_dgl_HeteroGraph(g, single_gpu)
-    gs = CuGraphStorage(
-        data_dict=edges_dict,
-        num_nodes_dict=num_nodes_dict,
-        single_gpu=single_gpu,
-        idtype=g.idtype,
-    )
-    add_ndata_from_dgl_HeteroGraph(gs, g)
-    add_edata_from_dgl_HeteroGraph(gs, g)
-    return gs
-
-
-def cugraph_dgl_graph_from_heterograph(
-    input_graph: dgl.DGLGraph,
-    single_gpu: bool = True,
-    ndata_storage: str = "torch",
-    edata_storage: str = "torch",
-    **kwargs,
-) -> cugraph_dgl.Graph:
-    """
-    Converts a DGL Graph to a cuGraph-DGL Graph.
-    """
-
-    output_graph = cugraph_dgl.Graph(
-        is_multi_gpu=(not single_gpu),
-        ndata_storage=ndata_storage,
-        edata_storage=edata_storage,
-        **kwargs,
-    )
-
-    # Calling is_homogeneous does not work here
-    if len(input_graph.ntypes) <= 1:
-        output_graph.add_nodes(
-            input_graph.num_nodes(), data=input_graph.ndata, ntype=input_graph.ntypes[0]
-        )
-    else:
-        for ntype in input_graph.ntypes:
-            data = {
-                k: v_dict[ntype]
-                for k, v_dict in input_graph.ndata.items()
-                if ntype in v_dict
-            }
-            output_graph.add_nodes(input_graph.num_nodes(ntype), data=data, ntype=ntype)
-
-    if len(input_graph.canonical_etypes) <= 1:
-        can_etype = input_graph.canonical_etypes[0]
-        src_t, dst_t = input_graph.edges(form="uv", etype=can_etype)
-        output_graph.add_edges(src_t, dst_t, input_graph.edata, etype=can_etype)
-    else:
-        for can_etype in input_graph.canonical_etypes:
-            data = {
-                k: v_dict[can_etype]
-                for k, v_dict in input_graph.edata.items()
-                if can_etype in v_dict
-            }
-
-            src_t, dst_t = input_graph.edges(form="uv", etype=can_etype)
-            output_graph.add_edges(src_t, dst_t, data=data, etype=can_etype)
-
-    return output_graph
diff --git a/python/cugraph-dgl/cugraph_dgl/cugraph_storage.py b/python/cugraph-dgl/cugraph_dgl/cugraph_storage.py
deleted file mode 100644
index 6a1b6ee32b8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/cugraph_storage.py
+++ /dev/null
@@ -1,714 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-from typing import Optional, Sequence, Tuple, Dict, Union
-from functools import cached_property
-from cugraph.utilities.utils import import_optional, MissingModule
-from cugraph.gnn import FeatureStore
-from cugraph.gnn.dgl_extensions.dgl_uniform_sampler import DGLUniformSampler
-import cudf
-import dask_cudf
-import cupy as cp
-from cugraph_dgl.utils.cugraph_storage_utils import (
-    _assert_valid_canonical_etype,
-    backend_dtype_to_np_dtype_dict,
-    add_edge_ids_to_edges_dict,
-    add_node_offset_to_edges_dict,
-)
-from cugraph_dgl.utils.feature_storage import dgl_FeatureStorage
-
-dgl = import_optional("dgl")
-F = import_optional("dgl.backend")
-torch = import_optional("torch")
-
-
-class CuGraphStorage:
-    """
-    Duck-typed version of the DGLHeteroGraph class made for cuGraph
-    for storing graph structure and node/edge feature data.
-
-    This object is wrapper around cugraph's Multi GPU MultiGraph and returns samples
-    that conform with `DGLHeteroGraph`
-    See: https://docs.rapids.ai/api/cugraph/nightly/api_docs/cugraph_dgl.html
-    """
-
-    def __init__(
-        self,
-        data_dict: Dict[
-            Tuple[str, str, str], Union[cudf.DataFrame, dask_cudf.DataFrame]
-        ],
-        num_nodes_dict: Dict[str, int],
-        single_gpu: bool = True,
-        device_id: int = 0,
-        idtype=None if isinstance(F, MissingModule) else F.int64,
-    ):
-        """
-        Constructor for creating a object of instance CuGraphStorage
-
-        See also ``cugraph_dgl.cugraph_storage_from_heterograph``
-        to convert from DGLHeteroGraph to CuGraphStorage
-
-        Parameters
-        ----------
-         data_dict:
-            The dictionary data for constructing a heterogeneous graph.
-            The keys are in the form of string triplets (src_type, edge_type, dst_type),
-            specifying the source node, edge, and destination node types.
-            The values are graph data is a dataframe with 2 columns form of (𝑈,𝑉),
-            where (𝑈[𝑖],𝑉[𝑖]) forms the edge with ID 𝑖.
-
-         num_nodes_dict: dict[str, int]
-            The number of nodes for some node types, which is a
-            dictionary mapping a node type T to the number of T-typed nodes.
-
-        single_gpu: bool
-            Whether to create the cugraph Property Graph
-            on a single GPU or multiple GPUs
-            single GPU = True
-            single GPU = False
-
-        device_id: int
-            If specified, must be the integer ID of the GPU device to have the
-            results being created on
-
-        idtype: Framework-specific device object,
-            The data type for storing the structure-related graph
-            information this can be ``torch.int32`` or ``torch.int64``
-            for PyTorch.
-            Defaults to ``torch.int64`` if pytorch is installed
-
-
-         Examples
-         --------
-         The following example uses `CuGraphStorage` :
-            >>> from cugraph_dgl.cugraph_storage import CuGraphStorage
-            >>> import cudf
-            >>> import torch
-            >>> num_nodes_dict={"drug": 3, "gene": 2, "disease": 1}
-            >>> drug_interacts_drug_df = cudf.DataFrame({"src": [0, 1], "dst": [1, 2]})
-            >>> drug_interacts_gene = cudf.DataFrame({"src": [0, 1], "dst": [0, 1]})
-            >>> drug_treats_disease = cudf.DataFrame({"src": [1], "dst": [0]})
-            >>> data_dict = {("drug", "interacts", "drug"):drug_interacts_drug_df,
-                 ("drug", "interacts", "gene"):drug_interacts_gene,
-                 ("drug", "treats", "disease"):drug_treats_disease }
-            >>> gs = CuGraphStorage(data_dict=data_dict, num_nodes_dict=num_nodes_dict)
-            >>> gs.add_node_data(ntype='drug', feat_name='node_feat',
-                                          feat_obj=torch.as_tensor([0.1, 0.2, 0.3]))
-            >>> gs.add_edge_data(canonical_etype=("drug", "interacts", "drug"),
-                                          feat_name='edge_feat',
-                                          feat_obj=torch.as_tensor([0.2, 0.4]))
-            >>> gs.ntypes
-            ['disease', 'drug', 'gene']
-            >>> gs.etypes
-            ['interacts', 'interacts', 'treats']
-            >>> gs.canonical_etypes
-            [('drug', 'interacts', 'drug'),
-             ('drug', 'interacts', 'gene'),
-             ('drug', 'treats', 'disease')]
-
-            >>> gs.sample_neighbors({'disease':[0]},
-                                    1)
-            Graph(num_nodes={'disease': 1, 'drug': 3, 'gene': 2},
-            num_edges={('drug', 'interacts', 'drug'): 0,
-                       ('drug', 'interacts', 'gene'): 0,
-                       ('drug', 'treats', 'disease'): 1},
-            metagraph=[('drug', 'drug', 'interacts'),
-                       ('drug', 'gene', 'interacts'),
-                       ('drug', 'disease', 'treats')])
-
-            >>> gs.get_node_storage(key='node_feat',
-                                    ntype='drug').fetch([0,1,2])
-            tensor([0.1000, 0.2000, 0.3000], device='cuda:0',
-             dtype=torch.float64)
-
-            >>> es = gs.get_edge_storage(key='edge_feat',
-                                    etype=('drug', 'interacts', 'drug'))
-            >>> es.fetch([0,1])
-            tensor([0.2000, 0.4000], device='cuda:0', dtype=torch.float64)
-        """
-        # Order is very important
-        # do this first before cuda work
-        # Create cuda context on the right gpu,
-        # defaults to gpu-0
-        import numba.cuda as cuda
-
-        cuda.select_device(device_id)
-
-        self.idtype = idtype
-        self.id_np_type = backend_dtype_to_np_dtype_dict[idtype]
-        self.num_nodes_dict = num_nodes_dict
-        self._ntype_offset_d = self.__get_ntype_offset_d(self.num_nodes_dict)
-        # Todo: Can possibly optimize by persisting edge-list
-        # Trade-off memory for run-time
-        self.num_edges_dict = {k: len(v) for k, v in data_dict.items()}
-        self._etype_offset_d = self.__get_etype_offset_d(self.num_edges_dict)
-        self.single_gpu = single_gpu
-
-        self.ndata_storage = FeatureStore(backend="torch")
-        self.ndata = self.ndata_storage.fd
-        self.edata_storage = FeatureStore(backend="torch")
-        self.edata = self.edata_storage.fd
-
-        self._etype_range_d = self.__get_etype_range_d(
-            self._etype_offset_d, self.num_canonical_edges_dict
-        )
-        _edges_dict = add_edge_ids_to_edges_dict(
-            data_dict, self._etype_offset_d, self.id_np_type
-        )
-
-        self._edges_dict = add_node_offset_to_edges_dict(
-            _edges_dict, self._ntype_offset_d
-        )
-
-        # Persist the dataframes so they can be retrieved later
-        # for a multi-GPU workflow.
-        if not single_gpu:
-            for k in list(self._edges_dict.keys()):
-                self._edges_dict[k] = self._edges_dict[k].persist()
-
-        self._etype_id_dict = {
-            etype: etype_id for etype_id, etype in enumerate(self.canonical_etypes)
-        }
-        self.uniform_sampler = None
-
-    def add_node_data(self, feat_obj: Sequence, ntype: str, feat_name: str):
-        """
-        Add node features
-
-        Parameters
-        ----------
-        df : array_like object
-            The node feature to save in feature store
-        ntype : str
-            The node type to be added.
-            For example, if dataframe contains data about users, ntype
-            might be "users".
-        feat_name : str
-            The name of the feature being stored
-        Returns
-        -------
-        None
-        """
-        self.ndata_storage.add_data(
-            feat_obj=feat_obj,
-            type_name=ntype,
-            feat_name=feat_name,
-        )
-
-    def add_edge_data(
-        self,
-        feat_obj: Sequence,
-        canonical_etype: Tuple[str, str, str],
-        feat_name: str,
-    ):
-        """
-        Add edge features
-
-        Parameters
-        ----------
-        feat_obj : array_like object
-            The edge feature to save in feature store
-        canonical_etype : Tuple[(str, str, str)]
-            The edge type to be added
-        feat_name : string
-        Returns
-        -------
-        None
-        """
-        _assert_valid_canonical_etype(canonical_etype)
-        self.edata_storage.add_data(
-            feat_obj=feat_obj,
-            type_name=canonical_etype,
-            feat_name=feat_name,
-        )
-
-    # Sampling Function
-    def sample_neighbors(
-        self,
-        nodes,
-        fanout: int,
-        edge_dir: str = "in",
-        prob: Optional[str] = None,
-        exclude_edges=None,
-        replace: bool = False,
-        output_device=None,
-    ):
-        """
-        Return a DGLGraph which is a subgraph induced by sampling neighboring
-        edges of the given nodes.
-        See ``dgl.sampling.sample_neighbors`` for detailed semantics.
-        Parameters
-        ----------
-        nodes : Tensor or dict[str, Tensor]
-            Node IDs to sample neighbors from.
-            This argument can take a single ID tensor or a dictionary of node
-            types and ID tensors. If a single tensor is given, the graph must
-            only have one type of nodes.
-        fanout : int or dict[etype, int]
-            The number of edges to be sampled for each node on each edge type.
-            This argument can take a single int or a dictionary of edge types
-            and ints. If a single int is given, DGL will sample this number of
-            edges for each node for every edge type.
-            If -1 is given for a single edge type, all the neighboring edges
-            with that edge type will be selected.
-        edge_dir: 'in' or 'out'
-            The direction of edges to import
-        prob : str, optional
-            Feature name used as the (un-normalized) probabilities associated
-            with each neighboring edge of a node.  The feature must have only
-            one element for each edge.
-            The features must be non-negative floats, and the sum of the
-            features of inbound/outbound edges for every node must be positive
-            (though they don't have to sum up to one).  Otherwise, the result
-            will be undefined. If :attr:`prob` is not None, GPU sampling is
-            not supported.
-        exclude_edges: tensor or dict
-            Edge IDs to exclude during sampling neighbors for the seed nodes.
-            This argument can take a single ID tensor or a dictionary of edge
-            types and ID tensors. If a single tensor is given, the graph must
-            only have one type of nodes.
-        replace : bool, optional
-            If True, sample with replacement.
-        output_device : Framework-specific device context object, optional
-            The output device.  Default is the same as the input graph.
-        Returns
-        -------
-        DGLGraph
-            A sampled subgraph with the same nodes as the original graph, but
-            only the sampled neighboring edges.  The induced edge IDs will be
-            in ``edata[dgl.EID]``.
-        """
-        if self.uniform_sampler is None:
-            self.uniform_sampler = DGLUniformSampler(
-                self._edges_dict,
-                self._etype_range_d,
-                self._etype_id_dict,
-                self.single_gpu,
-            )
-
-        if prob is not None:
-            raise NotImplementedError(
-                "prob is not currently supported",
-                " for sample_neighbors in CuGraphStorage",
-            )
-
-        if exclude_edges is not None:
-            raise NotImplementedError(
-                "exclude_edges is not currently supported",
-                " for sample_neighbors in CuGraphStorage",
-            )
-
-        if not isinstance(nodes, dict):
-            if len(self.ntypes) > 1:
-                raise dgl.DGLError(
-                    "Must specify node type when the graph is not homogeneous."
-                )
-            nodes = cp.asarray(nodes)
-            nodes = {self.ntypes[0]: nodes}
-        else:
-            nodes = {
-                k: self.dgl_n_id_to_cugraph_id(F.tensor(n), k) for k, n in nodes.items()
-            }
-            nodes = {k: cp.asarray(F.tensor(n)) for k, n in nodes.items()}
-
-        sampled_obj = self.uniform_sampler.sample_neighbors(
-            nodes,
-            fanout,
-            edge_dir=edge_dir,
-            prob=prob,
-            replace=replace,
-        )
-        # heterograph case
-        if len(self.etypes) > 1:
-            graph_data_d, graph_eid_d = self.__convert_to_dgl_tensor_d(
-                sampled_obj, self.idtype
-            )
-            sampled_graph = dgl.heterograph(
-                data_dict=graph_data_d,
-                num_nodes_dict=self.num_nodes_dict,
-                idtype=self.idtype,
-            )
-            sampled_graph.edata[dgl.EID] = graph_eid_d
-        else:
-            src_ids, dst_ids, edge_ids = sampled_obj
-            src_ids = torch.as_tensor(src_ids, device="cuda")
-            dst_ids = torch.as_tensor(dst_ids, device="cuda")
-            edge_ids = torch.as_tensor(edge_ids, device="cuda")
-            total_number_of_nodes = self.total_number_of_nodes
-            sampled_graph = dgl.graph(
-                (src_ids, dst_ids),
-                num_nodes=total_number_of_nodes,
-                idtype=self.idtype,
-            )
-            sampled_graph.edata[dgl.EID] = edge_ids
-
-        # to device function move the dgl graph to desired devices
-        if output_device is not None:
-            sampled_graph.to(output_device)
-        return sampled_graph
-
-    # Required in Cluster-GCN
-    def subgraph(self, nodes, relabel_nodes=False, output_device=None):
-        """Return a subgraph induced on given nodes.
-        This has the same semantics as ``dgl.node_subgraph``.
-        Parameters
-        ----------
-        nodes : nodes or dict[str, nodes]
-            The nodes to form the subgraph. The allowed nodes formats are:
-            * Int Tensor: Each element is a node ID. The tensor must have the
-             same device type and ID data type as the graph's.
-            * iterable[int]: Each element is a node ID.
-            * Bool Tensor: Each :math:`i^{th}` element is a bool flag
-             indicating whether node :math:`i` is in the subgraph.
-             If the graph is homogeneous, directly pass the above formats.
-             Otherwise, the argument must be a dictionary with keys being
-             node types and values being the node IDs in the above formats.
-        relabel_nodes : bool, optional
-            If True, the extracted subgraph will only have the nodes in the
-            specified node set and it will relabel the nodes in order.
-        output_device : Framework-specific device context object, optional
-            The output device.  Default is the same as the input graph.
-        Returns
-        -------
-        DGLGraph
-            The subgraph.
-        """
-        raise NotImplementedError("subgraph is not implemented yet")
-
-    # Required in Link Prediction
-    # relabel = F we use dgl functions,
-    # relabel = T, we need to delete nodes and relabel
-    def edge_subgraph(self, edges, relabel_nodes=False, output_device=None):
-        """
-        Return a subgraph induced on given edges.
-        This has the same semantics as ``dgl.edge_subgraph``.
-        Parameters
-        ----------
-        edges : edges or dict[(str, str, str), edges]
-            The edges to form the subgraph. The allowed edges formats are:
-            * Int Tensor: Each element is an edge ID. The tensor must have the
-              same device type and ID data type as the graph's.
-            * iterable[int]: Each element is an edge ID.
-            * Bool Tensor: Each :math:`i^{th}` element is a bool flag
-             indicating whether edge :math:`i` is in the subgraph.
-            If the graph is homogeneous, one can directly pass the above
-            formats. Otherwise, the argument must be a dictionary with keys
-            being edge types and values being the edge IDs in the above formats
-        relabel_nodes : bool, optional
-            If True, the extracted subgraph will only have the nodes in the
-            specified node set and it will relabel the nodes in order.
-        output_device : Framework-specific device context object, optional
-            The output device.  Default is the same as the input graph.
-        Returns
-        -------
-        DGLGraph
-            The subgraph.
-        """
-        raise NotImplementedError("edge_subgraph is not implemented yet")
-
-    # Required in Link Prediction negative sampler
-    def find_edges(
-        self, eid, etype: Optional[Tuple[str, str, str]] = None, output_device=None
-    ):
-        """
-        Return the source and destination node ID(s) given the edge ID(s).
-
-        Parameters
-        ----------
-        eid : edge ID(s)
-            The edge IDs. The allowed formats are:
-
-            * ``int``: A single ID.
-            * Int Tensor: Each element is an ID.
-            The tensor must have the same device type
-            and ID data type as the graph's.
-            * iterable[int]: Each element is an ID.
-
-        etype : Tuple[str, str, str]
-            The type name of the edges.
-            Can be omitted if the graph has only one type of edges.
-
-        Returns
-        -------
-        Tensor
-            The source node IDs of the edges.
-            The i-th element is the source node ID of the i-th edge.
-        Tensor
-            The destination node IDs of the edges.
-            The i-th element is the destination node ID of the i-th edge.
-        """
-
-        if etype:
-            src_type, connection_type, dst_type = etype
-        eid = self.dgl_e_id_to_cugraph_id(eid, etype)
-        # TODO: implement below
-        src, dst = self.find_edges(eid, etype)
-        src = torch.as_tensor(src, device="cuda")
-        dst = torch.as_tensor(dst, device="cuda")
-        src = self.cugraph_n_id_to_dgl_id(src, src_type)
-        dst = self.cugraph_n_id_to_dgl_id(dst, dst_type)
-
-        return src, dst
-
-    # Required in Link Prediction negative sampler
-    def global_uniform_negative_sampling(
-        self, num_samples, exclude_self_loops=True, replace=False, etype=None
-    ):
-        """
-        Per source negative sampling as in ``dgl.dataloading.GlobalUniform``
-        """
-        raise NotImplementedError(
-            "global_uniform_negative_sampling not implemented yet"
-        )
-
-    def get_node_storage(self, key: str, ntype: str = None):
-        """
-        Get storage object of node feature of
-        type :attr:`ntype` and name :attr:`key`
-        """
-        if ntype is None:
-            if len(self.ntypes) > 1:
-                raise ValueError(
-                    "ntype must be provided if multiple ntypes are present in the graph"
-                )
-            else:
-                ntype = self.ntype[0]
-        return dgl_FeatureStorage(self.ndata_storage, type_name=ntype, feat_name=key)
-
-    def get_edge_storage(self, key: str, etype: Optional[Tuple[str, str, str]] = None):
-        """
-        Get storage object of edge feature of
-        type :attr:`ntype` and name :attr:`key`
-        """
-        if etype is None:
-            if len(self.etypes) > 1:
-                raise ValueError(
-                    "etype must be provided if multiple etypes are present in the graph"
-                )
-            else:
-                etype = self.etypes[0]
-        return dgl_FeatureStorage(self.edata_storage, type_name=etype, feat_name=key)
-
-    # Number of edges/nodes utils
-    def num_nodes(self, ntype: str = None) -> int:
-        """
-        Return the number of nodes in the graph.
-        Parameters
-        ----------
-        ntype : str, optional
-            The node type name. If given, it returns the number of nodes of the
-            type.
-            If not given (default), it  returns the total number of nodes
-            of all types.
-
-        Returns
-        -------
-        int
-            The number of nodes.
-        """
-        if ntype:
-            return self.num_nodes_dict[ntype]
-        else:
-            return self.total_number_of_nodes
-
-    def number_of_nodes(self, ntype: str = None) -> int:
-        """
-        Return the number of nodes in the graph.
-        Alias of ``num_nodes``
-        Parameters
-        ----------
-        ntype : str, optional
-            The node type name. If given, it returns the number of nodes of the
-            type.
-            If not given (default), it  returns the total number of nodes
-            of all types.
-
-        Returns
-        -------
-        int
-            The number of nodes.
-        """
-        return self.num_nodes(ntype)
-
-    @property
-    def ntypes(self) -> Sequence[str]:
-        """
-        Return all the node type names in the graph.
-
-        Returns
-        -------
-        list[str]
-            All the node type names in a list.
-        """
-        ntypes = list(self.num_nodes_dict.keys())
-        return ntypes
-
-    @property
-    def etypes(self) -> Sequence[str]:
-        """
-        Return all the edge type names in the graph.
-
-        Returns
-        -------
-        list[str]
-            All the edge type names in a list.
-        """
-
-        return [can_etype[1] for can_etype in self.canonical_etypes]
-
-    def num_edges(self, etype: Optional[str] = None) -> int:
-        """
-        Return the number of edges in the graph.
-        Parameters
-        ----------
-        etype:
-
-        Returns
-        -------
-        int
-            The number of edges
-        """
-        if etype:
-            if etype not in self.canonical_etypes:
-                etype = self.get_corresponding_canonical_etype(etype)
-            return self.num_edges_dict[etype]
-        else:
-            return self.total_number_of_edges
-
-    @cached_property
-    def total_number_of_edges(self) -> int:
-        return sum(self.num_edges_dict.values())
-
-    @cached_property
-    def total_number_of_nodes(self) -> int:
-        return sum(self.num_nodes_dict.values())
-
-    @property
-    def num_canonical_edges_dict(self) -> dict[str, int]:
-        return self.num_edges_dict
-
-    @property
-    def canonical_etypes(self) -> Sequence[Tuple[str, str, str]]:
-        return list(self.num_edges_dict.keys())
-
-    @property
-    def device(self):
-        """
-        Get the device of the graph.
-        Returns
-        -------
-        device context
-            The device of the graph, which should be a
-            framework-specific device object (e.g., ``torch.device``).
-        """
-        return torch.cuda.current_device()
-
-    # Index Conversion Utils
-    def get_node_id_offset(self, ntype: str) -> int:
-        """
-        Return the integer offset for node id of type ntype
-        """
-        return self._ntype_offset_d[ntype]
-
-    def get_edge_id_offset(self, canonical_etype: Tuple[str, str, str]) -> int:
-        """
-        Return the integer offset for node id of type etype
-        """
-        _assert_valid_canonical_etype(canonical_etype)
-        return self._etype_offset_d[canonical_etype]
-
-    def dgl_n_id_to_cugraph_id(self, index_t, ntype: str):
-        return index_t + self.get_node_id_offset(ntype)
-
-    def cugraph_n_id_to_dgl_id(self, index_t, ntype: str):
-        return index_t - self.get_node_id_offset(ntype)
-
-    def dgl_e_id_to_cugraph_id(self, index_t, canonical_etype: Tuple[str, str, str]):
-        return index_t + self.get_edge_id_offset(canonical_etype)
-
-    def cugraph_e_id_to_dgl_id(self, index_t, canonical_etype: Tuple[str, str, str]):
-        return index_t - self.get_edge_id_offset(canonical_etype)
-
-    # Methods for getting the offsets per type
-    @staticmethod
-    def __get_etype_offset_d(num_canonical_edges_dict):
-        last_st = 0
-        etype_st_d = {}
-        for etype in sorted(num_canonical_edges_dict.keys()):
-            etype_st_d[etype] = last_st
-            last_st = last_st + num_canonical_edges_dict[etype]
-        return etype_st_d
-
-    @staticmethod
-    def __get_etype_range_d(etype_offset_d, num_canonical_edges_dict):
-        # dict for edge_id_offset_start
-        etype_range_d = {}
-        for etype, st in etype_offset_d.items():
-            etype_range_d[etype] = (st, st + num_canonical_edges_dict[etype])
-        return etype_range_d
-
-    @staticmethod
-    def __get_ntype_offset_d(num_nodes_dict):
-        # dict for node_id_offset_start
-        last_st = 0
-        ntype_st_d = {}
-        for ntype in sorted(num_nodes_dict.keys()):
-            ntype_st_d[ntype] = last_st
-            last_st = last_st + num_nodes_dict[ntype]
-        return ntype_st_d
-
-    def get_corresponding_canonical_etype(self, etype: str) -> str:
-        can_etypes = [
-            can_etype for can_etype in self.canonical_etypes if can_etype[1] == etype
-        ]
-        if len(can_etypes) > 1:
-            raise dgl.DGLError(
-                f'Edge type "{etype}" is ambiguous. Please use canonical'
-                + "edge type in the form of (srctype, etype, dsttype)"
-            )
-        return can_etypes[0]
-
-    def __convert_to_dgl_tensor_d(
-        self,
-        graph_sampled_data_d,
-        o_dtype=None if isinstance(F, MissingModule) else F.int64,
-    ):
-
-        graph_data_d = {}
-        graph_eid_d = {}
-        for canonical_etype, (
-            src,
-            dst,
-            edge_id,
-        ) in graph_sampled_data_d.items():
-            src_type = canonical_etype[0]
-            dst_type = canonical_etype[2]
-
-            src_t = _torch_tensor_from_cp_array(src)
-            dst_t = _torch_tensor_from_cp_array(dst)
-            edge_id_t = _torch_tensor_from_cp_array(edge_id)
-
-            src_t = self.cugraph_n_id_to_dgl_id(src_t, src_type)
-            dst_t = self.cugraph_n_id_to_dgl_id(dst_t, dst_type)
-            edge_id_t = self.cugraph_e_id_to_dgl_id(edge_id_t, canonical_etype)
-            graph_data_d[canonical_etype] = (src_t.to(o_dtype), dst_t.to(o_dtype))
-            graph_eid_d[canonical_etype] = edge_id_t.to(o_dtype)
-
-        return graph_data_d, graph_eid_d
-
-
-def _torch_tensor_from_cp_array(ar):
-    if len(ar) == 0:
-        return torch.as_tensor(ar.get()).to("cuda")
-    return torch.as_tensor(ar, device="cuda")
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
deleted file mode 100644
index 8a2e9cd954d..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from cugraph_dgl.dataloading.dataset import (
-    HomogenousBulkSamplerDataset,
-    HeterogenousBulkSamplerDataset,
-)
-
-from cugraph_dgl.dataloading.sampler import Sampler
-from cugraph_dgl.dataloading.neighbor_sampler import NeighborSampler
-
-from cugraph_dgl.dataloading.dask_dataloader import DaskDataLoader
-from cugraph_dgl.dataloading.dataloader import DataLoader as FutureDataLoader
-
-
-def DataLoader(*args, **kwargs):
-    warnings.warn(
-        "DataLoader has been renamed to DaskDataLoader.  "
-        "In Release 24.10, cugraph_dgl.dataloading.FutureDataLoader "
-        "will take over the DataLoader name.",
-        FutureWarning,
-    )
-    return DaskDataLoader(*args, **kwargs)
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
deleted file mode 100644
index e220b93f738..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dask_dataloader.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-import os
-import shutil
-import cugraph_dgl
-import cupy as cp
-import cudf
-from cugraph.utilities.utils import import_optional
-from cugraph.gnn import BulkSampler
-from dask.distributed import default_client, Event
-from cugraph_dgl.dataloading import (
-    HomogenousBulkSamplerDataset,
-    HeterogenousBulkSamplerDataset,
-)
-from cugraph_dgl.dataloading.utils.extract_graph_helpers import (
-    create_cugraph_graph_from_edges_dict,
-)
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-
-
-class DaskDataLoader(torch.utils.data.DataLoader):
-    """
-    Sampled graph data loader. Wrap a :class:`~cugraph_dgl.CuGraphStorage` and a
-    :class:`~cugraph_dgl.dataloading.NeighborSampler` into
-    an iterable over mini-batches of samples. cugraph_dgl's ``DataLoader`` extends
-    PyTorch's ``DataLoader`` by handling creation and
-    transmission of graph samples.
-    """
-
-    def __init__(
-        self,
-        graph: cugraph_dgl.CuGraphStorage,
-        indices: torch.Tensor,
-        graph_sampler: cugraph_dgl.dataloading.NeighborSampler,
-        sampling_output_dir: str,
-        batches_per_partition: int = 50,
-        seeds_per_call: int = 200_000,
-        device: torch.device = None,
-        use_ddp: bool = False,
-        ddp_seed: int = 0,
-        batch_size: int = 1024,
-        drop_last: bool = False,
-        shuffle: bool = False,
-        sparse_format: str = "coo",
-        **kwargs,
-    ):
-        """
-        Constructor for DaskDataLoader:
-        -------------------------------
-        graph : CuGraphStorage
-            The graph.
-        indices : Tensor or dict[ntype, Tensor]
-            The set of indices.  It can either be a tensor of
-            integer indices or a dictionary of types and indices.
-            The actual meaning of the indices is defined by the :meth:`sample` method of
-            :attr:`graph_sampler`.
-        graph_sampler : cugraph_dgl.dataloading.NeighborSampler
-            The subgraph sampler.
-        sampling_output_dir: str
-            Output directory to share sampling results in
-        batches_per_partition: int
-            The number of batches of sampling results to write/read
-        seeds_per_call: int
-            The number of seeds to sample at once
-        device : device context, optional
-            The device of the generated MFGs in each iteration, which should be a
-            PyTorch device object (e.g., ``torch.device``).
-            By default this returns the tenors on device with the current
-            cuda context
-        use_ddp : boolean, optional
-            If True, tells the DataLoader to split the training set for each
-            participating process appropriately using
-            :class:`torch.utils.data.distributed.DistributedSampler`.
-            Overrides the :attr:`sampler` argument of
-            :class:`torch.utils.data.DataLoader`.
-        ddp_seed : int, optional
-            The seed for shuffling the dataset in
-            :class:`torch.utils.data.distributed.DistributedSampler`.
-            Only effective when :attr:`use_ddp` is True.
-        batch_size: int
-            Batch size.
-        sparse_format: str, default = "coo"
-            The sparse format of the emitted sampled graphs. Choose between "csc"
-            and "coo". When using "csc", the graphs are of type
-            cugraph_dgl.nn.SparseGraph.
-        kwargs : dict
-            Key-word arguments to be passed to the parent PyTorch
-            :py:class:`torch.utils.data.DataLoader` class. Common arguments are:
-                - ``batch_size`` (int): The number of indices in each batch.
-                - ``drop_last`` (bool): Whether to drop the last incomplete
-                                        batch.
-                - ``shuffle`` (bool): Whether to randomly shuffle the
-                                      indices at each epoch
-        Examples
-        --------
-        To train a 3-layer GNN for node classification on a set of nodes
-        ``train_nid`` on a homogeneous graph where each node takes messages
-        from 15 neighbors on the first layer, 10 neighbors on the second, and
-        5 neighbors on the third:
-        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
-        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
-        ...     g, train_nid, sampler,
-        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
-        >>> for input_nodes, output_nodes, blocks in dataloader:
-        ...     train_on(input_nodes, output_nodes, blocks)
-        **Using with Distributed Data Parallel**
-        If you are using PyTorch's distributed training (e.g. when using
-        :mod:`torch.nn.parallel.DistributedDataParallel`),
-        you can train the model by turning
-        on the `use_ddp` option:
-        >>> sampler = cugraph_dgl.dataloading.NeighborSampler([15, 10, 5])
-        >>> dataloader = cugraph_dgl.dataloading.DataLoader(
-        ...     g, train_nid, sampler, use_ddp=True,
-        ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=0)
-        >>> for epoch in range(start_epoch, n_epochs):
-        ...     for input_nodes, output_nodes, blocks in dataloader:
-        ...
-        """
-        if sparse_format not in ["coo", "csc"]:
-            raise ValueError(
-                f"sparse_format must be one of 'coo', 'csc', "
-                f"but got {sparse_format}."
-            )
-        self.sparse_format = sparse_format
-
-        self.ddp_seed = ddp_seed
-        self.use_ddp = use_ddp
-        self.shuffle = shuffle
-        self.drop_last = drop_last
-        self.graph_sampler = graph_sampler
-        worker_init_fn = dgl.dataloading.WorkerInitWrapper(
-            kwargs.get("worker_init_fn", None)
-        )
-        self.other_storages = {}
-        self.epoch_number = 0
-        self._batch_size = batch_size
-        self._sampling_output_dir = sampling_output_dir
-        self._batches_per_partition = batches_per_partition
-        self._seeds_per_call = seeds_per_call
-        self._rank = None
-
-        indices = _dgl_idx_to_cugraph_idx(indices, graph)
-
-        self.tensorized_indices_ds = dgl.dataloading.create_tensorized_dataset(
-            indices,
-            batch_size,
-            drop_last,
-            use_ddp,
-            ddp_seed,
-            shuffle,
-            kwargs.get("persistent_workers", False),
-        )
-
-        if len(graph.ntypes) <= 1:
-            self.cugraph_dgl_dataset = HomogenousBulkSamplerDataset(
-                total_number_of_nodes=graph.total_number_of_nodes,
-                edge_dir=self.graph_sampler.edge_dir,
-                sparse_format=sparse_format,
-            )
-        else:
-            etype_id_to_etype_str_dict = {v: k for k, v in graph._etype_id_dict.items()}
-
-            self.cugraph_dgl_dataset = HeterogenousBulkSamplerDataset(
-                num_nodes_dict=graph.num_nodes_dict,
-                etype_id_dict=etype_id_to_etype_str_dict,
-                etype_offset_dict=graph._etype_offset_d,
-                ntype_offset_dict=graph._ntype_offset_d,
-                edge_dir=self.graph_sampler.edge_dir,
-            )
-
-        if use_ddp:
-            rank = torch.distributed.get_rank()
-            client = default_client()
-            self._graph_creation_event = Event("cugraph_dgl_load_mg_graph_event")
-            if rank == 0:
-                G = create_cugraph_graph_from_edges_dict(
-                    edges_dict=graph._edges_dict,
-                    etype_id_dict=graph._etype_id_dict,
-                    edge_dir=graph_sampler.edge_dir,
-                )
-                client.publish_dataset(cugraph_dgl_mg_graph_ds=G)
-                self._graph_creation_event.set()
-            else:
-                if self._graph_creation_event.wait(timeout=1000):
-                    G = client.get_dataset("cugraph_dgl_mg_graph_ds")
-                else:
-                    raise RuntimeError(
-                        f"Fetch cugraph_dgl_mg_graph_ds to worker_id {rank}",
-                        "from worker_id 0 failed",
-                    )
-        else:
-            rank = 0
-            G = create_cugraph_graph_from_edges_dict(
-                edges_dict=graph._edges_dict,
-                etype_id_dict=graph._etype_id_dict,
-                edge_dir=graph_sampler.edge_dir,
-            )
-
-        self._rank = rank
-        self._cugraph_graph = G
-        super().__init__(
-            self.cugraph_dgl_dataset,
-            batch_size=None,
-            worker_init_fn=worker_init_fn,
-            collate_fn=lambda x: x,  # Hack to prevent collating
-            **kwargs,
-        )
-
-    def __iter__(self):
-        output_dir = os.path.join(
-            self._sampling_output_dir, "epoch_" + str(self.epoch_number)
-        )
-        kwargs = {}
-        if isinstance(self.cugraph_dgl_dataset, HomogenousBulkSamplerDataset):
-            kwargs["deduplicate_sources"] = True
-            kwargs["prior_sources_behavior"] = "carryover"
-            kwargs["renumber"] = True
-
-            if self.sparse_format == "csc":
-                kwargs["compression"] = "CSR"
-                kwargs["compress_per_hop"] = True
-                # The following kwargs will be deprecated in uniform sampler.
-                kwargs["use_legacy_names"] = False
-                kwargs["include_hop_column"] = False
-
-        else:
-            kwargs["deduplicate_sources"] = False
-            kwargs["prior_sources_behavior"] = None
-            kwargs["renumber"] = False
-
-        bs = BulkSampler(
-            output_path=output_dir,
-            batch_size=self._batch_size,
-            graph=self._cugraph_graph,
-            batches_per_partition=self._batches_per_partition,
-            seeds_per_call=self._seeds_per_call,
-            fanout_vals=self.graph_sampler._reversed_fanout_vals,
-            with_replacement=self.graph_sampler.replace,
-            **kwargs,
-        )
-
-        if self.shuffle:
-            self.tensorized_indices_ds.shuffle()
-
-        batch_df = create_batch_df(self.tensorized_indices_ds)
-        bs.add_batches(batch_df, start_col_name="start", batch_col_name="batch_id")
-        bs.flush()
-        self.cugraph_dgl_dataset.set_input_files(input_directory=output_dir)
-        self.epoch_number = self.epoch_number + 1
-        return super().__iter__()
-
-    def __del__(self):
-        if self.use_ddp:
-            torch.distributed.barrier()
-        if self._rank == 0:
-            if self.use_ddp:
-                client = default_client()
-                client.unpublish_dataset("cugraph_dgl_mg_graph_ds")
-                self._graph_creation_event.clear()
-            _clean_directory(self._sampling_output_dir)
-
-
-def get_batch_id_series(n_output_rows: int, batch_size: int) -> cudf.Series:
-    num_batches = (n_output_rows + batch_size - 1) // batch_size
-    print(f"Number of batches = {num_batches}".format(num_batches))
-    batch_ar = cp.arange(0, num_batches).repeat(batch_size)
-    batch_ar = batch_ar[0:n_output_rows].astype(cp.int32)
-    return cudf.Series(batch_ar)
-
-
-def create_batch_df(dataset: torch.Tensor) -> cudf.DataFrame:
-    batch_id_ls = []
-    indices_ls = []
-    for batch_id, b_indices in enumerate(dataset):
-        if isinstance(b_indices, dict):
-            b_indices = torch.cat(list(b_indices.values()))
-        batch_id_ar = cp.full(shape=len(b_indices), fill_value=batch_id, dtype=cp.int32)
-        batch_id_ls.append(batch_id_ar)
-        indices_ls.append(b_indices)
-
-    batch_id_ar = cp.concatenate(batch_id_ls)
-    indices_ar = cp.asarray(torch.concat(indices_ls))
-    batches_df = cudf.DataFrame(
-        {
-            "start": indices_ar,
-            "batch_id": batch_id_ar,
-        }
-    )
-    return batches_df
-
-
-def _dgl_idx_to_cugraph_idx(idx, cugraph_gs):
-    if not isinstance(idx, dict):
-        if len(cugraph_gs.ntypes) > 1:
-            raise dgl.DGLError(
-                "Must specify node type when the graph is not homogeneous."
-            )
-        return idx
-    else:
-        return {k: cugraph_gs.dgl_n_id_to_cugraph_id(n, k) for k, n in idx.items()}
-
-
-def _clean_directory(path):
-    """param <path> could either be relative or absolute."""
-    if os.path.isfile(path):
-        os.remove(path)  # remove the file
-    elif os.path.isdir(path):
-        shutil.rmtree(path)  # remove dir and all contains
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
deleted file mode 100644
index 4f36353cb18..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from typing import Union, Optional, Dict
-
-from cugraph.utilities.utils import import_optional
-
-import cugraph_dgl
-from cugraph_dgl.typing import TensorType
-from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-
-
-class DataLoader:
-    """
-    Duck-typed version of dgl.dataloading.DataLoader
-    """
-
-    def __init__(
-        self,
-        graph: "cugraph_dgl.Graph",
-        indices: TensorType,
-        graph_sampler: "cugraph_dgl.dataloading.Sampler",
-        device: Union[int, str, "torch.device"] = None,
-        use_ddp: bool = False,
-        ddp_seed: int = 0,
-        batch_size: int = 1,
-        drop_last: bool = False,
-        shuffle: bool = False,
-        use_prefetch_thread: Optional[bool] = None,
-        use_alternate_streams: Optional[bool] = None,
-        pin_prefetcher: Optional[bool] = None,
-        use_uva=False,
-        gpu_cache: Dict[str, Dict[str, int]] = None,
-        output_format: str = "dgl.Block",
-        **kwargs,
-    ):
-        """
-        Parameters
-        ----------
-        graph: cugraph_dgl.Graph
-            The graph being sampled.  Can be a single-GPU or multi-GPU graph.
-        indices: TensorType
-            The seed nodes for sampling.  If use_ddp=True, then all seed
-            nodes should be provided.  If use_ddp=False, then only the seed
-            nodes assigned to this worker should be provided.
-        graph_sampler: cugraph_dgl.dataloading.Sampler
-            The sampler responsible for sampling the graph and producing
-            output minibatches.
-        device: Union[int, str, torch.device]
-            Optional.
-            The device assigned to this loader ('cpu', 'cuda' or device id).
-            Defaults to the current device.
-        use_ddp: bool
-            Optional (default=False).
-            If true, this argument will assume the entire list of input seed
-            nodes is being passed to each worker, and will appropriately
-            split and shuffle the list.
-            It false, then it is assumed that the list of input seed nodes
-            is comprised of the union of the lists provided to each worker.
-        ddp_seed: int
-            Optional (default=0).
-            The seed used for dividing and shuffling data if use_ddp=True.
-            Has no effect if use_ddp=False.
-        use_uva: bool
-            Optional (default=False).
-            Whether to use pinned memory and unified virtual addressing
-            to perform sampling.
-            This argument is ignored by cuGraph-DGL.
-        use_prefetch_thread: bool
-            Optional (default=False).
-            Whether to spawn a new thread for feature fetching.
-            This argument is ignored by cuGraph-DGL.
-        use_alternate_streams: bool
-            Optional (default=False).
-            Whether to perform feature fetching on a separate stream.
-            This argument is ignored by cuGraph-DGL.
-        pin_prefetcher: bool
-            Optional (default=False).
-            Whether to pin the feature tensors.
-            This argument is currently ignored by cuGraph-DGL.
-        gpu_cache: Dict[str, Dict[str, int]]
-            List of features to cache using HugeCTR.
-            This argument is not supported by cuGraph-DGL and
-            will result in an error.
-        output_format: str
-            Optional (default="dgl.Block").
-            The output format for blocks.
-            Can be either "dgl.Block" or "cugraph_dgl.nn.SparseGraph".
-        """
-
-        if use_uva:
-            warnings.warn("The 'use_uva' argument is ignored by cuGraph-DGL.")
-        if use_prefetch_thread:
-            warnings.warn(
-                "The 'use_prefetch_thread' argument is ignored by cuGraph-DGL."
-            )
-        if use_alternate_streams:
-            warnings.warn(
-                "The 'use_alternate_streams' argument is ignored by cuGraph-DGL."
-            )
-        if pin_prefetcher:
-            warnings.warn("The 'pin_prefetcher' argument is ignored by cuGraph-DGL.")
-        if gpu_cache:
-            raise ValueError(
-                "HugeCTR is not supported by cuGraph-DGL. "
-                "Consider using WholeGraph for feature storage"
-                " in cugraph_dgl.Graph instead."
-            )
-
-        indices = _cast_to_torch_tensor(indices)
-
-        self.__dataset = dgl.dataloading.create_tensorized_dataset(
-            indices,
-            batch_size,
-            drop_last,
-            use_ddp,
-            ddp_seed,
-            shuffle,
-            kwargs.get("persistent_workers", False),
-        )
-
-        self.__output_format = output_format
-        self.__sampler = graph_sampler
-        self.__batch_size = batch_size
-        self.__graph = graph
-        self.__device = device
-
-    @property
-    def _batch_size(self):
-        return self.__batch_size
-
-    @property
-    def dataset(
-        self,
-    ) -> Union[
-        "dgl.dataloading.dataloader.TensorizedDataset",
-        "dgl.dataloading.dataloader.DDPTensorizedDataset",
-    ]:
-        return self.__dataset
-
-    def __iter__(self):
-        # TODO move to the correct device (rapidsai/cugraph-gnn#11)
-        return self.__sampler.sample(
-            self.__graph,
-            self.__dataset,
-            batch_size=self.__batch_size,
-        )
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
deleted file mode 100644
index f6fe38fe9f8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from typing import Tuple, Dict, Optional, List, Union
-
-import os
-import cudf
-from cugraph.utilities.utils import import_optional
-from cugraph_dgl.dataloading.utils.sampling_helpers import (
-    create_homogeneous_sampled_graphs_from_dataframe,
-    create_heterogeneous_sampled_graphs_from_dataframe,
-    create_homogeneous_sampled_graphs_from_dataframe_csc,
-)
-
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-
-
-# Todo: maybe should switch to __iter__
-class HomogenousBulkSamplerDataset(torch.utils.data.Dataset):
-    def __init__(
-        self,
-        total_number_of_nodes: int,
-        edge_dir: str,
-        return_type: str = "dgl.Block",
-        sparse_format: str = "coo",
-    ):
-        if return_type not in ["dgl.Block", "cugraph_dgl.nn.SparseGraph"]:
-            raise ValueError(
-                "return_type must be either 'dgl.Block' or "
-                "'cugraph_dgl.nn.SparseGraph'."
-            )
-        # TODO: Deprecate `total_number_of_nodes`
-        # as it is no longer needed
-        # in the next release
-        self.total_number_of_nodes = total_number_of_nodes
-        self.edge_dir = edge_dir
-        self.sparse_format = sparse_format
-        self._current_batch_fn = None
-        self._input_files = None
-        self._return_type = return_type
-
-    def __len__(self):
-        return self.num_batches
-
-    def __getitem__(self, idx: int):
-        if self._input_files is None:
-            raise dgl.DGLError(
-                "Please set input files by calling `set_input_files` "
-                "before trying to fetch a sample"
-            )
-
-        fn, batch_offset = self._batch_to_fn_d[idx]
-        if fn != self._current_batch_fn:
-            # Remove current batches to free up memory
-            # before loading new batches
-            if hasattr(self, "_current_batches"):
-                del self._current_batches
-            if self.sparse_format == "csc":
-                df = _load_sampled_file(dataset_obj=self, fn=fn, skip_rename=True)
-                self._current_batches = (
-                    create_homogeneous_sampled_graphs_from_dataframe_csc(df)
-                )
-            else:
-                df = _load_sampled_file(dataset_obj=self, fn=fn)
-                self._current_batches = (
-                    create_homogeneous_sampled_graphs_from_dataframe(
-                        sampled_df=df,
-                        edge_dir=self.edge_dir,
-                        return_type=self._return_type,
-                    )
-                )
-        current_offset = idx - batch_offset
-        return self._current_batches[current_offset]
-
-    def set_input_files(
-        self,
-        input_directory: Optional[str] = None,
-        input_file_paths: Optional[List[str]] = None,
-    ):
-        """
-        Set input files that have been created by the `cugraph.gnn.BulkSampler`
-        Parameters
-        ----------
-        input_directory: str
-           input_directory which contains all the files that will be
-           loaded by HomogenousBulkSamplerDataset
-        input_file_paths: List[str]
-            File paths that will be loaded by the HomogenousBulkSamplerDataset
-        """
-        _set_input_files(
-            self, input_directory=input_directory, input_file_paths=input_file_paths
-        )
-
-
-class HeterogenousBulkSamplerDataset(torch.utils.data.Dataset):
-    def __init__(
-        self,
-        num_nodes_dict: Dict[str, int],
-        etype_id_dict: Dict[int, Tuple[str, str, str]],
-        etype_offset_dict: Dict[Tuple[str, str, str], int],
-        ntype_offset_dict: Dict[str, int],
-        edge_dir: str = "in",
-    ):
-        self.num_nodes_dict = num_nodes_dict
-        self.etype_id_dict = etype_id_dict
-        self.etype_offset_dict = etype_offset_dict
-        self.ntype_offset_dict = ntype_offset_dict
-        self.edge_dir = edge_dir
-        self._current_batch_fn = None
-        self._input_files = None
-
-    def __len__(self):
-        return self.num_batches
-
-    def __getitem__(self, idx):
-        if self._input_files is None:
-            raise dgl.DGLError(
-                "Please set input files by calling `set_input_files` "
-                "before trying to fetch a sample"
-            )
-
-        fn, batch_offset = self._batch_to_fn_d[idx]
-        if fn != self._current_batch_fn:
-            df = _load_sampled_file(dataset_obj=self, fn=fn)
-            self._current_batches = create_heterogeneous_sampled_graphs_from_dataframe(
-                sampled_df=df,
-                num_nodes_dict=self.num_nodes_dict,
-                etype_id_dict=self.etype_id_dict,
-                etype_offset_dict=self.etype_offset_dict,
-                ntype_offset_dict=self.ntype_offset_dict,
-                edge_dir=self.edge_dir,
-            )
-            del df
-
-        current_offset = idx - batch_offset
-        return self._current_batches[current_offset]
-
-    def set_input_files(
-        self,
-        input_directory: Optional[str] = None,
-        input_file_paths: Optional[List[str]] = None,
-    ):
-        """
-        Set input files that have been created by the `cugraph.gnn.BulkSampler`
-        Parameters
-        ----------
-        input_directory: str
-            input_directory which contains all the files that will be
-            loaded by HeterogenousBulkSamplerDataset
-        input_file_paths: List[str]
-            File names that will be loaded by the HeterogenousBulkSamplerDataset
-        """
-        _set_input_files(
-            self, input_directory=input_directory, input_file_paths=input_file_paths
-        )
-
-
-def _load_sampled_file(dataset_obj, fn, skip_rename=False):
-    df = cudf.read_parquet(os.path.join(fn))
-    if dataset_obj.edge_dir == "in" and not skip_rename:
-        df.rename(
-            columns={"sources": "destinations", "destinations": "sources"},
-            inplace=True,
-        )
-    dataset_obj._current_batch_fn = fn
-    return df
-
-
-def get_batch_start_end(fn):
-    batch_str = fn.split("batch=")[1]
-    batch_start, batch_end = batch_str.split("-")
-    batch_end = batch_end.split(".parquet")[0]
-    return int(batch_start), int(batch_end)
-
-
-def get_batch_to_fn_d(files):
-    batch_to_fn_d = {}
-    batch_id = 0
-    for fn in files:
-        start, end = get_batch_start_end(fn)
-        batch_offset = batch_id
-        for _ in range(start, end + 1):
-            batch_to_fn_d[batch_id] = fn, batch_offset
-            batch_id += 1
-    return batch_to_fn_d
-
-
-def _set_input_files(
-    dataset_obj: Union[HomogenousBulkSamplerDataset, HeterogenousBulkSamplerDataset],
-    input_directory: Optional[str] = None,
-    input_file_paths: Optional[List[str]] = None,
-) -> None:
-
-    if input_directory is None and input_file_paths is None:
-        raise ValueError("input_files or input_file_paths must be set")
-
-    if (input_directory is not None) and (input_file_paths is not None):
-        raise ValueError("Only one of input_directory or input_file_paths must be set")
-
-    if input_file_paths:
-        dataset_obj._input_files = input_file_paths
-    if input_directory:
-        dataset_obj._input_files = [fp.path for fp in os.scandir(input_directory)]
-    dataset_obj._batch_to_fn_d = get_batch_to_fn_d(dataset_obj._input_files)
-    dataset_obj.num_batches = len(dataset_obj._batch_to_fn_d)
-    dataset_obj._current_batch_fn = None
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
deleted file mode 100644
index ecc51006995..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/neighbor_sampler.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import warnings
-import tempfile
-
-from typing import Sequence, Optional, Union, List, Tuple, Iterator
-
-from cugraph.gnn import UniformNeighborSampler, BiasedNeighborSampler, DistSampleWriter
-from cugraph.utilities.utils import import_optional
-
-import cugraph_dgl
-from cugraph_dgl.typing import DGLSamplerOutput
-from cugraph_dgl.dataloading.sampler import Sampler, HomogeneousSampleReader
-
-torch = import_optional("torch")
-
-
-class NeighborSampler(Sampler):
-    """Sampler that builds computational dependency of node representations via
-    neighbor sampling for multilayer GNN.
-    This sampler will make every node gather messages from a fixed number of neighbors
-    per edge type.  The neighbors are picked uniformly.
-    Parameters
-    ----------
-    fanouts_per_layer : int
-        List of neighbors to sample for each GNN layer, with the i-th
-        element being the fanout for the i-th GNN layer.
-        If -1 is provided then all inbound/outbound edges
-        of that edge type will be included.
-    edge_dir : str, default ``'in'``
-        Can be either ``'in' `` where the neighbors will be sampled according to
-        incoming edges, or ``'out'`` for outgoing edges
-    replace : bool, default False
-        Whether to sample with replacement
-    Examples
-    --------
-    **Node classification**
-    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
-    a homogeneous graph where each node takes messages from 5, 10, 15 neighbors for
-    the first, second, and third layer respectively (assuming the backend is PyTorch):
-    >>> sampler = cugraph_dgl.dataloading.NeighborSampler([5, 10, 15])
-    >>> dataloader = cugraph_dgl.dataloading.DataLoader(
-    ...     g, train_nid, sampler,
-    ...     batch_size=1024, shuffle=True)
-    >>> for input_nodes, output_nodes, blocks in dataloader:
-    ...     train_on(blocks)
-    """
-
-    def __init__(
-        self,
-        fanouts_per_layer: Sequence[int],
-        edge_dir: str = "in",
-        replace: bool = False,
-        prob: Optional[str] = None,
-        mask: Optional[str] = None,
-        prefetch_node_feats: Optional[Union[List[str], dict[str, List[str]]]] = None,
-        prefetch_edge_feats: Optional[
-            Union[List[str], dict[Tuple[str, str, str], List[str]]]
-        ] = None,
-        prefetch_labels: Optional[Union[List[str], dict[str, List[str]]]] = None,
-        output_device: Optional[Union["torch.device", int, str]] = None,
-        fused: Optional[bool] = None,
-        sparse_format="csc",
-        output_format="dgl.Block",
-        **kwargs,
-    ):
-        """
-        Parameters
-        ----------
-        fanouts_per_layer: Sequence[int]
-            The number of neighbors to sample per layer.
-        edge_dir: str
-            Optional (default='in').
-            The direction to traverse edges.
-        replace: bool
-            Optional (default=False).
-            Whether to sample with replacement.
-        prob: str
-            Optional.
-            If provided, the probability of each neighbor being
-            sampled is proportional to the edge feature
-            with the given name.  Mutually exclusive with mask.
-        mask: str
-            Optional.
-            If proivided, only neighbors where the edge mask
-            with the given name is True can be selected.
-            Mutually exclusive with prob.
-            Currently unsupported.
-        prefetch_node_feats: Union[List[str], dict[str, List[str]]]
-            Optional.
-            Currently ignored by cuGraph-DGL.
-        prefetch_edge_feats: Union[List[str], dict[Tuple[str, str, str], List[str]]]
-            Optional.
-            Currently ignored by cuGraph-DGL.
-        prefetch_labels: Union[List[str], dict[str, List[str]]]
-            Optional.
-            Currently ignored by cuGraph-DGL.
-        output_device: Union[torch.device, int, str]
-            Optional.
-            Output device for samples. Defaults to the current device.
-        fused: bool
-            Optional.
-            This argument is ignored by cuGraph-DGL.
-        sparse_format: str
-            Optional (default = "coo").
-            The sparse format of the emitted sampled graphs.
-            Currently, only "csc" is supported.
-        output_format: str
-            Optional (default = "dgl.Block")
-            The output format of the emitted sampled graphs.
-            Can be either "dgl.Block" (default), or "cugraph_dgl.nn.SparseGraph".
-        **kwargs
-            Keyword arguments for the underlying cuGraph distributed sampler
-            and writer (directory, batches_per_partition, format,
-            local_seeds_per_call).
-        """
-
-        if mask:
-            raise NotImplementedError(
-                "Edge masking is currently unsupported by cuGraph-DGL"
-            )
-        if prefetch_edge_feats:
-            warnings.warn("'prefetch_edge_feats' is ignored by cuGraph-DGL")
-        if prefetch_node_feats:
-            warnings.warn("'prefetch_node_feats' is ignored by cuGraph-DGL")
-        if prefetch_labels:
-            warnings.warn("'prefetch_labels' is ignored by cuGraph-DGL")
-        if fused:
-            warnings.warn("'fused' is ignored by cuGraph-DGL")
-
-        self.__prob_attr = prob
-
-        self.fanouts = fanouts_per_layer
-        reverse_fanouts = fanouts_per_layer.copy()
-        reverse_fanouts.reverse()
-        self._reversed_fanout_vals = reverse_fanouts
-
-        self.edge_dir = edge_dir
-        self.replace = replace
-        self.__kwargs = kwargs
-
-        super().__init__(
-            sparse_format=sparse_format,
-            output_format=output_format,
-        )
-
-    def sample(
-        self,
-        g: "cugraph_dgl.Graph",
-        indices: Iterator["torch.Tensor"],
-        batch_size: int = 1,
-    ) -> Iterator[DGLSamplerOutput]:
-        kwargs = dict(**self.__kwargs)
-
-        directory = kwargs.pop("directory", None)
-        if directory is None:
-            warnings.warn("Setting a directory to store samples is recommended.")
-            self._tempdir = tempfile.TemporaryDirectory()
-            directory = self._tempdir.name
-
-        writer = DistSampleWriter(
-            directory=directory,
-            batches_per_partition=kwargs.pop("batches_per_partition", 256),
-            format=kwargs.pop("format", "parquet"),
-        )
-
-        sampling_clx = (
-            UniformNeighborSampler
-            if self.__prob_attr is None
-            else BiasedNeighborSampler
-        )
-
-        ds = sampling_clx(
-            g._graph(self.edge_dir, prob_attr=self.__prob_attr),
-            writer,
-            compression="CSR",
-            fanout=self._reversed_fanout_vals,
-            prior_sources_behavior="carryover",
-            deduplicate_sources=True,
-            compress_per_hop=True,
-            with_replacement=self.replace,
-            **kwargs,
-        )
-
-        if g.is_homogeneous:
-            indices = torch.concat(list(indices))
-            reader = ds.sample_from_nodes(indices.long(), batch_size=batch_size)
-            return HomogeneousSampleReader(reader, self.output_format, self.edge_dir)
-
-        raise ValueError(
-            "Sampling heterogeneous graphs is currently"
-            " unsupported in the non-dask API"
-        )
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py b/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
deleted file mode 100644
index 7ea608e7e53..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/sampler.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Iterator, Dict, Tuple, List, Union
-
-import cugraph_dgl
-from cugraph_dgl.nn import SparseGraph
-from cugraph_dgl.typing import DGLSamplerOutput
-from cugraph_dgl.dataloading.utils.sampling_helpers import (
-    create_homogeneous_sampled_graphs_from_tensors_csc,
-)
-
-
-from cugraph.utilities.utils import import_optional
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-class SampleReader:
-    """
-    Iterator that processes results from the cuGraph distributed sampler.
-    """
-
-    def __init__(
-        self,
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]],
-        output_format: str = "dgl.Block",
-    ):
-        """
-        Constructs a new SampleReader.
-
-        Parameters
-        ----------
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-            The iterator responsible for loading saved samples produced by
-            the cuGraph distributed sampler.
-        """
-        self.__output_format = output_format
-        self.__base_reader = base_reader
-        self.__num_samples_remaining = 0
-        self.__index = 0
-
-    @property
-    def output_format(self) -> str:
-        return self.__output_format
-
-    def __next__(self) -> DGLSamplerOutput:
-        if self.__num_samples_remaining == 0:
-            # raw_sample_data is already a dict of tensors
-            self.__raw_sample_data, start_inclusive, end_inclusive = next(
-                self.__base_reader
-            )
-
-            self.__decoded_samples = self._decode_all(self.__raw_sample_data)
-            self.__num_samples_remaining = end_inclusive - start_inclusive + 1
-            self.__index = 0
-
-        out = self.__decoded_samples[self.__index]
-        self.__index += 1
-        self.__num_samples_remaining -= 1
-        return out
-
-    def _decode_all(self) -> List[DGLSamplerOutput]:
-        raise NotImplementedError("Must be implemented by subclass")
-
-    def __iter__(self) -> DGLSamplerOutput:
-        return self
-
-
-class HomogeneousSampleReader(SampleReader):
-    """
-    Subclass of SampleReader that reads DGL homogeneous output samples
-    produced by the cuGraph distributed sampler.
-    """
-
-    def __init__(
-        self,
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]],
-        output_format: str = "dgl.Block",
-        edge_dir="in",
-    ):
-        """
-        Constructs a new HomogeneousSampleReader
-
-        Parameters
-        ----------
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-            The reader responsible for loading saved samples produced by
-            the cuGraph distributed sampler.
-        output_format: str
-            The output format for blocks (either "dgl.Block" or
-            "cugraph_dgl.nn.SparseGraph").
-        edge_dir: str
-            The direction sampling was performed in ("in" or "out").
-        """
-
-        self.__edge_dir = edge_dir
-        super().__init__(base_reader, output_format=output_format)
-
-    def __decode_csc(
-        self, raw_sample_data: Dict[str, "torch.Tensor"]
-    ) -> List[DGLSamplerOutput]:
-        return create_homogeneous_sampled_graphs_from_tensors_csc(
-            raw_sample_data, output_format=self.output_format
-        )
-
-    def __decode_coo(
-        self, raw_sample_data: Dict[str, "torch.Tensor"]
-    ) -> List[DGLSamplerOutput]:
-        raise NotImplementedError(
-            "COO format is currently unsupported in the non-dask API"
-        )
-
-    def _decode_all(
-        self, raw_sample_data: Dict[str, "torch.Tensor"]
-    ) -> List[DGLSamplerOutput]:
-        if "major_offsets" in raw_sample_data:
-            return self.__decode_csc(raw_sample_data)
-        else:
-            return self.__decode_coo(raw_sample_data)
-
-
-class Sampler:
-    """
-    Base sampler class for all cugraph-DGL samplers.
-    """
-
-    def __init__(self, sparse_format: str = "csc", output_format="dgl.Block"):
-        """
-        Parameters
-        ----------
-        sparse_format: str
-            Optional (default = "coo").
-            The sparse format of the emitted sampled graphs.
-            Currently, only "csc" is supported.
-        output_format: str
-            Optional (default = "dgl.Block")
-            The output format of the emitted sampled graphs.
-            Can be either "dgl.Block" (default), or "cugraph_dgl.nn.SparseGraph".
-        """
-
-        if sparse_format != "csc":
-            raise ValueError("Only CSC format is supported at this time")
-
-        self.__output_format = output_format
-
-    @property
-    def output_format(self):
-        return self.__output_format
-
-    @property
-    def sparse_format(self):
-        return self.__sparse_format
-
-    def sample(
-        self,
-        g: cugraph_dgl.Graph,
-        indices: Iterator["torch.Tensor"],
-        batch_size: int = 1,
-    ) -> Iterator[
-        Tuple["torch.Tensor", "torch.Tensor", List[Union[SparseGraph, "dgl.Block"]]]
-    ]:
-        """
-        Samples the graph.
-
-        Parameters
-        ----------
-        g: cugraph_dgl.Graph
-            The graph being sampled.
-        indices: TensorType
-            The node ids of seed nodes where sampling will initiate from.
-        batch_size: int
-            The number of seed nodes per batch.
-
-        Returns
-        -------
-        Iterator[DGLSamplerOutput]
-            Iterator over batches.  The returned tuples are in standard
-            DGL format: (input nodes, output nodes, blocks) where input
-            nodes are the renumbered input nodes, output nodes are
-            the renumbered output nodes, and blocks are the output graphs
-            for each hop.
-        """
-
-        raise NotImplementedError("Must be implemented by subclass")
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/__init__.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/__init__.py
deleted file mode 100644
index a1dd01f33d4..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py
deleted file mode 100644
index 0d3d5823097..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/extract_graph_helpers.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from typing import Tuple, Dict, Union
-import cugraph
-import cudf
-import dask_cudf
-import numpy as np
-
-
-def create_cugraph_graph_from_edges_dict(
-    edges_dict: Dict[Tuple(str, str, str), Union[dask_cudf.DataFrame, cudf.DataFrame]],
-    etype_id_dict: Dict[Dict[Tuple(str, str, str)] : int],
-    edge_dir: str,
-):
-    if edge_dir == "in":
-        edges_dict = {k: reverse_edges(df) for k, df in edges_dict.items()}
-    if len(edges_dict) > 1:
-        has_multiple_etypes = True
-        edges_dict = {
-            k: add_etype_id(df, etype_id_dict[k]) for k, df in edges_dict.items()
-        }
-    else:
-        has_multiple_etypes = False
-
-    edges_dfs = list(edges_dict.values())
-    del edges_dict
-    if isinstance(edges_dfs[0], dask_cudf.DataFrame):
-        edges_df = dask_cudf.concat(edges_dfs, ignore_index=True)
-    else:
-        edges_df = cudf.concat(edges_dfs, ignore_index=True)
-    del edges_dfs
-
-    G = cugraph.MultiGraph(directed=True)
-    if isinstance(edges_df, dask_cudf.DataFrame):
-        g_creation_f = G.from_dask_cudf_edgelist
-    else:
-        g_creation_f = G.from_cudf_edgelist
-
-    if has_multiple_etypes:
-        edge_etp = "etp"
-    else:
-        edge_etp = None
-
-    g_creation_f(
-        edges_df,
-        source="_SRC_",
-        destination="_DST_",
-        weight=None,
-        edge_id="_EDGE_ID_",
-        edge_type=edge_etp,
-        renumber=True,
-    )
-    return G
-
-
-def reverse_edges(df: Union[dask_cudf.DataFrame, cudf.DataFrame]):
-    return df.rename(columns={"_SRC_": "_DST_", "_DST_": "_SRC_"})
-
-
-def add_etype_id(df: Union[dask_cudf.DataFrame, cudf.DataFrame], etype_id: int):
-    df["etp"] = np.int32(etype_id)
-    return df
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
deleted file mode 100644
index 3b7e4502134..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ /dev/null
@@ -1,692 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from typing import List, Tuple, Dict, Optional
-from collections import defaultdict
-import cudf
-from cugraph.utilities.utils import import_optional
-from cugraph_dgl.nn import SparseGraph
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-cugraph_dgl = import_optional("cugraph_dgl")
-
-
-def cast_to_tensor(ser: cudf.Series):
-    if len(ser) == 0:
-        # Empty series can not be converted to pytorch cuda tensor
-        t = torch.from_numpy(ser.values.get())
-        return t.to("cuda")
-
-    return torch.as_tensor(ser.values, device="cuda")
-
-
-def _split_tensor(t, split_indices):
-    """
-    Split a tensor into a list of tensors based on split_indices.
-    """
-    # TODO: Switch to something below
-    # return [t[i:j] for i, j in zip(split_indices[:-1], split_indices[1:])]
-    if split_indices.device.type != "cpu":
-        split_indices = split_indices.to("cpu")
-    return torch.tensor_split(t, split_indices)
-
-
-def _get_source_destination_range(sampled_df):
-    o = sampled_df.groupby(["batch_id", "hop_id"], as_index=True).agg(
-        {"sources": "max", "destinations": "max"}
-    )
-    o.rename(
-        columns={"sources": "sources_range", "destinations": "destinations_range"},
-        inplace=True,
-    )
-    d = o.to_dict(orient="index")
-    return d
-
-
-def _create_split_dict(tensor):
-    min_value = tensor.min()
-    max_value = tensor.max()
-    indices = torch.arange(
-        start=min_value + 1,
-        end=max_value + 1,
-        device=tensor.device,
-    )
-    split_dict = {i: {} for i in range(min_value, max_value + 1)}
-    return split_dict, indices
-
-
-def _get_renumber_map(df):
-    map = df["map"]
-    df.drop(columns=["map"], inplace=True)
-
-    map_starting_offset = map.iloc[0]
-    renumber_map = map[map_starting_offset:].dropna().reset_index(drop=True)
-    renumber_map_batch_indices = map[1 : map_starting_offset - 1].reset_index(drop=True)
-    renumber_map_batch_indices = renumber_map_batch_indices - map_starting_offset
-
-    map_end_offset = map_starting_offset + len(renumber_map)
-    # We only need to drop rows if the length of dataframe is determined by the map
-    # that is if map_length > sampled edges length
-    if map_end_offset == len(df):
-        df.dropna(axis=0, how="all", inplace=True)
-        df.reset_index(drop=True, inplace=True)
-
-    return df, cast_to_tensor(renumber_map), cast_to_tensor(renumber_map_batch_indices)
-
-
-def _get_tensor_d_from_sampled_df(df):
-    """
-    Converts a sampled cuDF DataFrame into a list of tensors.
-
-    Args:
-        df (cudf.DataFrame): The sampled cuDF DataFrame containing columns
-    Returns:
-        dict: A dictionary of tensors, keyed by batch_id and hop_id.
-    """
-    range_d = _get_source_destination_range(df)
-    df, renumber_map, renumber_map_batch_indices = _get_renumber_map(df)
-    batch_id_tensor = cast_to_tensor(df["batch_id"])
-    split_d, batch_indices = _create_split_dict(batch_id_tensor)
-    batch_split_indices = torch.searchsorted(batch_id_tensor, batch_indices).to("cpu")
-
-    for column in df.columns:
-        if column != "batch_id":
-            t = cast_to_tensor(df[column])
-            split_t = _split_tensor(t, batch_split_indices)
-            for bid, batch_t in zip(split_d.keys(), split_t):
-                split_d[bid][column] = batch_t
-
-    split_t = _split_tensor(renumber_map, renumber_map_batch_indices)
-    for bid, batch_t in zip(split_d.keys(), split_t):
-        split_d[bid]["map"] = batch_t
-    del df
-    result_tensor_d = {}
-    # Cache hop_split_d, hop_indices
-    hop_split_empty_d, hop_indices = None, None
-    for batch_id, batch_d in split_d.items():
-        hop_id_tensor = batch_d["hop_id"]
-        if hop_split_empty_d is None:
-            hop_split_empty_d, hop_indices = _create_split_dict(hop_id_tensor)
-
-        hop_split_d = {k: {} for k in hop_split_empty_d.keys()}
-        hop_split_indices = torch.searchsorted(hop_id_tensor, hop_indices).to("cpu")
-        for column, t in batch_d.items():
-            if column not in ["hop_id", "map"]:
-                split_t = _split_tensor(t, hop_split_indices)
-                for hid, ht in zip(hop_split_d.keys(), split_t):
-                    hop_split_d[hid][column] = ht
-        for hid in hop_split_d.keys():
-            hop_split_d[hid]["sources_range"] = range_d[(batch_id, hid)][
-                "sources_range"
-            ]
-            hop_split_d[hid]["destinations_range"] = range_d[(batch_id, hid)][
-                "destinations_range"
-            ]
-
-        result_tensor_d[batch_id] = hop_split_d
-        result_tensor_d[batch_id]["map"] = batch_d["map"]
-    return result_tensor_d
-
-
-def create_homogeneous_sampled_graphs_from_dataframe(
-    sampled_df: cudf.DataFrame,
-    edge_dir: str = "in",
-    return_type: str = "dgl.Block",
-):
-    """
-    This helper function creates DGL MFGS  for
-    homogeneous graphs from cugraph sampled dataframe
-
-    Args:
-        sampled_df (cudf.DataFrame): The sampled cuDF DataFrame containing
-            columns `sources`, `destinations`, `edge_id`, `batch_id` and
-            `hop_id`.
-        edge_dir (str): Direction of edges from samples
-    Returns:
-        list: A list containing three elements:
-            - input_nodes: The input nodes for the batch.
-            - output_nodes: The output nodes for the batch.
-            - graph_per_hop_ls: A list of DGL MFGS for each hop.
-    """
-    if return_type not in ["dgl.Block", "cugraph_dgl.nn.SparseGraph"]:
-        raise ValueError(
-            "return_type must be either dgl.Block or cugraph_dgl.nn.SparseGraph"
-        )
-
-    result_tensor_d = _get_tensor_d_from_sampled_df(sampled_df)
-    del sampled_df
-    result_mfgs = [
-        _create_homogeneous_sampled_graphs_from_tensors_perhop(
-            tensors_batch_d, edge_dir, return_type
-        )
-        for tensors_batch_d in result_tensor_d.values()
-    ]
-    del result_tensor_d
-    return result_mfgs
-
-
-def _create_homogeneous_sampled_graphs_from_tensors_perhop(
-    tensors_batch_d, edge_dir, return_type
-):
-    """
-    This helper function creates sampled DGL MFGS for
-    homogeneous graphs from tensors per hop for a single
-    batch
-    Args:
-        tensors_batch_d (dict): A dictionary of tensors, keyed by hop_id.
-        edge_dir (str): Direction of edges from samples
-        metagraph (dgl.metagraph): The metagraph for the sampled graph
-        return_type (str): The type of graph to return
-    Returns:
-        tuple: A tuple of three elements:
-            - input_nodes: The input nodes for the batch.
-            - output_nodes: The output nodes for the batch.
-            - graph_per_hop_ls: A list of MFGS for each hop.
-    """
-    if edge_dir not in ["in", "out"]:
-        raise ValueError(f"Invalid edge_dir {edge_dir} provided")
-    if edge_dir == "out":
-        raise ValueError("Outwards edges not supported yet")
-    graph_per_hop_ls = []
-    seednodes_range = None
-    for hop_id, tensor_per_hop_d in tensors_batch_d.items():
-        if hop_id != "map":
-            if return_type == "dgl.Block":
-                mfg = _create_homogeneous_dgl_block_from_tensor_d(
-                    tensor_d=tensor_per_hop_d,
-                    renumber_map=tensors_batch_d["map"],
-                    seednodes_range=seednodes_range,
-                )
-            elif return_type == "cugraph_dgl.nn.SparseGraph":
-                mfg = _create_homogeneous_cugraph_dgl_nn_sparse_graph(
-                    tensor_d=tensor_per_hop_d, seednodes_range=seednodes_range
-                )
-            else:
-                raise ValueError(f"Invalid return_type {return_type} provided")
-            seednodes_range = max(
-                tensor_per_hop_d["sources_range"],
-                tensor_per_hop_d["destinations_range"],
-            )
-            graph_per_hop_ls.append(mfg)
-
-    # default DGL behavior
-    if edge_dir == "in":
-        graph_per_hop_ls.reverse()
-    if return_type == "dgl.Block":
-        input_nodes = graph_per_hop_ls[0].srcdata[dgl.NID]
-        output_nodes = graph_per_hop_ls[-1].dstdata[dgl.NID]
-    else:
-        map = tensors_batch_d["map"]
-        input_nodes = map[0 : graph_per_hop_ls[0].num_src_nodes()]
-        output_nodes = map[0 : graph_per_hop_ls[-1].num_dst_nodes()]
-    return input_nodes, output_nodes, graph_per_hop_ls
-
-
-def _create_homogeneous_dgl_block_from_tensor_d(
-    tensor_d,
-    renumber_map,
-    seednodes_range=None,
-):
-    rs = tensor_d["sources"]
-    rd = tensor_d["destinations"]
-    max_src_nodes = tensor_d["sources_range"]
-    max_dst_nodes = tensor_d["destinations_range"]
-    if seednodes_range is not None:
-        # If we have  vertices without outgoing edges, then
-        # sources can be missing from seednodes
-        # so we add them
-        # to ensure all the blocks are
-        # lined up correctly
-        max_dst_nodes = max(max_dst_nodes, seednodes_range)
-
-    data_dict = {("_N", "_E", "_N"): (rs, rd)}
-    num_src_nodes = {"_N": max_src_nodes + 1}
-    num_dst_nodes = {"_N": max_dst_nodes + 1}
-
-    block = dgl.create_block(
-        data_dict=data_dict, num_src_nodes=num_src_nodes, num_dst_nodes=num_dst_nodes
-    )
-    if "edge_id" in tensor_d:
-        block.edata[dgl.EID] = tensor_d["edge_id"]
-    # Below adds run time overhead
-    block.srcdata[dgl.NID] = renumber_map[0 : max_src_nodes + 1]
-    block.dstdata[dgl.NID] = renumber_map[0 : max_dst_nodes + 1]
-    return block
-
-
-def _create_homogeneous_cugraph_dgl_nn_sparse_graph(tensor_d, seednodes_range):
-    max_src_nodes = tensor_d["sources_range"]
-    max_dst_nodes = tensor_d["destinations_range"]
-    if seednodes_range is not None:
-        max_dst_nodes = max(max_dst_nodes, seednodes_range)
-    size = (max_src_nodes + 1, max_dst_nodes + 1)
-    sparse_graph = cugraph_dgl.nn.SparseGraph(
-        size=size,
-        src_ids=tensor_d["sources"],
-        dst_ids=tensor_d["destinations"],
-        formats=["csc"],
-        reduce_memory=True,
-    )
-    return sparse_graph
-
-
-def create_heterogeneous_sampled_graphs_from_dataframe(
-    sampled_df: cudf.DataFrame,
-    num_nodes_dict: Dict[str, int],
-    etype_id_dict: Dict[int, Tuple[str, str, str]],
-    etype_offset_dict: Dict[Tuple[str, str, str], int],
-    ntype_offset_dict: Dict[str, int],
-    edge_dir: str = "in",
-):
-    """
-    This helper function creates DGL MFGS from cugraph sampled dataframe
-    """
-    sampled_df["batch_id"] = sampled_df["batch_id"] - sampled_df["batch_id"].min()
-    result_df_ls = sampled_df[
-        ["sources", "destinations", "edge_id", "hop_id", "edge_type"]
-    ].scatter_by_map(sampled_df["batch_id"], keep_index=False)
-    del sampled_df
-
-    result_df_ls = [
-        batch_df[["sources", "destinations", "edge_id", "edge_type"]].scatter_by_map(
-            batch_df["hop_id"], keep_index=False
-        )
-        for batch_df in result_df_ls
-    ]
-
-    result_tensor_ls = [
-        [
-            _get_edges_dict_from_perhop_df(
-                h_df, etype_id_dict, etype_offset_dict, ntype_offset_dict
-            )
-            for h_df in per_batch_ls
-        ]
-        for per_batch_ls in result_df_ls
-    ]
-    del result_df_ls
-
-    result_mfgs = [
-        _create_heterogenous_sampled_graphs_from_tensors_perhop(
-            tensors_perhop_ls, num_nodes_dict, edge_dir
-        )
-        for tensors_perhop_ls in result_tensor_ls
-    ]
-    return result_mfgs
-
-
-def _get_edges_dict_from_perhop_df(
-    df, etype_id_dict, etype_offset_dict, ntype_offset_dict
-):
-    # Optimize below function
-    # based on _get_tensor_ls_from_sampled_df
-    edges_per_type_ls = df[["sources", "destinations", "edge_id"]].scatter_by_map(
-        df["edge_type"], map_size=len(etype_id_dict), keep_index=False
-    )
-    del df
-    per_type_df_d = {etype_id_dict[i]: df for i, df in enumerate(edges_per_type_ls)}
-    del edges_per_type_ls
-    # reverse src,dst here
-    per_type_tensor_d = {
-        etype: (
-            cast_to_tensor(etype_df["sources"]) - ntype_offset_dict[etype[0]],
-            cast_to_tensor(etype_df["destinations"]) - ntype_offset_dict[etype[2]],
-            cast_to_tensor(etype_df["edge_id"]) - etype_offset_dict[etype],
-        )
-        for etype, etype_df in per_type_df_d.items()
-    }
-    return per_type_tensor_d
-
-
-def _create_heterogenous_sampled_graphs_from_tensors_perhop(
-    tensors_perhop_ls, num_nodes_dict, edge_dir
-):
-    if edge_dir not in ["in", "out"]:
-        raise ValueError(f"Invalid edge_dir {edge_dir} provided")
-    if edge_dir == "out":
-        raise ValueError("Outwards edges not supported yet")
-    graph_per_hop_ls = []
-    output_nodes = None
-
-    seed_nodes = None
-    for hop_edges_dict in tensors_perhop_ls:
-        block = create_heterogenous_dgl_block_from_tensors_dict(
-            hop_edges_dict, num_nodes_dict, seed_nodes
-        )
-        seed_nodes = block.srcdata[dgl.NID]
-        if output_nodes is None:
-            output_nodes = block.dstdata[dgl.NID]
-        graph_per_hop_ls.append(block)
-
-    # default DGL behavior
-    if edge_dir == "in":
-        graph_per_hop_ls.reverse()
-    return seed_nodes, output_nodes, graph_per_hop_ls
-
-
-def create_heterogenous_dgl_block_from_tensors_dict(
-    edges_dict: Dict[Tuple(str, str, str), (torch.Tensor, torch.Tensor, torch.Tensor)],
-    num_nodes_dict: Dict[str, torch.Tensor],
-    seed_nodes: Optional[Dict[str, torch.Tensor]],
-):
-    data_dict = {k: (s, d) for k, (s, d, _) in edges_dict.items()}
-    edge_ids_dict = {k: eid for k, (_, _, eid) in edges_dict.items()}
-
-    sampled_graph = dgl.heterograph(
-        data_dict=data_dict,
-        num_nodes_dict=num_nodes_dict,
-    )
-    sampled_graph.edata[dgl.EID] = edge_ids_dict
-
-    src_d = defaultdict(list)
-    dst_d = defaultdict(list)
-
-    for (s, _, d), (src_id, dst_id) in data_dict.items():
-        src_d[s].append(src_id)
-        dst_d[d].append(dst_id)
-
-    src_d = {k: torch.cat(v).unique() for k, v in src_d.items() if len(v) > 0}
-    if seed_nodes is None:
-        seed_nodes = {k: torch.cat(v).unique() for k, v in dst_d.items() if len(v) > 0}
-
-    block = dgl.to_block(sampled_graph, dst_nodes=seed_nodes, src_nodes=src_d)
-    block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
-    return block
-
-
-def _process_sampled_tensors_csc(
-    tensors: Dict["torch.Tensor"],
-    reverse_hop_id: bool = True,
-) -> Tuple[
-    Dict[int, Dict[int, Dict[str, "torch.Tensor"]]],
-    List["torch.Tensor"],
-    List[List[int, int]],
-]:
-    """
-    Convert tensors generated by BulkSampler to a dictionary of tensors, to
-    facilitate MFG creation. The sampled graphs in the dataframe use CSC-format.
-
-    Parameters
-    ----------
-    tensors: Dict[torch.Tensor]
-        The output from BulkSampler compressed in CSC format. The dataframe
-        should be generated with `compression="CSR"` in BulkSampler,
-        since the sampling routine treats seed nodes as sources.
-
-    reverse_hop_id: bool (default=True)
-        Reverse hop id.
-
-    Returns
-    -------
-    tensors_dict: dict
-        A nested dictionary keyed by batch id and hop id.
-        `tensor_dict[batch_id][hop_id]` holds "minors" and "major_offsets"
-        values for CSC MFGs.
-
-    renumber_map_list: list
-        List of renumbering maps for looking up global indices of nodes. One
-        map for each batch.
-
-    mfg_sizes: list
-        List of the number of nodes in each message passing layer. For the
-        k-th hop, mfg_sizes[k] and mfg_sizes[k+1] is the number of sources and
-        destinations, respectively.
-    """
-
-    major_offsets = tensors["major_offsets"]
-    minors = tensors["minors"]
-    label_hop_offsets = tensors["label_hop_offsets"]
-    renumber_map = tensors["map"]
-    renumber_map_offsets = tensors["renumber_map_offsets"]
-
-    n_batches = len(renumber_map_offsets) - 1
-    n_hops = int((len(label_hop_offsets) - 1) / n_batches)
-
-    # make global offsets local
-    # Have to make a clone as pytorch does not allow
-    # in-place operations on tensors
-    major_offsets -= major_offsets[0].clone()
-    label_hop_offsets -= label_hop_offsets[0].clone()
-    renumber_map_offsets -= renumber_map_offsets[0].clone()
-
-    # get the sizes of each adjacency matrix (for MFGs)
-    mfg_sizes = (label_hop_offsets[1:] - label_hop_offsets[:-1]).reshape(
-        (n_batches, n_hops)
-    )
-    n_nodes = renumber_map_offsets[1:] - renumber_map_offsets[:-1]
-    mfg_sizes = torch.hstack((mfg_sizes, n_nodes.reshape(n_batches, -1)))
-    if reverse_hop_id:
-        mfg_sizes = mfg_sizes.flip(1)
-
-    tensors_dict = {}
-    renumber_map_list = []
-    # Note: minors and major_offsets from BulkSampler are of type int32
-    # and int64 respectively. Since pylibcugraphops binding code doesn't
-    # support distinct node and edge index type, we simply casting both
-    # to int32 for now.
-    minors = minors.int()
-    major_offsets = major_offsets.int()
-    # Note: We transfer tensors to CPU here to avoid the overhead of
-    # transferring them in each iteration of the for loop below.
-    major_offsets_cpu = major_offsets.to("cpu").numpy()
-    label_hop_offsets_cpu = label_hop_offsets.to("cpu").numpy()
-
-    for batch_id in range(n_batches):
-        batch_dict = {}
-        for hop_id in range(n_hops):
-            hop_dict = {}
-            idx = batch_id * n_hops + hop_id  # idx in label_hop_offsets
-            major_offsets_start = label_hop_offsets_cpu[idx]
-            major_offsets_end = label_hop_offsets_cpu[idx + 1]
-            minors_start = major_offsets_cpu[major_offsets_start]
-            minors_end = major_offsets_cpu[major_offsets_end]
-            hop_dict["minors"] = minors[minors_start:minors_end]
-            hop_dict["major_offsets"] = (
-                major_offsets[major_offsets_start : major_offsets_end + 1]
-                - major_offsets[major_offsets_start]
-            )
-            if reverse_hop_id:
-                batch_dict[n_hops - 1 - hop_id] = hop_dict
-            else:
-                batch_dict[hop_id] = hop_dict
-
-        tensors_dict[batch_id] = batch_dict
-
-        renumber_map_list.append(
-            renumber_map[
-                renumber_map_offsets[batch_id] : renumber_map_offsets[batch_id + 1]
-            ],
-        )
-
-    return tensors_dict, renumber_map_list, mfg_sizes.tolist()
-
-
-def _process_sampled_df_csc(
-    df: cudf.DataFrame,
-    reverse_hop_id: bool = True,
-):
-    """
-    Convert a dataframe generated by BulkSampler to a dictionary of tensors, to
-    facilitate MFG creation. The sampled graphs in the dataframe use CSC-format.
-
-    Parameters
-    ----------
-    df: cudf.DataFrame
-        The output from BulkSampler compressed in CSC format. The dataframe
-        should be generated with `compression="CSR"` in BulkSampler,
-        since the sampling routine treats seed nodes as sources.
-
-    reverse_hop_id: bool (default=True)
-        Reverse hop id.
-
-    Returns
-    -------
-    tensors_dict: dict
-        A nested dictionary keyed by batch id and hop id.
-        `tensor_dict[batch_id][hop_id]` holds "minors" and "major_offsets"
-        values for CSC MFGs.
-
-    renumber_map_list: list
-        List of renumbering maps for looking up global indices of nodes. One
-        map for each batch.
-
-    mfg_sizes: list
-        List of the number of nodes in each message passing layer. For the
-        k-th hop, mfg_sizes[k] and mfg_sizes[k+1] is the number of sources and
-        destinations, respectively.
-    """
-
-    return _process_sampled_tensors_csc(
-        {
-            "major_offsets": cast_to_tensor(df.major_offsets.dropna()),
-            "label_hop_offsets": cast_to_tensor(df.label_hop_offsets.dropna()),
-            "renumber_map_offsets": cast_to_tensor(df.renumber_map_offsets.dropna()),
-            "map": cast_to_tensor(df["map"].dropna()),
-            "minors": cast_to_tensor(df.minors.dropna()),
-        },
-        reverse_hop_id=reverse_hop_id,
-    )
-
-
-def _create_homogeneous_blocks_from_csc(
-    tensors_dict: Dict[int, Dict[int, Dict[str, torch.Tensor]]],
-    renumber_map_list: List[torch.Tensor],
-    mfg_sizes: List[int, int],
-):
-    """Create mini-batches of MFGs in the dgl.Block format.
-    The input arguments are the outputs of
-    the function `_process_sampled_df_csc`.
-
-    Returns
-    -------
-    output: list
-        A list of mini-batches. Each mini-batch is a list that consists of
-        `input_nodes` tensor, `output_nodes` tensor and a list of MFGs.
-    """
-    n_batches, n_hops = len(mfg_sizes), len(mfg_sizes[0]) - 1
-    output = []
-    for b_id in range(n_batches):
-        output_batch = []
-        output_batch.append(renumber_map_list[b_id])
-        output_batch.append(renumber_map_list[b_id][: mfg_sizes[b_id][-1]])
-
-        mfgs = [
-            SparseGraph(
-                size=(mfg_sizes[b_id][h_id], mfg_sizes[b_id][h_id + 1]),
-                src_ids=tensors_dict[b_id][h_id]["minors"],
-                cdst_ids=tensors_dict[b_id][h_id]["major_offsets"],
-                formats=["csc", "coo"],
-                reduce_memory=True,
-            )
-            for h_id in range(n_hops)
-        ]
-
-        blocks = []
-        seednodes_range = None
-        for mfg in reversed(mfgs):
-            block_mfg = _create_homogeneous_dgl_block_from_tensor_d(
-                {
-                    "sources": mfg.src_ids(),
-                    "destinations": mfg.dst_ids(),
-                    "sources_range": mfg._num_src_nodes - 1,
-                    "destinations_range": mfg._num_dst_nodes - 1,
-                },
-                renumber_map=renumber_map_list[b_id],
-                seednodes_range=seednodes_range,
-            )
-
-            seednodes_range = max(
-                mfg._num_src_nodes - 1,
-                mfg._num_dst_nodes - 1,
-            )
-            blocks.append(block_mfg)
-        del mfgs
-
-        blocks.reverse()
-
-        output_batch.append(blocks)
-
-        output.append(output_batch)
-    return output
-
-
-def _create_homogeneous_sparse_graphs_from_csc(
-    tensors_dict: Dict[int, Dict[int, Dict[str, torch.Tensor]]],
-    renumber_map_list: List[torch.Tensor],
-    mfg_sizes: List[int, int],
-) -> List[List[torch.Tensor, torch.Tensor, List[SparseGraph]]]:
-    """Create mini-batches of MFGs. The input arguments are the outputs of
-    the function `_process_sampled_df_csc`.
-
-    Returns
-    -------
-    output: list
-        A list of mini-batches. Each mini-batch is a list that consists of
-        `input_nodes` tensor, `output_nodes` tensor and a list of MFGs.
-    """
-    n_batches, n_hops = len(mfg_sizes), len(mfg_sizes[0]) - 1
-    output = []
-    for b_id in range(n_batches):
-        output_batch = []
-        output_batch.append(renumber_map_list[b_id])
-        output_batch.append(renumber_map_list[b_id][: mfg_sizes[b_id][-1]])
-        mfgs = [
-            SparseGraph(
-                size=(mfg_sizes[b_id][h_id], mfg_sizes[b_id][h_id + 1]),
-                src_ids=tensors_dict[b_id][h_id]["minors"],
-                cdst_ids=tensors_dict[b_id][h_id]["major_offsets"],
-                formats=["csc"],
-                reduce_memory=True,
-            )
-            for h_id in range(n_hops)
-        ]
-
-        output_batch.append(mfgs)
-
-        output.append(output_batch)
-
-    return output
-
-
-def create_homogeneous_sampled_graphs_from_dataframe_csc(
-    sampled_df: cudf.DataFrame, output_format: str = "cugraph_dgl.nn.SparseGraph"
-):
-    """Public API to create mini-batches of MFGs using a dataframe output by
-    BulkSampler, where the sampled graph is compressed in CSC format."""
-    if output_format == "cugraph_dgl.nn.SparseGraph":
-        return _create_homogeneous_sparse_graphs_from_csc(
-            *(_process_sampled_df_csc(sampled_df)),
-        )
-    elif output_format == "dgl.Block":
-        return _create_homogeneous_blocks_from_csc(
-            *(_process_sampled_df_csc(sampled_df)),
-        )
-    else:
-        raise ValueError(f"Invalid output format {output_format}")
-
-
-def create_homogeneous_sampled_graphs_from_tensors_csc(
-    tensors: Dict["torch.Tensor"], output_format: str = "cugraph_dgl.nn.SparseGraph"
-):
-    """Public API to create mini-batches of MFGs using a dataframe output by
-    BulkSampler, where the sampled graph is compressed in CSC format."""
-    if output_format == "cugraph_dgl.nn.SparseGraph":
-        return _create_homogeneous_sparse_graphs_from_csc(
-            *(_process_sampled_tensors_csc(tensors)),
-        )
-    elif output_format == "dgl.Block":
-        return _create_homogeneous_blocks_from_csc(
-            *(_process_sampled_tensors_csc(tensors)),
-        )
-    else:
-        raise ValueError(f"Invalid output format {output_format}")
diff --git a/python/cugraph-dgl/cugraph_dgl/features.py b/python/cugraph-dgl/cugraph_dgl/features.py
deleted file mode 100644
index 9dc009f4127..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/features.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-wgth = import_optional("pylibwholegraph.torch")
-
-
-class WholeFeatureStore(
-    object if isinstance(dgl, MissingModule) else dgl.storages.base.FeatureStorage
-):
-    """
-    Interface for feature storage.
-    """
-
-    def __init__(
-        self,
-        tensor: "torch.Tensor",
-        memory_type: str = "distributed",
-        location: str = "cpu",
-    ):
-        """
-        Constructs a new WholeFeatureStore object that wraps a WholeGraph wholememory
-        distributed tensor.
-
-        Parameters
-        ----------
-        t: torch.Tensor
-            The local slice of the tensor being distributed.  These should be in order
-            by rank (i.e. rank 0 contains elements 0-9, rank 1 contains elements 10-19,
-            rank 3 contains elements 20-29, etc.)  The sizes do not need to be equal.
-        memory_type: str (optional, default='distributed')
-            The memory type of this store.  Options are
-            'distributed', 'chunked', and 'continuous'.
-            For more information consult the WholeGraph
-            documentation.
-        location: str(optional, default='cpu')
-            The location ('cpu' or 'cuda') where data is stored.
-        """
-        self.__wg_comm = wgth.get_global_communicator()
-
-        if len(tensor.shape) > 2:
-            raise ValueError("Only 1-D or 2-D tensors are supported by WholeGraph.")
-
-        rank = torch.distributed.get_rank()
-        world_size = torch.distributed.get_world_size()
-
-        ld = torch.tensor(tensor.shape[0], device="cuda", dtype=torch.int64)
-        sizes = torch.empty((world_size,), device="cuda", dtype=torch.int64)
-        torch.distributed.all_gather_into_tensor(sizes, ld)
-
-        sizes = sizes.cpu()
-        ld = sizes.sum()
-
-        self.__td = -1 if len(tensor.shape) == 1 else tensor.shape[1]
-        global_shape = [
-            int(ld),
-            self.__td if self.__td > 0 else 1,
-        ]
-
-        if self.__td < 0:
-            tensor = tensor.reshape((tensor.shape[0], 1))
-
-        wg_tensor = wgth.create_wholememory_tensor(
-            self.__wg_comm,
-            memory_type,
-            location,
-            global_shape,
-            tensor.dtype,
-            [global_shape[1], 1],
-        )
-
-        offset = sizes[:rank].sum() if rank > 0 else 0
-
-        wg_tensor.scatter(
-            tensor.clone(memory_format=torch.contiguous_format).cuda(),
-            torch.arange(
-                offset, offset + tensor.shape[0], dtype=torch.int64, device="cuda"
-            ).contiguous(),
-        )
-
-        self.__wg_comm.barrier()
-
-        self.__wg_tensor = wg_tensor
-
-    def requires_ddp(self) -> bool:
-        return True
-
-    def fetch(
-        self,
-        indices: torch.Tensor,
-        device: torch.cuda.Device,
-        pin_memory=False,
-        **kwargs,
-    ):
-        if pin_memory:
-            warnings.warn("pin_memory has no effect for WholeFeatureStorage.")
-
-        t = self.__wg_tensor.gather(
-            indices.cuda(),
-            force_dtype=self.__wg_tensor.dtype,
-        )
-
-        if self.__td < 0:
-            t = t.reshape((t.shape[0],))
-
-        return t.to(torch.device(device))
diff --git a/python/cugraph-dgl/cugraph_dgl/graph.py b/python/cugraph-dgl/cugraph_dgl/graph.py
deleted file mode 100644
index 88b93656fa8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/graph.py
+++ /dev/null
@@ -1,931 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from typing import Union, Optional, Dict, Tuple, List
-
-from cugraph.utilities.utils import import_optional
-from cugraph.gnn import cugraph_comms_get_raft_handle
-
-import cupy
-import pylibcugraph
-
-from cugraph_dgl.typing import TensorType
-from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
-from cugraph_dgl.features import WholeFeatureStore
-from cugraph_dgl.view import (
-    HeteroNodeView,
-    HeteroNodeDataView,
-    HeteroEdgeView,
-    HeteroEdgeDataView,
-    EmbeddingView,
-)
-
-
-# Have to use import_optional even though these are required
-# dependencies in order to build properly.
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-tensordict = import_optional("tensordict")
-
-HOMOGENEOUS_NODE_TYPE = "n"
-HOMOGENEOUS_EDGE_TYPE = (HOMOGENEOUS_NODE_TYPE, "e", HOMOGENEOUS_NODE_TYPE)
-
-
-class Graph:
-    """
-    cuGraph-backed duck-typed version of dgl.DGLGraph that distributes
-    the graph across workers.  This object uses lazy graph creation.
-    Users can repeatedly call add_edges, and the tensors won't
-    be converted into a cuGraph graph until one is needed
-    (i.e. when creating a loader). Supports
-    single-node/single-GPU, single-node/multi-GPU, and
-    multi-node/multi-GPU graph storage.
-
-    Each worker should have a slice of the graph locally, and
-    call put_edge_index with its slice.
-    """
-
-    def __init__(
-        self,
-        is_multi_gpu: bool = False,
-        ndata_storage="torch",
-        edata_storage="torch",
-        **kwargs,
-    ):
-        """
-        Parameters
-        ----------
-        is_multi_gpu: bool (optional, default=False)
-            Specifies whether this graph is distributed across GPUs.
-        ndata_storage: str (optional, default='torch')
-            Specifies where node data should be stored
-            (options are 'torch' and 'wholegraph').
-            If using PyTorch tensors for storage ('torch')
-            then data will be replicated across workers and data
-            for all nodes should be provided when calling add_nodes.
-            If using WholeGraph wholememory tensors for storage,
-            then data will be distributed across workers and only
-            the local slice of the data should be provided when
-            calling add_nodes.
-        edata_storage: str (optional, default='torch')
-            If using PyTorch tensors for storage ('torch')
-            then data will be replicated across workers and data
-            for all nodes should be provided when calling add_edge.
-            If using WholeGraph wholememory tensors for storage,
-            then data will be distributed across workers and only
-            the local slice of the data should be provided when
-            calling add_edges.
-        kwargs:
-            Optional kwargs for WholeGraph feature storage.
-        """
-
-        if ndata_storage not in ("torch", "wholegraph"):
-            raise ValueError(
-                "Invalid node storage type (valid types are 'torch' and 'wholegraph')"
-            )
-        if edata_storage not in ("torch", "wholegraph"):
-            raise ValueError(
-                "Invalid edge storage type (valid types are 'torch' and 'wholegraph')"
-            )
-
-        self.__num_nodes_dict = {}
-        self.__num_edges_dict = {}
-        self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
-
-        self.__graph = None
-        self.__vertex_offsets = None
-        self.__handle = None
-        self.__is_multi_gpu = is_multi_gpu
-
-        self.__ndata_storage_type = (
-            WholeFeatureStore
-            if ndata_storage == "wholegraph"
-            else dgl.storages.pytorch_tensor.PyTorchTensorStorage
-        )
-        self.__edata_storage_type = (
-            WholeFeatureStore
-            if edata_storage == "wholegraph"
-            else dgl.storages.pytorch_tensor.PyTorchTensorStorage
-        )
-        self.__ndata_storage = {}
-        self.__edata_storage = {}
-        self.__wg_kwargs = kwargs
-
-    @property
-    def is_multi_gpu(self):
-        return self.__is_multi_gpu
-
-    def to_canonical_etype(
-        self, etype: Union[str, Tuple[str, str, str]]
-    ) -> Tuple[str, str, str]:
-        if etype is None:
-            if len(self.canonical_etypes) > 1:
-                raise ValueError("Edge type is required for heterogeneous graphs.")
-            return HOMOGENEOUS_EDGE_TYPE
-
-        if isinstance(etype, tuple) and len(etype) == 3:
-            return etype
-
-        for src_type, rel_type, dst_type in self.__edge_indices.keys(
-            leaves_only=True, include_nested=True
-        ):
-            if etype == rel_type:
-                return (src_type, rel_type, dst_type)
-
-        raise ValueError("Unknown relation type " + etype)
-
-    def add_nodes(
-        self,
-        global_num_nodes: int,
-        data: Optional[Dict[str, TensorType]] = None,
-        ntype: Optional[str] = None,
-    ):
-        """
-        Adds the given number of nodes to this graph.  Can only be called once
-        per node type. The number of nodes specified here refers to the total
-        number of nodes across all workers (the entire graph). If the backing
-        feature store is distributed (i.e. wholegraph), then only local features
-        should be passed to the data argument.  If the backing feature store is
-        replicated, then features for all nodes in the graph should be passed to
-        the data argument, including those for nodes not on the local worker.
-
-        Parameters
-        ----------
-        global_num_nodes: int
-            The total number of nodes of the given type in this graph.
-            The same number should be passed to every worker.
-        data: Dict[str, TensorType] (optional, default=None)
-            Node feature tensors.
-        ntype: str (optional, default=None)
-            The node type being modified.  Required for heterogeneous graphs.
-        """
-        if ntype is None:
-            if len(self.__num_nodes_dict.keys()) > 1:
-                raise ValueError("Node type is required for heterogeneous graphs.")
-            ntype = HOMOGENEOUS_NODE_TYPE
-
-        if ntype in self.__num_nodes_dict:
-            raise ValueError(
-                "Calling add_nodes multiple types for the same "
-                "node type is not allowed in cuGraph-DGL"
-            )
-
-        if self.is_multi_gpu:
-            # Ensure all nodes got the same number of nodes passed
-            world_size = torch.distributed.get_world_size()
-            local_size = torch.tensor(
-                [global_num_nodes], device="cuda", dtype=torch.int64
-            )
-            ns = torch.empty((world_size,), device="cuda", dtype=torch.int64)
-            torch.distributed.all_gather_into_tensor(ns, local_size)
-            if not (ns == global_num_nodes).all():
-                raise ValueError("The global number of nodes must match on all workers")
-
-            # Ensure the sum of the feature shapes equals the global number of nodes.
-            if data is not None:
-                for feature_name, feature_tensor in data.items():
-                    features_size = torch.tensor(
-                        [int(feature_tensor.shape[0])], device="cuda", dtype=torch.int64
-                    )
-                    torch.distributed.all_reduce(
-                        features_size, op=torch.distributed.ReduceOp.SUM
-                    )
-                    if features_size != global_num_nodes:
-                        raise ValueError(
-                            "The total length of the feature vector across workers must"
-                            " match the global number of nodes but it does not "
-                            f"match for {feature_name}."
-                        )
-
-        self.__num_nodes_dict[ntype] = global_num_nodes
-
-        if data is not None:
-            for feature_name, feature_tensor in data.items():
-                self.__ndata_storage[ntype, feature_name] = self.__ndata_storage_type(
-                    _cast_to_torch_tensor(feature_tensor), **self.__wg_kwargs
-                )
-
-        self.__graph = None
-        self.__vertex_offsets = None
-
-    def __check_node_ids(self, ntype: str, ids: TensorType):
-        """
-        Ensures all node ids in the provided id tensor are valid.
-        Raises a ValueError if any are invalid.
-
-        Parameters
-        ----------
-        ntype: str
-            The node type being validated against.
-        ids:
-            The tensor of ids being validated.
-        """
-        if ntype in self.__num_nodes_dict:
-            if ids.max() + 1 > self.num_nodes(ntype):
-                raise ValueError(
-                    f"input tensor contains invalid node ids for type {ntype}"
-                )
-        else:
-            raise ValueError(
-                f"add_nodes() must be called for type {ntype} before calling num_edges."
-            )
-
-    def add_edges(
-        self,
-        u: TensorType,
-        v: TensorType,
-        data: Optional[Dict[str, TensorType]] = None,
-        etype: Optional[Union[str, Tuple[str, str, str]]] = None,
-    ) -> None:
-        """
-        Adds edges to this graph.  Must be called after add_nodes
-        is called for the src/dst node type. If the backing feature
-        store is distributed (i.e. wholegraph), then only local
-        features should be passed to the data argument.  If the
-        backing feature store is replicated, then features for
-        all edges should be passed to the data argument,
-        including those for edges not on the local worker.
-
-        Parameters
-        ----------
-        u: TensorType
-            1d tensor of source node ids (local slice of the distributed edgelist).
-        v: TensorType
-            1d tensor of destination node ids (local slice of the distributed edgelist).
-        data: Dict[str, TensorType] (optional, default=None)
-            Dictionary containing edge features for the new edges.
-        etype: Union[str, Tuple[str, str, str]]
-            The edge type of the edges being inserted.  Not required
-            for homogeneous graphs, which have only one edge type.
-        """
-
-        # Validate all inputs before proceeding
-        # The number of nodes for the src/dst type needs to be known and there cannot
-        # be any edges of this type in the graph.
-        dgl_can_edge_type = self.to_canonical_etype(etype)
-        src_type, _, dst_type = dgl_can_edge_type
-        if dgl_can_edge_type in self.__edge_indices.keys(
-            leaves_only=True, include_nested=True
-        ):
-            raise ValueError(
-                "This cuGraph-DGL graph already contains edges of type"
-                f" {dgl_can_edge_type}. Calling add_edges multiple times"
-                " for the same edge type is not supported."
-            )
-        self.__check_node_ids(src_type, u)
-        self.__check_node_ids(dst_type, v)
-
-        self.__edge_indices[dgl_can_edge_type] = torch.stack(
-            [
-                _cast_to_torch_tensor(u),
-                _cast_to_torch_tensor(v),
-            ]
-        ).to(self.idtype)
-
-        if data is not None:
-            for attr_name, attr_tensor in data.items():
-                self.__edata_storage[
-                    dgl_can_edge_type, attr_name
-                ] = self.__edata_storage_type(
-                    _cast_to_torch_tensor(attr_tensor), **self.__wg_kwargs
-                )
-
-        num_edges = self.__edge_indices[dgl_can_edge_type].shape[1]
-        if self.is_multi_gpu:
-            num_edges = torch.tensor([num_edges], device="cuda", dtype=torch.int64)
-            torch.distributed.all_reduce(num_edges, op=torch.distributed.ReduceOp.SUM)
-
-        self.__num_edges_dict[dgl_can_edge_type] = int(num_edges)
-
-        self.__graph = None
-        self.__vertex_offsets = None
-
-    def num_nodes(self, ntype: Optional[str] = None) -> int:
-        """
-        Returns the number of nodes of ntype, or if ntype is not provided,
-        the total number of nodes in the graph.
-        """
-        if ntype is None:
-            return sum(self.__num_nodes_dict.values())
-
-        return self.__num_nodes_dict[ntype]
-
-    def number_of_nodes(self, ntype: Optional[str] = None) -> int:
-        """
-        Alias for num_nodes.
-        """
-        return self.num_nodes(ntype=ntype)
-
-    def num_edges(self, etype: Union[str, Tuple[str, str, str]] = None) -> int:
-        """
-        Returns the number of edges of etype, or if etype is not provided,
-        the total number of edges in the graph.
-        """
-        if etype is None:
-            return sum(self.__num_edges_dict.values())
-
-        etype = self.to_canonical_etype(etype)
-        return self.__num_edges_dict[etype]
-
-    def number_of_edges(self, etype: Union[str, Tuple[str, str, str]] = None) -> int:
-        """
-        Alias for num_edges.
-        """
-        return self.num_edges(etype=etype)
-
-    @property
-    def ntypes(self) -> List[str]:
-        """
-        Returns the node type names in this graph.
-        """
-        return list(self.__num_nodes_dict.keys())
-
-    @property
-    def etypes(self) -> List[str]:
-        """
-        Returns the edge type names in this graph
-        (the second element of the canonical edge
-        type tuple).
-        """
-        return [et[1] for et in self.__num_edges_dict.keys()]
-
-    @property
-    def canonical_etypes(self) -> List[str]:
-        """
-        Returns the canonical edge type names in this
-        graph.
-        """
-        return list(self.__num_edges_dict.keys())
-
-    @property
-    def _vertex_offsets(self) -> Dict[str, int]:
-        if self.__vertex_offsets is None:
-            ordered_keys = sorted(list(self.ntypes))
-            self.__vertex_offsets = {}
-            offset = 0
-            for vtype in ordered_keys:
-                self.__vertex_offsets[vtype] = offset
-                offset += self.num_nodes(vtype)
-
-        return dict(self.__vertex_offsets)
-
-    def __get_edgelist(self, prob_attr=None) -> Dict[str, "torch.Tensor"]:
-        """
-        This function always returns src/dst labels with respect
-        to the out direction.
-
-        Returns
-        -------
-        Dict[str, torch.Tensor] with the following keys:
-            src: source vertices (int64)
-                Note that src is the 1st element of the DGL edge index.
-            dst: destination vertices (int64)
-                Note that dst is the 2nd element of the DGL edge index.
-            eid: edge ids for each edge (int64)
-                Note that these start from 0 for each edge type.
-            etp: edge types for each edge (int32)
-                Note that these are in lexicographic order.
-        """
-        sorted_keys = sorted(
-            list(self.__edge_indices.keys(leaves_only=True, include_nested=True))
-        )
-
-        # note that this still follows the DGL convention of (src, rel, dst)
-        # i.e. (author, writes, paper): [[0,1,2],[2,0,1]] is referring to a
-        # cuGraph graph where (paper 2) -> (author 0), (paper 0) -> (author 1),
-        # and (paper 1) -> (author 0)
-        edge_index = torch.concat(
-            [
-                torch.stack(
-                    [
-                        self.__edge_indices[src_type, rel_type, dst_type][0]
-                        + self._vertex_offsets[src_type],
-                        self.__edge_indices[src_type, rel_type, dst_type][1]
-                        + self._vertex_offsets[dst_type],
-                    ]
-                )
-                for (src_type, rel_type, dst_type) in sorted_keys
-            ],
-            axis=1,
-        ).cuda()
-
-        edge_type_array = torch.arange(
-            len(sorted_keys), dtype=torch.int32, device="cuda"
-        ).repeat_interleave(
-            torch.tensor(
-                [self.__edge_indices[et].shape[1] for et in sorted_keys],
-                device="cuda",
-                dtype=torch.int32,
-            )
-        )
-
-        num_edges_t = torch.tensor(
-            [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
-        )
-
-        if self.is_multi_gpu:
-            rank = torch.distributed.get_rank()
-            world_size = torch.distributed.get_world_size()
-
-            num_edges_all_t = torch.empty(
-                world_size, num_edges_t.numel(), dtype=torch.int64, device="cuda"
-            )
-            torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
-
-            start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
-
-        else:
-            rank = 0
-            start_offsets = torch.zeros(
-                (len(sorted_keys),), dtype=torch.int64, device="cuda"
-            )
-            num_edges_all_t = num_edges_t.reshape((1, num_edges_t.numel()))
-
-        # Use pinned memory here for fast access to CPU/WG storage
-        edge_id_array_per_type = [
-            torch.arange(
-                start_offsets[i],
-                start_offsets[i] + num_edges_all_t[rank][i],
-                dtype=torch.int64,
-                device="cpu",
-            ).pin_memory()
-            for i in range(len(sorted_keys))
-        ]
-
-        # Retrieve the weights from the appropriate feature(s)
-        # DGL implicitly requires all edge types use the same
-        # feature name.
-        if prob_attr is None:
-            weights = None
-        else:
-            if len(sorted_keys) > 1:
-                weights = torch.concat(
-                    [
-                        self.edata[prob_attr][sorted_keys[i]][ix]
-                        for i, ix in enumerate(edge_id_array_per_type)
-                    ]
-                )
-            else:
-                weights = self.edata[prob_attr][edge_id_array_per_type[0]]
-
-        # Safe to move this to cuda because the consumer will always
-        # move it to cuda if it isn't already there.
-        edge_id_array = torch.concat(edge_id_array_per_type).cuda()
-
-        edgelist_dict = {
-            "src": edge_index[0],
-            "dst": edge_index[1],
-            "etp": edge_type_array,
-            "eid": edge_id_array,
-        }
-
-        if weights is not None:
-            edgelist_dict["wgt"] = weights
-
-        return edgelist_dict
-
-    @property
-    def is_homogeneous(self):
-        return len(self.__num_edges_dict) <= 1 and len(self.__num_nodes_dict) <= 1
-
-    @property
-    def idtype(self):
-        return torch.int64
-
-    @property
-    def _resource_handle(self):
-        if self.__handle is None:
-            if self.is_multi_gpu:
-                self.__handle = pylibcugraph.ResourceHandle(
-                    cugraph_comms_get_raft_handle().getHandle()
-                )
-            else:
-                self.__handle = pylibcugraph.ResourceHandle()
-        return self.__handle
-
-    def _graph(
-        self,
-        direction: str,
-        prob_attr: Optional[str] = None,
-    ) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
-        """
-        Gets the pylibcugraph Graph object with edges pointing in the given direction
-        (i.e. 'out' is standard, 'in' is reverse).
-        """
-
-        if direction not in ["out", "in"]:
-            raise ValueError(f"Invalid direction {direction} (expected 'in' or 'out').")
-
-        graph_properties = pylibcugraph.GraphProperties(
-            is_multigraph=True, is_symmetric=False
-        )
-
-        if self.__graph is not None:
-            if (
-                self.__graph["direction"] != direction
-                or self.__graph["prob_attr"] != prob_attr
-            ):
-                self.__graph = None
-
-        if self.__graph is None:
-            src_col, dst_col = ("src", "dst") if direction == "out" else ("dst", "src")
-            edgelist_dict = self.__get_edgelist(prob_attr=prob_attr)
-
-            if self.is_multi_gpu:
-                rank = torch.distributed.get_rank()
-                world_size = torch.distributed.get_world_size()
-
-                vertices_array = cupy.arange(self.num_nodes(), dtype="int64")
-                vertices_array = cupy.array_split(vertices_array, world_size)[rank]
-
-                graph = pylibcugraph.MGGraph(
-                    self._resource_handle,
-                    graph_properties,
-                    [cupy.asarray(edgelist_dict[src_col]).astype("int64")],
-                    [cupy.asarray(edgelist_dict[dst_col]).astype("int64")],
-                    vertices_array=[vertices_array],
-                    edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
-                    edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
-                    weight_array=[cupy.asarray(edgelist_dict["wgt"])]
-                    if "wgt" in edgelist_dict
-                    else None,
-                )
-            else:
-                graph = pylibcugraph.SGGraph(
-                    self._resource_handle,
-                    graph_properties,
-                    cupy.asarray(edgelist_dict[src_col]).astype("int64"),
-                    cupy.asarray(edgelist_dict[dst_col]).astype("int64"),
-                    vertices_array=cupy.arange(self.num_nodes(), dtype="int64"),
-                    edge_id_array=cupy.asarray(edgelist_dict["eid"]),
-                    edge_type_array=cupy.asarray(edgelist_dict["etp"]),
-                    weight_array=cupy.asarray(edgelist_dict["wgt"])
-                    if "wgt" in edgelist_dict
-                    else None,
-                )
-
-        self.__graph = {"graph": graph, "direction": direction, "prob_attr": prob_attr}
-
-        return self.__graph["graph"]
-
-    def _has_n_emb(self, ntype: str, emb_name: str) -> bool:
-        return (ntype, emb_name) in self.__ndata_storage
-
-    def _get_n_emb(
-        self, ntype: Union[str, None], emb_name: str, u: Union[str, TensorType]
-    ) -> Union["torch.Tensor", "EmbeddingView"]:
-        """
-        Gets the embedding of a single node type.
-        Unlike DGL, this function takes the string node
-        type name instead of an integer id.
-
-        Parameters
-        ----------
-        ntype: str
-            The node type to get the embedding of.
-        emb_name: str
-            The embedding name of the embedding to get.
-        u: Union[str, TensorType]
-            Nodes to get the representation of, or ALL
-            to get the representation of all nodes of
-            the given type (returns embedding view).
-
-        Returns
-        -------
-        Union[torch.Tensor, cugraph_dgl.view.EmbeddingView]
-            The embedding of the given edge type with the given embedding name.
-        """
-
-        if ntype is None:
-            if len(self.ntypes) == 1:
-                ntype = HOMOGENEOUS_NODE_TYPE
-            else:
-                raise ValueError("Must provide the node type for a heterogeneous graph")
-
-        if dgl.base.is_all(u):
-            return EmbeddingView(
-                self.__ndata_storage[ntype, emb_name], self.num_nodes(ntype)
-            )
-
-        try:
-            return self.__ndata_storage[ntype, emb_name].fetch(
-                _cast_to_torch_tensor(u), "cuda"
-            )
-        except RuntimeError as ex:
-            warnings.warn(
-                "Got error accessing data, trying again with index on device: "
-                + str(ex)
-            )
-            return self.__ndata_storage[ntype, emb_name].fetch(
-                _cast_to_torch_tensor(u).cuda(), "cuda"
-            )
-
-    def _has_e_emb(self, etype: Tuple[str, str, str], emb_name: str) -> bool:
-        return (etype, emb_name) in self.__edata_storage
-
-    def _get_e_emb(
-        self, etype: Tuple[str, str, str], emb_name: str, u: Union[str, TensorType]
-    ) -> "torch.Tensor":
-        """
-        Gets the embedding of a single edge type.
-        Unlike DGL, this function takes the canonical edge type
-        instead of an integer id.
-
-        Parameters
-        ----------
-        etype: str
-            The edge type to get the embedding of.
-        emb_name: str
-            The embedding name of the embedding to get.
-        u: Union[str, TensorType]
-            Edges to get the representation of, or ALL to
-            get the representation of all nodes of the
-            given type.
-
-        Returns
-        -------
-        torch.Tensor
-            The embedding of the given edge type with the given embedding name.
-        """
-
-        etype = self.to_canonical_etype(etype)
-
-        if dgl.base.is_all(u):
-            return EmbeddingView(
-                self.__edata_storage[etype, emb_name], self.num_edges(etype)
-            )
-
-        try:
-            return self.__edata_storage[etype, emb_name].fetch(
-                _cast_to_torch_tensor(u), "cuda"
-            )
-        except RuntimeError as ex:
-            warnings.warn(
-                "Got error accessing data, trying again with index on device: "
-                + str(ex)
-            )
-            return self.__edata_storage[etype, emb_name].fetch(
-                _cast_to_torch_tensor(u).cuda(), "cuda"
-            )
-
-    def _set_n_emb(
-        self, ntype: str, u: Union[str, TensorType], kv: Dict[str, TensorType]
-    ) -> None:
-        """
-        Stores or updates the embedding(s) of a single node type.
-        Unlike DGL, this function takes the string node type name
-        instead of an integer id.
-
-        The semantics of this function match those of add_nodes
-        with respect to whether or not the backing feature store
-        is distributed.
-
-        Parameters
-        ----------
-        ntype: str
-            The node type to store an embedding of.
-        u: Union[str, TensorType]
-            The indices to update, if updating the embedding.
-            Currently, updating a slice of an embedding is
-            unsupported, so this should be ALL.
-        kv: Dict[str, TensorType]
-            A mapping of embedding names to embedding tensors.
-        """
-
-        if not dgl.base.is_all(u):
-            raise NotImplementedError(
-                "Updating a slice of an embedding is "
-                "currently unimplemented in cuGraph-DGL."
-            )
-
-        for k, v in kv:
-            self.__ndata_storage[ntype, k] = self.__ndata_storage_type(
-                v,
-                **self.__wg_kwargs,
-            )
-
-    def _set_e_emb(
-        self, etype: str, u: Union[str, TensorType], kv: Dict[str, TensorType]
-    ) -> None:
-        """
-        Stores or updates the embedding(s) of a single edge type.
-        Unlike DGL, this function takes the canonical edge type name
-        instead of an integer id.
-
-        The semantics of this function match those of add_edges
-        with respect to whether or not the backing feature store
-        is distributed.
-
-        Parameters
-        ----------
-        etype: str
-            The edge type to store an embedding of.
-        u: Union[str, TensorType]
-            The indices to update, if updating the embedding.
-            Currently, updating a slice of an embedding is
-            unsupported, so this should be ALL.
-        kv: Dict[str, TensorType]
-            A mapping of embedding names to embedding tensors.
-        """
-
-        if not dgl.base.is_all(u):
-            raise NotImplementedError(
-                "Updating a slice of an embedding is "
-                "currently unimplemented in cuGraph-DGL."
-            )
-
-        for k, v in kv:
-            self.__edata_storage[etype, k] = self.__edata_storage_type(
-                v,
-                **self.__wg_kwargs,
-            )
-
-    def _pop_n_emb(self, ntype: str, key: str) -> "torch.Tensor":
-        """
-        Removes and returns the embedding of the given node
-        type with the given name.
-
-        Parameters
-        ----------
-        ntype:str
-            The node type.
-        key:str
-            The embedding name.
-
-        Returns
-        -------
-        The removed embedding.
-        """
-        return self.__ndata_storage[ntype, key].pop(key)
-
-    def _pop_e_emb(self, etype: str, key: str) -> "torch.Tensor":
-        """
-        Removes and returns the embedding of the given edge
-        type with the given name.
-
-        Parameters
-        ----------
-        etype:str
-            The node type.
-        key:str
-            The embedding name.
-
-        Returns
-        -------
-        torch.Tensor
-            The removed embedding.
-        """
-        return self.__edata_storage[etype, key].pop(key)
-
-    def _get_n_emb_keys(self, ntype: str) -> List[str]:
-        """
-        Gets a list of the embedding names for a given node
-        type.
-
-        Parameters
-        ----------
-        ntype: str
-            The node type to get embedding names for.
-
-        Returns
-        -------
-        List[str]
-            The list of embedding names for the given node type.
-        """
-        return [k for (t, k) in self.__ndata_storage if ntype == t]
-
-    def _get_e_emb_keys(self, etype: str) -> List[str]:
-        """
-        Gets a list of the embedding names for a given edge
-        type.
-
-        Parameters
-        ----------
-        etype: str
-            The edge type to get embedding names for.
-
-        Returns
-        -------
-        List[str]
-            The list of embedding names for the given edge type.
-        """
-        return [k for (t, k) in self.__edata_storage if etype == t]
-
-    def all_edges(
-        self,
-        form="uv",
-        order="eid",
-        etype: Union[str, Tuple[str, str, str]] = None,
-        device: Union[str, int, "torch.device"] = "cpu",
-    ):
-        """
-        Returns all edges with the specified edge type.
-        cuGraph-DGL currently only supports 'eid' format and
-        'eid' order.
-
-        Parameters
-        ----------
-        form: str (optional, default='uv')
-            The format to return ('uv', 'eid', 'all').
-
-        order: str (optional, default='eid')
-            The order to return edges in ('eid', 'srcdst')
-            cuGraph-DGL currently only supports 'eid'.
-        etype: Union[str, Tuple[str, str, str]] (optional, default=None)
-            The edge type to get.  Not required if this is
-            a homogeneous graph.  Can be the relation type if the
-            relation type is unique, or the canonical edge type.
-        device: Union[str, int, torch.device] (optional, default='cpu')
-            The device where returned edges should be stored
-            ('cpu', 'cuda', or device id).
-        """
-
-        if order != "eid":
-            raise NotImplementedError("cugraph-DGL only supports eid order.")
-
-        if etype is None and len(self.canonical_etypes) > 1:
-            raise ValueError("Edge type is required for heterogeneous graphs.")
-
-        etype = self.to_canonical_etype(etype)
-
-        if form == "eid":
-            return torch.arange(
-                0,
-                self.__num_edges_dict[etype],
-                dtype=self.idtype,
-                device=device,
-            )
-        else:
-            if self.is_multi_gpu:
-                # This can't be done because it requires collective communication.
-                raise ValueError(
-                    "Calling all_edges in a distributed graph with"
-                    " form 'uv' or 'all' is unsupported."
-                )
-
-            else:
-                eix = self.__edge_indices[etype].to(device)
-                if form == "uv":
-                    return eix[0], eix[1]
-                elif form == "all":
-                    return (
-                        eix[0],
-                        eix[1],
-                        torch.arange(
-                            self.__num_edges_dict[etype],
-                            dtype=self.idtype,
-                            device=device,
-                        ),
-                    )
-                else:
-                    raise ValueError(f"Invalid form {form}")
-
-    @property
-    def ndata(self) -> HeteroNodeDataView:
-        """
-        Returns a view of the node data in this graph which can be used to
-        access or modify node features.
-        """
-
-        if len(self.ntypes) == 1:
-            ntype = self.ntypes[0]
-            return HeteroNodeDataView(self, ntype, dgl.base.ALL)
-
-        return HeteroNodeDataView(self, self.ntypes, dgl.base.ALL)
-
-    @property
-    def edata(self) -> HeteroEdgeDataView:
-        """
-        Returns a view of the edge data in this graph which can be used to
-        access or modify edge features.
-        """
-        if len(self.canonical_etypes) == 1:
-            return HeteroEdgeDataView(self, None, dgl.base.ALL)
-
-        return HeteroEdgeDataView(self, self.canonical_etypes, dgl.base.ALL)
-
-    @property
-    def nodes(self) -> HeteroNodeView:
-        """
-        Returns a view of the nodes in this graph.
-        """
-        return HeteroNodeView(self)
-
-    @property
-    def edges(self) -> HeteroEdgeView:
-        """
-        Returns a view of the edges in this graph.
-        """
-        return HeteroEdgeView(self)
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/__init__.py b/python/cugraph-dgl/cugraph_dgl/nn/__init__.py
deleted file mode 100644
index 9a4a087baf4..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .conv import *  # noqa
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
deleted file mode 100644
index 3e7f2f076f0..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .base import SparseGraph
-from .gatconv import GATConv
-from .gatv2conv import GATv2Conv
-from .relgraphconv import RelGraphConv
-from .sageconv import SAGEConv
-from .transformerconv import TransformerConv
-
-__all__ = [
-    "SparseGraph",
-    "GATConv",
-    "GATv2Conv",
-    "RelGraphConv",
-    "SAGEConv",
-    "TransformerConv",
-]
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
deleted file mode 100644
index fcd5a26aee6..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-
-import cugraph_dgl
-
-torch = import_optional("torch")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-dgl = import_optional("dgl")
-
-
-def compress_ids(ids: torch.Tensor, size: int) -> torch.Tensor:
-    return torch._convert_indices_from_coo_to_csr(
-        ids, size, out_int32=ids.dtype == torch.int32
-    )
-
-
-def decompress_ids(c_ids: torch.Tensor) -> torch.Tensor:
-    ids = torch.arange(c_ids.numel() - 1, dtype=c_ids.dtype, device=c_ids.device)
-    return ids.repeat_interleave(c_ids[1:] - c_ids[:-1])
-
-
-class SparseGraph(object):
-    r"""A class to create and store different sparse formats needed by
-    cugraph-ops. It always creates a CSC representation and can provide COO- or
-    CSR-format if needed.
-
-    Parameters
-    ----------
-    size: tuple of int
-        Size of the adjacency matrix: (num_src_nodes, num_dst_nodes).
-
-    src_ids: torch.Tensor
-        Source indices of the edges.
-
-    dst_ids: torch.Tensor, optional
-        Destination indices of the edges.
-
-    csrc_ids: torch.Tensor, optional
-        Compressed source indices. It is a monotonically increasing array of
-        size (num_src_nodes + 1,). For the k-th source node, its neighborhood
-        consists of the destinations between `dst_indices[csrc_indices[k]]` and
-        `dst_indices[csrc_indices[k+1]]`.
-
-    cdst_ids: torch.Tensor, optional
-        Compressed destination indices. It is a monotonically increasing array of
-        size (num_dst_nodes + 1,). For the k-th destination node, its neighborhood
-        consists of the sources between `src_indices[cdst_indices[k]]` and
-        `src_indices[cdst_indices[k+1]]`.
-
-    values: torch.Tensor, optional
-        Values on the edges.
-
-    is_sorted: bool
-        Whether the COO inputs (src_ids, dst_ids, values) have been sorted by
-        `dst_ids` in an ascending order. CSC layout creation is much faster
-        when sorted.
-
-    formats: str or tuple of str, optional
-        The desired sparse formats to create for the graph. The formats tuple
-        must include "csc". Default: "csc".
-
-    reduce_memory: bool, optional
-        When set, the tensors are not required by the desired formats will be
-        set to `None`. Default: True.
-
-    Notes
-    -----
-    For MFGs (sampled graphs), the node ids must have been renumbered.
-    """
-
-    supported_formats = {
-        "coo": ("_src_ids", "_dst_ids"),
-        "csc": ("_cdst_ids", "_src_ids"),
-        "csr": ("_csrc_ids", "_dst_ids", "_perm_csc2csr"),
-    }
-
-    all_tensors = set(
-        [
-            "_src_ids",
-            "_dst_ids",
-            "_csrc_ids",
-            "_cdst_ids",
-            "_perm_coo2csc",
-            "_perm_csc2csr",
-        ]
-    )
-
-    def __init__(
-        self,
-        size: Tuple[int, int],
-        src_ids: torch.Tensor,
-        dst_ids: Optional[torch.Tensor] = None,
-        csrc_ids: Optional[torch.Tensor] = None,
-        cdst_ids: Optional[torch.Tensor] = None,
-        values: Optional[torch.Tensor] = None,
-        is_sorted: bool = False,
-        formats: Union[str, Tuple[str]] = "csc",
-        reduce_memory: bool = True,
-    ):
-        self._num_src_nodes, self._num_dst_nodes = size
-        self._is_sorted = is_sorted
-
-        if dst_ids is None and cdst_ids is None:
-            raise ValueError(
-                "One of 'dst_ids' and 'cdst_ids' must be given "
-                "to create a SparseGraph."
-            )
-
-        if src_ids is not None:
-            src_ids = src_ids.contiguous()
-
-        if dst_ids is not None:
-            dst_ids = dst_ids.contiguous()
-
-        if csrc_ids is not None:
-            if csrc_ids.numel() != self._num_src_nodes + 1:
-                raise RuntimeError(
-                    f"Size mismatch for 'csrc_ids': expected ({size[0] + 1},), "
-                    f"but got {tuple(csrc_ids.size())}"
-                )
-            csrc_ids = csrc_ids.contiguous()
-
-        if cdst_ids is not None:
-            if cdst_ids.numel() != self._num_dst_nodes + 1:
-                raise RuntimeError(
-                    f"Size mismatch for 'cdst_ids': expected ({size[1] + 1},), "
-                    f"but got {tuple(cdst_ids.size())}"
-                )
-            cdst_ids = cdst_ids.contiguous()
-
-        if values is not None:
-            values = values.contiguous()
-
-        self._src_ids = src_ids
-        self._dst_ids = dst_ids
-        self._csrc_ids = csrc_ids
-        self._cdst_ids = cdst_ids
-        self._values = values
-        self._perm_coo2csc = None
-        self._perm_csc2csr = None
-
-        if isinstance(formats, str):
-            formats = (formats,)
-        self._formats = formats
-
-        if "csc" not in formats:
-            raise ValueError(
-                f"{self.__class__.__name__}.formats must contain "
-                f"'csc', but got {formats}."
-            )
-
-        # always create csc first
-        if self._cdst_ids is None:
-            if not self._is_sorted:
-                self._dst_ids, self._perm_coo2csc = torch.sort(self._dst_ids)
-                self._src_ids = self._src_ids[self._perm_coo2csc]
-                if self._values is not None:
-                    self._values = self._values[self._perm_coo2csc]
-            self._cdst_ids = compress_ids(self._dst_ids, self._num_dst_nodes)
-
-        for format_ in formats:
-            assert format_ in SparseGraph.supported_formats
-            self.__getattribute__(f"{format_}")()
-
-        self._reduce_memory = reduce_memory
-        if reduce_memory:
-            self.reduce_memory()
-
-    def reduce_memory(self):
-        """Remove the tensors that are not necessary to create the desired sparse
-        formats to reduce memory footprint."""
-        if self._formats is None:
-            return
-
-        tensors_needed = []
-        for f in self._formats:
-            tensors_needed += SparseGraph.supported_formats[f]
-        for t in SparseGraph.all_tensors.difference(set(tensors_needed)):
-            self.__dict__[t] = None
-
-    def src_ids(self) -> torch.Tensor:
-        return self._src_ids
-
-    def cdst_ids(self) -> torch.Tensor:
-        return self._cdst_ids
-
-    def dst_ids(self) -> torch.Tensor:
-        if self._dst_ids is None:
-            self._dst_ids = decompress_ids(self._cdst_ids)
-        return self._dst_ids
-
-    def csrc_ids(self) -> torch.Tensor:
-        if self._csrc_ids is None:
-            src_ids, self._perm_csc2csr = torch.sort(self._src_ids)
-            self._csrc_ids = compress_ids(src_ids, self._num_src_nodes)
-        return self._csrc_ids
-
-    def num_src_nodes(self):
-        return self._num_src_nodes
-
-    def num_dst_nodes(self):
-        return self._num_dst_nodes
-
-    def values(self):
-        return self._values
-
-    def formats(self):
-        return self._formats
-
-    def coo(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        if "coo" not in self.formats():
-            raise RuntimeError(
-                "The SparseGraph did not create a COO layout. "
-                "Set 'formats' list to include 'coo' when creating the graph."
-            )
-        return self.src_ids(), self.dst_ids(), self._values
-
-    def csc(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        if "csc" not in self.formats():
-            raise RuntimeError(
-                "The SparseGraph did not create a CSC layout. "
-                "Set 'formats' list to include 'csc' when creating the graph."
-            )
-        return self.cdst_ids(), self.src_ids(), self._values
-
-    def csr(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        if "csr" not in self.formats():
-            raise RuntimeError(
-                "The SparseGraph did not create a CSR layout. "
-                "Set 'formats' list to include 'csr' when creating the graph."
-            )
-        csrc_ids = self.csrc_ids()
-        dst_ids = self.dst_ids()[self._perm_csc2csr]
-        value = self._values
-        if value is not None:
-            value = value[self._perm_csc2csr]
-        return csrc_ids, dst_ids, value
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}(num_src_nodes={self._num_src_nodes}, "
-            f"num_dst_nodes={self._num_dst_nodes}, "
-            f"num_edges={self._src_ids.size(0)}, formats={self._formats})"
-        )
-
-    def to(self, device: Union[torch.device, str, int]) -> "cugraph_dgl.nn.SparseGraph":
-        sg = SparseGraph(
-            src_ids=None if self._src_ids is None else self._src_ids.to(device),
-            dst_ids=None if self._dst_ids is None else self._dst_ids.to(device),
-            csrc_ids=None if self._csrc_ids is None else self._csrc_ids.to(device),
-            cdst_ids=None if self._cdst_ids is None else self._cdst_ids.to(device),
-            values=None if self._values is None else self._values.to(device),
-            is_sorted=self._is_sorted,
-            formats=self._formats,
-            reduce_memory=self._reduce_memory,
-        )
-
-        sg._perm_coo2csc = (
-            None if self._perm_coo2csc is None else self._perm_coo2csc.to(device)
-        )
-        sg._perm_csc2csr = (
-            None if self._perm_csc2csr is None else self._perm_csc2csr.to(device)
-        )
-
-        return sg
-
-
-class BaseConv(torch.nn.Module):
-    r"""An abstract base class for cugraph-ops nn module."""
-
-    def __init__(self):
-        super().__init__()
-
-    def reset_parameters(self):
-        r"""Resets all learnable parameters of the module."""
-        raise NotImplementedError
-
-    def forward(self, *args):
-        r"""Runs the forward pass of the module."""
-        raise NotImplementedError
-
-    def get_cugraph_ops_CSC(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        is_bipartite: bool = False,
-        max_in_degree: Optional[int] = None,
-    ) -> ops_torch.CSC:
-        """Create CSC structure needed by cugraph-ops."""
-
-        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
-            raise TypeError(
-                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
-                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
-            )
-
-        # TODO: max_in_degree should default to None in pylibcugraphops
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        if isinstance(g, SparseGraph):
-            offsets, indices, _ = g.csc()
-        else:
-            offsets, indices, _ = g.adj_tensors("csc")
-
-        graph = ops_torch.CSC(
-            offsets=offsets,
-            indices=indices,
-            num_src_nodes=g.num_src_nodes(),
-            dst_max_in_degree=max_in_degree,
-            is_bipartite=is_bipartite,
-        )
-
-        return graph
-
-    def get_cugraph_ops_HeteroCSC(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        num_edge_types: int,
-        etypes: Optional[torch.Tensor] = None,
-        is_bipartite: bool = False,
-        max_in_degree: Optional[int] = None,
-    ) -> ops_torch.HeteroCSC:
-        """Create HeteroCSC structure needed by cugraph-ops."""
-
-        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
-            raise TypeError(
-                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
-                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
-            )
-
-        # TODO: max_in_degree should default to None in pylibcugraphops
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        if isinstance(g, SparseGraph):
-            offsets, indices, etypes = g.csc()
-            if etypes is None:
-                raise ValueError(
-                    "SparseGraph must have 'values' to create HeteroCSC. "
-                    "Pass in edge types as 'values' when creating the SparseGraph."
-                )
-            etypes = etypes.int()
-        else:
-            if etypes is None:
-                raise ValueError(
-                    "'etypes' is required when creating HeteroCSC "
-                    "from dgl.DGLHeteroGraph."
-                )
-            offsets, indices, perm = g.adj_tensors("csc")
-            etypes = etypes[perm].int()
-
-        graph = ops_torch.HeteroCSC(
-            offsets=offsets,
-            indices=indices,
-            edge_types=etypes,
-            num_src_nodes=g.num_src_nodes(),
-            num_edge_types=num_edge_types,
-            dst_max_in_degree=max_in_degree,
-            is_bipartite=is_bipartite,
-        )
-
-        return graph
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
deleted file mode 100644
index e8813271fd8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class GATConv(BaseConv):
-    r"""Graph attention layer from `Graph Attention Network
-    <https://arxiv.org/pdf/1710.10903.pdf>`__, with the sparse aggregation
-    accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int or (int, int)
-        Input feature size. A pair denotes feature sizes of source and
-        destination nodes.
-    out_feats : int
-        Output feature size.
-    num_heads : int
-        Number of heads in multi-head attention.
-    feat_drop : float, optional
-        Dropout rate on feature. Defaults: ``0``.
-    concat : bool, optional
-        If False, the multi-head attentions are averaged instead of concatenated.
-        Default: ``True``.
-    edge_feats : int, optional
-        Edge feature size. Default: ``None``.
-    negative_slope : float, optional
-        LeakyReLU angle of negative slope. Defaults: ``0.2``.
-    residual : bool, optional
-        If True, use residual connection. Defaults: ``False``.
-    allow_zero_in_degree : bool, optional
-        If there are 0-in-degree nodes in the graph, output for those nodes will
-        be invalid since no message will be passed to those nodes. This is
-        harmful for some applications causing silent performance regression.
-        This module will raise a DGLError if it detects 0-in-degree nodes in
-        input graph. By setting ``True``, it will suppress the check and let the
-        users handle it by themselves. Defaults: ``False``.
-    bias : bool, optional
-        If True, learns a bias term. Defaults: ``True``.
-
-    Examples
-    --------
-    >>> import dgl
-    >>> import torch
-    >>> from cugraph_dgl.nn import GATConv
-    ...
-    >>> device = 'cuda'
-    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3])).to(device)
-    >>> g = dgl.add_self_loop(g)
-    >>> feat = torch.ones(6, 10).to(device)
-    >>> conv = GATConv(10, 2, num_heads=3).to(device)
-    >>> res = conv(g, feat)
-    >>> res
-    tensor([[[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]]], device='cuda:0', grad_fn=<ViewBackward0>)
-    """
-
-    def __init__(
-        self,
-        in_feats: Union[int, tuple[int, int]],
-        out_feats: int,
-        num_heads: int,
-        feat_drop: float = 0.0,
-        concat: bool = True,
-        edge_feats: Optional[int] = None,
-        negative_slope: float = 0.2,
-        residual: bool = False,
-        allow_zero_in_degree: bool = False,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if isinstance(in_feats, int):
-            self.in_feats_src = self.in_feats_dst = in_feats
-        else:
-            self.in_feats_src, self.in_feats_dst = in_feats
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.num_heads = num_heads
-        self.feat_drop = nn.Dropout(feat_drop)
-        self.concat = concat
-        self.edge_feats = edge_feats
-        self.negative_slope = negative_slope
-        self.residual = residual
-        self.allow_zero_in_degree = allow_zero_in_degree
-
-        if isinstance(in_feats, int):
-            self.lin = nn.Linear(in_feats, num_heads * out_feats, bias=False)
-        else:
-            self.lin_src = nn.Linear(
-                self.in_feats_src, num_heads * out_feats, bias=False
-            )
-            self.lin_dst = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=False
-            )
-
-        if edge_feats is not None:
-            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
-            self.attn_weights = nn.Parameter(torch.empty(3 * num_heads * out_feats))
-        else:
-            self.register_parameter("lin_edge", None)
-            self.attn_weights = nn.Parameter(torch.empty(2 * num_heads * out_feats))
-
-        out_dim = num_heads * out_feats if concat else out_feats
-        if residual:
-            if self.in_feats_dst != out_dim:
-                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
-            else:
-                self.lin_res = nn.Identity()
-        else:
-            self.register_buffer("lin_res", None)
-
-        if bias and not isinstance(self.lin_res, nn.Linear):
-            if concat:
-                self.bias = nn.Parameter(torch.empty(num_heads, out_feats))
-            else:
-                self.bias = nn.Parameter(torch.empty(out_feats))
-        else:
-            self.register_buffer("bias", None)
-
-        self.reset_parameters()
-
-    def set_allow_zero_in_degree(self, set_value):
-        r"""Set allow_zero_in_degree flag."""
-        self.allow_zero_in_degree = set_value
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain("relu")
-        if hasattr(self, "lin"):
-            nn.init.xavier_normal_(self.lin.weight, gain=gain)
-        else:
-            nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
-            nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
-
-        nn.init.xavier_normal_(
-            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
-        )
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        if self.lin_res is not None:
-            self.lin_res.reset_parameters()
-
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        efeat: Optional[torch.Tensor] = None,
-        max_in_degree: Optional[int] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-        high_precision_dgrad: bool = False,
-        high_precision_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        graph : DGLGraph or SparseGraph
-            The graph.
-        nfeat : torch.Tensor or (torch.Tensor, torch.Tensor)
-            Node features. If given as a tuple, the two elements correspond to
-            the source and destination node features, respectively, in a
-            bipartite graph.
-        efeat: torch.Tensor, optional
-            Optional edge features.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-        deterministic_dgrad : bool, default=False
-            Optional flag indicating whether the feature gradients
-            are computed deterministically using a dedicated workspace buffer.
-        deterministic_wgrad: bool, default=False
-            Optional flag indicating whether the weight gradients
-            are computed deterministically using a dedicated workspace buffer.
-        high_precision_dgrad: bool, default=False
-            Optional flag indicating whether gradients for inputs in half precision
-            are kept in single precision as long as possible and only casted to
-            the corresponding input type at the very end.
-        high_precision_wgrad: bool, default=False
-            Optional flag indicating whether gradients for weights in half precision
-            are kept in single precision as long as possible and only casted to
-            the corresponding input type at the very end.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, H, D_{out})` where
-            :math:`H` is the number of heads, and :math:`D_{out}` is size of
-            output feature.
-        """
-        if isinstance(g, dgl.DGLHeteroGraph):
-            if not self.allow_zero_in_degree:
-                if (g.in_degrees() == 0).any():
-                    raise dgl.base.DGLError(
-                        "There are 0-in-degree nodes in the graph, "
-                        "output for those nodes will be invalid. "
-                        "This is harmful for some applications, "
-                        "causing silent performance regression. "
-                        "Adding self-loop on the input graph by "
-                        "calling `g = dgl.add_self_loop(g)` will resolve "
-                        "the issue. Setting ``allow_zero_in_degree`` "
-                        "to be `True` when constructing this module will "
-                        "suppress the check and let the code run."
-                    )
-
-        bipartite = isinstance(nfeat, (list, tuple))
-
-        _graph = self.get_cugraph_ops_CSC(
-            g, is_bipartite=bipartite, max_in_degree=max_in_degree
-        )
-        if deterministic_dgrad:
-            _graph.add_reverse_graph()
-
-        if bipartite:
-            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
-            nfeat_dst_orig = nfeat[1]
-        else:
-            nfeat = self.feat_drop(nfeat)
-            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
-
-        if efeat is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_feats must be set to "
-                    f"accept edge features."
-                )
-            efeat = self.lin_edge(efeat)
-
-        if bipartite:
-            if not hasattr(self, "lin_src"):
-                nfeat_src = self.lin(nfeat[0])
-                nfeat_dst = self.lin(nfeat[1])
-            else:
-                nfeat_src = self.lin_src(nfeat[0])
-                nfeat_dst = self.lin_dst(nfeat[1])
-        else:
-            if not hasattr(self, "lin"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_feats is expected to be an "
-                    f"integer when the graph is not bipartite, "
-                    f"but got {self.in_feats}."
-                )
-            nfeat = self.lin(nfeat)
-
-        out = ops_torch.operators.mha_gat_n2n(
-            (nfeat_src, nfeat_dst) if bipartite else nfeat,
-            self.attn_weights,
-            _graph,
-            num_heads=self.num_heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=efeat,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-            high_precision_dgrad=high_precision_dgrad,
-            high_precision_wgrad=high_precision_wgrad,
-        )[: g.num_dst_nodes()]
-
-        if self.concat:
-            out = out.view(-1, self.num_heads, self.out_feats)
-
-        if self.residual:
-            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
-            if not self.concat:
-                res = res.mean(dim=1)
-            out = out + res
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
deleted file mode 100644
index 4f47005f8ee..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class GATv2Conv(BaseConv):
-    r"""GATv2 from `How Attentive are Graph Attention Networks?
-    <https://arxiv.org/pdf/2105.14491.pdf>`__, with the sparse aggregation
-    accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int or (int, int)
-        Input feature size. A pair denotes feature sizes of source and
-        destination nodes.
-    out_feats : int
-        Output feature size.
-    num_heads : int
-        Number of heads in Multi-Head Attention.
-    feat_drop : float, optional
-        Dropout rate on feature. Defaults: ``0``.
-    concat : bool, optional
-        If False, the multi-head attentions are averaged instead of concatenated.
-        Default: ``True``.
-    edge_feats : int, optional
-        Edge feature size. Default: ``None``.
-    negative_slope : float, optional
-        LeakyReLU angle of negative slope. Defaults: ``0.2``.
-    residual : bool, optional
-        If True, use residual connection. Defaults: ``False``.
-    allow_zero_in_degree : bool, optional
-        If there are 0-in-degree nodes in the graph, output for those nodes will
-        be invalid since no message will be passed to those nodes. This is
-        harmful for some applications causing silent performance regression.
-        This module will raise a DGLError if it detects 0-in-degree nodes in
-        input graph. By setting ``True``, it will suppress the check and let the
-        users handle it by themselves. Defaults: ``False``.
-    bias : bool, optional
-        If True, learns a bias term. Defaults: ``True``.
-    share_weights : bool, optional
-        If ``True``, the same matrix will be applied to the source and the
-        destination node features. Defaults: ``False``.
-    """
-
-    def __init__(
-        self,
-        in_feats: Union[int, tuple[int, int]],
-        out_feats: int,
-        num_heads: int,
-        feat_drop: float = 0.0,
-        concat: bool = True,
-        edge_feats: Optional[int] = None,
-        negative_slope: float = 0.2,
-        residual: bool = False,
-        allow_zero_in_degree: bool = False,
-        bias: bool = True,
-        share_weights: bool = False,
-    ):
-        super().__init__()
-
-        if isinstance(in_feats, int):
-            self.in_feats_src = self.in_feats_dst = in_feats
-        else:
-            self.in_feats_src, self.in_feats_dst = in_feats
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.num_heads = num_heads
-        self.feat_drop = nn.Dropout(feat_drop)
-        self.concat = concat
-        self.edge_feats = edge_feats
-        self.negative_slope = negative_slope
-        self.residual = residual
-        self.allow_zero_in_degree = allow_zero_in_degree
-        self.share_weights = share_weights
-        self.bias = bias
-
-        self.lin_src = nn.Linear(self.in_feats_src, num_heads * out_feats, bias=bias)
-        if share_weights:
-            if self.in_feats_src != self.in_feats_dst:
-                raise ValueError(
-                    f"Input feature size of source and destination "
-                    f"nodes must be identical when share_weights is enabled, "
-                    f"but got {self.in_feats_src} and {self.in_feats_dst}."
-                )
-            self.lin_dst = self.lin_src
-        else:
-            self.lin_dst = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
-
-        self.attn_weights = nn.Parameter(torch.empty(num_heads * out_feats))
-
-        if edge_feats is not None:
-            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
-        else:
-            self.register_parameter("lin_edge", None)
-
-        out_dim = num_heads * out_feats if concat else out_feats
-        if residual:
-            if self.in_feats_dst != out_dim:
-                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
-            else:
-                self.lin_res = nn.Identity()
-        else:
-            self.register_buffer("lin_res", None)
-
-        self.reset_parameters()
-
-    def set_allow_zero_in_degree(self, set_value):
-        r"""Set allow_zero_in_degree flag."""
-        self.allow_zero_in_degree = set_value
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain("relu")
-        nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
-        nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
-
-        nn.init.xavier_normal_(
-            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
-        )
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        if self.lin_res is not None:
-            self.lin_res.reset_parameters()
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        efeat: Optional[torch.Tensor] = None,
-        max_in_degree: Optional[int] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        graph : DGLGraph or SparseGraph
-            The graph.
-        nfeat : torch.Tensor
-            Input features of shape :math:`(N, D_{in})`.
-        efeat: torch.Tensor, optional
-            Optional edge features.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-        deterministic_dgrad : bool, default=False
-            Optional flag indicating whether the feature gradients
-            are computed deterministically using a dedicated workspace buffer.
-        deterministic_wgrad: bool, default=False
-            Optional flag indicating whether the weight gradients
-            are computed deterministically using a dedicated workspace buffer.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, H, D_{out})` where
-            :math:`H` is the number of heads, and :math:`D_{out}` is size of
-            output feature.
-        """
-
-        if isinstance(g, dgl.DGLHeteroGraph):
-            if not self.allow_zero_in_degree:
-                if (g.in_degrees() == 0).any():
-                    raise dgl.base.DGLError(
-                        "There are 0-in-degree nodes in the graph, "
-                        "output for those nodes will be invalid. "
-                        "This is harmful for some applications, "
-                        "causing silent performance regression. "
-                        "Adding self-loop on the input graph by "
-                        "calling `g = dgl.add_self_loop(g)` will resolve "
-                        "the issue. Setting ``allow_zero_in_degree`` "
-                        "to be `True` when constructing this module will "
-                        "suppress the check and let the code run."
-                    )
-
-        nfeat_bipartite = isinstance(nfeat, (list, tuple))
-        graph_bipartite = nfeat_bipartite or self.share_weights is False
-
-        _graph = self.get_cugraph_ops_CSC(
-            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
-        )
-        if deterministic_dgrad:
-            _graph.add_reverse_graph()
-
-        if nfeat_bipartite:
-            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
-            nfeat_dst_orig = nfeat[1]
-        else:
-            nfeat = self.feat_drop(nfeat)
-            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
-
-        if efeat is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_feats must be set to "
-                    f"accept edge features."
-                )
-            efeat = self.lin_edge(efeat)
-
-        if nfeat_bipartite:
-            nfeat = (self.lin_src(nfeat[0]), self.lin_dst(nfeat[1]))
-        elif graph_bipartite:
-            nfeat = (self.lin_src(nfeat), self.lin_dst(nfeat[: g.num_dst_nodes()]))
-        else:
-            nfeat = self.lin_src(nfeat)
-
-        out = ops_torch.operators.mha_gat_v2_n2n(
-            nfeat,
-            self.attn_weights,
-            _graph,
-            num_heads=self.num_heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=efeat,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-        )[: g.num_dst_nodes()]
-
-        if self.concat:
-            out = out.view(-1, self.num_heads, self.out_feats)
-
-        if self.residual:
-            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
-            if not self.concat:
-                res = res.mean(dim=1)
-            out = out + res
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
deleted file mode 100644
index 5c4b5dea441..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class RelGraphConv(BaseConv):
-    r"""An accelerated relational graph convolution layer from `Modeling
-    Relational Data with Graph Convolutional Networks
-    <https://arxiv.org/abs/1703.06103>`__, with the sparse aggregation
-    accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    num_rels : int
-        Number of relations.
-    regularizer : str, optional
-        Which weight regularizer to use ("basis" or ``None``):
-         - "basis" is for basis-decomposition.
-         - ``None`` applies no regularization.
-        Default: ``None``.
-    num_bases : int, optional
-        Number of bases. It comes into effect when a regularizer is applied.
-        Default: ``None``.
-    bias : bool, optional
-        True if bias is added. Default: ``True``.
-    self_loop : bool, optional
-        True to include self loop message. Default: ``True``.
-    dropout : float, optional
-        Dropout rate. Default: ``0.0``.
-    apply_norm : bool, optional
-        True to normalize aggregation output by the in-degree of the destination
-        node per edge type, i.e. :math:`|\mathcal{N}^r_i|`. Default: ``True``.
-
-    Examples
-    --------
-    >>> import dgl
-    >>> import torch
-    >>> from cugraph_dgl.nn import RelGraphConv
-    ...
-    >>> device = 'cuda'
-    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3])).to(device)
-    >>> feat = torch.ones(6, 10).to(device)
-    >>> conv = RelGraphConv(
-    ...     10, 2, 3, regularizer='basis', num_bases=2).to(device)
-    >>> etypes = torch.tensor([0,1,2,0,1,2]).to(device)
-    >>> res = conv(g, feat, etypes)
-    >>> res
-    tensor([[-1.7774, -2.0184],
-            [-1.4335, -2.3758],
-            [-1.7774, -2.0184],
-            [-0.4698, -3.0876],
-            [-1.4335, -2.3758],
-            [-1.4331, -2.3295]], device='cuda:0', grad_fn=<AddBackward0>)
-    """
-
-    def __init__(
-        self,
-        in_feats: int,
-        out_feats: int,
-        num_rels: int,
-        regularizer: Optional[str] = None,
-        num_bases: Optional[int] = None,
-        bias: bool = True,
-        self_loop: bool = True,
-        dropout: float = 0.0,
-        apply_norm: bool = False,
-    ):
-        super().__init__()
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.num_rels = num_rels
-        self.apply_norm = apply_norm
-        self.dropout = nn.Dropout(dropout)
-
-        dim_self_loop = 1 if self_loop else 0
-        self.self_loop = self_loop
-        if regularizer is None:
-            self.W = nn.Parameter(
-                torch.empty(num_rels + dim_self_loop, in_feats, out_feats)
-            )
-            self.coeff = None
-        elif regularizer == "basis":
-            if num_bases is None:
-                raise ValueError('Missing "num_bases" for basis regularization.')
-            self.W = nn.Parameter(
-                torch.empty(num_bases + dim_self_loop, in_feats, out_feats)
-            )
-            self.coeff = nn.Parameter(torch.empty(num_rels, num_bases))
-            self.num_bases = num_bases
-        else:
-            raise ValueError(
-                f"Supported regularizer options: 'basis' or None, but got "
-                f"'{regularizer}'."
-            )
-        self.regularizer = regularizer
-
-        if bias:
-            self.bias = nn.Parameter(torch.empty(out_feats))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        bound = 1 / math.sqrt(self.in_feats)
-        end = -1 if self.self_loop else None
-        nn.init.uniform_(self.W[:end], -bound, bound)
-        if self.regularizer == "basis":
-            nn.init.xavier_uniform_(self.coeff, gain=nn.init.calculate_gain("relu"))
-        if self.self_loop:
-            nn.init.xavier_uniform_(self.W[-1], nn.init.calculate_gain("relu"))
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: torch.Tensor,
-        etypes: torch.Tensor,
-        max_in_degree: Optional[int] = None,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        g : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            A 2D tensor of node features. Shape: :math:`(|V|, D_{in})`.
-        etypes : torch.Tensor
-            A 1D integer tensor of edge types. Shape: :math:`(|E|,)`.
-            Note that cugraph-ops only accepts edge type tensors in int32,
-            so any input of other integer types will be casted into int32,
-            thus introducing some overhead. Pass in int32 tensors directly
-            for best performance.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-
-        Returns
-        -------
-        torch.Tensor
-            New node features. Shape: :math:`(|V|, D_{out})`.
-        """
-        _graph = self.get_cugraph_ops_HeteroCSC(
-            g,
-            num_edge_types=self.num_rels,
-            etypes=etypes,
-            is_bipartite=False,
-            max_in_degree=max_in_degree,
-        )
-
-        h = ops_torch.operators.agg_hg_basis_n2n_post(
-            feat,
-            self.coeff,
-            _graph,
-            concat_own=self.self_loop,
-            norm_by_out_degree=self.apply_norm,
-        )[: g.num_dst_nodes()]
-        h = h @ self.W.view(-1, self.out_feats)
-        if self.bias is not None:
-            h = h + self.bias
-        h = self.dropout(h)
-
-        return h
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
deleted file mode 100644
index b6198903766..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class SAGEConv(BaseConv):
-    r"""An accelerated GraphSAGE layer from `Inductive Representation Learning
-    on Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`, with the sparse
-    aggregation accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int or tuple
-        Input feature size. If a scalar is given, the source and destination
-        nodes are required to be the same.
-    out_feats : int
-        Output feature size.
-    aggregator_type : str
-        Aggregator type to use ("mean", "sum", "min", "max", "pool", "gcn").
-    feat_drop : float
-        Dropout rate on features, default: ``0``.
-    bias : bool
-        If True, adds a learnable bias to the output. Default: ``True``.
-
-    Examples
-    --------
-    >>> import dgl
-    >>> import torch
-    >>> from cugraph_dgl.nn import SAGEConv
-    ...
-    >>> device = 'cuda'
-    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3])).to(device)
-    >>> g = dgl.add_self_loop(g)
-    >>> feat = torch.ones(6, 10).to(device)
-    >>> conv = SAGEConv(10, 2, 'mean').to(device)
-    >>> res = conv(g, feat)
-    >>> res
-    tensor([[-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952]], device='cuda:0', grad_fn=<AddmmBackward0>)
-    """
-    valid_aggr_types = {"mean", "sum", "min", "max", "pool", "gcn"}
-
-    def __init__(
-        self,
-        in_feats: Union[int, tuple[int, int]],
-        out_feats: int,
-        aggregator_type: str = "mean",
-        feat_drop: float = 0.0,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if aggregator_type not in self.valid_aggr_types:
-            raise ValueError(
-                f"Invalid aggregator_type. Must be one of {self.valid_aggr_types}. "
-                f"But got '{aggregator_type}' instead."
-            )
-
-        self.aggregator_type = aggregator_type
-        self._aggr = aggregator_type
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
-        self.feat_drop = nn.Dropout(feat_drop)
-
-        if self.aggregator_type == "gcn":
-            self._aggr = "mean"
-            self.lin = nn.Linear(self.in_feats_src, out_feats, bias=bias)
-        else:
-            self.lin = nn.Linear(
-                self.in_feats_src + self.in_feats_dst, out_feats, bias=bias
-            )
-
-        if self.aggregator_type == "pool":
-            self._aggr = "max"
-            self.pre_lin = nn.Linear(self.in_feats_src, self.in_feats_src)
-        else:
-            self.register_parameter("pre_lin", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        self.lin.reset_parameters()
-        if self.pre_lin is not None:
-            self.pre_lin.reset_parameters()
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        max_in_degree: Optional[int] = None,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        g : DGLGraph or SparseGraph
-            The graph.
-        feat : torch.Tensor or tuple
-            Node features. Shape: :math:`(|V|, D_{in})`.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-
-        Returns
-        -------
-        torch.Tensor
-            Output node features. Shape: :math:`(|V|, D_{out})`.
-        """
-        feat_bipartite = isinstance(feat, (list, tuple))
-        graph_bipartite = feat_bipartite or self.aggregator_type == "pool"
-
-        _graph = self.get_cugraph_ops_CSC(
-            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
-        )
-
-        if feat_bipartite:
-            feat = (self.feat_drop(feat[0]), self.feat_drop(feat[1]))
-        else:
-            feat = self.feat_drop(feat)
-
-        if self.aggregator_type == "pool":
-            if feat_bipartite:
-                feat = (self.pre_lin(feat[0]).relu(), feat[1])
-            else:
-                feat = (self.pre_lin(feat).relu(), feat[: g.num_dst_nodes()])
-            # force ctx.needs_input_grad=True in cugraph-ops autograd function
-            feat[0].requires_grad_()
-            feat[1].requires_grad_()
-
-        out = ops_torch.operators.agg_concat_n2n(feat, _graph, self._aggr)[
-            : g.num_dst_nodes()
-        ]
-
-        if self.aggregator_type == "gcn":
-            out = out[:, : self.in_feats_src]
-
-        out = self.lin(out)
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
deleted file mode 100644
index e77556fb76f..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class TransformerConv(BaseConv):
-    r"""The graph transformer layer from the `"Masked Label Prediction:
-    Unified Message Passing Model for Semi-Supervised Classification"
-    <https://arxiv.org/abs/2009.03509>`_ paper.
-
-    Parameters
-    ----------
-    in_node_feats : int or pair of ints
-        Input feature size. A pair denotes feature sizes of source and
-        destination nodes.
-    out_node_feats : int
-        Output feature size.
-    num_heads : int
-        Number of multi-head-attentions.
-    concat : bool, optional
-        If False, the multi-head attentions are averaged instead of concatenated.
-        Default: ``True``.
-    beta : bool, optional
-        If True, use a gated residual connection. Default: ``True``.
-    edge_feats: int, optional
-        Edge feature size. Default: ``None``.
-    bias: bool, optional
-        If True, learns a bias term. Default: ``True``.
-    root_weight: bool, optional
-        If False, will skip to learn a root weight matrix. Default: ``True``.
-    """
-
-    def __init__(
-        self,
-        in_node_feats: Union[int, tuple[int, int]],
-        out_node_feats: int,
-        num_heads: int,
-        concat: bool = True,
-        beta: bool = False,
-        edge_feats: Optional[int] = None,
-        bias: bool = True,
-        root_weight: bool = True,
-    ):
-        super().__init__()
-
-        self.in_node_feats = in_node_feats
-        self.out_node_feats = out_node_feats
-        self.num_heads = num_heads
-        self.concat = concat
-        self.beta = beta
-        self.edge_feats = edge_feats
-        self.bias = bias
-        self.root_weight = root_weight
-
-        if isinstance(in_node_feats, int):
-            in_node_feats = (in_node_feats, in_node_feats)
-
-        self.lin_key = nn.Linear(in_node_feats[0], num_heads * out_node_feats)
-        self.lin_query = nn.Linear(in_node_feats[1], num_heads * out_node_feats)
-        self.lin_value = nn.Linear(in_node_feats[0], num_heads * out_node_feats)
-
-        if edge_feats is not None:
-            self.lin_edge = nn.Linear(
-                edge_feats, num_heads * out_node_feats, bias=False
-            )
-        else:
-            self.lin_edge = self.register_parameter("lin_edge", None)
-
-        if concat:
-            self.lin_skip = nn.Linear(
-                in_node_feats[1], num_heads * out_node_feats, bias=bias
-            )
-            if self.beta:
-                self.lin_beta = nn.Linear(3 * num_heads * out_node_feats, 1, bias=bias)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-        else:
-            self.lin_skip = nn.Linear(in_node_feats[1], out_node_feats, bias=bias)
-            if self.beta:
-                self.lin_beta = nn.Linear(3 * out_node_feats, 1, bias=False)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.lin_key.reset_parameters()
-        self.lin_query.reset_parameters()
-        self.lin_value.reset_parameters()
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-        if self.lin_skip is not None:
-            self.lin_skip.reset_parameters()
-        if self.lin_beta is not None:
-            self.lin_beta.reset_parameters()
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        efeat: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward computation.
-
-        Parameters
-        ----------
-        g: DGLGraph
-            The graph.
-        nfeat: torch.Tensor or a pair of torch.Tensor
-            Node feature tensor. A pair denotes features for source and
-            destination nodes, respectively.
-        efeat: torch.Tensor, optional
-            Edge feature tensor. Default: ``None``.
-        """
-        feat_bipartite = isinstance(nfeat, (list, tuple))
-        if not feat_bipartite:
-            nfeat = (nfeat, nfeat)
-
-        _graph = self.get_cugraph_ops_CSC(g, is_bipartite=True)
-
-        query = self.lin_query(nfeat[1][: g.num_dst_nodes()])
-        key = self.lin_key(nfeat[0])
-        value = self.lin_value(nfeat[0])
-
-        if efeat is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_feats must be set to allow "
-                    f"edge features."
-                )
-            efeat = self.lin_edge(efeat)
-
-        out = ops_torch.operators.mha_simple_n2n(
-            key_emb=key,
-            query_emb=query,
-            value_emb=value,
-            graph=_graph,
-            num_heads=self.num_heads,
-            concat_heads=self.concat,
-            edge_emb=efeat,
-            norm_by_dim=True,
-            score_bias=None,
-        )[: g.num_dst_nodes()]
-
-        if self.root_weight:
-            res = self.lin_skip(nfeat[1][: g.num_dst_nodes()])
-            if self.lin_beta is not None:
-                beta = self.lin_beta(torch.cat([out, res, out - res], dim=-1))
-                beta = beta.sigmoid()
-                out = beta * res + (1 - beta) * out
-            else:
-                out = out + res
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/__init__.py b/python/cugraph-dgl/cugraph_dgl/tests/__init__.py
deleted file mode 100644
index 1144e9bab3f..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/conftest.py b/python/cugraph-dgl/cugraph_dgl/tests/conftest.py
deleted file mode 100644
index ee1183f5cd1..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/conftest.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import dgl
-import torch
-
-from cugraph.testing.mg_utils import (
-    start_dask_client,
-    stop_dask_client,
-)
-
-
-@pytest.fixture(scope="module")
-def dask_client():
-    # start_dask_client will check for the SCHEDULER_FILE and
-    # DASK_WORKER_DEVICES env vars and use them when creating a client if
-    # set. start_dask_client will also initialize the Comms singleton.
-    dask_client, dask_cluster = start_dask_client(
-        dask_worker_devices="0", protocol="tcp"
-    )
-
-    yield dask_client
-
-    stop_dask_client(dask_client, dask_cluster)
-
-
-class SparseGraphData1:
-    size = (6, 5)
-    nnz = 6
-    src_ids = torch.IntTensor([0, 1, 2, 3, 2, 5]).cuda()
-    dst_ids = torch.IntTensor([1, 2, 3, 4, 0, 3]).cuda()
-    values = torch.IntTensor([10, 20, 30, 40, 50, 60]).cuda()
-
-    # CSR
-    src_ids_sorted_by_src = torch.IntTensor([0, 1, 2, 2, 3, 5]).cuda()
-    dst_ids_sorted_by_src = torch.IntTensor([1, 2, 0, 3, 4, 3]).cuda()
-    csrc_ids = torch.IntTensor([0, 1, 2, 4, 5, 5, 6]).cuda()
-    values_csr = torch.IntTensor([10, 20, 50, 30, 40, 60]).cuda()
-
-    # CSC
-    src_ids_sorted_by_dst = torch.IntTensor([2, 0, 1, 5, 2, 3]).cuda()
-    dst_ids_sorted_by_dst = torch.IntTensor([0, 1, 2, 3, 3, 4]).cuda()
-    cdst_ids = torch.IntTensor([0, 1, 2, 3, 5, 6]).cuda()
-    values_csc = torch.IntTensor([50, 10, 20, 60, 30, 40]).cuda()
-
-
-@pytest.fixture
-def sparse_graph_1():
-    return SparseGraphData1()
-
-
-@pytest.fixture
-def dgl_graph_1():
-    src = torch.tensor([0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9])
-    dst = torch.tensor([1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0])
-    return dgl.graph((src, dst))
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
deleted file mode 100644
index e2542657de4..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-import dgl
-import torch as th
-from cugraph_dgl import cugraph_storage_from_heterograph
-import tempfile
-import numpy as np
-
-
-def sample_dgl_graphs(g, train_nid, fanouts):
-    # Single fanout to match cugraph
-    sampler = dgl.dataloading.NeighborSampler(fanouts)
-    dataloader = dgl.dataloading.DataLoader(
-        g,
-        train_nid,
-        sampler,
-        batch_size=1,
-        shuffle=False,
-        drop_last=False,
-        num_workers=0,
-    )
-
-    dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return dgl_output
-
-
-def sample_cugraph_dgl_graphs(cugraph_gs, train_nid, fanouts):
-    sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
-    tempdir_object = tempfile.TemporaryDirectory()
-    sampling_output_dir = tempdir_object
-    dataloader = cugraph_dgl.dataloading.DaskDataLoader(
-        cugraph_gs,
-        train_nid,
-        sampler,
-        batch_size=1,
-        sampling_output_dir=sampling_output_dir.name,
-        drop_last=False,
-        shuffle=False,
-    )
-
-    cugraph_dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        cugraph_dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return cugraph_dgl_output
-
-
-def test_same_heterograph_results():
-    single_gpu = True
-    data_dict = {
-        ("B", "BA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-        ("C", "CA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-    }
-    train_nid = {"A": th.tensor([0])}
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.heterograph(data_dict)
-    cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [{"BA": 1, "CA": 1}])
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_gs, train_nid, [2])
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"]["A"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"]["A"].cpu().numpy()
-    np.testing.assert_array_equal(cugraph_output_nodes, dgl_output_nodes)
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-
-
-def test_same_homogeneousgraph_results():
-    single_gpu = True
-    train_nid = th.tensor([1])
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.graph(([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]))
-    cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2])
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_gs, train_nid, [2])
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-    np.testing.assert_array_equal(cugraph_output_nodes, dgl_output_nodes)
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-
-
-def test_heterograph_multi_block_results():
-    data_dict = {
-        ("B", "BA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-        ("C", "CA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-        ("A", "AA", "A"): ([1], [0]),
-    }
-    dgl_g = dgl.heterograph(data_dict)
-    cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(dgl_g, single_gpu=True)
-    train_nid = {"A": th.tensor([0])}
-    cugraph_dgl_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [10, 10])
-    assert (
-        cugraph_dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_dgl_output[0]["blocks"][1].num_src_nodes()
-    )
-
-
-def test_homogenousgraph_multi_block_results():
-    dgl_g = dgl.graph(data=([1, 2, 2, 3, 4, 5], [0, 0, 1, 2, 2, 3]))
-    cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(dgl_g, single_gpu=True)
-    train_nid = th.tensor([0])
-    cugraph_dgl_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [2, 2, 2])
-    assert (
-        cugraph_dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_dgl_output[0]["blocks"][1].num_src_nodes()
-    )
-    assert (
-        cugraph_dgl_output[0]["blocks"][1].num_dst_nodes()
-        == cugraph_dgl_output[0]["blocks"][2].num_src_nodes()
-    )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
deleted file mode 100644
index d49e1293e77..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dask_dataloader_mg.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import pytest
-
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-import dgl
-import torch as th
-from cugraph_dgl import cugraph_storage_from_heterograph
-import tempfile
-import numpy as np
-
-
-def sample_dgl_graphs(g, train_nid, fanouts):
-    # Single fanout to match cugraph
-    sampler = dgl.dataloading.NeighborSampler(fanouts)
-    dataloader = dgl.dataloading.DataLoader(
-        g,
-        train_nid,
-        sampler,
-        batch_size=1,
-        shuffle=False,
-        drop_last=False,
-        num_workers=0,
-    )
-
-    dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return dgl_output
-
-
-def sample_cugraph_dgl_graphs(cugraph_gs, train_nid, fanouts):
-    sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts)
-    tempdir_object = tempfile.TemporaryDirectory()
-    sampling_output_dir = tempdir_object
-    dataloader = cugraph_dgl.dataloading.DaskDataLoader(
-        cugraph_gs,
-        train_nid,
-        sampler,
-        batch_size=1,
-        sampling_output_dir=sampling_output_dir.name,
-        drop_last=False,
-        shuffle=False,
-    )
-
-    cugraph_dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        cugraph_dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return cugraph_dgl_output
-
-
-def test_same_heterograph_results(dask_client):
-    single_gpu = False
-    data_dict = {
-        ("B", "BA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-        ("C", "CA", "A"): ([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]),
-    }
-    train_nid = {"A": th.tensor([0])}
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.heterograph(data_dict)
-    cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [{"BA": 1, "CA": 1}])
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_gs, train_nid, [2])
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"]["A"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"]["A"].cpu().numpy()
-    np.testing.assert_array_equal(cugraph_output_nodes, dgl_output_nodes)
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-
-
-def test_same_homogeneousgraph_results(dask_client):
-    single_gpu = False
-    train_nid = th.tensor([1])
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.graph(([1, 2, 3, 4, 5, 6, 7, 8], [0, 0, 0, 0, 1, 1, 1, 1]))
-    cugraph_gs = cugraph_storage_from_heterograph(dgl_g, single_gpu=single_gpu)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2])
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_gs, train_nid, [2])
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-    np.testing.assert_array_equal(cugraph_output_nodes, dgl_output_nodes)
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
deleted file mode 100644
index 419ec7790a9..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import cugraph_dgl.dataloading
-import pytest
-
-import cugraph_dgl
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-import numpy as np
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-def test_dataloader_basic_homogeneous():
-    graph = cugraph_dgl.Graph(is_multi_gpu=False)
-
-    num_nodes = karate.number_of_nodes()
-    graph.add_nodes(num_nodes, data={"z": torch.arange(num_nodes)})
-
-    edf = karate.get_edgelist()
-    graph.add_edges(
-        u=edf["src"], v=edf["dst"], data={"q": torch.arange(karate.number_of_edges())}
-    )
-
-    sampler = cugraph_dgl.dataloading.NeighborSampler([5, 5, 5])
-    loader = cugraph_dgl.dataloading.FutureDataLoader(
-        graph, torch.arange(num_nodes), sampler, batch_size=2
-    )
-
-    for in_t, out_t, blocks in loader:
-        assert len(blocks) == 3
-        assert len(out_t) <= 2
-
-
-def sample_dgl_graphs(g, train_nid, fanouts, batch_size=1, prob_attr=None):
-    # Single fanout to match cugraph
-    sampler = dgl.dataloading.NeighborSampler(
-        fanouts,
-        prob=prob_attr,
-    )
-    dataloader = dgl.dataloading.DataLoader(
-        g,
-        train_nid,
-        sampler,
-        batch_size=batch_size,
-        shuffle=False,
-        drop_last=False,
-        num_workers=0,
-    )
-
-    dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return dgl_output
-
-
-def sample_cugraph_dgl_graphs(
-    cugraph_g, train_nid, fanouts, batch_size=1, prob_attr=None
-):
-    sampler = cugraph_dgl.dataloading.NeighborSampler(
-        fanouts,
-        prob=prob_attr,
-    )
-
-    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
-        cugraph_g,
-        train_nid,
-        sampler,
-        batch_size=batch_size,
-        drop_last=False,
-        shuffle=False,
-    )
-
-    cugraph_dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        cugraph_dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return cugraph_dgl_output
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("ix", [[1], [1, 0]])
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_same_homogeneousgraph_results(ix, batch_size):
-    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
-    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
-
-    train_nid = torch.tensor(ix)
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.graph((src, dst))
-
-    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=False)
-    cugraph_g.add_nodes(9)
-    cugraph_g.add_edges(u=src, v=dst)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2], batch_size=batch_size)
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [2], batch_size)
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-
-    np.testing.assert_array_equal(
-        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-def test_dataloader_biased_homogeneous():
-    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
-    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
-    wgt = torch.tensor([1, 1, 2, 0, 0, 0, 2, 1], dtype=torch.float32)
-
-    train_nid = torch.tensor([0, 1])
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.graph((src, dst))
-    dgl_g.edata["wgt"] = wgt
-
-    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=False)
-    cugraph_g.add_nodes(9)
-    cugraph_g.add_edges(u=src, v=dst, data={"wgt": wgt})
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [4], batch_size=2, prob_attr="wgt")
-    cugraph_output = sample_cugraph_dgl_graphs(
-        cugraph_g, train_nid, [4], batch_size=2, prob_attr="wgt"
-    )
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-
-    np.testing.assert_array_equal(
-        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-    assert 5 == cugraph_output[0]["blocks"][0].num_edges()
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
deleted file mode 100644
index 061f4fa2077..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataloader_mg.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import numpy as np
-
-import cugraph_dgl
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph.gnn import (
-    cugraph_comms_create_unique_id,
-    cugraph_comms_shutdown,
-)
-
-from cugraph_dgl.tests.utils import init_pytorch_worker
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-def run_test_dataloader_basic_homogeneous(rank, world_size, uid):
-    init_pytorch_worker(rank, world_size, uid)
-
-    graph = cugraph_dgl.Graph(is_multi_gpu=True)
-
-    num_nodes = karate.number_of_nodes()
-    graph.add_nodes(
-        num_nodes,
-    )
-
-    edf = karate.get_edgelist()
-    graph.add_edges(
-        u=torch.tensor_split(torch.as_tensor(edf["src"], device="cuda"), world_size)[
-            rank
-        ],
-        v=torch.tensor_split(torch.as_tensor(edf["dst"], device="cuda"), world_size)[
-            rank
-        ],
-    )
-
-    sampler = cugraph_dgl.dataloading.NeighborSampler([5, 5, 5])
-    loader = cugraph_dgl.dataloading.FutureDataLoader(
-        graph,
-        torch.arange(num_nodes),
-        sampler,
-        batch_size=2,
-        use_ddp=True,
-    )
-
-    for in_t, out_t, blocks in loader:
-        assert len(blocks) == 3
-        assert len(out_t) <= 2
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-def test_dataloader_basic_homogeneous():
-    uid = cugraph_comms_create_unique_id()
-    # Limit the number of GPUs this rest is run with
-    world_size = min(torch.cuda.device_count(), 4)
-
-    torch.multiprocessing.spawn(
-        run_test_dataloader_basic_homogeneous,
-        args=(
-            world_size,
-            uid,
-        ),
-        nprocs=world_size,
-    )
-
-
-def sample_dgl_graphs(
-    g,
-    train_nid,
-    fanouts,
-    batch_size=1,
-    prob_attr=None,
-):
-    # Single fanout to match cugraph
-    sampler = dgl.dataloading.NeighborSampler(
-        fanouts,
-        prob=prob_attr,
-    )
-    dataloader = dgl.dataloading.DataLoader(
-        g,
-        train_nid,
-        sampler,
-        batch_size=batch_size,
-        shuffle=False,
-        drop_last=False,
-        num_workers=0,
-    )
-
-    dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return dgl_output
-
-
-def sample_cugraph_dgl_graphs(
-    cugraph_g,
-    train_nid,
-    fanouts,
-    batch_size=1,
-    prob_attr=None,
-):
-    sampler = cugraph_dgl.dataloading.NeighborSampler(
-        fanouts,
-        prob=prob_attr,
-    )
-
-    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
-        cugraph_g,
-        train_nid,
-        sampler,
-        batch_size=batch_size,
-        drop_last=False,
-        shuffle=False,
-    )
-
-    cugraph_dgl_output = {}
-    for batch_id, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        cugraph_dgl_output[batch_id] = {
-            "input_nodes": input_nodes,
-            "output_nodes": output_nodes,
-            "blocks": blocks,
-        }
-    return cugraph_dgl_output
-
-
-def run_test_same_homogeneousgraph_results(rank, world_size, uid, ix, batch_size):
-    init_pytorch_worker(rank, world_size, uid)
-
-    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
-    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1])
-
-    local_src = torch.tensor_split(src, world_size)[rank]
-    local_dst = torch.tensor_split(dst, world_size)[rank]
-
-    train_nid = torch.tensor(ix)
-    # Create a heterograph with 3 node types and 3 edges types.
-    dgl_g = dgl.graph((src, dst))
-
-    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=True)
-    cugraph_g.add_nodes(9)
-    cugraph_g.add_edges(u=local_src, v=local_dst)
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [2], batch_size=batch_size)
-    cugraph_output = sample_cugraph_dgl_graphs(cugraph_g, train_nid, [2], batch_size)
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-
-    np.testing.assert_array_equal(
-        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("ix", [[1], [1, 0]])
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_same_homogeneousgraph_results_mg(ix, batch_size):
-    uid = cugraph_comms_create_unique_id()
-    # Limit the number of GPUs this rest is run with
-    world_size = min(torch.cuda.device_count(), 4)
-
-    torch.multiprocessing.spawn(
-        run_test_same_homogeneousgraph_results,
-        args=(world_size, uid, ix, batch_size),
-        nprocs=world_size,
-    )
-
-
-def run_test_dataloader_biased_homogeneous(rank, world_size, uid):
-    init_pytorch_worker(rank, world_size, uid, True)
-
-    src = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]) + (rank * 9)
-    dst = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1]) + (rank * 9)
-    wgt = torch.tensor(
-        [0.1, 0.1, 0.2, 0, 0, 0, 0.2, 0.1] * world_size, dtype=torch.float32
-    )
-
-    train_nid = torch.tensor([0, 1]) + (rank * 9)
-    # Create a heterograph with 3 node types and 3 edge types.
-    dgl_g = dgl.graph((src, dst))
-    dgl_g.edata["wgt"] = wgt[:8]
-
-    cugraph_g = cugraph_dgl.Graph(is_multi_gpu=True)
-    cugraph_g.add_nodes(9 * world_size)
-    cugraph_g.add_edges(u=src, v=dst, data={"wgt": wgt})
-
-    dgl_output = sample_dgl_graphs(dgl_g, train_nid, [4], batch_size=2, prob_attr="wgt")
-    cugraph_output = sample_cugraph_dgl_graphs(
-        cugraph_g, train_nid, [4], batch_size=2, prob_attr="wgt"
-    )
-
-    cugraph_output_nodes = cugraph_output[0]["output_nodes"].cpu().numpy()
-    dgl_output_nodes = dgl_output[0]["output_nodes"].cpu().numpy()
-
-    np.testing.assert_array_equal(
-        np.sort(cugraph_output_nodes), np.sort(dgl_output_nodes)
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_dst_nodes()
-        == cugraph_output[0]["blocks"][0].num_dst_nodes()
-    )
-    assert (
-        dgl_output[0]["blocks"][0].num_edges()
-        == cugraph_output[0]["blocks"][0].num_edges()
-    )
-
-    assert 5 == cugraph_output[0]["blocks"][0].num_edges()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-def test_dataloader_biased_homogeneous_mg():
-    uid = cugraph_comms_create_unique_id()
-    # Limit the number of GPUs this test is run with
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_dataloader_biased_homogeneous,
-        args=(world_size, uid),
-        nprocs=world_size,
-    )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py b/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py
deleted file mode 100644
index 5db443dc0d8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/dataloading/test_dataset.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-try:
-    import cugraph_dgl
-
-    del cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from dgl.dataloading import MultiLayerNeighborSampler
-import dgl
-import torch
-import cudf
-import pandas as pd
-import cupy as cp
-import numpy as np
-from cugraph_dgl.dataloading.utils.sampling_helpers import (
-    create_homogeneous_sampled_graphs_from_dataframe,
-)
-
-
-def get_edge_df_from_homogenous_block(block):
-    block = block.to("cpu")
-    src, dst, eid = block.edges("all")
-    src = block.srcdata[dgl.NID][src]
-    dst = block.dstdata[dgl.NID][dst]
-    eid = block.edata[dgl.EID][eid]
-    df = pd.DataFrame({"src": src, "dst": dst, "eid": eid})
-    return df.sort_values(by="eid").reset_index(drop=True)
-
-
-def create_dgl_mfgs(g, seed_nodes, fanout):
-    sampler = MultiLayerNeighborSampler(fanout)
-    return sampler.sample_blocks(g, seed_nodes)
-
-
-def create_cugraph_dgl_homogenous_mfgs(dgl_blocks, return_type):
-    df_ls = []
-    unique_vertices_ls = []
-    for hop_id, block in enumerate(reversed(dgl_blocks)):
-        block = block.to("cpu")
-        src, dst, eid = block.edges("all")
-        eid = block.edata[dgl.EID][eid]
-
-        og_src = block.srcdata[dgl.NID][src]
-        og_dst = block.dstdata[dgl.NID][dst]
-        unique_vertices = pd.concat(
-            [pd.Series(og_dst.numpy()), pd.Series(og_src.numpy())]
-        ).drop_duplicates(keep="first")
-        unique_vertices_ls.append(unique_vertices)
-        df = cudf.DataFrame(
-            {
-                "sources": cp.asarray(src),
-                "destinations": cp.asarray(dst),
-                "edge_id": cp.asarray(eid),
-            }
-        )
-        df["hop_id"] = hop_id
-        df_ls.append(df)
-    df = cudf.concat(df_ls, ignore_index=True)
-    df["batch_id"] = 0
-
-    # Add map column
-    # to the dataframe
-    renumberd_map = pd.concat(unique_vertices_ls).drop_duplicates(keep="first").values
-    offsets = np.asarray([2, 2 + len(renumberd_map)])
-    map_ar = np.concatenate([offsets, renumberd_map])
-    map_ser = cudf.Series(map_ar)
-    # Have to reindex cause map_ser can be of larger length than df
-    df = df.reindex(df.index.union(map_ser.index))
-    df["map"] = map_ser
-    return create_homogeneous_sampled_graphs_from_dataframe(
-        df, return_type=return_type
-    )[0]
-
-
-@pytest.mark.parametrize("return_type", ["dgl.Block", "cugraph_dgl.nn.SparseGraph"])
-@pytest.mark.parametrize("seed_node", [3, 4, 5])
-def test_homogeneous_sampled_graphs_from_dataframe(return_type, seed_node):
-    g = dgl.graph(([0, 1, 2, 3, 4], [1, 2, 3, 4, 5]))
-    fanout = [1, 1, 1]
-    seed_node = torch.as_tensor([seed_node])
-
-    dgl_seed_nodes, dgl_output_nodes, dgl_mfgs = create_dgl_mfgs(g, seed_node, fanout)
-    (
-        cugraph_seed_nodes,
-        cugraph_output_nodes,
-        cugraph_mfgs,
-    ) = create_cugraph_dgl_homogenous_mfgs(dgl_mfgs, return_type=return_type)
-
-    np.testing.assert_equal(
-        cugraph_seed_nodes.cpu().numpy().copy().sort(),
-        dgl_seed_nodes.cpu().numpy().copy().sort(),
-    )
-
-    np.testing.assert_equal(
-        dgl_output_nodes.cpu().numpy().copy().sort(),
-        cugraph_output_nodes.cpu().numpy().copy().sort(),
-    )
-
-    if return_type == "dgl.Block":
-        for dgl_block, cugraph_dgl_block in zip(dgl_mfgs, cugraph_mfgs):
-            dgl_df = get_edge_df_from_homogenous_block(dgl_block)
-            cugraph_dgl_df = get_edge_df_from_homogenous_block(cugraph_dgl_block)
-            pd.testing.assert_frame_equal(dgl_df, cugraph_dgl_df)
-    else:
-        for dgl_block, cugraph_dgl_graph in zip(dgl_mfgs, cugraph_mfgs):
-            # Can not verify edge ids as they are not
-            # preserved in cugraph_dgl.nn.SparseGraph
-            assert dgl_block.num_src_nodes() == cugraph_dgl_graph.num_src_nodes()
-            assert dgl_block.num_dst_nodes() == cugraph_dgl_graph.num_dst_nodes()
-            dgl_offsets, dgl_indices, _ = dgl_block.adj_tensors("csc")
-            cugraph_offsets, cugraph_indices, _ = cugraph_dgl_graph.csc()
-            assert torch.equal(dgl_offsets.to("cpu"), cugraph_offsets.to("cpu"))
-            assert torch.equal(dgl_indices.to("cpu"), cugraph_indices.to("cpu"))
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
deleted file mode 100644
index de27efc6329..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import GATConv as CuGraphGATConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("residual", [False, True])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_gatconv_equality(
-    dgl_graph_1,
-    mode,
-    idx_type,
-    max_in_degree,
-    num_heads,
-    residual,
-    to_block,
-    sparse_format,
-):
-    from dgl.nn.pytorch import GATConv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-
-    if mode == "bipartite":
-        in_feats = (10, 3)
-        nfeat = (
-            torch.randn(size[0], in_feats[0]).to(device),
-            torch.randn(size[1], in_feats[1]).to(device),
-        )
-    elif mode == "share_weights":
-        in_feats = 5
-        nfeat = (
-            torch.randn(size[0], in_feats).to(device),
-            torch.randn(size[1], in_feats).to(device),
-        )
-    else:
-        in_feats = 7
-        nfeat = torch.randn(size[0], in_feats).to(device)
-    out_feats = 2
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True, "residual": residual}
-
-    conv1 = GATConv(*args, **kwargs).to(device)
-    conv2 = CuGraphGATConv(*args, **kwargs).to(device)
-
-    dim = num_heads * out_feats
-    with torch.no_grad():
-        conv2.attn_weights[:dim].copy_(conv1.attn_l.flatten())
-        conv2.attn_weights[dim:].copy_(conv1.attn_r.flatten())
-        if mode == "bipartite":
-            conv2.lin_src.weight.copy_(conv1.fc_src.weight)
-            conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
-        else:
-            conv2.lin.weight.copy_(conv1.fc.weight)
-        if residual and conv1.has_linear_res:
-            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
-
-    out1 = conv1(g, nfeat)
-    if sparse_format is not None:
-        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out1 = torch.randn_like(out1)
-    grad_out2 = grad_out1.detach().clone()
-    out1.backward(grad_out1)
-    out2.backward(grad_out2)
-
-    if mode == "bipartite":
-        assert torch.allclose(
-            conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
-        )
-        assert torch.allclose(
-            conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
-        )
-    else:
-        assert torch.allclose(conv1.fc.weight.grad, conv2.lin.weight.grad, atol=ATOL)
-
-    if residual and conv1.has_linear_res:
-        assert torch.allclose(
-            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
-        )
-
-    assert torch.allclose(
-        torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
-        conv2.attn_weights.grad.view(2, num_heads, out_feats),
-        atol=1e-5,  # Note: using a loosened tolerance here due to numerical error
-    )
-
-
-@pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("use_edge_feats", [False, True])
-def test_gatconv_edge_feats(
-    dgl_graph_1,
-    bias,
-    bipartite,
-    concat,
-    max_in_degree,
-    num_heads,
-    to_block,
-    use_edge_feats,
-):
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    if bipartite:
-        in_feats = (10, 3)
-        nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
-        )
-    else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
-    out_feats = 2
-
-    if use_edge_feats:
-        edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
-    else:
-        edge_feats = None
-        efeat = None
-
-    conv = CuGraphGATConv(
-        in_feats,
-        out_feats,
-        num_heads,
-        concat=concat,
-        edge_feats=edge_feats,
-        bias=bias,
-        allow_zero_in_degree=True,
-    ).to(device)
-    out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
-
-    grad_out = torch.randn_like(out)
-    out.backward(grad_out)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
deleted file mode 100644
index 2d26b7fdc28..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import GATv2Conv as CuGraphGATv2Conv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-5
-
-
-@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("residual", [False, True])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_gatv2conv_equality(
-    dgl_graph_1,
-    mode,
-    idx_type,
-    max_in_degree,
-    num_heads,
-    residual,
-    to_block,
-    sparse_format,
-):
-    from dgl.nn.pytorch import GATv2Conv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-
-    if mode == "bipartite":
-        in_feats = (10, 3)
-        nfeat = (
-            torch.randn(size[0], in_feats[0]).to(device),
-            torch.randn(size[1], in_feats[1]).to(device),
-        )
-    elif mode == "share_weights":
-        in_feats = 5
-        nfeat = (
-            torch.randn(size[0], in_feats).to(device),
-            torch.randn(size[1], in_feats).to(device),
-        )
-    else:
-        in_feats = 7
-        nfeat = torch.randn(size[0], in_feats).to(device)
-    out_feats = 2
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    args = (in_feats, out_feats, num_heads)
-    kwargs = {
-        "bias": False,
-        "allow_zero_in_degree": True,
-        "residual": residual,
-        "share_weights": mode == "share_weights",
-    }
-
-    conv1 = GATv2Conv(*args, **kwargs).to(device)
-    conv2 = CuGraphGATv2Conv(*args, **kwargs).to(device)
-
-    with torch.no_grad():
-        conv2.attn_weights.copy_(conv1.attn.flatten())
-        conv2.lin_src.weight.copy_(conv1.fc_src.weight)
-        conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
-        if residual:
-            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
-
-    out1 = conv1(g, nfeat)
-    if sparse_format is not None:
-        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out1 = torch.randn_like(out1)
-    grad_out2 = grad_out1.detach().clone()
-    out1.backward(grad_out1)
-    out2.backward(grad_out2)
-
-    assert torch.allclose(
-        conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
-    )
-
-    if residual:
-        assert torch.allclose(
-            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
-        )
-
-    assert torch.allclose(
-        conv1.attn.grad,
-        conv2.attn_weights.grad.view(1, num_heads, out_feats),
-        atol=ATOL,
-    )
-
-
-@pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("use_edge_feats", [False, True])
-def test_gatv2conv_edge_feats(
-    dgl_graph_1,
-    bias,
-    bipartite,
-    concat,
-    max_in_degree,
-    num_heads,
-    to_block,
-    use_edge_feats,
-):
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    if bipartite:
-        in_feats = (10, 3)
-        nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
-        )
-    else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
-    out_feats = 2
-
-    if use_edge_feats:
-        edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
-    else:
-        edge_feats = None
-        efeat = None
-
-    conv = CuGraphGATv2Conv(
-        in_feats,
-        out_feats,
-        num_heads,
-        concat=concat,
-        edge_feats=edge_feats,
-        bias=bias,
-        allow_zero_in_degree=True,
-    ).to(device)
-    out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
-
-    grad_out = torch.randn_like(out)
-    out.backward(grad_out)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
deleted file mode 100644
index b5d3686c609..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import RelGraphConv as CuGraphRelGraphConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_bases", [1, 2, 5])
-@pytest.mark.parametrize("regularizer", [None, "basis"])
-@pytest.mark.parametrize("self_loop", [False, True])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_relgraphconv_equality(
-    dgl_graph_1,
-    idx_type,
-    max_in_degree,
-    num_bases,
-    regularizer,
-    self_loop,
-    to_block,
-    sparse_format,
-):
-    from dgl.nn.pytorch import RelGraphConv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    in_feat, out_feat, num_rels = 10, 2, 3
-    args = (in_feat, out_feat, num_rels)
-    kwargs = {
-        "num_bases": num_bases,
-        "regularizer": regularizer,
-        "bias": False,
-        "self_loop": self_loop,
-    }
-
-    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).to(device)
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size,
-            src_ids=g.edges()[0],
-            dst_ids=g.edges()[1],
-            values=g.edata[dgl.ETYPE],
-            formats="csc",
-        )
-    elif sparse_format == "csc":
-        offsets, indices, perm = g.adj_tensors("csc")
-        etypes = g.edata[dgl.ETYPE][perm]
-        sg = SparseGraph(
-            size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
-        )
-
-    conv1 = RelGraphConv(*args, **kwargs).to(device)
-    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).to(device)
-
-    with torch.no_grad():
-        if self_loop:
-            conv2.W[:-1].copy_(conv1.linear_r.W)
-            conv2.W[-1].copy_(conv1.loop_weight)
-        else:
-            conv2.W.copy_(conv1.linear_r.W)
-
-        if regularizer is not None:
-            conv2.coeff.copy_(conv1.linear_r.coeff)
-
-    out1 = conv1(g, feat, g.edata[dgl.ETYPE])
-
-    if sparse_format is not None:
-        out2 = conv2(sg, feat, sg.values(), max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, feat, g.edata[dgl.ETYPE], max_in_degree=max_in_degree)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.randn_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-
-    end = -1 if self_loop else None
-    assert torch.allclose(conv1.linear_r.W.grad, conv2.W.grad[:end], atol=ATOL)
-
-    if self_loop:
-        assert torch.allclose(conv1.loop_weight.grad, conv2.W.grad[-1], atol=ATOL)
-
-    if regularizer is not None:
-        assert torch.allclose(conv1.linear_r.coeff.grad, conv2.coeff.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
deleted file mode 100644
index 3f1c2b1b3fe..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("aggr", ["mean", "pool"])
-@pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_sageconv_equality(
-    dgl_graph_1, aggr, bias, bipartite, idx_type, max_in_degree, to_block, sparse_format
-):
-    from dgl.nn.pytorch import SAGEConv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-
-    if bipartite:
-        in_feats = (5, 3)
-        feat = (
-            torch.rand(size[0], in_feats[0], requires_grad=True).to(device),
-            torch.rand(size[1], in_feats[1], requires_grad=True).to(device),
-        )
-    else:
-        in_feats = 5
-        feat = torch.rand(size[0], in_feats).to(device)
-    out_feats = 2
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    kwargs = {"aggregator_type": aggr, "bias": bias}
-    conv1 = SAGEConv(in_feats, out_feats, **kwargs).to(device)
-    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).to(device)
-
-    in_feats_src = conv2.in_feats_src
-    with torch.no_grad():
-        conv2.lin.weight[:, :in_feats_src].copy_(conv1.fc_neigh.weight)
-        conv2.lin.weight[:, in_feats_src:].copy_(conv1.fc_self.weight)
-        if bias:
-            conv2.lin.bias.copy_(conv1.fc_self.bias)
-        if aggr == "pool":
-            conv2.pre_lin.weight.copy_(conv1.fc_pool.weight)
-            conv2.pre_lin.bias.copy_(conv1.fc_pool.bias)
-
-    out1 = conv1(g, feat)
-    if sparse_format is not None:
-        out2 = conv2(sg, feat, max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, feat, max_in_degree=max_in_degree)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.randn_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-    assert torch.allclose(
-        conv1.fc_neigh.weight.grad,
-        conv2.lin.weight.grad[:, :in_feats_src],
-        atol=ATOL,
-    )
-    assert torch.allclose(
-        conv1.fc_self.weight.grad,
-        conv2.lin.weight.grad[:, in_feats_src:],
-        atol=ATOL,
-    )
-    if bias:
-        assert torch.allclose(conv1.fc_self.bias.grad, conv2.lin.bias.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py
deleted file mode 100644
index 09c0df202ff..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sparsegraph.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph.utilities.utils import import_optional
-from cugraph_dgl.nn import SparseGraph
-
-torch = import_optional("torch")
-
-
-def test_coo2csc(sparse_graph_1):
-    data = sparse_graph_1
-
-    g = SparseGraph(
-        size=data.size,
-        src_ids=data.src_ids,
-        dst_ids=data.dst_ids,
-        values=data.values,
-        formats=["csc"],
-    )
-    cdst_ids, src_ids, values = g.csc()
-
-    new = torch.sparse_csc_tensor(cdst_ids, src_ids, values).cuda()
-    old = torch.sparse_coo_tensor(
-        torch.vstack((data.src_ids, data.dst_ids)), data.values
-    ).cuda()
-    torch.allclose(new.to_dense(), old.to_dense())
-
-
-def test_csc_input(sparse_graph_1):
-    data = sparse_graph_1
-
-    g = SparseGraph(
-        size=data.size,
-        src_ids=data.src_ids_sorted_by_dst,
-        cdst_ids=data.cdst_ids,
-        values=data.values_csc,
-        formats=["coo", "csc", "csr"],
-    )
-    src_ids, dst_ids, values = g.coo()
-
-    new = torch.sparse_coo_tensor(torch.vstack((src_ids, dst_ids)), values).cuda()
-    old = torch.sparse_csc_tensor(
-        data.cdst_ids, data.src_ids_sorted_by_dst, data.values_csc
-    ).cuda()
-    torch.allclose(new.to_dense(), old.to_dense())
-
-    csrc_ids, dst_ids, values = g.csr()
-
-    new = torch.sparse_csr_tensor(csrc_ids, dst_ids, values).cuda()
-    torch.allclose(new.to_dense(), old.to_dense())
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
deleted file mode 100644
index 28d13dedec8..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import TransformerConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("beta", [False, True])
-@pytest.mark.parametrize("bipartite_node_feats", [False, True])
-@pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("num_heads", [1, 3, 4])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("use_edge_feats", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_transformerconv(
-    dgl_graph_1,
-    beta,
-    bipartite_node_feats,
-    concat,
-    idx_type,
-    num_heads,
-    to_block,
-    use_edge_feats,
-    sparse_format,
-):
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    if bipartite_node_feats:
-        in_node_feats = (5, 3)
-        nfeat = (
-            torch.rand(g.num_src_nodes(), in_node_feats[0], device=device),
-            torch.rand(g.num_dst_nodes(), in_node_feats[1], device=device),
-        )
-    else:
-        in_node_feats = 3
-        nfeat = torch.rand(g.num_src_nodes(), in_node_feats, device=device)
-    out_node_feats = 2
-
-    if use_edge_feats:
-        edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats, device=device)
-    else:
-        edge_feats = None
-        efeat = None
-
-    conv = TransformerConv(
-        in_node_feats,
-        out_node_feats,
-        num_heads=num_heads,
-        concat=concat,
-        beta=beta,
-        edge_feats=edge_feats,
-    ).to(device)
-
-    if sparse_format is not None:
-        out = conv(sg, nfeat, efeat)
-    else:
-        out = conv(g, nfeat, efeat)
-
-    grad_out = torch.randn_like(out)
-    out.backward(grad_out)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py b/python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py
deleted file mode 100644
index 0a99d4d65b7..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_cugraph_storage.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
-import cudf
-import numpy as np
-from cugraph_dgl import CuGraphStorage
-from .utils import assert_same_sampling_len
-
-th = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-@pytest.fixture()
-def dgl_graph():
-    graph_data = {
-        ("nt.a", "connects", "nt.b"): (
-            th.tensor([0, 1, 2]),
-            th.tensor([0, 1, 2]),
-        ),
-        ("nt.a", "connects", "nt.c"): (
-            th.tensor([0, 1, 2]),
-            th.tensor([0, 1, 2]),
-        ),
-        ("nt.c", "connects", "nt.c"): (
-            th.tensor([1, 3, 4, 5]),
-            th.tensor([0, 0, 0, 0]),
-        ),
-    }
-    g = dgl.heterograph(graph_data)
-    return g
-
-
-def test_cugraphstore_basic_apis():
-
-    num_nodes_dict = {"drug": 3, "gene": 2, "disease": 1}
-    # edges
-    drug_interacts_drug_df = cudf.DataFrame({"src": [0, 1], "dst": [1, 2]})
-    drug_interacts_gene = cudf.DataFrame({"src": [0, 1], "dst": [0, 1]})
-    drug_treats_disease = cudf.DataFrame({"src": [1], "dst": [0]})
-    data_dict = {
-        ("drug", "interacts", "drug"): drug_interacts_drug_df,
-        ("drug", "interacts", "gene"): drug_interacts_gene,
-        ("drug", "treats", "disease"): drug_treats_disease,
-    }
-    gs = CuGraphStorage(data_dict=data_dict, num_nodes_dict=num_nodes_dict)
-    # add node data
-    gs.add_node_data(
-        ntype="drug",
-        feat_name="node_feat",
-        feat_obj=th.as_tensor([0.1, 0.2, 0.3], dtype=th.float64),
-    )
-    # add edge data
-    gs.add_edge_data(
-        canonical_etype=("drug", "interacts", "drug"),
-        feat_name="edge_feat",
-        feat_obj=th.as_tensor([0.2, 0.4], dtype=th.float64),
-    )
-
-    assert gs.num_nodes() == 6
-
-    assert gs.num_edges(("drug", "interacts", "drug")) == 2
-    assert gs.num_edges(("drug", "interacts", "gene")) == 2
-    assert gs.num_edges(("drug", "treats", "disease")) == 1
-
-    node_feat = (
-        gs.get_node_storage(key="node_feat", ntype="drug")
-        .fetch([0, 1, 2])
-        .to("cpu")
-        .numpy()
-    )
-    np.testing.assert_equal(node_feat, np.asarray([0.1, 0.2, 0.3]))
-
-    edge_feat = (
-        gs.get_edge_storage(key="edge_feat", etype=("drug", "interacts", "drug"))
-        .fetch([0, 1])
-        .to("cpu")
-        .numpy()
-    )
-    np.testing.assert_equal(edge_feat, np.asarray([0.2, 0.4]))
-
-
-def test_sampling_heterograph(dgl_graph):
-    cugraph_gs = cugraph_dgl.cugraph_storage_from_heterograph(dgl_graph)
-
-    for fanout in [1, 2, 3, -1]:
-        for ntype in ["nt.a", "nt.b", "nt.c"]:
-            for d in ["in", "out"]:
-                assert_same_sampling_len(
-                    dgl_graph,
-                    cugraph_gs,
-                    nodes={ntype: [0]},
-                    fanout=fanout,
-                    edge_dir=d,
-                )
-
-
-def test_sampling_homogenous():
-    src_ar = np.asarray([0, 1, 2, 0, 1, 2, 7, 9, 10, 11], dtype=np.int32)
-    dst_ar = np.asarray([3, 4, 5, 6, 7, 8, 6, 6, 6, 6], dtype=np.int32)
-    g = dgl.heterograph({("a", "connects", "a"): (src_ar, dst_ar)})
-    cugraph_gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
-    # Convert to homogeneous
-    g = dgl.to_homogeneous(g)
-    nodes = [6]
-    # Test for multiple fanouts
-    for fanout in [1, 2, 3]:
-        exp_g = g.sample_neighbors(nodes, fanout=fanout)
-        cu_g = cugraph_gs.sample_neighbors(nodes, fanout=fanout)
-        exp_src, exp_dst = exp_g.edges()
-        cu_src, cu_dst = cu_g.edges()
-        assert len(exp_src) == len(cu_src)
-
-    # Test same results for all neighbours
-    exp_g = g.sample_neighbors(nodes, fanout=-1)
-    cu_g = cugraph_gs.sample_neighbors(nodes, fanout=-1)
-    exp_src, exp_dst = exp_g.edges()
-    exp_src, exp_dst = exp_src.numpy(), exp_dst.numpy()
-
-    cu_src, cu_dst = cu_g.edges()
-    cu_src, cu_dst = cu_src.to("cpu").numpy(), cu_dst.to("cpu").numpy()
-
-    # Assert same values sorted by src
-    exp_src_perm = exp_src.argsort()
-    exp_src = exp_src[exp_src_perm]
-    exp_dst = exp_dst[exp_src_perm]
-
-    cu_src_perm = cu_src.argsort()
-    cu_src = cu_src[cu_src_perm]
-    cu_dst = cu_dst[cu_src_perm]
-
-    np.testing.assert_equal(exp_dst, cu_dst)
-    np.testing.assert_equal(exp_src, cu_src)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
deleted file mode 100644
index 667a4a2e66d..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_from_dgl_heterograph.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import pytest
-
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
-from .utils import (
-    assert_same_edge_feats,
-    assert_same_edge_feats_daskapi,
-    assert_same_node_feats,
-    assert_same_node_feats_daskapi,
-    assert_same_num_edges_can_etypes,
-    assert_same_num_edges_etypes,
-    assert_same_num_nodes,
-)
-
-th = import_optional("torch")
-dgl = import_optional("dgl")
-F = import_optional("dgl.backend")
-
-
-def create_heterograph1(idtype):
-    ctx = th.device("cuda")
-    graph_data = {
-        ("nt.a", "join.1", "nt.a"): (
-            F.tensor([0, 1, 2], dtype=idtype),
-            F.tensor([0, 1, 2], dtype=idtype),
-        ),
-        ("nt.a", "join.2", "nt.a"): (
-            F.tensor([0, 1, 2], dtype=idtype),
-            F.tensor([0, 1, 2], dtype=idtype),
-        ),
-    }
-    g = dgl.heterograph(graph_data, device=th.device("cuda"))
-    g.nodes["nt.a"].data["h"] = F.copy_to(F.tensor([1, 1, 1], dtype=idtype), ctx=ctx)
-    return g
-
-
-def create_heterograph2(idtype):
-    ctx = th.device("cuda")
-
-    g = dgl.heterograph(
-        {
-            ("user", "plays", "game"): (
-                F.tensor([0, 1, 1, 2], dtype=idtype),
-                F.tensor([0, 0, 1, 1], dtype=idtype),
-            ),
-            ("developer", "develops", "game"): (
-                F.tensor([0, 1], dtype=idtype),
-                F.tensor([0, 1], dtype=idtype),
-            ),
-            ("developer", "tests", "game"): (
-                F.tensor([0, 1], dtype=idtype),
-                F.tensor([0, 1], dtype=idtype),
-            ),
-        },
-        idtype=idtype,
-        device=th.device("cuda"),
-    )
-
-    g.nodes["user"].data["h"] = F.copy_to(F.tensor([1, 1, 1], dtype=idtype), ctx=ctx)
-    g.nodes["user"].data["p"] = F.copy_to(F.tensor([1, 1, 1], dtype=idtype), ctx=ctx)
-    g.nodes["game"].data["h"] = F.copy_to(F.tensor([2, 2], dtype=idtype), ctx=ctx)
-    g.nodes["developer"].data["h"] = F.copy_to(F.tensor([3, 3], dtype=idtype), ctx=ctx)
-    g.edges["plays"].data["h"] = F.copy_to(
-        F.tensor([1, 1, 1, 1], dtype=idtype), ctx=ctx
-    )
-    return g
-
-
-def create_heterograph3(idtype):
-    ctx = th.device("cuda")
-
-    g = dgl.heterograph(
-        {
-            ("user", "follows", "user"): (
-                F.tensor([0, 1, 1, 2, 2, 2], dtype=idtype),
-                F.tensor([0, 0, 1, 1, 2, 2], dtype=idtype),
-            ),
-            ("user", "plays", "game"): (
-                F.tensor([0, 1], dtype=idtype),
-                F.tensor([0, 1], dtype=idtype),
-            ),
-        },
-        idtype=idtype,
-        device=th.device("cuda"),
-    )
-    g.nodes["user"].data["h"] = F.copy_to(F.tensor([1, 1, 1], dtype=idtype), ctx=ctx)
-    g.nodes["game"].data["h"] = F.copy_to(F.tensor([2, 2], dtype=idtype), ctx=ctx)
-    g.edges["follows"].data["h"] = F.copy_to(
-        F.tensor([10, 20, 30, 40, 50, 60], dtype=idtype), ctx=ctx
-    )
-    g.edges["follows"].data["p"] = F.copy_to(
-        F.tensor([1, 2, 3, 4, 5, 6], dtype=idtype), ctx=ctx
-    )
-    g.edges["plays"].data["h"] = F.copy_to(F.tensor([1, 2], dtype=idtype), ctx=ctx)
-    return g
-
-
-def create_heterograph4(idtype):
-    ctx = th.device("cuda")
-
-    g = dgl.heterograph(
-        {
-            ("user", "follows", "user"): (
-                F.tensor([1, 2], dtype=idtype),
-                F.tensor([0, 1], dtype=idtype),
-            ),
-            ("user", "plays", "game"): (
-                F.tensor([0, 1], dtype=idtype),
-                F.tensor([0, 1], dtype=idtype),
-            ),
-        },
-        idtype=idtype,
-        device=th.device("cuda"),
-    )
-    g.nodes["user"].data["h"] = F.copy_to(F.tensor([1, 1, 1], dtype=idtype), ctx=ctx)
-    g.nodes["game"].data["h"] = F.copy_to(F.tensor([2, 2], dtype=idtype), ctx=ctx)
-    g.edges["follows"].data["h"] = F.copy_to(F.tensor([1, 2], dtype=idtype), ctx=ctx)
-    g.edges["plays"].data["h"] = F.copy_to(F.tensor([1, 2], dtype=idtype), ctx=ctx)
-    return g
-
-
-@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
-def test_heterograph_conversion_nodes_daskapi(idxtype):
-    graph_fs = [
-        create_heterograph1,
-        create_heterograph2,
-        create_heterograph3,
-        create_heterograph4,
-    ]
-    for graph_f in graph_fs:
-        g = graph_f(idxtype)
-        gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
-
-        assert_same_num_nodes(gs, g)
-        assert_same_node_feats_daskapi(gs, g)
-
-
-@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
-def test_heterograph_conversion_edges_daskapi(idxtype):
-    graph_fs = [
-        create_heterograph1,
-        create_heterograph2,
-        create_heterograph3,
-        create_heterograph4,
-    ]
-    for graph_f in graph_fs:
-        g = graph_f(idxtype)
-        gs = cugraph_dgl.cugraph_storage_from_heterograph(g)
-
-        assert_same_num_edges_can_etypes(gs, g)
-        assert_same_num_edges_etypes(gs, g)
-        assert_same_edge_feats_daskapi(gs, g)
-
-
-@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
-def test_heterograph_conversion_nodes(idxtype):
-    graph_fs = [
-        create_heterograph1,
-        create_heterograph2,
-        create_heterograph3,
-        create_heterograph4,
-    ]
-    for graph_f in graph_fs:
-        g = graph_f(idxtype)
-        gs = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g)
-
-        assert_same_num_nodes(gs, g)
-        assert_same_node_feats(gs, g)
-
-
-@pytest.mark.parametrize("idxtype", [th.int32, th.int64])
-def test_heterograph_conversion_edges(idxtype):
-    graph_fs = [
-        create_heterograph1,
-        create_heterograph2,
-        create_heterograph3,
-        create_heterograph4,
-    ]
-    for graph_f in graph_fs:
-        g = graph_f(idxtype)
-        gs = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g)
-
-        assert_same_num_edges_can_etypes(gs, g)
-        assert_same_num_edges_etypes(gs, g)
-        assert_same_edge_feats(gs, g)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
deleted file mode 100644
index a60db97b8d6..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import cugraph_dgl
-import pylibcugraph
-import cupy
-import numpy as np
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("direction", ["out", "in"])
-def test_graph_make_homogeneous_graph(direction):
-    df = karate.get_edgelist()
-    df.src = df.src.astype("int64")
-    df.dst = df.dst.astype("int64")
-    wgt = np.random.random((len(df),))
-
-    graph = cugraph_dgl.Graph()
-    num_nodes = max(df.src.max(), df.dst.max()) + 1
-    node_x = np.random.random((num_nodes,))
-
-    graph.add_nodes(
-        num_nodes, data={"num": torch.arange(num_nodes, dtype=torch.int64), "x": node_x}
-    )
-    graph.add_edges(df.src, df.dst, {"weight": wgt})
-    plc_dgl_graph = graph._graph(direction=direction)
-
-    assert graph.num_nodes() == num_nodes
-    assert graph.num_edges() == len(df)
-    assert graph.is_homogeneous
-    assert not graph.is_multi_gpu
-
-    assert (
-        graph.nodes() == torch.arange(num_nodes, dtype=torch.int64, device="cuda")
-    ).all()
-
-    assert graph.nodes[None]["x"] is not None
-    assert (graph.nodes[None]["x"] == torch.as_tensor(node_x, device="cuda")).all()
-    assert (
-        graph.nodes[None]["num"]
-        == torch.arange(num_nodes, dtype=torch.int64, device="cuda")
-    ).all()
-
-    assert (
-        graph.edges("eid", device="cuda")
-        == torch.arange(len(df), dtype=torch.int64, device="cuda")
-    ).all()
-    assert (graph.edges[None]["weight"] == torch.as_tensor(wgt, device="cuda")).all()
-
-    plc_expected_graph = pylibcugraph.SGGraph(
-        pylibcugraph.ResourceHandle(),
-        pylibcugraph.GraphProperties(is_multigraph=True, is_symmetric=False),
-        df.src if direction == "out" else df.dst,
-        df.dst if direction == "out" else df.src,
-        vertices_array=cupy.arange(num_nodes, dtype="int64"),
-    )
-
-    # Do the expensive check to make sure this test fails if an invalid
-    # graph is constructed.
-    v_actual, d_in_actual, d_out_actual = pylibcugraph.degrees(
-        pylibcugraph.ResourceHandle(),
-        plc_dgl_graph,
-        source_vertices=cupy.arange(num_nodes, dtype="int64"),
-        do_expensive_check=True,
-    )
-
-    v_exp, d_in_exp, d_out_exp = pylibcugraph.degrees(
-        pylibcugraph.ResourceHandle(),
-        plc_expected_graph,
-        source_vertices=cupy.arange(num_nodes, dtype="int64"),
-        do_expensive_check=True,
-    )
-
-    assert (v_actual == v_exp).all()
-    assert (d_in_actual == d_in_exp).all()
-    assert (d_out_actual == d_out_exp).all()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("direction", ["out", "in"])
-def test_graph_make_heterogeneous_graph(direction):
-    df = karate.get_edgelist()
-    df.src = df.src.astype("int64")
-    df.dst = df.dst.astype("int64")
-
-    graph = cugraph_dgl.Graph()
-    total_num_nodes = max(df.src.max(), df.dst.max()) + 1
-
-    num_nodes_group_1 = total_num_nodes // 2
-    num_nodes_group_2 = total_num_nodes - num_nodes_group_1
-
-    node_x_1 = np.random.random((num_nodes_group_1,))
-    node_x_2 = np.random.random((num_nodes_group_2,))
-
-    graph.add_nodes(num_nodes_group_1, {"x": node_x_1}, "type1")
-    graph.add_nodes(num_nodes_group_2, {"x": node_x_2}, "type2")
-
-    edges_11 = df[(df.src < num_nodes_group_1) & (df.dst < num_nodes_group_1)]
-    edges_12 = df[(df.src < num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
-    edges_21 = df[(df.src >= num_nodes_group_1) & (df.dst < num_nodes_group_1)]
-    edges_22 = df[(df.src >= num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
-
-    edges_12.dst -= num_nodes_group_1
-    edges_21.src -= num_nodes_group_1
-    edges_22.dst -= num_nodes_group_1
-    edges_22.src -= num_nodes_group_1
-
-    graph.add_edges(edges_11.src, edges_11.dst, etype=("type1", "e1", "type1"))
-    graph.add_edges(edges_12.src, edges_12.dst, etype=("type1", "e2", "type2"))
-    graph.add_edges(edges_21.src, edges_21.dst, etype=("type2", "e3", "type1"))
-    graph.add_edges(edges_22.src, edges_22.dst, etype=("type2", "e4", "type2"))
-
-    assert not graph.is_homogeneous
-    assert not graph.is_multi_gpu
-
-    # Verify graph.nodes()
-    assert (
-        graph.nodes() == torch.arange(total_num_nodes, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.nodes("type1")
-        == torch.arange(num_nodes_group_1, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.nodes("type2")
-        == torch.arange(num_nodes_group_2, dtype=torch.int64, device="cuda")
-    ).all()
-
-    # Verify graph.edges()
-    assert (
-        graph.edges("eid", etype=("type1", "e1", "type1"))
-        == torch.arange(len(edges_11), dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type1", "e2", "type2"))
-        == torch.arange(len(edges_12), dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type2", "e3", "type1"))
-        == torch.arange(len(edges_21), dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type2", "e4", "type2"))
-        == torch.arange(len(edges_22), dtype=torch.int64, device="cuda")
-    ).all()
-
-    # Use sampling call to check graph creation
-    # This isn't a test of cuGraph sampling with DGL; the options are
-    # set to verify the graph only.
-    plc_graph = graph._graph(direction)
-    sampling_output = pylibcugraph.uniform_neighbor_sample(
-        pylibcugraph.ResourceHandle(),
-        plc_graph,
-        start_list=cupy.arange(total_num_nodes, dtype="int64"),
-        h_fan_out=np.array([1, 1], dtype="int32"),
-        with_replacement=False,
-        do_expensive_check=True,
-        with_edge_properties=True,
-        prior_sources_behavior="exclude",
-        return_dict=True,
-    )
-
-    expected_etypes = {
-        0: "e1",
-        1: "e2",
-        2: "e3",
-        3: "e4",
-    }
-    expected_offsets = {
-        0: (0, 0),
-        1: (0, num_nodes_group_1),
-        2: (num_nodes_group_1, 0),
-        3: (num_nodes_group_1, num_nodes_group_1),
-    }
-    if direction == "in":
-        src_col = "minors"
-        dst_col = "majors"
-    else:
-        src_col = "majors"
-        dst_col = "minors"
-
-    # Looping over the output verifies that all edges are valid
-    # (and therefore, the graph is valid)
-    for i, etype in enumerate(sampling_output["edge_type"].tolist()):
-        eid = int(sampling_output["edge_id"][i])
-
-        srcs, dsts, eids = graph.edges(
-            "all", etype=expected_etypes[etype], device="cpu"
-        )
-
-        assert eids[eid] == eid
-        assert (
-            srcs[eid] == int(sampling_output[src_col][i]) - expected_offsets[etype][0]
-        )
-        assert (
-            dsts[eid] == int(sampling_output[dst_col][i]) - expected_offsets[etype][1]
-        )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py b/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
deleted file mode 100644
index eedda664c52..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_graph_mg.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-
-import cugraph_dgl
-import pylibcugraph
-import cupy
-import numpy as np
-
-import cudf
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph.gnn import (
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-    cugraph_comms_get_raft_handle,
-)
-
-from .utils import init_pytorch_worker
-
-pylibwholegraph = import_optional("pylibwholegraph")
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-def run_test_graph_make_homogeneous_graph_mg(rank, uid, world_size, direction):
-    init_pytorch_worker(rank, world_size, uid, init_wholegraph=True)
-
-    df = karate.get_edgelist()
-    df.src = df.src.astype("int64")
-    df.dst = df.dst.astype("int64")
-    wgt = np.random.random((len(df),))
-
-    graph = cugraph_dgl.Graph(
-        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
-    )
-
-    # The number of nodes is set globally but features can have
-    # any distribution across workers as long as they are in order.
-    global_num_nodes = max(df.src.max(), df.dst.max()) + 1
-    node_x = np.array_split(np.arange(global_num_nodes, dtype="int64"), world_size)[
-        rank
-    ]
-
-    # Each worker gets a shuffled, permuted version of the edgelist
-    df = df.sample(frac=1.0)
-    df.src = (df.src + rank) % global_num_nodes
-    df.dst = (df.dst + rank + 1) % global_num_nodes
-
-    graph.add_nodes(global_num_nodes, data={"x": node_x})
-    graph.add_edges(df.src, df.dst, {"weight": wgt})
-    plc_dgl_graph = graph._graph(direction=direction)
-
-    assert graph.num_nodes() == global_num_nodes
-    assert graph.num_edges() == len(df) * world_size
-    assert graph.is_homogeneous
-    assert graph.is_multi_gpu
-
-    assert (
-        graph.nodes()
-        == torch.arange(global_num_nodes, dtype=torch.int64, device="cuda")
-    ).all()
-    ix = torch.arange(len(node_x) * rank, len(node_x) * (rank + 1), dtype=torch.int64)
-    assert graph.nodes[ix]["x"] is not None
-    assert (graph.nodes[ix]["x"] == torch.as_tensor(node_x, device="cuda")).all()
-
-    assert (
-        graph.edges("eid", device="cuda")
-        == torch.arange(world_size * len(df), dtype=torch.int64, device="cuda")
-    ).all()
-    ix = torch.arange(len(df) * rank, len(df) * (rank + 1), dtype=torch.int64)
-    assert (graph.edges[ix]["weight"] == torch.as_tensor(wgt, device="cuda")).all()
-
-    plc_handle = pylibcugraph.ResourceHandle(
-        cugraph_comms_get_raft_handle().getHandle()
-    )
-
-    plc_expected_graph = pylibcugraph.MGGraph(
-        plc_handle,
-        pylibcugraph.GraphProperties(is_multigraph=True, is_symmetric=False),
-        [df.src] if direction == "out" else [df.dst],
-        [df.dst] if direction == "out" else [df.src],
-        vertices_array=[
-            cupy.array_split(cupy.arange(global_num_nodes, dtype="int64"), world_size)[
-                rank
-            ]
-        ],
-    )
-
-    # Do the expensive check to make sure this test fails if an invalid
-    # graph is constructed.
-    v_actual, d_in_actual, d_out_actual = pylibcugraph.degrees(
-        plc_handle,
-        plc_dgl_graph,
-        source_vertices=cupy.arange(global_num_nodes, dtype="int64"),
-        do_expensive_check=True,
-    )
-
-    v_exp, d_in_exp, d_out_exp = pylibcugraph.degrees(
-        plc_handle,
-        plc_expected_graph,
-        source_vertices=cupy.arange(global_num_nodes, dtype="int64"),
-        do_expensive_check=True,
-    )
-
-    assert (v_actual == v_exp).all()
-    assert (d_in_actual == d_in_exp).all()
-    assert (d_out_actual == d_out_exp).all()
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(
-    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
-)
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("direction", ["out", "in"])
-def test_graph_make_homogeneous_graph_mg(direction):
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_graph_make_homogeneous_graph_mg,
-        args=(
-            uid,
-            world_size,
-            direction,
-        ),
-        nprocs=world_size,
-    )
-
-
-def run_test_graph_make_heterogeneous_graph_mg(rank, uid, world_size, direction):
-    init_pytorch_worker(rank, world_size, uid)
-
-    df = karate.get_edgelist()
-    df.src = df.src.astype("int64")
-    df.dst = df.dst.astype("int64")
-
-    graph = cugraph_dgl.Graph(is_multi_gpu=True)
-    total_num_nodes = max(df.src.max(), df.dst.max()) + 1
-
-    # Each worker gets a shuffled, permuted version of the edgelist
-    df = df.sample(frac=1.0)
-    df.src = (df.src + rank) % total_num_nodes
-    df.dst = (df.dst + rank + 1) % total_num_nodes
-
-    num_nodes_group_1 = total_num_nodes // 2
-    num_nodes_group_2 = total_num_nodes - num_nodes_group_1
-
-    node_x_1 = np.array_split(np.random.random((num_nodes_group_1,)), world_size)[rank]
-    node_x_2 = np.array_split(np.random.random((num_nodes_group_2,)), world_size)[rank]
-
-    graph.add_nodes(num_nodes_group_1, {"x": node_x_1}, "type1")
-    graph.add_nodes(num_nodes_group_2, {"x": node_x_2}, "type2")
-
-    edges_11 = df[(df.src < num_nodes_group_1) & (df.dst < num_nodes_group_1)]
-    edges_12 = df[(df.src < num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
-    edges_21 = df[(df.src >= num_nodes_group_1) & (df.dst < num_nodes_group_1)]
-    edges_22 = df[(df.src >= num_nodes_group_1) & (df.dst >= num_nodes_group_1)]
-
-    edges_12.dst -= num_nodes_group_1
-    edges_21.src -= num_nodes_group_1
-    edges_22.dst -= num_nodes_group_1
-    edges_22.src -= num_nodes_group_1
-
-    total_edges_11 = torch.tensor(len(edges_11), device="cuda", dtype=torch.int64)
-    torch.distributed.all_reduce(total_edges_11, torch.distributed.ReduceOp.SUM)
-    total_edges_12 = torch.tensor(len(edges_12), device="cuda", dtype=torch.int64)
-    torch.distributed.all_reduce(total_edges_12, torch.distributed.ReduceOp.SUM)
-    total_edges_21 = torch.tensor(len(edges_21), device="cuda", dtype=torch.int64)
-    torch.distributed.all_reduce(total_edges_21, torch.distributed.ReduceOp.SUM)
-    total_edges_22 = torch.tensor(len(edges_22), device="cuda", dtype=torch.int64)
-    torch.distributed.all_reduce(total_edges_22, torch.distributed.ReduceOp.SUM)
-
-    graph.add_edges(edges_11.src, edges_11.dst, etype=("type1", "e1", "type1"))
-    graph.add_edges(edges_12.src, edges_12.dst, etype=("type1", "e2", "type2"))
-    graph.add_edges(edges_21.src, edges_21.dst, etype=("type2", "e3", "type1"))
-    graph.add_edges(edges_22.src, edges_22.dst, etype=("type2", "e4", "type2"))
-
-    assert not graph.is_homogeneous
-    assert graph.is_multi_gpu
-
-    # Verify graph.nodes()
-    assert (
-        graph.nodes() == torch.arange(total_num_nodes, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.nodes("type1")
-        == torch.arange(num_nodes_group_1, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.nodes("type2")
-        == torch.arange(num_nodes_group_2, dtype=torch.int64, device="cuda")
-    ).all()
-
-    # Verify graph.edges()
-    assert (
-        graph.edges("eid", etype=("type1", "e1", "type1"))
-        == torch.arange(total_edges_11, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type1", "e2", "type2"))
-        == torch.arange(total_edges_12, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type2", "e3", "type1"))
-        == torch.arange(total_edges_21, dtype=torch.int64, device="cuda")
-    ).all()
-    assert (
-        graph.edges("eid", etype=("type2", "e4", "type2"))
-        == torch.arange(total_edges_22, dtype=torch.int64, device="cuda")
-    ).all()
-
-    # Use sampling call to check graph creation
-    # This isn't a test of cuGraph sampling with DGL; the options are
-    # set to verify the graph only.
-    plc_graph = graph._graph(direction)
-    assert isinstance(plc_graph, pylibcugraph.MGGraph)
-    sampling_output = pylibcugraph.uniform_neighbor_sample(
-        graph._resource_handle,
-        plc_graph,
-        start_list=cupy.arange(total_num_nodes, dtype="int64"),
-        batch_id_list=cupy.full(total_num_nodes, rank, dtype="int32"),
-        label_list=cupy.arange(world_size, dtype="int32"),
-        label_to_output_comm_rank=cupy.arange(world_size, dtype="int32"),
-        h_fan_out=np.array([-1], dtype="int32"),
-        with_replacement=False,
-        do_expensive_check=True,
-        with_edge_properties=True,
-        prior_sources_behavior="exclude",
-        return_dict=True,
-    )
-
-    sdf = cudf.DataFrame(
-        {
-            "majors": sampling_output["majors"],
-            "minors": sampling_output["minors"],
-            "edge_id": sampling_output["edge_id"],
-            "edge_type": sampling_output["edge_type"],
-        }
-    )
-
-    expected_offsets = {
-        0: (0, 0),
-        1: (0, num_nodes_group_1),
-        2: (num_nodes_group_1, 0),
-        3: (num_nodes_group_1, num_nodes_group_1),
-    }
-    if direction == "in":
-        src_col = "minors"
-        dst_col = "majors"
-    else:
-        src_col = "majors"
-        dst_col = "minors"
-
-    edges_11["etype"] = 0
-    edges_12["etype"] = 1
-    edges_21["etype"] = 2
-    edges_22["etype"] = 3
-
-    cdf = cudf.concat([edges_11, edges_12, edges_21, edges_22])
-    for i in range(len(cdf)):
-        row = cdf.iloc[i]
-        etype = row["etype"]
-        src = row["src"] + expected_offsets[etype][0]
-        dst = row["dst"] + expected_offsets[etype][1]
-
-        f = sdf[
-            (sdf[src_col] == src) & (sdf[dst_col] == dst) & (sdf["edge_type"] == etype)
-        ]
-        assert len(f) > 0  # may be multiple, some could be on other GPU
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(
-    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
-)
-@pytest.mark.skipif(isinstance(dgl, MissingModule), reason="dgl not available")
-@pytest.mark.parametrize("direction", ["out", "in"])
-def test_graph_make_heterogeneous_graph_mg(direction):
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_graph_make_heterogeneous_graph_mg,
-        args=(
-            uid,
-            world_size,
-            direction,
-        ),
-        nprocs=world_size,
-    )
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py b/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
deleted file mode 100644
index 4be66758b43..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cudf
-import cupy as cp
-import numpy as np
-from cugraph_dgl.dataloading.utils.sampling_helpers import (
-    cast_to_tensor,
-    _get_renumber_map,
-    _split_tensor,
-    _get_tensor_d_from_sampled_df,
-    create_homogeneous_sampled_graphs_from_dataframe,
-    _get_source_destination_range,
-    _create_homogeneous_cugraph_dgl_nn_sparse_graph,
-    create_homogeneous_sampled_graphs_from_dataframe_csc,
-)
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-cugraph_dgl = import_optional("cugraph_dgl")
-
-
-def test_casting_empty_array():
-    ar = cp.zeros(shape=0, dtype=cp.int32)
-    ser = cudf.Series(ar)
-    output_tensor = cast_to_tensor(ser)
-    assert output_tensor.dtype == torch.int32
-
-
-def get_dummy_sampled_df():
-    df = cudf.DataFrame()
-    df["sources"] = [0, 0, 1, 0, 0, 1, 0, 0, 2] + [np.nan] * 4
-    df["destinations"] = [1, 2, 0, 1, 2, 1, 2, 0, 1] + [np.nan] * 4
-    df["batch_id"] = [0, 0, 0, 1, 1, 1, 2, 2, 2] + [np.nan] * 4
-    df["hop_id"] = [0, 1, 1, 0, 1, 1, 0, 1, 1] + [np.nan] * 4
-    df["map"] = [4, 7, 10, 13, 10, 11, 12, 13, 14, 15, 16, 17, 18]
-    df = df.astype("int32")
-    df["hop_id"] = df["hop_id"].astype("uint8")
-    df["map"] = df["map"].astype("int64")
-    return df
-
-
-def get_dummy_sampled_df_csc():
-    df_dict = dict(
-        minors=np.array(
-            [1, 1, 2, 1, 0, 3, 1, 3, 2, 3, 2, 4, 0, 1, 1, 0, 3, 2], dtype=np.int32
-        ),
-        major_offsets=np.arange(19, dtype=np.int64),
-        map=np.array(
-            [26, 29, 33, 22, 23, 32, 18, 29, 33, 33, 8, 30, 32], dtype=np.int32
-        ),
-        renumber_map_offsets=np.array([0, 4, 9, 13], dtype=np.int64),
-        label_hop_offsets=np.array([0, 1, 3, 6, 7, 9, 13, 14, 16, 18], dtype=np.int64),
-    )
-
-    # convert values to Series so that NaNs are padded automatically
-    return cudf.DataFrame({k: cudf.Series(v) for k, v in df_dict.items()})
-
-
-def test_get_renumber_map():
-
-    sampled_df = get_dummy_sampled_df()
-
-    df, renumber_map, renumber_map_batch_indices = _get_renumber_map(sampled_df)
-
-    # Ensure that map was dropped
-    assert "map" not in df.columns
-
-    expected_map = torch.as_tensor(
-        [10, 11, 12, 13, 14, 15, 16, 17, 18], dtype=torch.int32, device="cuda"
-    )
-    assert torch.equal(renumber_map, expected_map)
-
-    expected_batch_indices = torch.as_tensor([3, 6], dtype=torch.int32, device="cuda")
-    assert torch.equal(renumber_map_batch_indices, expected_batch_indices)
-
-    # Ensure we dropped the Nans for rows  corresponding to the renumber_map
-    assert len(df) == 9
-
-    t_ls = _split_tensor(renumber_map, renumber_map_batch_indices)
-    assert torch.equal(
-        t_ls[0], torch.as_tensor([10, 11, 12], dtype=torch.int64, device="cuda")
-    )
-    assert torch.equal(
-        t_ls[1], torch.as_tensor([13, 14, 15], dtype=torch.int64, device="cuda")
-    )
-    assert torch.equal(
-        t_ls[2], torch.as_tensor([16, 17, 18], dtype=torch.int64, device="cuda")
-    )
-
-
-def test_get_tensor_d_from_sampled_df():
-    df = get_dummy_sampled_df()
-    tensor_d = _get_tensor_d_from_sampled_df(df)
-
-    expected_maps = {}
-    expected_maps[0] = torch.as_tensor([10, 11, 12], dtype=torch.int64, device="cuda")
-    expected_maps[1] = torch.as_tensor([13, 14, 15], dtype=torch.int64, device="cuda")
-    expected_maps[2] = torch.as_tensor([16, 17, 18], dtype=torch.int64, device="cuda")
-
-    for batch_id, batch_td in tensor_d.items():
-        batch_df = df[df["batch_id"] == batch_id]
-        for hop_id, hop_t in batch_td.items():
-            if hop_id != "map":
-                hop_df = batch_df[batch_df["hop_id"] == hop_id]
-                assert torch.equal(hop_t["sources"], cast_to_tensor(hop_df["sources"]))
-                assert torch.equal(
-                    hop_t["destinations"], cast_to_tensor(hop_df["destinations"])
-                )
-
-        assert torch.equal(batch_td["map"], expected_maps[batch_id])
-
-
-def test_create_homogeneous_sampled_graphs_from_dataframe():
-    sampler = dgl.dataloading.MultiLayerNeighborSampler([2, 2])
-    g = dgl.graph(([0, 10, 20], [0, 0, 10])).to("cuda")
-    dgl_input_nodes, dgl_output_nodes, dgl_blocks = sampler.sample_blocks(
-        g, torch.as_tensor([0]).to("cuda")
-    )
-
-    # Directions are reversed in dgl
-    s1, d1 = dgl_blocks[0].edges()
-    s0, d0 = dgl_blocks[1].edges()
-    srcs = cp.concatenate([cp.asarray(s0), cp.asarray(s1)])
-    dsts = cp.concatenate([cp.asarray(d0), cp.asarray(d1)])
-
-    nids = dgl_blocks[0].srcdata[dgl.NID]
-    nids = cp.concatenate(
-        [cp.asarray([2]), cp.asarray([len(nids) + 2]), cp.asarray(nids)]
-    )
-
-    df = cudf.DataFrame()
-    df["sources"] = srcs
-    df["destinations"] = dsts
-    df["hop_id"] = [0] * len(s0) + [1] * len(s1)
-    df["batch_id"] = 0
-    df["map"] = nids
-
-    (
-        cugraph_input_nodes,
-        cugraph_output_nodes,
-        cugraph_blocks,
-    ) = create_homogeneous_sampled_graphs_from_dataframe(df)[0]
-
-    assert torch.equal(dgl_input_nodes, cugraph_input_nodes)
-    assert torch.equal(dgl_output_nodes, cugraph_output_nodes)
-
-    for c_block, d_block in zip(cugraph_blocks, dgl_blocks):
-        ce, cd = c_block.edges()
-        de, dd = d_block.edges()
-        assert torch.equal(ce, de)
-        assert torch.equal(cd, dd)
-
-
-def test_get_source_destination_range():
-    df = get_dummy_sampled_df()
-    output_d = _get_source_destination_range(df)
-
-    expected_output = {
-        (0, 0): {"sources_range": 0, "destinations_range": 1},
-        (0, 1): {"sources_range": 1, "destinations_range": 2},
-        (1, 0): {"sources_range": 0, "destinations_range": 1},
-        (1, 1): {"sources_range": 1, "destinations_range": 2},
-        (2, 0): {"sources_range": 0, "destinations_range": 2},
-        (2, 1): {"sources_range": 2, "destinations_range": 1},
-    }
-
-    assert output_d == expected_output
-
-
-def test__create_homogeneous_cugraph_dgl_nn_sparse_graph():
-    tensor_d = {
-        "sources_range": 1,
-        "destinations_range": 2,
-        "sources": torch.as_tensor([0, 0, 1, 1], dtype=torch.int64, device="cuda"),
-        "destinations": torch.as_tensor([0, 0, 1, 2], dtype=torch.int64, device="cuda"),
-    }
-
-    seednodes_range = 10
-    sparse_graph = _create_homogeneous_cugraph_dgl_nn_sparse_graph(
-        tensor_d, seednodes_range
-    )
-    assert sparse_graph.num_src_nodes() == 2
-    assert sparse_graph.num_dst_nodes() == seednodes_range + 1
-    assert isinstance(sparse_graph, cugraph_dgl.nn.SparseGraph)
-
-
-def test_create_homogeneous_sampled_graphs_from_dataframe_csc():
-    df = get_dummy_sampled_df_csc()
-    batches = create_homogeneous_sampled_graphs_from_dataframe_csc(df)
-
-    assert len(batches) == 3
-    assert torch.equal(batches[0][0], torch.IntTensor([26, 29, 33, 22]).cuda())
-    assert torch.equal(batches[1][0], torch.IntTensor([23, 32, 18, 29, 33]).cuda())
-    assert torch.equal(batches[2][0], torch.IntTensor([33, 8, 30, 32]).cuda())
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/utils.py b/python/cugraph-dgl/cugraph_dgl/tests/utils.py
deleted file mode 100644
index fa4eb05f297..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/utils.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-from cugraph.utilities.utils import import_optional
-from cugraph.gnn import cugraph_comms_init
-
-th = import_optional("torch")
-
-
-def assert_same_node_feats_daskapi(gs, g):
-    assert set(gs.ndata.keys()) == set(g.ndata.keys())
-
-    for key in g.ndata.keys():
-        for ntype in g.ntypes:
-            indices = th.arange(0, g.num_nodes(ntype), dtype=g.idtype).cuda()
-            if len(g.ntypes) <= 1 or ntype in g.ndata[key]:
-                g_output = g.get_node_storage(key=key, ntype=ntype).fetch(
-                    indices, device="cuda"
-                )
-                gs_output = gs.get_node_storage(key=key, ntype=ntype).fetch(indices)
-                equal_t = (gs_output != g_output).sum().cpu()
-                assert equal_t == 0
-
-
-def assert_same_node_feats(gs, g):
-    assert set(gs.ndata.keys()) == set(g.ndata.keys())
-    assert set(gs.ntypes) == set(g.ntypes)
-
-    for key in g.ndata.keys():
-        for ntype in g.ntypes:
-            if len(g.ntypes) <= 1 or ntype in g.ndata[key]:
-                indices = th.arange(0, g.num_nodes(ntype), dtype=g.idtype)
-
-                g_output = g.ndata[key]
-                gs_output = gs.ndata[key]
-
-                if len(g.ntypes) > 1:
-                    g_output = g_output[ntype]
-                    gs_output = gs_output[ntype]
-
-                g_output = g_output[indices]
-                gs_output = gs_output[indices]
-
-                equal_t = (gs_output != g_output).sum()
-                assert equal_t == 0
-
-
-def assert_same_num_nodes(gs, g):
-    for ntype in g.ntypes:
-        assert g.num_nodes(ntype) == gs.num_nodes(ntype)
-
-
-def assert_same_num_edges_can_etypes(gs, g):
-    for can_etype in g.canonical_etypes:
-        assert g.num_edges(can_etype) == gs.num_edges(can_etype)
-
-
-def assert_same_num_edges_etypes(gs, g):
-    for etype in g.etypes:
-        assert g.num_edges(etype) == gs.num_edges(etype)
-
-
-def assert_same_edge_feats_daskapi(gs, g):
-    assert set(gs.edata.keys()) == set(g.edata.keys())
-    for key in g.edata.keys():
-        for etype in g.canonical_etypes:
-            indices = th.arange(0, g.num_edges(etype), dtype=g.idtype).cuda()
-            if len(g.etypes) <= 1 or etype in g.edata[key]:
-                g_output = g.get_edge_storage(key=key, etype=etype).fetch(
-                    indices, device="cuda"
-                )
-                gs_output = gs.get_edge_storage(key=key, etype=etype).fetch(indices)
-                equal_t = (gs_output != g_output).sum().cpu()
-                assert equal_t == 0
-
-
-def assert_same_edge_feats(gs, g):
-    assert set(gs.edata.keys()) == set(g.edata.keys())
-    assert set(gs.canonical_etypes) == set(g.canonical_etypes)
-    assert set(gs.etypes) == set(g.etypes)
-
-    for key in g.edata.keys():
-        for etype in g.canonical_etypes:
-            if len(g.etypes) <= 1 or etype in g.edata[key]:
-                indices = th.arange(0, g.num_edges(etype), dtype=g.idtype).cuda()
-                g_output = g.edata[key]
-                gs_output = gs.edata[key]
-
-                if len(g.etypes) > 1:
-                    g_output = g_output[etype]
-                    gs_output = gs_output[etype]
-
-                g_output = g_output[indices]
-                gs_output = gs_output[indices]
-
-                equal_t = (gs_output != g_output).sum().cpu()
-                assert equal_t == 0
-
-
-def assert_same_sampling_len(dgl_g, cugraph_gs, nodes, fanout, edge_dir):
-    dgl_o = dgl_g.sample_neighbors(nodes, fanout=fanout, edge_dir=edge_dir)
-    cugraph_o = cugraph_gs.sample_neighbors(nodes, fanout=fanout, edge_dir=edge_dir)
-    assert cugraph_o.num_edges() == dgl_o.num_edges()
-    for etype in dgl_o.canonical_etypes:
-        assert dgl_o.num_edges(etype) == cugraph_o.num_edges(etype)
-
-
-def init_pytorch_worker(rank, world_size, cugraph_id, init_wholegraph=False):
-    import rmm
-
-    rmm.reinitialize(
-        devices=rank,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    th.cuda.set_device(rank)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    th.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    if init_wholegraph:
-        import pylibwholegraph
-
-        pylibwholegraph.torch.initialize.init(
-            rank,
-            world_size,
-            rank,
-            world_size,
-        )
-
-    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
diff --git a/python/cugraph-dgl/cugraph_dgl/typing.py b/python/cugraph-dgl/cugraph_dgl/typing.py
deleted file mode 100644
index a68463c3fd9..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/typing.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Union, Tuple
-from cugraph.utilities.utils import import_optional
-
-from cugraph_dgl.nn import SparseGraph
-
-import pandas
-import numpy
-import cupy
-import cudf
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-TensorType = Union[
-    "torch.Tensor",
-    "cupy.ndarray",
-    "numpy.ndarray",
-    "cudf.Series",
-    "pandas.Series",
-    List[int],
-]
-
-DGLSamplerOutput = Tuple[
-    "torch.Tensor",
-    "torch.Tensor",
-    List[Union["dgl.Block", SparseGraph]],
-]
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/__init__.py b/python/cugraph-dgl/cugraph_dgl/utils/__init__.py
deleted file mode 100644
index 081b2ae8260..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/utils/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py b/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
deleted file mode 100644
index 2ba04bd916f..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_conversion_utils.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Utils to convert b/w dgl heterograph to cugraph GraphStore
-from __future__ import annotations
-from typing import Dict, Tuple, Union
-
-from cugraph_dgl.typing import TensorType
-
-import cudf
-import pandas as pd
-import dask.dataframe as dd
-import dask_cudf
-from dask.distributed import get_client
-import cupy as cp
-import numpy as np
-from cugraph.utilities.utils import import_optional
-from cugraph.gnn.dgl_extensions.dgl_uniform_sampler import src_n, dst_n
-
-dgl = import_optional("dgl")
-F = import_optional("dgl.backend")
-torch = import_optional("torch")
-
-
-# Feature Tensor to DataFrame Utils
-def convert_to_column_major(t: torch.Tensor):
-    return t.t().contiguous().t()
-
-
-def create_ar_from_tensor(t: torch.Tensor):
-    t = convert_to_column_major(t)
-    if t.device.type == "cuda":
-        ar = cp.asarray(t)
-    else:
-        ar = t.numpy()
-    return ar
-
-
-def _create_edge_frame(src_t: torch.Tensor, dst_t: torch.Tensor, single_gpu: bool):
-    """
-    Create a edge dataframe from src_t and dst_t
-    """
-    src_ar = create_ar_from_tensor(src_t)
-    dst_ar = create_ar_from_tensor(dst_t)
-    edge_df = _create_df_from_edge_ar(src_ar, dst_ar, single_gpu=single_gpu)
-    edge_df = edge_df.rename(
-        columns={edge_df.columns[0]: src_n, edge_df.columns[1]: dst_n}
-    )
-    return edge_df
-
-
-def _create_df_from_edge_ar(src_ar, dst_ar, single_gpu=True):
-    if not single_gpu:
-        nworkers = len(get_client().scheduler_info()["workers"])
-        npartitions = nworkers * 1
-    if single_gpu:
-        df = cudf.DataFrame(data={src_n: src_ar, dst_n: dst_ar})
-    else:
-        if isinstance(src_ar, cp.ndarray):
-            src_ar = src_ar.get()
-        if isinstance(dst_ar, cp.ndarray):
-            dst_ar = dst_ar.get()
-
-        df = pd.DataFrame(data={src_n: src_ar, dst_n: dst_ar})
-        # Only save stuff in host memory
-        df = dd.from_pandas(df, npartitions=npartitions).persist()
-        df = df.map_partitions(cudf.DataFrame.from_pandas)
-
-    df = df.reset_index(drop=True)
-    return df
-
-
-def get_edges_dict_from_dgl_HeteroGraph(
-    graph: dgl.DGLHeteroGraph, single_gpu: bool
-) -> Dict[Tuple[str, str, str], Union[cudf.DataFrame, dask_cudf.DataFrame]]:
-    etype_d = {}
-    for can_etype in graph.canonical_etypes:
-        src_t, dst_t = graph.edges(form="uv", etype=can_etype)
-        etype_d[can_etype] = _create_edge_frame(src_t, dst_t, single_gpu)
-    return etype_d
-
-
-def add_ndata_from_dgl_HeteroGraph(gs, g):
-    for feat_name, feat in g.ndata.items():
-        if isinstance(feat, torch.Tensor):
-            assert len(g.ntypes) == 1
-            ntype = g.ntypes[0]
-            gs.ndata_storage.add_data(
-                feat_name=feat_name, type_name=ntype, feat_obj=feat
-            )
-        else:
-            for ntype, feat_t in feat.items():
-                gs.ndata_storage.add_data(
-                    feat_name=feat_name, type_name=ntype, feat_obj=feat_t
-                )
-
-
-def add_edata_from_dgl_HeteroGraph(gs, g):
-    for feat_name, feat in g.edata.items():
-        if isinstance(feat, torch.Tensor):
-            assert len(g.etypes) == 1
-            etype = g.etypes[0]
-            gs.edata_storage.add_data(
-                feat_name=feat_name, type_name=etype, feat_obj=feat
-            )
-        else:
-            for etype, feat_t in feat.items():
-                gs.edata_storage.add_data(
-                    feat_name=feat_name, type_name=etype, feat_obj=feat_t
-                )
-
-
-def _cast_to_torch_tensor(t: TensorType) -> "torch.Tensor":
-    if isinstance(t, torch.Tensor):
-        return t
-    elif isinstance(t, (cp.ndarray, cudf.Series)):
-        return torch.as_tensor(t, device="cuda")
-    elif isinstance(t, (pd.Series, np.ndarray)):
-        return torch.as_tensor(t, device="cpu")
-    return torch.as_tensor(t)
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_storage_utils.py b/python/cugraph-dgl/cugraph_dgl/utils/cugraph_storage_utils.py
deleted file mode 100644
index cc23aa910a5..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/utils/cugraph_storage_utils.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-from cugraph.gnn.dgl_extensions.utils.sampling import eid_n, src_n, dst_n
-from cugraph.utilities.utils import import_optional, MissingModule
-
-dgl = import_optional("dgl")
-F = import_optional("dgl.backend")
-
-
-def _assert_valid_canonical_etype(canonical_etype):
-    if not _is_valid_canonical_etype:
-        error_message = (
-            f"Invalid canonical_etype {canonical_etype} "
-            + "canonical etype should be is a string triplet (str, str, str)"
-            + "for source node type, edge type and destination node type"
-        )
-        raise dgl.DGLError(error_message)
-
-
-def _is_valid_canonical_etype(canonical_etype):
-    if not isinstance(canonical_etype, tuple):
-        return False
-
-    if len(canonical_etype) != 3:
-        return False
-
-    for t in canonical_etype:
-        if not isinstance(t, str):
-            return False
-    return True
-
-
-def add_edge_ids_to_edges_dict(edge_data_dict, edge_id_offset_d, id_dtype):
-    eids_data_dict = {}
-    for etype, df in edge_data_dict.items():
-        # Do not modify input by user
-        if len(df.columns) != 2:
-            raise ValueError(
-                "Provided dataframe in edge_dict contains more than 2 columns",
-                "DataFrame with only 2 columns is supported",
-                "Where first is treated as src and second as dst",
-            )
-        df = df.copy(deep=False)
-        df = df.rename(columns={df.columns[0]: src_n, df.columns[1]: dst_n})
-        df[eid_n] = id_dtype(1)
-        df[eid_n] = df[eid_n].cumsum()
-        df[eid_n] = df[eid_n] + edge_id_offset_d[etype] - 1
-        df[eid_n] = df[eid_n].astype(id_dtype)
-        eids_data_dict[etype] = df
-    return eids_data_dict
-
-
-def add_node_offset_to_edges_dict(edge_data_dict, node_id_offset_d):
-    for etype, df in edge_data_dict.items():
-        src_type, _, dst_type = etype
-        df[src_n] = df[src_n] + node_id_offset_d[src_type]
-        df[dst_n] = df[dst_n] + node_id_offset_d[dst_type]
-    return edge_data_dict
-
-
-if isinstance(F, MissingModule):
-    backend_dtype_to_np_dtype_dict = MissingModule("dgl")
-else:
-    backend_dtype_to_np_dtype_dict = {
-        F.bool: bool,
-        F.uint8: np.uint8,
-        F.int8: np.int8,
-        F.int16: np.int16,
-        F.int32: np.int32,
-        F.int64: np.int64,
-        F.float16: np.float16,
-        F.float32: np.float32,
-        F.float64: np.float64,
-    }
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/feature_storage.py b/python/cugraph-dgl/cugraph_dgl/utils/feature_storage.py
deleted file mode 100644
index 31917661557..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/utils/feature_storage.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-from cugraph.gnn import FeatureStore
-from cugraph.utilities.utils import import_optional
-
-torch = import_optional("torch")
-
-
-class dgl_FeatureStorage:
-    """
-    Storage for node/edge feature data.
-    """
-
-    def __init__(self, fs: FeatureStore, type_name: str, feat_name: str):
-        self.fs = fs
-        self.type_name = type_name
-        self.feat_name = feat_name
-
-    def fetch(self, indices, device=None, pin_memory=False, **kwargs):
-        """Fetch the features of the given node/edge IDs to the
-        given device.
-        Parameters
-        ----------
-        indices : Tensor
-            Node or edge IDs.
-        device : Device
-            Device context.
-        pin_memory : bool
-            Wether to use pin_memory for fetching features
-            pin_memory=True is currently not supported
-
-        Returns
-        -------
-        Tensor
-            Feature data stored in PyTorch Tensor.
-        """
-        if pin_memory:
-            raise ValueError("pinned memory not supported in dgl_FeatureStorage")
-        if isinstance(indices, torch.Tensor):
-            indices = indices.long()
-        t = self.fs.get_data(
-            indices=indices, type_name=self.type_name, feat_name=self.feat_name
-        )
-        if device:
-            return t.to(device)
-        else:
-            return t
diff --git a/python/cugraph-dgl/cugraph_dgl/view.py b/python/cugraph-dgl/cugraph_dgl/view.py
deleted file mode 100644
index 4de9406be07..00000000000
--- a/python/cugraph-dgl/cugraph_dgl/view.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import warnings
-
-from collections import defaultdict
-from collections.abc import MutableMapping
-from typing import Union, Dict, List, Tuple
-
-from cugraph.utilities.utils import import_optional
-
-import cugraph_dgl
-from cugraph_dgl.typing import TensorType
-from cugraph_dgl.utils.cugraph_conversion_utils import _cast_to_torch_tensor
-
-torch = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-class EmbeddingView:
-    def __init__(self, storage: "dgl.storages.base.FeatureStorage", ld: int):
-        self.__ld = ld
-        self.__storage = storage
-
-    def __getitem__(self, u: TensorType) -> "torch.Tensor":
-        u = _cast_to_torch_tensor(u)
-        try:
-            return self.__storage.fetch(
-                u,
-                "cuda",
-            )
-        except RuntimeError as ex:
-            warnings.warn(
-                "Got error accessing data, trying again with index on device: "
-                + str(ex)
-            )
-            return self.__storage.fetch(
-                u.cuda(),
-                "cuda",
-            )
-
-    @property
-    def shape(self) -> "torch.Size":
-        try:
-            f = self.__storage.fetch(torch.tensor([0]), "cpu")
-        except RuntimeError:
-            f = self.__storage.fetch(torch.tensor([0], device="cuda"), "cuda")
-        sz = [s for s in f.shape]
-        sz[0] = self.__ld
-        return torch.Size(tuple(sz))
-
-
-class HeteroEdgeDataView(MutableMapping):
-    """
-    Duck-typed version of DGL's HeteroEdgeDataView.
-    Used for accessing and modifying edge features.
-    """
-
-    def __init__(
-        self,
-        graph: "cugraph_dgl.Graph",
-        etype: Union[Tuple[str, str, str], List[Tuple[str, str, str]]],
-        edges: TensorType,
-    ):
-        self.__graph = graph
-        self.__etype = etype
-        self.__edges = edges
-
-    @property
-    def _etype(self) -> Tuple[str, str, str]:
-        return self.__etype
-
-    @property
-    def _graph(self) -> "cugraph_dgl.Graph":
-        return self.__graph
-
-    @property
-    def _edges(self) -> TensorType:
-        return self.__edges
-
-    def __getitem__(self, key: str):
-        if isinstance(self._etype, list):
-            return {
-                t: self._graph._get_e_emb(t, key, self._edges)
-                for t in self._etype
-                if self._graph._has_e_emb(t, key)
-            }
-
-        return self._graph._get_e_emb(self._etype, key, self._edges)
-
-    def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
-        if isinstance(self._etype, list):
-            if not isinstance(val, dict):
-                raise ValueError(
-                    "There are multiple edge types in this view. "
-                    "Expected a dictionary of values."
-                )
-            for t, v in val.items():
-                if t not in self._etype:
-                    raise ValueError("Attempted to modify a type out of view.")
-                self._graph.set_e_emb(t, self._edges, {key: v})
-        else:
-            if isinstance(val, dict):
-                raise ValueError(
-                    "There is only one edge type in this view. "
-                    "Expected a single tensor."
-                )
-            self._graph.set_e_emb(self._etype, self._edges, {key: v})
-
-    def __delitem__(self, key: str):
-        if isinstance(self._etype, list):
-            for t in self._etype:
-                self._graph.pop_e_emb(t, key)
-        else:
-            self._graph.pop_e_emb(self._etype, key)
-
-    def _transpose(self, fetch_vals=True):
-        if isinstance(self._etype, list):
-            tr = defaultdict(dict)
-            for etype in self._etype:
-                for key in self._graph._get_e_emb_keys(etype):
-                    tr[key][etype] = (
-                        self._graph._get_e_emb(etype, key, self._edges)
-                        if fetch_vals
-                        else []
-                    )
-        else:
-            tr = {}
-            for key in self._graph._get_e_emb_keys(self._etype):
-                tr[key] = (
-                    self._graph._get_e_emb(self._etype, key, self._edges)
-                    if fetch_vals
-                    else []
-                )
-
-        return tr
-
-    def __len__(self):
-        return len(self._transpose(fetch_vals=False))
-
-    def __iter__(self):
-        return iter(self._transpose())
-
-    def keys(self):
-        return self._transpose(fetch_vals=False).keys()
-
-    def values(self):
-        return self._transpose().values()
-
-    def __repr__(self):
-        return repr(self._transpose(fetch_vals=False))
-
-
-class HeteroNodeDataView(MutableMapping):
-    """
-    Duck-typed version of DGL's HeteroNodeDataView.
-    Used for accessing and modifying node features.
-    """
-
-    def __init__(
-        self,
-        graph: "cugraph_dgl.Graph",
-        ntype: Union[str, List[str]],
-        nodes: TensorType,
-    ):
-        self.__graph = graph
-        self.__ntype = ntype
-        self.__nodes = nodes
-
-    @property
-    def _ntype(self) -> str:
-        return self.__ntype
-
-    @property
-    def _graph(self) -> "cugraph_dgl.Graph":
-        return self.__graph
-
-    @property
-    def _nodes(self) -> TensorType:
-        return self.__nodes
-
-    def __getitem__(self, key: str):
-        if isinstance(self._ntype, list):
-            return {
-                t: self._graph._get_n_emb(t, key, self._nodes)
-                for t in self._ntype
-                if self._graph._has_n_emb(t, key)
-            }
-        else:
-            return self._graph._get_n_emb(self._ntype, key, self._nodes)
-
-    def __setitem__(self, key: str, val: Union[TensorType, Dict[str, TensorType]]):
-        if isinstance(self._ntype, list):
-            if not isinstance(val, dict):
-                raise ValueError(
-                    "There are multiple node types in this view. "
-                    "Expected a dictionary of values."
-                )
-            for t, v in val.items():
-                if t not in self._ntype:
-                    raise ValueError("Attempted to modify a type out of view.")
-                self._graph._set_n_emb(t, self._nodes, {key: v})
-        else:
-            if isinstance(val, dict):
-                raise ValueError(
-                    "There is only one node type in this view. "
-                    "Expected a single value tensor."
-                )
-            self._graph._set_n_emb(self._ntype, self._nodes, {key: val})
-
-    def __delitem__(self, key: str):
-        if isinstance(self._ntype, list):
-            for t in self._ntype:
-                self._graph._pop_n_emb(t, key)
-        else:
-            self._graph.pop_n_emb(self._ntype, key)
-
-    def _transpose(self, fetch_vals=True):
-        if isinstance(self._ntype, list):
-            tr = defaultdict(dict)
-            for ntype in self._ntype:
-                for key in self._graph._get_n_emb_keys(ntype):
-                    tr[key][ntype] = (
-                        self._graph._get_n_emb(ntype, key, self._nodes)
-                        if fetch_vals
-                        else []
-                    )
-        else:
-            tr = {}
-            for key in self._graph._get_n_emb_keys(self._ntype):
-                tr[key] = (
-                    self._graph._get_n_emb(self._ntype, key, self._nodes)
-                    if fetch_vals
-                    else []
-                )
-
-        return tr
-
-    def __len__(self):
-        return len(self._transpose(fetch_vals=False))
-
-    def __iter__(self):
-        return iter(self._transpose())
-
-    def keys(self):
-        return self._transpose(fetch_vals=False).keys()
-
-    def values(self):
-        return self._transpose().values()
-
-    def __repr__(self):
-        return repr(self._transpose(fetch_vals=False))
-
-
-class HeteroEdgeView:
-    """
-    Duck-typed version of DGL's HeteroEdgeView.
-    """
-
-    def __init__(self, graph):
-        self.__graph = graph
-
-    @property
-    def _graph(self) -> "cugraph_dgl.Graph":
-        return self.__graph
-
-    def __getitem__(self, key):
-        if isinstance(key, slice):
-            if not (key.start is None and key.stop is None and key.stop is None):
-                raise ValueError("Only full slices are supported in DGL.")
-            edges = dgl.base.ALL
-            etype = None
-        elif key is None:
-            edges = dgl.base.ALL
-            etype = None
-        elif isinstance(key, tuple):
-            if len(key) == 3:
-                edges = dgl.base.ALL
-                etype = key
-            else:
-                edges = key
-                etype = None
-        elif isinstance(key, str):
-            edges = dgl.base.ALL
-            etype = key
-        else:
-            edges = key
-            etype = None
-
-        return HeteroEdgeDataView(
-            graph=self.__graph,
-            etype=etype,
-            edges=edges,
-        )
-
-    def __call__(self, *args, **kwargs):
-        if "device" in kwargs:
-            return self.__graph.all_edges(*args, **kwargs)
-
-        return self.__graph.all_edges(*args, **kwargs, device="cuda")
-
-
-class HeteroNodeView:
-    """
-    Duck-typed version of DGL's HeteroNodeView.
-    """
-
-    def __init__(self, graph: "cugraph_dgl.Graph"):
-        self.__graph = graph
-
-    @property
-    def _graph(self) -> "cugraph_dgl.Graph":
-        return self.__graph
-
-    def __getitem__(self, key):
-        if isinstance(key, slice):
-            if not (key.start is None and key.stop is None and key.stop is None):
-                raise ValueError("Only full slices are supported in DGL.")
-            nodes = dgl.base.ALL
-            ntype = None
-        elif isinstance(key, tuple):
-            nodes, ntype = key
-        elif key is None or isinstance(key, str):
-            nodes = dgl.base.ALL
-            ntype = key
-        else:
-            nodes = key
-            ntype = None
-
-        return HeteroNodeDataView(graph=self.__graph, ntype=ntype, nodes=nodes)
-
-    def __call__(self, ntype=None):
-        return torch.arange(
-            0, self.__graph.num_nodes(ntype), dtype=self.__graph.idtype, device="cuda"
-        )
diff --git a/python/cugraph-dgl/examples/dataset_from_disk_cudf.ipynb b/python/cugraph-dgl/examples/dataset_from_disk_cudf.ipynb
deleted file mode 100644
index 15708f5dea6..00000000000
--- a/python/cugraph-dgl/examples/dataset_from_disk_cudf.ipynb
+++ /dev/null
@@ -1,269 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "bc952178-34c0-4f13-9003-478d4aa8cd4d",
-   "metadata": {},
-   "source": [
-    "# Testing Notebook for cugraph DGL vs DGL Upstream"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "d92a81b3-50ac-42ff-97e0-d636945f1f80",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"4\"\n",
-    "import cudf\n",
-    "import rmm\n",
-    "import torch\n",
-    "from rmm.allocators.torch import rmm_torch_allocator\n",
-    "rmm.reinitialize(initial_pool_size=15e9)\n",
-    "#Switch to async pool in case of memory issues due to fragmentation of the pool\n",
-    "#rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource(initial_pool_size=15e9))\n",
-    "torch.cuda.memory.change_current_allocator(rmm_torch_allocator)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "f304a5dd-1465-4054-846f-2308a19153fa",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "single_gpu = True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "b6f899ee",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def load_dgl_dataset(dataset_name='ogbn-products'):\n",
-    "    from ogb.nodeproppred import DglNodePropPredDataset\n",
-    "    dataset_root = '/raid/vjawa/gnn/'\n",
-    "    dataset =  DglNodePropPredDataset(name = dataset_name, root=dataset_root)\n",
-    "    split_idx = dataset.get_idx_split()\n",
-    "    train_idx, valid_idx, test_idx = split_idx[\"train\"], split_idx[\"valid\"], split_idx[\"test\"]\n",
-    "    g, label = dataset[0]\n",
-    "    g.ndata['label'] = label\n",
-    "    g = g.add_self_loop()\n",
-    "    g = g.to('cpu')\n",
-    "    return g, train_idx"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fdd59d3a-0c1d-425f-a337-34b09c675622",
-   "metadata": {},
-   "source": [
-    "# cuGraph DGL DataLoader"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "e1e84844-634e-451e-be74-939f9477562f",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import cugraph_dgl\n",
-    "import tempfile"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "3808055c-2d7d-4cc7-b1bd-2fe9edd6eb95",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!rm -rf \"/raid/vjawa/obgn_products_sampling/\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "eff3d77b",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "g, train_idx = load_dgl_dataset()\n",
-    "g = cugraph_dgl.cugraph_storage_from_heterograph(g, single_gpu=single_gpu)\n",
-    "\n",
-    "batch_size = 1024*2\n",
-    "fanout_vals=[25, 25]\n",
-    "sampler = cugraph_dgl.dataloading.NeighborSampler(fanout_vals)\n",
-    "dataloader = cugraph_dgl.dataloading.DataLoader(\n",
-    "    g,                               \n",
-    "    train_idx.to('cuda'),                        # train_nid must be on GPU.\n",
-    "    sampler,\n",
-    "    sampling_output_dir=\"/raid/vjawa/obgn_products_sampling/\", # Path to save sampling results to, Change to the fastest IO path available\n",
-    "    device=torch.device('cuda'),    # The device argument must be GPU.\n",
-    "    num_workers=0,                 # Number of workers must be 0.\n",
-    "    batch_size=batch_size,\n",
-    "    batches_per_partition=50,\n",
-    "    seeds_per_call=50*batch_size,\n",
-    "    drop_last=False,\n",
-    "    shuffle=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "94003c30-756f-4cdb-856a-dec16a5fb4dc",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "7.08 s ± 596 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit\n",
-    "batch_stats = {}\n",
-    "for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):\n",
-    "    batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "d8488e64-ba92-40c6-8e76-3898b1ca4317",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "del dataloader\n",
-    "del g"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b0a17523-53e9-4780-a9e1-eac4edd464e5",
-   "metadata": {},
-   "source": [
-    "# Pure DGL DataLoader"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "0d147756-6410-4b71-aac1-9ef1e3df8fff",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from dgl.dataloading import DataLoader, NeighborSampler\n",
-    "import dgl"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "7cb2cc68-b4ff-43f2-8b12-b2808510b3f2",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "g, train_idx = load_dgl_dataset()\n",
-    "batch_size = 1024*2\n",
-    "fanout_vals = [25, 25]\n",
-    "sampler = dgl.dataloading.MultiLayerNeighborSampler(fanout_vals)\n",
-    "dataloader = dgl.dataloading.DataLoader(\n",
-    "    g,                               \n",
-    "    train_idx.to(g.device),                        # train_nid must be on GPU.\n",
-    "    sampler,\n",
-    "    device=torch.device('cuda'),    # The device argument must be GPU.\n",
-    "    num_workers=0,                    # Number of workers must be 0.\n",
-    "    use_uva=False,\n",
-    "    batch_size=batch_size,\n",
-    "    drop_last=False,\n",
-    "    shuffle=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "7988aca2-7bfb-4200-ac87-008e30c670fb",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "7.34 s ± 353 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%timeit\n",
-    "dgl_batch_stats = {}\n",
-    "for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):\n",
-    "    dgl_batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "57022ea6-d2fc-4334-a086-82201e8814c8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "del dataloader\n",
-    "del g"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "a1325b9b48ed9084674a30242e696fec2a1a44bbc4c0ef7ed1d4392854f3d402"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/python/cugraph-dgl/examples/graphsage/README.MD b/python/cugraph-dgl/examples/graphsage/README.MD
deleted file mode 100644
index ca867f0b634..00000000000
--- a/python/cugraph-dgl/examples/graphsage/README.MD
+++ /dev/null
@@ -1,26 +0,0 @@
-Inductive Representation Learning on Large Graphs (GraphSAGE)
-============
-
-- Paper link: [http://papers.nips.cc/paper/6703-inductive-representation-learning-on-large-graphs.pdf](http://papers.nips.cc/paper/6703-inductive-representation-learning-on-large-graphs.pdf)
-- Author's code repo: [https://github.com/williamleif/graphsage-simple](https://github.com/williamleif/graphsage-simple)
-
-For advanced usages, including training with multi-gpu/multi-node, and PyTorch Lightning, etc., more examples can be found in [advanced](https://github.com/dmlc/dgl/tree/master/examples/pytorch/graphsage/advanced) and [dist](https://github.com/dmlc/dgl/tree/master/examples/pytorch/graphsage/dist) directory.
-
-Requirements
-------------
-
-```bash
-mamba install ogb torchmetrics -c conda-forge
-```
-
-How to run
--------
-
-
-### Minibatch training for node classification
-
-Train w/ mini-batch sampling with cugraph_storage backend for node classification on "ogbn-products"
-
-```bash
-python3 node_classification.py --mode=gpu_cugraph_dgl
-```
diff --git a/python/cugraph-dgl/examples/graphsage/node-classification-dask.py b/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
deleted file mode 100644
index 0481f9566bc..00000000000
--- a/python/cugraph-dgl/examples/graphsage/node-classification-dask.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Example modified from:
-# https://github.com/dmlc/dgl/blob/master/examples/pytorch/graphsage/node_classification.py
-
-# Ignore Warning
-import warnings
-import time
-import cugraph_dgl
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchmetrics.functional as MF
-import dgl
-import dgl.nn as dglnn
-from dgl.data import AsNodePredDataset
-from dgl.dataloading import (
-    DataLoader,
-    NeighborSampler,
-    MultiLayerFullNeighborSampler,
-)
-from ogb.nodeproppred import DglNodePropPredDataset
-import tqdm
-import argparse
-
-warnings.filterwarnings("ignore")
-
-
-def set_allocators():
-    import rmm
-    import cudf
-    import cupy
-    from rmm.allocators.torch import rmm_torch_allocator
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    mr = rmm.mr.CudaAsyncMemoryResource()
-    rmm.mr.set_current_device_resource(mr)
-    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-    cudf.set_option("spill", True)
-
-
-class SAGE(nn.Module):
-    def __init__(self, in_size, hid_size, out_size):
-        super().__init__()
-        self.layers = nn.ModuleList()
-        # three-layer GraphSAGE-mean
-        self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
-        self.layers.append(dglnn.SAGEConv(hid_size, hid_size, "mean"))
-        self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
-        self.dropout = nn.Dropout(0.5)
-        self.hid_size = hid_size
-        self.out_size = out_size
-
-    def forward(self, blocks, x):
-        h = x
-        for l_id, (layer, block) in enumerate(zip(self.layers, blocks)):
-            h = layer(block, h)
-            if l_id != len(self.layers) - 1:
-                h = F.relu(h)
-                h = self.dropout(h)
-        return h
-
-    def inference(self, g, device, batch_size):
-        """Conduct layer-wise inference to get all the node embeddings."""
-        all_node_ids = torch.arange(0, g.num_nodes()).to(device)
-        feat = g.get_node_storage(key="feat", ntype="_N").fetch(
-            all_node_ids, device=device
-        )
-
-        sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
-        dataloader = DataLoader(
-            g,
-            torch.arange(g.num_nodes()).to(g.device),
-            sampler,
-            device=device,
-            batch_size=batch_size,
-            shuffle=False,
-            drop_last=False,
-            num_workers=0,
-        )
-        buffer_device = torch.device("cpu")
-        pin_memory = buffer_device != device
-
-        for l_id, layer in enumerate(self.layers):
-            y = torch.empty(
-                g.num_nodes(),
-                self.hid_size if l_id != len(self.layers) - 1 else self.out_size,
-                device=buffer_device,
-                pin_memory=pin_memory,
-            )
-            feat = feat.to(device)
-            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
-                x = feat[input_nodes]
-                h = layer(blocks[0], x)  # len(blocks) = 1
-                if l_id != len(self.layers) - 1:
-                    h = F.relu(h)
-                    h = self.dropout(h)
-                # by design, our output nodes are contiguous
-                y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
-            feat = y
-        return y
-
-
-def evaluate(model, graph, dataloader):
-    model.eval()
-    ys = []
-    y_hats = []
-    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        with torch.no_grad():
-            if isinstance(graph.ndata["feat"], dict):
-                x = graph.ndata["feat"]["_N"][input_nodes]
-                label = graph.ndata["label"]["_N"][output_nodes]
-            else:
-                x = graph.ndata["feat"][input_nodes]
-                label = graph.ndata["label"][output_nodes]
-            ys.append(label)
-            y_hats.append(model(blocks, x))
-    num_classes = y_hats[0].shape[1]
-    return MF.accuracy(
-        torch.cat(y_hats),
-        torch.cat(ys),
-        task="multiclass",
-        num_classes=num_classes,
-    )
-
-
-def layerwise_infer(device, graph, nid, model, batch_size):
-    model.eval()
-    with torch.no_grad():
-        pred = model.inference(graph, device, batch_size)  # pred in buffer_device
-        pred = pred[nid]
-        label = graph.ndata["label"]
-        if isinstance(label, dict):
-            label = label["_N"]
-        label = label[nid].to(device).to(pred.device)
-        num_classes = pred.shape[1]
-        return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
-
-
-def train(args, device, g, dataset, model):
-    # create sampler & dataloader
-    train_idx = dataset.train_idx.to(device)
-    val_idx = dataset.val_idx.to(device)
-
-    use_uva = args.mode == "mixed"
-    batch_size = 1024
-    fanouts = [5, 10, 15]
-    sampler = NeighborSampler(fanouts)
-    train_dataloader = DataLoader(
-        g,
-        train_idx,
-        sampler,
-        device=device,
-        batch_size=batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=0,
-        use_uva=use_uva,
-    )
-    val_dataloader = DataLoader(
-        g,
-        val_idx,
-        sampler,
-        device=device,
-        batch_size=batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=0,
-        use_uva=use_uva,
-    )
-
-    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
-
-    for epoch in range(10):
-        model.train()
-        total_loss = 0
-        st = time.time()
-        for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
-            if isinstance(g.ndata["feat"], dict):
-                x = g.ndata["feat"]["_N"][input_nodes]
-                y = g.ndata["label"]["_N"][output_nodes]
-            else:
-                x = g.ndata["feat"][input_nodes]
-                y = g.ndata["label"][output_nodes]
-
-            y_hat = model(blocks, x)
-            loss = F.cross_entropy(y_hat, y)
-            opt.zero_grad()
-            loss.backward()
-            opt.step()
-            total_loss += loss.item()
-
-        et = time.time()
-
-        print(
-            f"Time taken for epoch {epoch} with batch_size {batch_size} = {et - st} s"
-        )
-        acc = evaluate(model, g, val_dataloader)
-        print(
-            "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
-                epoch, total_loss / (it + 1), acc.item()
-            )
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode",
-        default="gpu_cugraph_dgl",
-        choices=["cpu", "mixed", "gpu_dgl", "gpu_cugraph_dgl"],
-        help="Training mode."
-        " 'cpu' for CPU training,"
-        " 'mixed' for CPU-GPU mixed training, "
-        " 'gpu_dgl' for pure-GPU training, "
-        " 'gpu_cugraph_dgl' for pure-GPU training.",
-    )
-    args = parser.parse_args()
-    if not torch.cuda.is_available():
-        args.mode = "cpu"
-    if args.mode == "gpu_cugraph_dgl":
-        set_allocators()
-    print(f"Training in {args.mode} mode.")
-
-    # load and preprocess dataset
-    print("Loading data")
-    dataset = AsNodePredDataset(DglNodePropPredDataset("ogbn-products"))
-    g = dataset[0]
-    g = dgl.add_self_loop(g)
-    if args.mode == "gpu_cugraph_dgl":
-        g = cugraph_dgl.cugraph_storage_from_heterograph(g.to("cuda"))
-        del dataset.g
-
-    else:
-        g = g.to("cuda" if args.mode == "gpu_dgl" else "cpu")
-    device = torch.device(
-        "cpu" if args.mode == "cpu" or args.mode == "mixed" else "cuda"
-    )
-
-    # create GraphSAGE model
-    feat_shape = (
-        g.get_node_storage(key="feat", ntype="_N")
-        .fetch(torch.LongTensor([0]).to(device), device=device)
-        .shape[1]
-    )
-    print(feat_shape)
-    # no ndata in cugraph storage object
-    in_size = feat_shape
-    out_size = dataset.num_classes
-    model = SAGE(in_size, 256, out_size).to(device)
-
-    # model training
-    print("Training...")
-    train(args, device, g, dataset, model)
-
-    # test the model
-    print("Testing...")
-    acc = layerwise_infer(device, g, dataset.test_idx, model, batch_size=4096)
-    print("Test Accuracy {:.4f}".format(acc.item()))
diff --git a/python/cugraph-dgl/examples/graphsage/node-classification.py b/python/cugraph-dgl/examples/graphsage/node-classification.py
deleted file mode 100644
index 56ac41c09b4..00000000000
--- a/python/cugraph-dgl/examples/graphsage/node-classification.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Example modified from:
-# https://github.com/dmlc/dgl/blob/master/examples/pytorch/graphsage/node_classification.py
-
-# Ignore Warning
-import warnings
-import tempfile
-import time
-import cugraph_dgl
-import cugraph_dgl.dataloading
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchmetrics.functional as MF
-import dgl
-import dgl.nn as dglnn
-from dgl.data import AsNodePredDataset
-from dgl.dataloading import (
-    DataLoader,
-    NeighborSampler,
-    MultiLayerFullNeighborSampler,
-)
-from ogb.nodeproppred import DglNodePropPredDataset
-import tqdm
-import argparse
-
-warnings.filterwarnings("ignore")
-
-
-def set_allocators():
-    import rmm
-    import cudf
-    import cupy
-    from rmm.allocators.torch import rmm_torch_allocator
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    mr = rmm.mr.CudaAsyncMemoryResource()
-    rmm.mr.set_current_device_resource(mr)
-    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-    cudf.set_option("spill", True)
-
-
-class SAGE(nn.Module):
-    def __init__(self, in_size, hid_size, out_size):
-        super().__init__()
-        self.layers = nn.ModuleList()
-        # three-layer GraphSAGE-mean
-        self.layers.append(dglnn.SAGEConv(in_size, hid_size, "mean"))
-        self.layers.append(dglnn.SAGEConv(hid_size, hid_size, "mean"))
-        self.layers.append(dglnn.SAGEConv(hid_size, out_size, "mean"))
-        self.dropout = nn.Dropout(0.5)
-        self.hid_size = hid_size
-        self.out_size = out_size
-
-    def forward(self, blocks, x):
-        h = x
-        for l_id, (layer, block) in enumerate(zip(self.layers, blocks)):
-            h = layer(block, h)
-            if l_id != len(self.layers) - 1:
-                h = F.relu(h)
-                h = self.dropout(h)
-        return h
-
-    def inference(self, g, device, batch_size):
-        """Conduct layer-wise inference to get all the node embeddings."""
-        all_node_ids = torch.arange(0, g.num_nodes()).to(device)
-        feat = g.ndata["feat"][all_node_ids].to(device)
-
-        if isinstance(g, cugraph_dgl.Graph):
-            sampler = cugraph_dgl.dataloading.NeighborSampler([-1])
-            loader_cls = cugraph_dgl.dataloading.FutureDataLoader
-        else:
-            sampler = MultiLayerFullNeighborSampler(1, prefetch_node_feats=["feat"])
-            loader_cls = DataLoader
-        dataloader = loader_cls(
-            g,
-            torch.arange(g.num_nodes()).to(device),
-            sampler,
-            device=device,
-            batch_size=batch_size,
-            shuffle=False,
-            drop_last=False,
-            num_workers=0,
-        )
-        buffer_device = torch.device("cpu")
-        pin_memory = buffer_device != device
-
-        for l_id, layer in enumerate(self.layers):
-            y = torch.empty(
-                g.num_nodes(),
-                self.hid_size if l_id != len(self.layers) - 1 else self.out_size,
-                device=buffer_device,
-                pin_memory=pin_memory,
-            )
-            feat = feat.to(device)
-            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
-                x = feat[input_nodes]
-                h = layer(blocks[0], x)  # len(blocks) = 1
-                if l_id != len(self.layers) - 1:
-                    h = F.relu(h)
-                    h = self.dropout(h)
-                # by design, our output nodes are contiguous
-                y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
-            feat = y
-        return y
-
-
-def evaluate(model, graph, dataloader):
-    model.eval()
-    ys = []
-    y_hats = []
-    for it, (input_nodes, output_nodes, blocks) in enumerate(dataloader):
-        with torch.no_grad():
-            if isinstance(graph.ndata["feat"], dict):
-                x = graph.ndata["feat"]["_N"][input_nodes]
-                label = graph.ndata["label"]["_N"][output_nodes]
-            else:
-                x = graph.ndata["feat"][input_nodes]
-                label = graph.ndata["label"][output_nodes]
-            ys.append(label)
-            y_hats.append(model(blocks, x))
-    num_classes = y_hats[0].shape[1]
-    return MF.accuracy(
-        torch.cat(y_hats),
-        torch.cat(ys),
-        task="multiclass",
-        num_classes=num_classes,
-    )
-
-
-def layerwise_infer(device, graph, nid, model, batch_size):
-    model.eval()
-    with torch.no_grad():
-        pred = model.inference(graph, device, batch_size)  # pred in buffer_device
-        pred = pred[nid]
-        label = graph.ndata["label"]
-        if isinstance(label, dict):
-            label = label["_N"]
-        label = label[nid].to(device).to(pred.device)
-        num_classes = pred.shape[1]
-        return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
-
-
-def train(args, device, g, dataset, model, directory):
-    # create sampler & dataloader
-    train_idx = dataset.train_idx.to(device)
-    val_idx = dataset.val_idx.to(device)
-
-    use_uva = args.mode == "mixed"
-    batch_size = 1024
-    fanouts = [5, 10, 15]
-    if isinstance(g, cugraph_dgl.Graph):
-        sampler = cugraph_dgl.dataloading.NeighborSampler(fanouts, directory=directory)
-        loader_cls = cugraph_dgl.dataloading.FutureDataLoader
-    else:
-        sampler = NeighborSampler(fanouts)
-        loader_cls = DataLoader
-    train_dataloader = loader_cls(
-        g,
-        train_idx,
-        sampler,
-        device=device,
-        batch_size=batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=0,
-        use_uva=use_uva,
-    )
-    val_dataloader = loader_cls(
-        g,
-        val_idx,
-        sampler,
-        device=device,
-        batch_size=batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=0,
-        use_uva=use_uva,
-    )
-
-    opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
-
-    for epoch in range(10):
-        model.train()
-        total_loss = 0
-        st = time.time()
-        for it, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
-            if isinstance(g.ndata["feat"], dict):
-                x = g.ndata["feat"]["_N"][input_nodes]
-                y = g.ndata["label"]["_N"][output_nodes]
-            else:
-                x = g.ndata["feat"][input_nodes]
-                y = g.ndata["label"][output_nodes]
-
-            y_hat = model(blocks, x)
-            loss = F.cross_entropy(y_hat, y)
-            opt.zero_grad()
-            loss.backward()
-            opt.step()
-            total_loss += loss.item()
-
-        et = time.time()
-
-        print(
-            f"Time taken for epoch {epoch} with batch_size {batch_size} = {et - st} s"
-        )
-        acc = evaluate(model, g, val_dataloader)
-        print(
-            "Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} ".format(
-                epoch, total_loss / (it + 1), acc.item()
-            )
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode",
-        default="gpu_cugraph_dgl",
-        choices=["cpu", "mixed", "gpu_dgl", "gpu_cugraph_dgl"],
-        help="Training mode."
-        " 'cpu' for CPU training,"
-        " 'mixed' for CPU-GPU mixed training, "
-        " 'gpu_dgl' for pure-GPU training, "
-        " 'gpu_cugraph_dgl' for pure-GPU training.",
-    )
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--tempdir_root", type=str, default=None)
-    args = parser.parse_args()
-    if not torch.cuda.is_available():
-        args.mode = "cpu"
-    if args.mode == "gpu_cugraph_dgl":
-        set_allocators()
-    print(f"Training in {args.mode} mode.")
-
-    # load and preprocess dataset
-    print("Loading data")
-    dataset = AsNodePredDataset(
-        DglNodePropPredDataset("ogbn-products", root=args.dataset_root)
-    )
-    g = dataset[0]
-    g = dgl.add_self_loop(g)
-    if args.mode == "gpu_cugraph_dgl":
-        g = cugraph_dgl.cugraph_dgl_graph_from_heterograph(g.to("cuda"))
-        del dataset.g
-
-    else:
-        g = g.to("cuda" if args.mode == "gpu_dgl" else "cpu")
-    device = torch.device(
-        "cpu" if args.mode == "cpu" or args.mode == "mixed" else "cuda"
-    )
-
-    # create GraphSAGE model
-    feat_shape = g.ndata["feat"].shape[1]
-    print(feat_shape)
-
-    in_size = feat_shape
-    out_size = dataset.num_classes
-    model = SAGE(in_size, 256, out_size).to(device)
-
-    # model training
-    print("Training...")
-    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
-        train(args, device, g, dataset, model, directory)
-
-    # test the model
-    print("Testing...")
-    acc = layerwise_infer(device, g, dataset.test_idx, model, batch_size=4096)
-    print("Test Accuracy {:.4f}".format(acc.item()))
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
deleted file mode 100644
index 3e0c0454905..00000000000
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/model.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# A graphsage GNN model using dgl for node classification
-# with three layers and mean aggregation
-import time
-import dgl
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchmetrics.functional as MF
-from cugraph_dgl.nn import SAGEConv
-import tqdm
-
-
-class Sage(nn.Module):
-    def __init__(self, in_size, hid_size, out_size):
-        super().__init__()
-        self.layers = nn.ModuleList()
-        # 2-layer GraphSAGE-mean
-        self.layers.append(SAGEConv(in_size, hid_size, "mean"))
-        self.layers.append(SAGEConv(hid_size, out_size, "mean"))
-        self.dropout = nn.Dropout(0.5)
-        self.hid_size = hid_size
-        self.out_size = out_size
-
-    def forward(self, blocks, x):
-        h = x
-        for l_id, (layer, block) in enumerate(zip(self.layers, blocks)):
-            h = layer(block, h)
-            if l_id != len(self.layers) - 1:
-                h = F.relu(h)
-                h = self.dropout(h)
-        return h
-
-    def inference(self, g, batch_size, device):
-        """
-        Inference with the GraphSAGE model on
-        full neighbors (i.e. without neighbor sampling).
-        g : the entire graph.
-        batch_size : the node number of each inference output
-        device : the inference device
-        """
-        # During inference with sampling,
-        # multi-layer blocks are very inefficient because
-        # lots of computations in the first few layers are repeated.
-        # Therefore, we compute the representation of all nodes layer by layer.
-        # The nodes on each layer are of course splitted in batches.
-
-        all_node_ids = torch.arange(0, g.num_nodes()).to(device)
-        feat = g.ndata["feat"][all_node_ids].to(device)
-
-        sampler = dgl.dataloading.MultiLayerFullNeighborSampler(
-            1, prefetch_node_feats=["feat"]
-        )
-        dataloader = dgl.dataloading.DataLoader(
-            g,
-            torch.arange(g.num_nodes(), dtype=torch.int32).to(g.device),
-            sampler,
-            device=device,
-            batch_size=batch_size,
-            shuffle=False,
-            drop_last=False,
-            num_workers=0,
-        )
-        buffer_device = torch.device("cpu")
-        pin_memory = buffer_device != device
-
-        for l_id, layer in enumerate(self.layers):
-            y = torch.empty(
-                g.num_nodes(),
-                self.hid_size if l_id != len(self.layers) - 1 else self.out_size,
-                device=buffer_device,
-                pin_memory=pin_memory,
-            )
-            feat = feat.to(device)
-            for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
-                x = feat[input_nodes]
-                h = layer(blocks[0], x)  # len(blocks) = 1
-                if l_id != len(self.layers) - 1:
-                    h = F.relu(h)
-                    h = self.dropout(h)
-                # by design, our output nodes are contiguous
-                y[output_nodes[0] : output_nodes[-1] + 1] = h.to(buffer_device)
-            feat = y
-        return y
-
-
-def layerwise_infer(graph, nid, model, batch_size, device):
-    model.eval()
-    with torch.no_grad():
-        pred = model.module.inference(
-            graph, batch_size, device
-        )  # pred in buffer_device
-        pred = pred[nid]
-        label = graph.ndata["label"]
-        if isinstance(label, dict):
-            label = label["_N"]
-        label = label[nid].to(pred.device)
-        num_classes = pred.shape[1]
-        label = label.squeeze(1)
-        return MF.accuracy(pred, label, task="multiclass", num_classes=num_classes)
-
-
-def train_model(model, g, opt, train_dataloader, num_epochs, rank, val_nid):
-    st = time.time()
-    model.train()
-    for epoch in range(num_epochs):
-        total_loss = 0
-        for _, (input_nodes, output_nodes, blocks) in enumerate(train_dataloader):
-            x = g.ndata["feat"][input_nodes].to(torch.float32)
-            y = g.ndata["label"][output_nodes].to(torch.int64)
-            y_hat = model(blocks, x)
-            y = y.squeeze(1)
-            loss = F.cross_entropy(y_hat, y)
-            opt.zero_grad()
-            loss.backward()
-            opt.step()
-            total_loss += loss.item()
-        print(
-            f"total loss: {total_loss} for epoch = {epoch} for rank = {rank}",
-            flush=True,
-        )
-    et = time.time()
-    print(
-        f"Total time taken for num_epochs {num_epochs} "
-        f"with batch_size {train_dataloader._batch_size} = {et - st} s on rank ={rank}"
-    )
-    if rank == 0:
-        val_acc = layerwise_infer(g, val_nid, model, 1024 * 5, "cuda")
-        print("---" * 30)
-        print("Validation Accuracy {:.4f}".format(val_acc))
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
deleted file mode 100644
index 11afe466014..00000000000
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_mnmg.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dgl
-import torch
-import time
-import tempfile
-import argparse
-import json
-import os
-import warnings
-
-from datetime import timedelta
-
-import cugraph_dgl
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-)
-
-from pylibwholegraph.torch.initialize import (
-    init as wm_init,
-    finalize as wm_finalize,
-)
-
-# Allow computation on objects that are larger than GPU memory
-# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
-os.environ["CUDF_SPILL"] = "1"
-
-
-def init_ddp_worker(global_rank, local_rank, world_size, cugraph_id):
-    import rmm
-
-    rmm.reinitialize(
-        devices=local_rank,
-        managed_memory=True,
-        pool_allocator=True,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(local_rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(local_rank)
-
-    cugraph_comms_init(
-        rank=global_rank, world_size=world_size, uid=cugraph_id, device=local_rank
-    )
-
-    wm_init(global_rank, world_size, local_rank, torch.cuda.device_count())
-
-
-def load_dgl_dataset(dataset_root="dataset", dataset_name="ogbn-products"):
-    from ogb.nodeproppred import DglNodePropPredDataset
-
-    dataset = DglNodePropPredDataset(root=dataset_root, name=dataset_name)
-    split_idx = dataset.get_idx_split()
-    train_idx, valid_idx, test_idx = (
-        split_idx["train"],
-        split_idx["valid"],
-        split_idx["test"],
-    )
-    g, label = dataset[0]
-    g.ndata["label"] = label
-    if len(g.etypes) <= 1:
-        g = dgl.add_self_loop(g)
-    else:
-        for etype in g.etypes:
-            if etype[0] == etype[2]:
-                # only add self loops for src->dst
-                g = dgl.add_self_loop(g, etype=etype)
-
-    g = g.int()
-    idx = {
-        "train": train_idx.int(),
-        "valid": valid_idx.int(),
-        "test": test_idx.int(),
-    }
-
-    return g, idx, dataset.num_classes
-
-
-def partition_data(
-    g, split_idx, num_classes, edge_path, feature_path, label_path, meta_path
-):
-    # Split and save edge index
-    os.makedirs(
-        edge_path,
-        exist_ok=True,
-    )
-    src, dst = g.all_edges(form="uv", order="eid")
-    edge_index = torch.stack([src, dst])
-    for (r, e) in enumerate(torch.tensor_split(edge_index, world_size, dim=1)):
-        rank_path = os.path.join(edge_path, f"rank={r}.pt")
-        torch.save(
-            e.clone(),
-            rank_path,
-        )
-
-    # Split and save features
-    os.makedirs(
-        feature_path,
-        exist_ok=True,
-    )
-
-    nix = torch.arange(g.num_nodes())
-    for (r, f) in enumerate(torch.tensor_split(nix, world_size)):
-        feat_path = os.path.join(feature_path, f"rank={r}_feat.pt")
-        torch.save(g.ndata["feat"][f], feat_path)
-
-        label_f_path = os.path.join(feature_path, f"rank={r}_label.pt")
-        torch.save(g.ndata["label"][f], label_f_path)
-
-    # Split and save labels
-    os.makedirs(
-        label_path,
-        exist_ok=True,
-    )
-    for (d, i) in split_idx.items():
-        i_parts = torch.tensor_split(i, world_size)
-        for r, i_part in enumerate(i_parts):
-            rank_path = os.path.join(label_path, f"rank={r}")
-            os.makedirs(rank_path, exist_ok=True)
-            torch.save(i_part, os.path.join(rank_path, f"{d}.pt"))
-
-    # Save metadata
-    meta = {
-        "num_classes": int(num_classes),
-        "num_nodes": int(g.num_nodes()),
-    }
-    with open(meta_path, "w") as f:
-        json.dump(meta, f)
-
-
-def load_partitioned_data(rank, edge_path, feature_path, label_path, meta_path):
-    g = cugraph_dgl.Graph(
-        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
-    )
-
-    # Load metadata
-    with open(meta_path, "r") as f:
-        meta = json.load(f)
-
-    # Load labels
-    split_idx = {}
-    for split in ["train", "test", "valid"]:
-        split_idx[split] = torch.load(
-            os.path.join(label_path, f"rank={rank}", f"{split}.pt")
-        )
-
-    # Load features
-    feat_t = torch.load(os.path.join(feature_path, f"rank={rank}_feat.pt"))
-    label_f_t = torch.load(os.path.join(feature_path, f"rank={rank}_label.pt"))
-    ndata = {"feat": feat_t, "label": label_f_t}
-    g.add_nodes(meta["num_nodes"], data=ndata)
-
-    # Load edge index
-    src, dst = torch.load(os.path.join(edge_path, f"rank={rank}.pt"))
-    g.add_edges(src.cuda(), dst.cuda(), data=None)
-
-    return g, split_idx, meta["num_classes"]
-
-
-def create_dataloader(gs, train_idx, device, temp_dir, stage):
-    import cugraph_dgl
-
-    temp_path = os.path.join(temp_dir, f"{stage}_{device}")
-    os.mkdir(temp_path)
-
-    sampler = cugraph_dgl.dataloading.NeighborSampler(
-        [10, 20],
-        directory=temp_path,
-        batches_per_partition=10,
-    )
-
-    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
-        gs,
-        train_idx,
-        sampler,
-        device=device,  # Put the sampled MFGs on CPU or GPU
-        use_ddp=True,  # Make it work with distributed data parallel
-        batch_size=1024,
-        shuffle=False,  # Whether to shuffle the nodes for every epoch
-        drop_last=False,
-        num_workers=0,
-    )
-    return dataloader
-
-
-def run_workflow(
-    global_rank, local_rank, world_size, g, split_idx, num_classes, temp_dir
-):
-    from model import Sage, train_model
-
-    # Below sets gpu_number
-    dev_id = local_rank
-    device = torch.device(f"cuda:{dev_id}")
-
-    dataloader = create_dataloader(g, split_idx["train"], device, temp_dir, "train")
-    print("Dataloader Creation Complete", flush=True)
-    num_feats = g.ndata["feat"].shape[1]
-    hid_size = 256
-    # Load Training example
-    model = Sage(num_feats, hid_size, num_classes).to(device)
-    model = torch.nn.parallel.DistributedDataParallel(
-        model,
-        device_ids=[device],
-        output_device=device,
-    )
-    torch.distributed.barrier()
-    n_epochs = 10
-    total_st = time.time()
-    opt = torch.optim.Adam(model.parameters(), lr=0.01)
-    train_model(model, g, opt, dataloader, n_epochs, global_rank, split_idx["valid"])
-    torch.distributed.barrier()
-    total_et = time.time()
-    print(
-        f"Total time taken on n_epochs {n_epochs} = {total_et - total_st} s",
-        f"measured by worker = {global_rank}",
-    )
-
-    wm_finalize()
-    cugraph_comms_shutdown()
-
-
-if __name__ == "__main__":
-    if "LOCAL_RANK" in os.environ:
-        parser = argparse.ArgumentParser()
-        parser.add_argument("--dataset_root", type=str, default="dataset")
-        parser.add_argument("--tempdir_root", type=str, default=None)
-        parser.add_argument("--dataset", type=str, default="ogbn-products")
-        parser.add_argument("--skip_partition", action="store_true")
-        args = parser.parse_args()
-
-        torch.distributed.init_process_group(
-            "nccl",
-            timeout=timedelta(minutes=60),
-        )
-        world_size = torch.distributed.get_world_size()
-        global_rank = torch.distributed.get_rank()
-        local_rank = int(os.environ["LOCAL_RANK"])
-        device = torch.device(local_rank)
-
-        # Create the uid needed for cuGraph comms
-        if global_rank == 0:
-            cugraph_id = [cugraph_comms_create_unique_id()]
-        else:
-            cugraph_id = [None]
-        torch.distributed.broadcast_object_list(cugraph_id, src=0, device=device)
-        cugraph_id = cugraph_id[0]
-
-        init_ddp_worker(global_rank, local_rank, world_size, cugraph_id)
-
-        # Split the data
-        edge_path = os.path.join(args.dataset_root, args.dataset + "_eix_part")
-        feature_path = os.path.join(args.dataset_root, args.dataset + "_fea_part")
-        label_path = os.path.join(args.dataset_root, args.dataset + "_label_part")
-        meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
-
-        if not args.skip_partition and global_rank == 0:
-            partition_data(
-                *load_dgl_dataset(args.dataset_root, args.dataset),
-                edge_path,
-                feature_path,
-                label_path,
-                meta_path,
-            )
-        torch.distributed.barrier()
-
-        print("loading partitions...")
-        g, split_idx, num_classes = load_partitioned_data(
-            rank=global_rank,
-            edge_path=edge_path,
-            feature_path=feature_path,
-            label_path=label_path,
-            meta_path=meta_path,
-        )
-        print(f"rank {global_rank} has loaded its partition")
-        torch.distributed.barrier()
-
-        with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
-            run_workflow(
-                global_rank,
-                local_rank,
-                world_size,
-                g,
-                split_idx,
-                num_classes,
-                directory,
-            )
-    else:
-        warnings.warn("This script should be run with 'torchrun`.  Exiting.")
diff --git a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py b/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
deleted file mode 100644
index 001d7fb82dc..00000000000
--- a/python/cugraph-dgl/examples/multi_trainer_MG_example/workflow_snmg.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dgl
-import torch
-import time
-import tempfile
-import argparse
-import os
-
-import cugraph_dgl
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-)
-
-from pylibwholegraph.torch.initialize import (
-    init as wm_init,
-    finalize as wm_finalize,
-)
-
-# Allow computation on objects that are larger than GPU memory
-# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
-os.environ["CUDF_SPILL"] = "1"
-
-
-def initalize_pytorch_worker(dev_id):
-    import cupy as cp
-    import rmm
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    dev = cp.cuda.Device(
-        dev_id
-    )  # Create cuda context on the right gpu, defaults to gpu-0
-    dev.use()
-    rmm.reinitialize(
-        pool_allocator=True,
-        initial_pool_size=10e9,
-        maximum_pool_size=15e9,
-        devices=[dev_id],
-    )
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(dev_id)
-    cp.cuda.set_allocator(rmm_cupy_allocator)
-    print("device_id", dev_id, flush=True)
-
-
-def load_dgl_dataset(
-    dataset_name="ogbn-products",
-    dataset_root=None,
-):
-    from ogb.nodeproppred import DglNodePropPredDataset
-
-    dataset = DglNodePropPredDataset(name=dataset_name, root=dataset_root)
-    split_idx = dataset.get_idx_split()
-    train_idx, valid_idx, test_idx = (
-        split_idx["train"],
-        split_idx["valid"],
-        split_idx["test"],
-    )
-    g, label = dataset[0]
-    g.ndata["label"] = label
-    if len(g.etypes) <= 1:
-        g = dgl.add_self_loop(g)
-    else:
-        for etype in g.etypes:
-            if etype[0] == etype[2]:
-                # only add self loops for src->dst
-                g = dgl.add_self_loop(g, etype=etype)
-
-    g = g.int()
-    train_idx = train_idx.int()
-    valid_idx = valid_idx.int()
-    test_idx = test_idx.int()
-    return g, train_idx, valid_idx, test_idx, dataset.num_classes
-
-
-def create_cugraph_graphstore_from_dgl_dataset(dataset, rank, world_size):
-    (g, train_idx, valid_idx, test_idx, num_classes) = dataset
-    # Partition the data
-    cg = cugraph_dgl.Graph(
-        is_multi_gpu=True, ndata_storage="wholegraph", edata_storage="wholegraph"
-    )
-
-    nix = torch.tensor_split(torch.arange(g.num_nodes()), world_size)[rank]
-    ndata = {k: g.ndata[k][nix].cuda() for k in g.ndata.keys()}
-
-    eix = torch.tensor_split(torch.arange(g.num_edges()), world_size)[rank]
-    src, dst = g.all_edges(form="uv", order="eid")
-    edata = {k: g.edata[k][eix].cuda() for k in g.edata.keys()}
-
-    cg.add_nodes(g.num_nodes(), data=ndata)
-    cg.add_edges(
-        torch.tensor_split(src, world_size)[rank].cuda(),
-        torch.tensor_split(dst, world_size)[rank].cuda(),
-        data=edata,
-    )
-
-    return (
-        cg,
-        torch.tensor_split(train_idx, world_size)[rank].to(torch.int64),
-        torch.tensor_split(valid_idx, world_size)[rank].to(torch.int64),
-        torch.tensor_split(test_idx, world_size)[rank].to(torch.int64),
-        num_classes,
-    )
-
-
-def create_dataloader(gs, train_idx, device, temp_dir, stage):
-    import cugraph_dgl
-
-    temp_path = os.path.join(temp_dir, f"{stage}_{device}")
-    os.mkdir(temp_path)
-
-    sampler = cugraph_dgl.dataloading.NeighborSampler(
-        [10, 20],
-        directory=temp_path,
-        batches_per_partition=10,
-    )
-    dataloader = cugraph_dgl.dataloading.FutureDataLoader(
-        gs,
-        train_idx,
-        sampler,
-        device=device,  # Put the sampled MFGs on CPU or GPU
-        use_ddp=True,  # Make it work with distributed data parallel
-        batch_size=1024,
-        shuffle=False,  # Whether to shuffle the nodes for every epoch
-        drop_last=False,
-        num_workers=0,
-    )
-    return dataloader
-
-
-def run_workflow(rank, world_size, cugraph_id, dataset, temp_dir):
-    from model import Sage, train_model
-
-    # Below sets gpu_number
-    dev_id = rank
-    initalize_pytorch_worker(dev_id)
-    device = torch.device(f"cuda:{dev_id}")
-
-    # Pytorch training worker initialization
-    dist_init_method = "tcp://{master_ip}:{master_port}".format(
-        master_ip="127.0.0.1", master_port="12346"
-    )
-
-    torch.distributed.init_process_group(
-        backend="nccl",
-        init_method=dist_init_method,
-        world_size=world_size,
-        rank=rank,
-    )
-
-    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
-    wm_init(rank, world_size, rank, world_size)
-
-    print(f"rank {rank}.", flush=True)
-    print("Initalized across GPUs.")
-
-    (
-        gs,
-        train_idx,
-        valid_idx,
-        test_idx,
-        num_classes,
-    ) = create_cugraph_graphstore_from_dgl_dataset(
-        dataset,
-        rank,
-        world_size,
-    )
-    del dataset
-
-    torch.distributed.barrier()
-    print(f"Loading graph to worker {rank} is complete", flush=True)
-
-    dataloader = create_dataloader(gs, train_idx, device, temp_dir, "train")
-    print("Dataloader Creation Complete", flush=True)
-    num_feats = gs.ndata["feat"].shape[1]
-    hid_size = 256
-    # Load Training example
-    model = Sage(num_feats, hid_size, num_classes).to(device)
-    model = torch.nn.parallel.DistributedDataParallel(
-        model,
-        device_ids=[device],
-        output_device=device,
-    )
-    torch.distributed.barrier()
-    n_epochs = 10
-    total_st = time.time()
-    opt = torch.optim.Adam(model.parameters(), lr=0.01)
-    train_model(model, gs, opt, dataloader, n_epochs, rank, valid_idx)
-    torch.distributed.barrier()
-    total_et = time.time()
-    print(
-        f"Total time taken on n_epochs {n_epochs} = {total_et - total_st} s",
-        f"measured by worker = {rank}",
-    )
-
-    torch.cuda.synchronize()
-    wm_finalize()
-    cugraph_comms_shutdown()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--tempdir_root", type=str, default=None)
-    parser.add_argument("--dataset", type=str, default="ogbn-products")
-    args = parser.parse_args()
-
-    from rmm.allocators.torch import rmm_torch_allocator
-
-    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-    # Create the uid needed for cuGraph comms
-    cugraph_id = cugraph_comms_create_unique_id()
-
-    ds = load_dgl_dataset(args.dataset, args.dataset_root)
-
-    world_size = torch.cuda.device_count()
-
-    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as directory:
-        torch.multiprocessing.spawn(
-            run_workflow,
-            args=(world_size, cugraph_id, ds, directory),
-            nprocs=world_size,
-        )
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
deleted file mode 100644
index af9e91a988e..00000000000
--- a/python/cugraph-dgl/pyproject.toml
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-[build-system]
-
-requires = [
-    "rapids-build-backend>=0.3.1,<0.4.0.dev0",
-    "setuptools>=61.0.0",
-    "wheel",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "rapids_build_backend.build"
-
-[project]
-name = "cugraph-dgl"
-dynamic = ["version"]
-description = "cugraph extensions for DGL"
-readme = { file = "README.md", content-type = "text/markdown" }
-authors = [
-    { name = "NVIDIA Corporation" },
-]
-license = { text = "Apache 2.0" }
-requires-python = ">=3.10"
-classifiers = [
-    "Intended Audience :: Developers",
-    "Programming Language :: Python",
-]
-dependencies = [
-    "cugraph==24.12.*,>=0.0.0a0",
-    "numba>=0.57",
-    "numpy>=1.23,<3.0a0",
-    "pylibcugraphops==24.12.*,>=0.0.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[project.optional-dependencies]
-test = [
-    "pandas",
-    "pylibwholegraph==24.12.*,>=0.0.0a0",
-    "pytest",
-    "pytest-benchmark",
-    "pytest-cov",
-    "pytest-xdist",
-    "scipy",
-    "tensordict>=0.1.2",
-    "torch>=2.3",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[project.urls]
-Homepage = "https://github.com/rapidsai/cugraph"
-Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
-
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "cugraph_dgl/VERSION"}
-
-[tool.setuptools.packages.find]
-include = [
-    "cugraph_dgl*",
-]
-
-[tool.rapids-build-backend]
-build-backend = "setuptools.build_meta"
-dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-dgl/tests/test_version.py b/python/cugraph-dgl/tests/test_version.py
deleted file mode 100644
index 343e4fb2675..00000000000
--- a/python/cugraph-dgl/tests/test_version.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import cugraph_dgl
-
-
-def test_version_constants_are_populated():
-    # __git_commit__ will only be non-empty in a built distribution
-    assert isinstance(cugraph_dgl.__git_commit__, str)
-
-    # __version__ should always be non-empty
-    assert isinstance(cugraph_dgl.__version__, str)
-    assert len(cugraph_dgl.__version__) > 0
diff --git a/python/cugraph-pyg/LICENSE b/python/cugraph-pyg/LICENSE
deleted file mode 120000
index 30cff7403da..00000000000
--- a/python/cugraph-pyg/LICENSE
+++ /dev/null
@@ -1 +0,0 @@
-../../LICENSE
\ No newline at end of file
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
deleted file mode 100644
index 5fbd947965f..00000000000
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# This file is generated by `rapids-dependency-file-generator`.
-# To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
-channels:
-- rapidsai
-- rapidsai-nightly
-- dask/label/dev
-- dglteam/label/th23_cu118
-- conda-forge
-- nvidia
-dependencies:
-- cugraph==24.12.*,>=0.0.0a0
-- pandas
-- pre-commit
-- pylibcugraphops==24.12.*,>=0.0.0a0
-- pytest
-- pytest-benchmark
-- pytest-cov
-- pytest-xdist
-- pytorch-cuda==11.8
-- pytorch>=2.3
-- pytorch_geometric>=2.5,<2.6
-- scipy
-- tensordict>=0.1.2
-name: cugraph_pyg_dev_cuda-118
diff --git a/python/cugraph-pyg/cugraph_pyg/VERSION b/python/cugraph-pyg/cugraph_pyg/VERSION
deleted file mode 120000
index d62dc733efd..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-../../../VERSION
\ No newline at end of file
diff --git a/python/cugraph-pyg/cugraph_pyg/__init__.py b/python/cugraph-pyg/cugraph_pyg/__init__.py
deleted file mode 100644
index e566e6e9fdd..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph_pyg._version import __git_commit__, __version__
-
-import cugraph_pyg.data
-import cugraph_pyg.loader
-import cugraph_pyg.sampler
-import cugraph_pyg.nn
diff --git a/python/cugraph-pyg/cugraph_pyg/_version.py b/python/cugraph-pyg/cugraph_pyg/_version.py
deleted file mode 100644
index 053b163116d..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/_version.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import importlib.resources
-
-# Read VERSION file from the module that is symlinked to VERSION file
-# in the root of the repo at build time or copied to the moudle at
-# installation. VERSION is a separate file that allows CI build-time scripts
-# to update version info (including commit hashes) without modifying
-# source files.
-__version__ = (
-    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
-)
-try:
-    __git_commit__ = (
-        importlib.resources.files(__package__)
-        .joinpath("GIT_COMMIT")
-        .read_text()
-        .strip()
-    )
-except FileNotFoundError:
-    __git_commit__ = ""
-
-__all__ = ["__git_commit__", "__version__"]
diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
deleted file mode 100644
index 6d51fd5ea01..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/data/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from cugraph_pyg.data.dask_graph_store import DaskGraphStore
-from cugraph_pyg.data.graph_store import GraphStore
-from cugraph_pyg.data.feature_store import (
-    TensorDictFeatureStore,
-    WholeFeatureStore,
-)
-
-
-def CuGraphStore(*args, **kwargs):
-    warnings.warn("CuGraphStore has been renamed to DaskGraphStore", FutureWarning)
-    return DaskGraphStore(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
deleted file mode 100644
index 6195f3118a4..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/data/dask_graph_store.py
+++ /dev/null
@@ -1,1321 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Any, Union, List, Dict
-
-from enum import Enum, auto
-
-from dataclasses import dataclass
-from collections import defaultdict
-from itertools import chain
-from functools import cached_property
-
-import numpy as np
-import cupy
-import pandas
-import cudf
-import cugraph
-import warnings
-
-import dask.array as dar
-import dask.dataframe as dd
-import dask.distributed as distributed
-import dask_cudf
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-Tensor = None if isinstance(torch, MissingModule) else torch.Tensor
-NdArray = None if isinstance(cupy, MissingModule) else cupy.ndarray
-DaskCudfSeries = None if isinstance(dask_cudf, MissingModule) else dask_cudf.Series
-
-TensorType = Union[Tensor, NdArray, cudf.Series, DaskCudfSeries]
-NodeType = (
-    None
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.typing.NodeType
-)
-EdgeType = (
-    None
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.typing.EdgeType
-)
-
-
-class EdgeLayout(Enum):
-    COO = "coo"
-    CSC = "csc"
-    CSR = "csr"
-
-
-@dataclass
-class CuGraphEdgeAttr:
-    """
-    Defines the attributes of an :obj:`GraphStore` edge.
-    """
-
-    # The type of the edge
-    edge_type: Optional[Any]
-
-    # The layout of the edge representation
-    layout: EdgeLayout
-
-    # Whether the edge index is sorted, by destination node. Useful for
-    # avoiding sorting costs when performing neighbor sampling, and only
-    # meaningful for COO (CSC and CSR are sorted by definition)
-    is_sorted: bool = False
-
-    # The number of nodes in this edge type. If set to None, will attempt to
-    # infer with the simple heuristic int(self.edge_index.max()) + 1
-    size: Optional[Tuple[int, int]] = None
-
-    # NOTE we define __post_init__ to force-cast layout
-    def __post_init__(self):
-        self.layout = EdgeLayout(self.layout)
-
-    @classmethod
-    def cast(cls, *args, **kwargs):
-        """
-        Cast to a CuGraphTensorAttr from a tuple, list, or dict.
-
-        Returns
-        -------
-        CuGraphTensorAttr
-            contains the data of the tuple, list, or dict passed in
-        """
-        if len(args) == 1 and len(kwargs) == 0:
-            elem = args[0]
-            if elem is None:
-                return None
-            if isinstance(elem, CuGraphEdgeAttr):
-                return elem
-            if isinstance(elem, (tuple, list)):
-                return cls(*elem)
-            if isinstance(elem, dict):
-                return cls(**elem)
-        return cls(*args, **kwargs)
-
-
-class _field_status(Enum):
-    UNSET = auto()
-
-
-@dataclass
-class CuGraphTensorAttr:
-    """
-    Defines the attributes of a class:`FeatureStore` tensor; in particular,
-    all the parameters necessary to uniquely identify a tensor from the feature
-    store.
-
-    Note that the order of the attributes is important; this is the order in
-    which attributes must be provided for indexing calls. Feature store
-    implementor classes can define a different ordering by overriding
-    :meth:`TensorAttr.__init__`.
-    """
-
-    # The group name that the tensor corresponds to. Defaults to UNSET.
-    group_name: Optional[str] = _field_status.UNSET
-
-    # The name of the tensor within its group. Defaults to UNSET.
-    attr_name: Optional[str] = _field_status.UNSET
-
-    # The node indices the rows of the tensor correspond to. Defaults to UNSET.
-    index: Optional[Any] = _field_status.UNSET
-
-    # The properties in the FeatureStore the rows of the tensor correspond to.
-    # Defaults to UNSET.
-    properties: Optional[Any] = _field_status.UNSET
-
-    # The datatype of the tensor.  Defaults to UNSET.
-    dtype: Optional[Any] = _field_status.UNSET
-
-    # Convenience methods
-
-    def is_set(self, key):
-        """
-        Whether an attribute is set in :obj:`TensorAttr`.
-        """
-        if key not in self.__dataclass_fields__:
-            raise KeyError(key)
-        attr = getattr(self, key)
-        return type(attr) is not _field_status or attr != _field_status.UNSET
-
-    def is_fully_specified(self):
-        """
-        Whether the :obj:`TensorAttr` has no unset fields.
-        """
-        return all([self.is_set(key) for key in self.__dataclass_fields__])
-
-    def fully_specify(self):
-        """
-        Sets all :obj:`UNSET` fields to :obj:`None`.
-        """
-        for key in self.__dataclass_fields__:
-            if not self.is_set(key):
-                setattr(self, key, None)
-        return self
-
-    def update(self, attr):
-        """
-        Updates an :class:`TensorAttr` with set attributes from another
-        :class:`TensorAttr`.
-        """
-        for key in self.__dataclass_fields__:
-            if attr.is_set(key):
-                setattr(self, key, getattr(attr, key))
-
-    @classmethod
-    def cast(cls, *args, **kwargs):
-        """
-        Casts to a CuGraphTensorAttr from a tuple, list, or dict
-
-        Returns
-        -------
-        CuGraphTensorAttr
-            contains the data of the tuple, list, or dict passed in
-        """
-        if len(args) == 1 and len(kwargs) == 0:
-            elem = args[0]
-            if elem is None:
-                return None
-            if isinstance(elem, CuGraphTensorAttr):
-                return elem
-            if isinstance(elem, (tuple, list)):
-                return cls(*elem)
-            if isinstance(elem, dict):
-                return cls(**elem)
-        return cls(*args, **kwargs)
-
-
-class DaskGraphStore:
-    """
-    Duck-typed version of PyG's GraphStore and FeatureStore that uses
-    Dask to distribute the graph structure across GPUs and a
-    cugraph.gnn.FeatureStore to store node/edge features.  Supports
-    single-node/single-GPU, single-node/multi-GPU, and multi-node/multi-GPU
-    configurations.  Supports both homogeneous and heterogeneous graphs.
-    """
-
-    # TODO allow (and possibly require) separate stores for node, edge attrs
-    # For now edge attrs are entirely unsupported.
-    # TODO add an "expensive check" argument that ensures the graph store
-    # and feature store are valid and compatible with PyG.
-    def __init__(
-        self,
-        F: cugraph.gnn.FeatureStore,
-        G: Union[
-            Dict[Tuple[str, str, str], Tuple[TensorType]],
-            Dict[Tuple[str, str, str], int],
-        ],
-        num_nodes_dict: Dict[str, int],
-        *,
-        multi_gpu: bool = False,
-        order: str = "CSR",
-    ):
-        """
-        Constructs a new DaskGraphStore from the provided
-        arguments.
-
-        Parameters
-        ----------
-        F: cugraph.gnn.FeatureStore (Required)
-            The feature store containing this graph's features.
-            Typed lexicographic-ordered numbering convention
-            should match that of the graph.
-
-        G: dict[str, tuple[TensorType]] or dict[str, int] (Required)
-            Dictionary of edge indices.
-            Option 1 (graph in memory):
-
-                Pass the edge indices: i.e.
-                {
-                ('author', 'writes', 'paper'): [[0,1,2],[2,0,1]],
-                ('author', 'affiliated', 'institution'): [[0,1],[0,1]]
-                }
-
-
-            Option 2 (graph not in memory):
-
-                Pass the number of edges: i.e.
-                {
-                ('author', 'writes', 'paper'): 2,
-                ('author', 'affiliated', 'institution'): 2
-                }
-                If the graph is not in memory, manipulating the edge indices
-                or calling sampling is not possible.  This is for cases where
-                sampling has already been done and samples were written to disk.
-
-            Note: the internal cugraph representation will use
-            offsetted vertex and edge ids.
-
-        num_nodes_dict: dict (Required)
-            A dictionary mapping each node type to the count of nodes
-            of that type in the graph.
-
-        multi_gpu: bool (Optional, default = False)
-            Whether the store should be backed by a multi-GPU graph.
-            Requires dask to have been set up.
-
-        order: str (Optional ["CSR", "CSC"], default = CSR)
-            The order to use for sampling.  CSR corresponds to the
-            standard OGB dataset order that is usually used in PyG.
-            CSC order constructs the same graph as CSR, but with
-            edges in the opposite direction.
-        """
-
-        if None in G:
-            raise ValueError("Unspecified edge types not allowed in PyG")
-
-        if order != "CSR" and order != "CSC":
-            raise ValueError("invalid valid for order")
-
-        self.__vertex_dtype = torch.int64
-
-        self._tensor_attr_cls = CuGraphTensorAttr
-        self._tensor_attr_dict = defaultdict(list)
-
-        construct_graph = True
-        if isinstance(next(iter(G.values())), int):
-            # User has passed in the number of edges
-            # (not the actual edge index), so the number of edges
-            # does not need to be counted.
-            num_edges_dict = dict(G)  # make sure the cugraph store owns this dict
-            construct_graph = False
-        else:
-            # User has passed in the actual edge index, so the
-            # number of edges needs to be counted.
-            num_edges_dict = {
-                pyg_can_edge_type: len(ei[0]) for pyg_can_edge_type, ei in G.items()
-            }
-
-        self.__infer_offsets(num_nodes_dict, num_edges_dict)
-        self.__infer_existing_tensors(F)
-        self.__infer_edge_types(num_nodes_dict, num_edges_dict)
-
-        self._edge_attr_cls = CuGraphEdgeAttr
-
-        self.__features = F
-        self.__graph = None
-        self.__is_graph_owner = False
-        self.__order = order
-
-        if construct_graph:
-            if multi_gpu:
-                self.__graph = distributed.get_client().get_dataset(
-                    "cugraph_graph", default=None
-                )
-
-            if self.__graph is None:
-                self.__graph = self.__construct_graph(
-                    G, multi_gpu=multi_gpu, order=order
-                )
-                self.__is_graph_owner = True
-
-        self.__subgraphs = {}
-
-    def __del__(self):
-        if self.__is_graph_owner:
-            if isinstance(self.__graph._plc_graph, dict):
-                try:
-                    distributed.get_client().unpublish_dataset("cugraph_graph")
-                except TypeError:
-                    warnings.warn(
-                        "Could not unpublish graph dataset, most likely because"
-                        " dask has already shut down."
-                    )
-            del self.__graph
-
-    def __make_offsets(self, input_dict):
-        offsets = {}
-        offsets["stop"] = [input_dict[v] for v in sorted(input_dict.keys())]
-        offsets["stop"] = torch.tensor(offsets["stop"]).cuda()
-
-        cumsum = offsets["stop"].cumsum(0)
-        offsets["start"] = cumsum - offsets["stop"]
-        offsets["stop"] = cumsum - 1
-
-        offsets["type"] = np.array(sorted(input_dict.keys()))
-
-        return offsets
-
-    def __infer_offsets(
-        self,
-        num_nodes_dict: Dict[str, int],
-        num_edges_dict: Dict[Tuple[str, str, str], int],
-    ) -> None:
-        """
-        Sets the vertex offsets for this store.
-        """
-        self.__vertex_type_offsets = self.__make_offsets(num_nodes_dict)
-
-        # Need to convert tuples to string in order to use searchsorted
-        # Can convert back using x.split('__')
-        # Lexicographic ordering is unchanged.
-        self.__edge_type_offsets = self.__make_offsets(
-            {
-                "__".join(pyg_can_edge_type): n
-                for pyg_can_edge_type, n in num_edges_dict.items()
-            }
-        )
-
-    def __dask_array_from_numpy(self, array: np.ndarray, npartitions: int):
-        return dar.from_array(
-            array,
-            meta=np.array([], dtype=array.dtype),
-            chunks=max(1, len(array) // npartitions),
-        )
-
-    def __construct_graph(
-        self,
-        edge_info: Dict[Tuple[str, str, str], List[TensorType]],
-        multi_gpu: bool = False,
-        order: str = "CSC",
-    ) -> cugraph.MultiGraph:
-        """
-        This function takes edge information and uses it to construct
-        a cugraph Graph.  It determines the numerical edge type by
-        sorting the keys of the input dictionary
-        (the canonical edge types).
-
-        Parameters
-        ----------
-        edge_info: Dict[Tuple[str, str, str], List[TensorType]] (Required)
-            Input edge info dictionary, where keys are the canonical
-            edge type and values are the edge index (src/dst).
-
-        multi_gpu: bool (Optional, default=False)
-            Whether to construct a single-GPU or multi-GPU cugraph Graph.
-            Defaults to a single-GPU graph.
-
-        order: str (CSC or CSR)
-            Essentially whether to reverse edges so that the cuGraph
-            sampling algorithm operates on the CSC matrix instead of
-            the CSR matrix.  Should nearly always be CSC unless there
-            is a specific expectation of reverse sampling, or correctness
-            testing is being performed.
-
-        Returns
-        -------
-        A newly-constructed directed cugraph.MultiGraph object.
-        """
-
-        # Ensure the original dict is not modified.
-        edge_info_cg = {}
-
-        if order != "CSR" and order != "CSC":
-            raise ValueError("Order must be either CSC (default) or CSR!")
-
-        # Iterate over the keys in sorted order so that the created
-        # numerical types correspond to the lexicographic order
-        # of the keys, which is critical to converting the numeric
-        # keys back to canonical edge types later.
-        # FIXME don't always convert to host arrays (#3383)
-        for pyg_can_edge_type in sorted(edge_info.keys()):
-            src_type, _, dst_type = pyg_can_edge_type
-            srcs, dsts = edge_info[pyg_can_edge_type]
-
-            src_offset = np.searchsorted(self.__vertex_type_offsets["type"], src_type)
-            srcs_t = srcs + int(self.__vertex_type_offsets["start"][src_offset])
-            if isinstance(srcs_t, torch.Tensor):
-                srcs_t = srcs_t.cpu()
-            else:
-                if isinstance(srcs_t, dask_cudf.Series):
-                    srcs_t = srcs_t.compute()
-                if isinstance(srcs_t, cudf.Series):
-                    srcs_t = srcs_t.values_host
-
-            dst_offset = np.searchsorted(self.__vertex_type_offsets["type"], dst_type)
-            dsts_t = dsts + int(self.__vertex_type_offsets["start"][dst_offset])
-            if isinstance(dsts_t, torch.Tensor):
-                dsts_t = dsts_t.cpu()
-            else:
-                if isinstance(dsts_t, dask_cudf.Series):
-                    dsts_t = dsts_t.compute()
-                if isinstance(dsts_t, cudf.Series):
-                    dsts_t = dsts_t.values_host
-
-            edge_info_cg[pyg_can_edge_type] = (srcs_t, dsts_t)
-
-        na_src = np.concatenate(
-            [
-                edge_info_cg[pyg_can_edge_type][0]
-                for pyg_can_edge_type in sorted(edge_info_cg.keys())
-            ]
-        )
-
-        na_dst = np.concatenate(
-            [
-                edge_info_cg[pyg_can_edge_type][1]
-                for pyg_can_edge_type in sorted(edge_info_cg.keys())
-            ]
-        )
-
-        et_offsets = self.__edge_type_offsets
-        na_etp = np.concatenate(
-            [
-                np.full(
-                    int(et_offsets["stop"][i] - et_offsets["start"][i] + 1),
-                    i,
-                    dtype="int32",
-                )
-                for i in range(len(self.__edge_type_offsets["start"]))
-            ]
-        )
-
-        vertex_dtype = na_src.dtype
-
-        if multi_gpu:
-            client = distributed.get_client()
-            nworkers = len(client.scheduler_info()["workers"])
-            npartitions = nworkers * 4
-
-            src_dar = self.__dask_array_from_numpy(na_src, npartitions)
-            del na_src
-
-            dst_dar = self.__dask_array_from_numpy(na_dst, npartitions)
-            del na_dst
-
-            etp_dar = self.__dask_array_from_numpy(na_etp, npartitions)
-            del na_etp
-
-            df = dd.from_dask_array(etp_dar, columns=["etp"])
-            df["src"] = dst_dar if order == "CSC" else src_dar
-            df["dst"] = src_dar if order == "CSC" else dst_dar
-
-            del src_dar
-            del dst_dar
-            del etp_dar
-
-            if df.etp.dtype != "int32":
-                raise ValueError("Edge type must be int32!")
-
-            # Ensure the dataframe is constructed on each partition
-            # instead of adding additional synchronization head from potential
-            # host to device copies.
-            def get_empty_df():
-                return cudf.DataFrame(
-                    {
-                        "etp": cudf.Series([], dtype="int32"),
-                        "src": cudf.Series([], dtype=vertex_dtype),
-                        "dst": cudf.Series([], dtype=vertex_dtype),
-                    }
-                )
-
-            # Have to check for empty partitions and handle them appropriately
-            df = df.persist()
-            df = df.map_partitions(
-                lambda f: cudf.DataFrame.from_pandas(f)
-                if len(f) > 0
-                else get_empty_df(),
-                meta=get_empty_df(),
-            ).reset_index(
-                drop=True
-            )  # should be ok for dask
-        else:
-            df = pandas.DataFrame(
-                {
-                    "src": pandas.Series(na_dst)
-                    if order == "CSC"
-                    else pandas.Series(na_src),
-                    "dst": pandas.Series(na_src)
-                    if order == "CSC"
-                    else pandas.Series(na_dst),
-                    "etp": pandas.Series(na_etp),
-                }
-            )
-            df = cudf.from_pandas(df)
-            df.reset_index(drop=True, inplace=True)
-
-        graph = cugraph.MultiGraph(directed=True)
-        if multi_gpu:
-            graph.from_dask_cudf_edgelist(
-                df,
-                source="src",
-                destination="dst",
-                edge_type="etp",
-            )
-            distributed.get_client().publish_dataset(cugraph_graph=graph)
-        else:
-            graph.from_cudf_edgelist(
-                df,
-                source="src",
-                destination="dst",
-                edge_type="etp",
-            )
-
-        del df
-        return graph
-
-    @property
-    def _edge_types_to_attrs(self) -> dict:
-        return dict(self.__edge_types_to_attrs)
-
-    @property
-    def order(self) -> str:
-        return self.__order
-
-    @property
-    def node_types(self) -> List[NodeType]:
-        return list(self.__vertex_type_offsets["type"])
-
-    @property
-    def edge_types(self) -> List[EdgeType]:
-        return list(self.__edge_types_to_attrs.keys())
-
-    def canonical_edge_type_to_numeric(self, etype: EdgeType) -> int:
-        return np.searchsorted(self.__edge_type_offsets["type"], "__".join(etype))
-
-    def numeric_edge_type_to_canonical(self, etype: int) -> EdgeType:
-        return tuple(self.__edge_type_offsets["type"][etype].split("__"))
-
-    @cached_property
-    def _is_delayed(self):
-        if self.__graph is None:
-            return False
-        return self.__graph.is_multi_gpu()
-
-    def _numeric_vertex_type_from_name(self, vertex_type_name: str) -> int:
-        return np.searchsorted(self.__vertex_type_offsets["type"], vertex_type_name)
-
-    def get_vertex_index(self, vtypes) -> TensorType:
-        if isinstance(vtypes, str):
-            vtypes = [vtypes]
-
-        ix = torch.tensor([], dtype=torch.int64)
-
-        if isinstance(self.__vertex_type_offsets, dict):
-            vtypes = np.searchsorted(self.__vertex_type_offsets["type"], vtypes)
-        for vtype in vtypes:
-            start = int(self.__vertex_type_offsets["start"][vtype])
-            stop = int(self.__vertex_type_offsets["stop"][vtype])
-            ix = torch.concatenate(
-                [
-                    ix,
-                    torch.arange(
-                        start, stop + 1, 1, dtype=self.__vertex_dtype, device="cuda"
-                    ),
-                ]
-            )
-
-        return ix
-
-    def put_edge_index(self, edge_index, edge_attr):
-        """
-        Adds additional edges to the graph.
-        Not yet implemented.
-        """
-        raise NotImplementedError("Adding indices not supported.")
-
-    def get_all_edge_attrs(self):
-        """
-        Gets a list of all edge types and indices in this store.
-
-        Returns
-        -------
-        list[str]
-            All edge types and indices in this store.
-        """
-        return self.__edge_types_to_attrs.values()
-
-    def _get_edge_index(self, attr: CuGraphEdgeAttr) -> Tuple[TensorType, TensorType]:
-        """
-        Returns the edge index in the requested format
-        (as defined by attr).  Currently, only unsorted
-        COO is supported, which is returned as a (src,dst)
-        tuple as expected by the PyG API.
-
-        Parameters
-        ----------
-        attr: CuGraphEdgeAttr
-            The CuGraphEdgeAttr specifying the
-            desired edge type, layout (i.e. CSR, COO, CSC), and
-            whether the returned index should be sorted (if COO).
-            Currently, only unsorted COO is supported.
-
-        Returns
-        -------
-        (src, dst) : Tuple[tensor type]
-            Tuple of the requested edge index in COO form.
-            Currently, only COO form is supported.
-        """
-
-        if self.__graph is None:
-            raise ValueError("Graph is not in memory, cannot access edge index!")
-
-        if attr.layout != EdgeLayout.COO:
-            # TODO support returning CSR/CSC (Issue #3802)
-            raise TypeError("Only COO direct access is supported!")
-
-        # Currently, graph creation enforces that input vertex ids are always of
-        # integer type.  Therefore, it is currently safe to assume that for MG
-        # graphs, the src/dst col names are renumbered_src/dst
-        # and for SG graphs, the src/dst col names are src/dst.
-        # This may change in the future if/when renumbering or the graph
-        # creation process is refactored.
-        # See Issue #3201 for more details.
-        # Also note src/dst are flipped so that cuGraph sampling is done in
-        # CSC format rather than CSR format.
-        if self._is_delayed:
-            dst_col_name = self.__graph.renumber_map.renumbered_src_col_name
-            src_col_name = self.__graph.renumber_map.renumbered_dst_col_name
-        else:
-            dst_col_name = self.__graph.srcCol
-            src_col_name = self.__graph.dstCol
-
-        # If there is only one edge type (homogeneous graph) then
-        # bypass the edge filters for a significant speed improvement.
-        if len(self.__edge_types_to_attrs) == 1:
-            if attr.edge_type not in self.__edge_types_to_attrs:
-                raise ValueError(
-                    f"Requested edge type {attr.edge_type}" "is not present in graph."
-                )
-
-            df = self.__graph.edgelist.edgelist_df[[src_col_name, dst_col_name]]
-            src_offset = 0
-            dst_offset = 0
-        else:
-            src_type, _, dst_type = attr.edge_type
-            src_offset = int(
-                self.__vertex_type_offsets["start"][
-                    self._numeric_vertex_type_from_name(src_type)
-                ]
-            )
-            dst_offset = int(
-                self.__vertex_type_offsets["start"][
-                    self._numeric_vertex_type_from_name(dst_type)
-                ]
-            )
-            coli = np.searchsorted(
-                self.__edge_type_offsets["type"], "__".join(attr.edge_type)
-            )
-
-            df = self.__graph.edgelist.edgelist_df[
-                [src_col_name, dst_col_name, self.__graph.edgeTypeCol]
-            ]
-            df = df[df[self.__graph.edgeTypeCol] == coli]
-            df = df[[src_col_name, dst_col_name]]
-
-        if self._is_delayed:
-            df = df.compute()
-
-        src = torch.as_tensor(df[src_col_name], device="cuda") - src_offset
-        dst = torch.as_tensor(df[dst_col_name], device="cuda") - dst_offset
-
-        src = src.to(self.__vertex_dtype)
-        dst = dst.to(self.__vertex_dtype)
-
-        if src.shape[0] != dst.shape[0]:
-            raise IndexError("src and dst shape do not match!")
-
-        return (src, dst)
-
-    def get_edge_index(self, *args, **kwargs) -> Tuple[TensorType, TensorType]:
-        """
-        Synchronously gets an edge_index tensor from the materialized
-        graph.
-
-        Args:
-            **attr(EdgeAttr): the edge attributes.
-
-        Returns:
-            EdgeTensorType: an edge_index tensor corresonding to the provided
-            attributes, or None if there is no such tensor.
-
-        Raises:
-            KeyError: if the edge index corresponding to attr was not found.
-        """
-
-        edge_attr = self._edge_attr_cls.cast(*args, **kwargs)
-        edge_attr.layout = EdgeLayout(edge_attr.layout)
-        # Override is_sorted for CSC and CSR:
-        # TODO treat is_sorted specially in this function, where is_sorted=True
-        # returns an edge index sorted by column.
-        edge_attr.is_sorted = edge_attr.is_sorted or (
-            edge_attr.layout in [EdgeLayout.CSC, EdgeLayout.CSR]
-        )
-        edge_index = self._get_edge_index(edge_attr)
-        if edge_index is None:
-            raise KeyError(f"An edge corresponding to '{edge_attr}' was not " f"found")
-        return edge_index
-
-    def _subgraph(self, edge_types: List[tuple] = None) -> cugraph.MultiGraph:
-        """
-        Returns a subgraph with edges limited to those of a given type
-
-        Parameters
-        ----------
-        edge_types : list of pyg canonical edge types
-            Directly references the graph's internal edge types.  Does
-            not accept PyG edge type tuples.
-
-        Returns
-        -------
-        The appropriate extracted subgraph.  Will extract the subgraph
-        if it has not already been extracted.
-
-        """
-        if self.__graph is None:
-            raise ValueError("Graph is not in memory, cannot get subgraph")
-
-        if edge_types is not None and set(edge_types) != set(
-            self.__edge_types_to_attrs.keys()
-        ):
-            raise ValueError(
-                "Subgraphing is currently unsupported, please"
-                " specify all edge types in the graph or leave"
-                " this argument empty."
-            )
-
-        return self.__graph
-
-    def _get_vertex_groups_from_sample(
-        self, nodes_of_interest: TensorType, is_sorted: bool = False
-    ) -> Dict[str, torch.Tensor]:
-        """
-        Given a tensor of nodes of interest, this
-        method a single dictionary, noi_index.
-
-        noi_index is the original vertex ids grouped by vertex type.
-
-        Example Input: [5, 2, 1, 10, 11, 8]
-        Output: {'red_vertex': [5, 1, 8], 'blue_vertex': [2], 'green_vertex': [10, 11]}
-
-        """
-
-        noi_index = {}
-
-        vtypes = cudf.Series(self.__vertex_type_offsets["type"])
-        if len(vtypes) == 1:
-            noi_index[vtypes.iloc[0]] = nodes_of_interest
-        else:
-            noi_type_indices = torch.searchsorted(
-                torch.as_tensor(self.__vertex_type_offsets["stop"], device="cuda"),
-                nodes_of_interest,
-            )
-
-            noi_types = vtypes.iloc[cupy.asarray(noi_type_indices)].reset_index(
-                drop=True
-            )
-            noi_starts = self.__vertex_type_offsets["start"][noi_type_indices]
-
-            noi_types = cudf.Series(noi_types, name="t").groupby("t").groups
-
-            for type_name, ix in noi_types.items():
-                # store the renumbering for this vertex type
-                # renumbered vertex id is the index of the old id
-                ix = torch.as_tensor(ix, device="cuda")
-                # subtract off the offsets
-                noi_index[type_name] = nodes_of_interest[ix] - noi_starts[ix]
-
-        return noi_index
-
-    def _get_sample_from_vertex_groups(
-        self, vertex_groups: Dict[str, TensorType]
-    ) -> TensorType:
-        """
-        Inverse of _get_vertex_groups_from_sample() (although with de-offsetted ids).
-        Given a dictionary of node types and de-offsetted node ids, return
-        the global (non-renumbered) vertex ids.
-
-        Example Input: {'horse': [1, 3, 5], 'duck': [1, 2]}
-        Output: [1, 3, 5, 14, 15]
-        """
-        t = torch.tensor([], dtype=torch.int64, device="cuda")
-
-        for group_name, ix in vertex_groups.items():
-            type_id = self._numeric_vertex_type_from_name(group_name)
-            if not ix.is_cuda:
-                ix = ix.cuda()
-            offset = self.__vertex_type_offsets["start"][type_id]
-            u = ix + offset
-            t = torch.concatenate([t, u])
-
-        return t
-
-    def _get_renumbered_edge_groups_from_sample(
-        self, sampling_results: cudf.DataFrame, noi_index: dict
-    ) -> Tuple[
-        Dict[Tuple[str, str, str], torch.Tensor],
-        Tuple[Dict[Tuple[str, str, str], torch.Tensor]],
-    ]:
-        """
-        Given a cudf (NOT dask_cudf) DataFrame of sampling results and a dictionary
-        of non-renumbered vertex ids grouped by vertex type, this method
-        outputs two dictionaries:
-            1. row_dict
-            2. col_dict
-        (1) row_dict corresponds to the renumbered source vertex ids grouped
-            by PyG edge type - (src, type, dst) tuple.
-        (2) col_dict corresponds to the renumbered destination vertex ids grouped
-            by PyG edge type (src, type, dst) tuple.
-        * The two outputs combined make a PyG "edge index".
-        * The ith element of each array corresponds to the same edge.
-        * The _get_vertex_groups_from_sample() method is usually called
-          before this one to get the noi_index.
-
-        Example Input: Series({
-                'majors': [0, 5, 11, 3],
-                'minors': [8, 2, 3, 5]},
-                'edge_type': [1, 3, 5, 14]
-            }),
-            {
-                'blue_vertex': [0, 5],
-                'red_vertex': [3, 11],
-                'green_vertex': [2, 8]
-            }
-        Output: {
-                ('blue', 'etype1', 'green'): [0, 1],
-                ('red', 'etype2', 'red'): [1],
-                ('red', 'etype3', 'blue'): [0]
-            },
-            {
-                ('blue', 'etype1', 'green'): [1, 0],
-                ('red', 'etype2', 'red'): [0],
-                ('red', 'etype3', 'blue'): [1]
-            }
-
-        """
-        row_dict = {}
-        col_dict = {}
-        # If there is only 1 edge type (includes heterogeneous graphs)
-        if len(self.edge_types) == 1:
-            t_pyg_type = list(self.__edge_types_to_attrs.values())[0].edge_type
-            src_type, _, dst_type = t_pyg_type
-
-            # If there is only 1 node type (homogeneous)
-            # This should only occur if the cuGraph loader was
-            # not used.  This logic is deprecated.
-            if len(self.node_types) == 1:
-                warnings.warn(
-                    "Renumbering after sampling for homogeneous graphs is deprecated.",
-                    FutureWarning,
-                )
-
-                # Create a dataframe mapping old ids to new ids.
-                vtype = src_type
-                id_table = noi_index[vtype]
-                id_map = cudf.Series(
-                    cupy.arange(id_table.shape[0], dtype="int32"),
-                    name="new_id",
-                    index=cupy.asarray(id_table),
-                ).sort_index()
-
-                # Renumber the majors using binary search
-                # Step 1: get the index of the new id
-                ix_r = torch.searchsorted(
-                    torch.as_tensor(id_map.index.values, device="cuda"),
-                    torch.as_tensor(sampling_results.majors.values, device="cuda"),
-                )
-                # Step 2: Go from id indices to actual ids
-                row_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
-                    ix_r
-                ]
-
-                # Renumber the minors using binary search
-                # Step 1: get the index of the new id
-                ix_c = torch.searchsorted(
-                    torch.as_tensor(id_map.index.values, device="cuda"),
-                    torch.as_tensor(sampling_results.minors.values, device="cuda"),
-                )
-                # Step 2: Go from id indices to actual ids
-                col_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
-                    ix_c
-                ]
-            else:
-                # Handle the heterogeneous case where there is only 1 edge type
-                dst_id_table = noi_index[dst_type]
-                dst_id_map = cudf.DataFrame(
-                    {
-                        "dst": cupy.asarray(dst_id_table),
-                        "new_id": cupy.arange(dst_id_table.shape[0]),
-                    }
-                ).set_index("dst")
-                dst = dst_id_map["new_id"].loc[sampling_results.minors]
-                col_dict[t_pyg_type] = torch.as_tensor(dst.values, device="cuda")
-
-                src_id_table = noi_index[src_type]
-                src_id_map = cudf.DataFrame(
-                    {
-                        "src": cupy.asarray(src_id_table),
-                        "new_id": cupy.arange(src_id_table.shape[0]),
-                    }
-                ).set_index("src")
-                src = src_id_map["new_id"].loc[sampling_results.majors]
-                row_dict[t_pyg_type] = torch.as_tensor(src.values, device="cuda")
-
-        else:
-            # This will retrieve the single string representation.
-            # It needs to be converted to a tuple in the for loop below.
-            eoi_types = (
-                cudf.Series(self.__edge_type_offsets["type"])
-                .iloc[sampling_results.edge_type.astype("int32")]
-                .reset_index(drop=True)
-            )
-
-            eoi_types = cudf.Series(eoi_types, name="t").groupby("t").groups
-
-            for pyg_can_edge_type_str, ix in eoi_types.items():
-                pyg_can_edge_type = tuple(pyg_can_edge_type_str.split("__"))
-
-                if self.__order == "CSR":
-                    src_type, _, dst_type = pyg_can_edge_type
-                else:  # CSC
-                    dst_type, _, src_type = pyg_can_edge_type
-
-                # Get the de-offsetted minors
-                dst_num_type = self._numeric_vertex_type_from_name(dst_type)
-                minors = torch.as_tensor(
-                    sampling_results.minors.iloc[ix].values, device="cuda"
-                )
-                minors -= self.__vertex_type_offsets["start"][dst_num_type]
-
-                # Create the col entry for this type
-                dst_id_table = noi_index[dst_type]
-                dst_id_map = (
-                    cudf.Series(cupy.asarray(dst_id_table), name="dst")
-                    .reset_index()
-                    .rename(columns={"index": "new_id"})
-                    .set_index("dst")
-                )
-                dst = dst_id_map["new_id"].loc[cupy.asarray(minors)]
-                col_dict[pyg_can_edge_type] = torch.as_tensor(dst.values, device="cuda")
-
-                # Get the de-offsetted majors
-                src_num_type = self._numeric_vertex_type_from_name(src_type)
-                majors = torch.as_tensor(
-                    sampling_results.majors.iloc[ix].values, device="cuda"
-                )
-                majors -= self.__vertex_type_offsets["start"][src_num_type]
-
-                # Create the row entry for this type
-                src_id_table = noi_index[src_type]
-                src_id_map = (
-                    cudf.Series(cupy.asarray(src_id_table), name="src")
-                    .reset_index()
-                    .rename(columns={"index": "new_id"})
-                    .set_index("src")
-                )
-                src = src_id_map["new_id"].loc[cupy.asarray(majors)]
-                row_dict[pyg_can_edge_type] = torch.as_tensor(src.values, device="cuda")
-
-        return row_dict, col_dict
-
-    def put_tensor(self, tensor, attr) -> None:
-        raise NotImplementedError("Adding properties not supported.")
-
-    def create_named_tensor(
-        self, attr_name: str, properties: List[str], vertex_type: str, dtype: str
-    ) -> None:
-        """
-        Create a named tensor that contains a subset of
-        properties in the graph.
-
-        Parameters
-        ----------
-        attr_name : str
-            The name of the tensor within its group.
-        properties : list[str]
-            The properties the rows
-            of the tensor correspond to.
-        vertex_type : str
-            The vertex type associated with this new tensor property.
-        dtype : numpy/cupy dtype (i.e. 'int32') or torch dtype (i.e. torch.float)
-            The datatype of the tensor.  Usually float32/float64.
-        """
-        self._tensor_attr_dict[vertex_type].append(
-            CuGraphTensorAttr(
-                vertex_type, attr_name, properties=properties, dtype=dtype
-            )
-        )
-
-    def __infer_edge_types(
-        self,
-        num_nodes_dict: Dict[str, int],
-        num_edges_dict: Dict[Tuple[str, str, str], int],
-    ) -> None:
-        self.__edge_types_to_attrs = {}
-
-        for pyg_can_edge_type in sorted(num_edges_dict.keys()):
-            sz_src = num_nodes_dict[pyg_can_edge_type[0]]
-            sz_dst = num_nodes_dict[pyg_can_edge_type[-1]]
-            self.__edge_types_to_attrs[pyg_can_edge_type] = CuGraphEdgeAttr(
-                edge_type=pyg_can_edge_type,
-                layout=EdgeLayout.COO,
-                is_sorted=False,
-                size=(sz_src, sz_dst),
-            )
-
-    def __infer_existing_tensors(self, F) -> None:
-        """
-        Infers the tensor attributes/features.
-        """
-        for attr_name, types_with_attr in F.get_feature_list().items():
-            for vt in types_with_attr:
-                attr_dtype = F.get_data(np.array([0]), vt, attr_name).dtype
-                self.create_named_tensor(
-                    attr_name=attr_name,
-                    properties=None,
-                    vertex_type=vt,
-                    dtype=attr_dtype,
-                )
-
-    def get_all_tensor_attrs(self) -> List[CuGraphTensorAttr]:
-        """
-        Obtains all tensor attributes stored in this feature store.
-        """
-        # unpack and return the list of lists
-        it = chain.from_iterable(self._tensor_attr_dict.values())
-        return [CuGraphTensorAttr.cast(c) for c in it]
-
-    def _get_tensor(self, attr: CuGraphTensorAttr) -> TensorType:
-        feature_backend = self.__features.backend
-        cols = attr.properties
-
-        idx = attr.index
-        if idx is not None:
-            if feature_backend in ["torch", "wholegraph"]:
-                if not isinstance(idx, torch.Tensor):
-                    raise TypeError(
-                        f"Type {type(idx)} invalid"
-                        f" for feature store backend {feature_backend}"
-                    )
-            elif feature_backend == "numpy":
-                # allow feature indexing through cupy arrays
-                if isinstance(idx, cupy.ndarray):
-                    idx = idx.get()
-                elif isinstance(idx, torch.Tensor):
-                    idx = np.asarray(idx.cpu())
-
-        if cols is None:
-            t = self.__features.get_data(idx, attr.group_name, attr.attr_name)
-            if idx is None:
-                t = t[-1]
-
-            if isinstance(t, np.ndarray):
-                t = torch.as_tensor(t, device="cpu")
-
-            return t
-
-        else:
-            t = self.__features.get_data(idx, attr.group_name, cols[0])
-
-            if len(t.shape) == 1:
-                t = torch.tensor([t])
-
-            for col in cols[1:]:
-                u = self.__features.get_data(idx, attr.group_name, col)
-
-                if len(u.shape) == 1:
-                    u = torch.tensor([u])
-
-                t = torch.concatenate([t, u])
-
-            return t
-
-    def _multi_get_tensor(self, attrs: List[CuGraphTensorAttr]) -> List[TensorType]:
-        return [self._get_tensor(attr) for attr in attrs]
-
-    def multi_get_tensor(self, attrs: List[CuGraphTensorAttr]) -> List[TensorType]:
-        """
-        Synchronously obtains a :class:`FeatureTensorType` object from the
-        feature store for each tensor associated with the attributes in
-        `attrs`.
-
-        Parameters
-        ----------
-        attrs (List[TensorAttr]): a list of :class:`TensorAttr` attributes
-        that identify the tensors to get.
-
-        Returns
-        -------
-        List[FeatureTensorType]: a Tensor of the same type as the index for
-        each attribute.
-
-        Raises
-        ------
-            KeyError: if a tensor corresponding to an attr was not found.
-            ValueError: if any input `TensorAttr` is not fully specified.
-        """
-        attrs = [
-            self._infer_unspecified_attr(self._tensor_attr_cls.cast(attr))
-            for attr in attrs
-        ]
-        bad_attrs = [attr for attr in attrs if not attr.is_fully_specified()]
-        if len(bad_attrs) > 0:
-            raise ValueError(
-                f"The input TensorAttr(s) '{bad_attrs}' are not fully "
-                f"specified. Please fully specify them by specifying all "
-                f"'UNSET' fields"
-            )
-
-        tensors = self._multi_get_tensor(attrs)
-
-        bad_attrs = [attrs[i] for i, v in enumerate(tensors) if v is None]
-        if len(bad_attrs) > 0:
-            raise KeyError(
-                f"Tensors corresponding to attributes " f"'{bad_attrs}' were not found"
-            )
-
-        return [tensor for attr, tensor in zip(attrs, tensors)]
-
-    def get_tensor(self, *args, **kwargs) -> TensorType:
-        """
-        Synchronously obtains a :class:`FeatureTensorType` object from the
-        feature store. Feature store implementors guarantee that the call
-        :obj:`get_tensor(put_tensor(tensor, attr), attr) = tensor` holds.
-
-        Parameters
-        ----------
-        **attr (TensorAttr): Any relevant tensor attributes that correspond
-            to the feature tensor. See the :class:`TensorAttr`
-            documentation for required and optional attributes. It is the
-            job of implementations of a :class:`FeatureStore` to store this
-            metadata in a meaningful way that allows for tensor retrieval
-            from a :class:`TensorAttr` object.
-
-        Returns
-        -------
-        FeatureTensorType: a Tensor of the same type as the index.
-
-        Raises
-        ------
-        KeyError: if the tensor corresponding to attr was not found.
-        ValueError: if the input `TensorAttr` is not fully specified.
-        """
-
-        attr = self._tensor_attr_cls.cast(*args, **kwargs)
-        attr = self._infer_unspecified_attr(attr)
-
-        if not attr.is_fully_specified():
-            raise ValueError(
-                f"The input TensorAttr '{attr}' is not fully "
-                f"specified. Please fully specify the input by "
-                f"specifying all 'UNSET' fields."
-            )
-
-        tensor = self._get_tensor(attr)
-        if tensor is None:
-            raise KeyError(f"A tensor corresponding to '{attr}' was not found")
-        return tensor
-
-    def _get_tensor_size(self, attr: CuGraphTensorAttr) -> Union[List, int]:
-        return self._get_tensor(attr).size()
-
-    def get_tensor_size(self, *args, **kwargs) -> Union[List, int]:
-        """
-        Obtains the size of a tensor given its attributes, or :obj:`None`
-        if the tensor does not exist.
-        """
-        attr = self._tensor_attr_cls.cast(*args, **kwargs)
-        if not attr.is_set("index"):
-            attr.index = None
-        return self._get_tensor_size(attr)
-
-    def _remove_tensor(self, attr):
-        raise NotImplementedError("Removing features not supported")
-
-    def _infer_unspecified_attr(self, attr: CuGraphTensorAttr) -> CuGraphTensorAttr:
-        if attr.properties == _field_status.UNSET:
-            # attempt to infer property names
-            if attr.group_name in self._tensor_attr_dict:
-                for n in self._tensor_attr_dict[attr.group_name]:
-                    if attr.attr_name == n.attr_name:
-                        attr.properties = n.properties
-            else:
-                raise KeyError(f"Invalid group name {attr.group_name}")
-
-        if attr.dtype == _field_status.UNSET:
-            # attempt to infer dtype
-            if attr.group_name in self._tensor_attr_dict:
-                for n in self._tensor_attr_dict[attr.group_name]:
-                    if attr.attr_name == n.attr_name:
-                        attr.dtype = n.dtype
-
-        return attr
-
-    def filter(
-        self,
-        format: str,
-        node_dict: Dict[str, torch.Tensor],
-        row_dict: Dict[str, torch.Tensor],
-        col_dict: Dict[str, torch.Tensor],
-        edge_dict: Dict[str, Tuple[torch.Tensor]],
-    ) -> torch_geometric.data.HeteroData:
-        """
-        Parameters
-        ----------
-        format: str
-            COO or CSC
-        node_dict: Dict[str, torch.Tensor]
-            IDs of nodes in original store being outputted
-        row_dict: Dict[str, torch.Tensor]
-            Renumbered output edge index row
-        col_dict: Dict[str, torch.Tensor]
-            Renumbered output edge index column
-        edge_dict: Dict[str, Tuple[torch.Tensor]]
-            Currently unused original edge mapping
-        """
-        data = torch_geometric.data.HeteroData()
-
-        # TODO use torch_geometric.EdgeIndex in release 24.04 (Issue #4051)
-        for attr in self.get_all_edge_attrs():
-            key = attr.edge_type
-            if key in row_dict and key in col_dict:
-                if format == "CSC":
-                    data.put_edge_index(
-                        (row_dict[key], col_dict[key]),
-                        edge_type=key,
-                        layout="csc",
-                        is_sorted=True,
-                    )
-                else:
-                    data[key].edge_index = torch.stack(
-                        [
-                            row_dict[key],
-                            col_dict[key],
-                        ],
-                        dim=0,
-                    )
-
-        required_attrs = []
-        # To prevent copying multiple times, we use a cache;
-        # the original node_dict serves as the gpu cache if needed
-        node_dict_cpu = {}
-        for attr in self.get_all_tensor_attrs():
-            if attr.group_name in node_dict:
-                device = self.__features.get_storage(attr.group_name, attr.attr_name)
-                attr.index = node_dict[attr.group_name]
-                if not isinstance(attr.index, torch.Tensor):
-                    raise ValueError("Node index must be a tensor!")
-                if attr.index.is_cuda and device == "cpu":
-                    if attr.group_name not in node_dict_cpu:
-                        node_dict_cpu[attr.group_name] = attr.index.cpu()
-                    attr.index = node_dict_cpu[attr.group_name]
-                elif attr.index.is_cpu and device == "cuda":
-                    node_dict_cpu[attr.group_name] = attr.index
-                    node_dict[attr.group_name] = attr.index.cuda()
-                    attr.index = node_dict[attr.group_name]
-
-                required_attrs.append(attr)
-                data[attr.group_name].num_nodes = attr.index.size(0)
-
-        tensors = self.multi_get_tensor(required_attrs)
-        for i, attr in enumerate(required_attrs):
-            data[attr.group_name][attr.attr_name] = tensors[i]
-
-        return data
-
-    def __len__(self):
-        return len(self.get_all_tensor_attrs())
diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
deleted file mode 100644
index b6450e7b192..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from typing import Optional, Tuple, List
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-tensordict = import_optional("tensordict")
-wgth = import_optional("pylibwholegraph.torch")
-
-
-class TensorDictFeatureStore(
-    object
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.data.FeatureStore
-):
-    """
-    A basic implementation of the PyG FeatureStore interface that stores
-    feature data in a single TensorDict.  This type of feature store is
-    not distributed, so each node will have to load the entire graph's
-    features into memory.
-    """
-
-    def __init__(self):
-        """
-        Constructs an empty TensorDictFeatureStore.
-        """
-        super().__init__()
-
-        self.__features = {}
-
-    def _put_tensor(
-        self,
-        tensor: "torch_geometric.typing.FeatureTensorType",
-        attr: "torch_geometric.data.feature_store.TensorAttr",
-    ) -> bool:
-        if attr.group_name in self.__features:
-            td = self.__features[attr.group_name]
-            batch_size = td.batch_size[0]
-
-            if attr.is_set("index"):
-                if attr.attr_name in td.keys():
-                    if attr.index.shape[0] != batch_size:
-                        raise ValueError(
-                            "Leading size of index tensor "
-                            "does not match existing tensors for group name "
-                            f"{attr.group_name}; Expected {batch_size}, "
-                            f"got {attr.index.shape[0]}"
-                        )
-                    td[attr.attr_name][attr.index] = tensor
-                    return True
-                else:
-                    warnings.warn(
-                        "Ignoring index parameter "
-                        f"(attribute does not exist for group {attr.group_name})"
-                    )
-
-            if tensor.shape[0] != batch_size:
-                raise ValueError(
-                    "Leading size of input tensor does not match "
-                    f"existing tensors for group name {attr.group_name};"
-                    f" Expected {batch_size}, got {tensor.shape[0]}"
-                )
-        else:
-            batch_size = tensor.shape[0]
-            self.__features[attr.group_name] = tensordict.TensorDict(
-                {}, batch_size=batch_size
-            )
-
-        self.__features[attr.group_name][attr.attr_name] = tensor
-        return True
-
-    def _get_tensor(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> Optional["torch_geometric.typing.FeatureTensorType"]:
-        if attr.group_name not in self.__features:
-            return None
-
-        if attr.attr_name not in self.__features[attr.group_name].keys():
-            return None
-
-        tensor = self.__features[attr.group_name][attr.attr_name]
-        return (
-            tensor
-            if (attr.index is None or (not attr.is_set("index")))
-            else tensor[attr.index]
-        )
-
-    def _remove_tensor(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> bool:
-        if attr.group_name not in self.__features:
-            return False
-
-        if attr.attr_name not in self.__features[attr.group_name].keys():
-            return False
-
-        del self.__features[attr.group_name][attr.attr_name]
-        return True
-
-    def _get_tensor_size(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> Tuple:
-        return self._get_tensor(attr).size()
-
-    def get_all_tensor_attrs(
-        self,
-    ) -> List["torch_geometric.data.feature_store.TensorAttr"]:
-        attrs = []
-        for group_name, td in self.__features.items():
-            for attr_name in td.keys():
-                attrs.append(
-                    torch_geometric.data.feature_store.TensorAttr(
-                        group_name,
-                        attr_name,
-                    )
-                )
-
-        return attrs
-
-
-class WholeFeatureStore(
-    object
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.data.FeatureStore
-):
-    """
-    A basic implementation of the PyG FeatureStore interface that stores
-    feature data in WholeGraph WholeMemory.  This type of feature store is
-    distributed, and avoids data replication across workers.
-
-    Data should be sliced before being passed into this feature store.
-    That means each worker should have its own partition and put_tensor
-    should be called for each worker's local partition.  When calling
-    get_tensor, multi_get_tensor, etc., the entire tensor can be accessed
-    regardless of what worker's partition the desired slice of the tensor
-    is on.
-    """
-
-    def __init__(self, memory_type="distributed", location="cpu"):
-        """
-        Constructs an empty WholeFeatureStore.
-
-        Parameters
-        ----------
-        memory_type: str (optional, default='distributed')
-            The memory type of this store.  Options are
-            'distributed', 'chunked', and 'continuous'.
-            For more information consult the WholeGraph
-            documentation.
-        location: str(optional, default='cpu')
-            The location ('cpu' or 'cuda') where data is stored.
-        """
-        super().__init__()
-
-        self.__features = {}
-
-        self.__wg_comm = wgth.get_global_communicator()
-        self.__wg_type = memory_type
-        self.__wg_location = location
-
-    def _put_tensor(
-        self,
-        tensor: "torch_geometric.typing.FeatureTensorType",
-        attr: "torch_geometric.data.feature_store.TensorAttr",
-    ) -> bool:
-        wg_comm_obj = self.__wg_comm
-
-        if attr.is_set("index"):
-            if (attr.group_name, attr.attr_name) in self.__features:
-                raise NotImplementedError(
-                    "Updating an embedding from an index"
-                    " is not supported by WholeGraph."
-                )
-            else:
-                warnings.warn(
-                    "Ignoring index parameter "
-                    f"(attribute does not exist for group {attr.group_name})"
-                )
-
-        if len(tensor.shape) > 2:
-            raise ValueError("Only 1-D or 2-D tensors are supported by WholeGraph.")
-
-        rank = torch.distributed.get_rank()
-        world_size = torch.distributed.get_world_size()
-
-        ld = torch.tensor(tensor.shape[0], device="cuda", dtype=torch.int64)
-        sizes = torch.empty((world_size,), device="cuda", dtype=torch.int64)
-        torch.distributed.all_gather_into_tensor(sizes, ld)
-
-        sizes = sizes.cpu()
-        ld = sizes.sum()
-
-        td = -1 if len(tensor.shape) == 1 else tensor.shape[1]
-        global_shape = [
-            int(ld),
-            td if td > 0 else 1,
-        ]
-
-        if td < 0:
-            tensor = tensor.reshape((tensor.shape[0], 1))
-
-        wg_embedding = wgth.create_wholememory_tensor(
-            wg_comm_obj,
-            self.__wg_type,
-            self.__wg_location,
-            global_shape,
-            tensor.dtype,
-            [global_shape[1], 1],
-        )
-
-        offset = sizes[:rank].sum() if rank > 0 else 0
-
-        wg_embedding.scatter(
-            tensor.clone(memory_format=torch.contiguous_format).cuda(),
-            torch.arange(
-                offset, offset + tensor.shape[0], dtype=torch.int64, device="cuda"
-            ).contiguous(),
-        )
-
-        wg_comm_obj.barrier()
-
-        self.__features[attr.group_name, attr.attr_name] = (wg_embedding, td)
-        return True
-
-    def _get_tensor(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> Optional["torch_geometric.typing.FeatureTensorType"]:
-        if (attr.group_name, attr.attr_name) not in self.__features:
-            return None
-
-        emb, td = self.__features[attr.group_name, attr.attr_name]
-
-        if attr.index is None or (not attr.is_set("index")):
-            attr.index = torch.arange(emb.shape[0], dtype=torch.int64)
-
-        attr.index = attr.index.cuda()
-        t = emb.gather(
-            attr.index,
-            force_dtype=emb.dtype,
-        )
-
-        if td < 0:
-            t = t.reshape((t.shape[0],))
-
-        return t
-
-    def _remove_tensor(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> bool:
-        if (attr.group_name, attr.attr_name) not in self.__features:
-            return False
-
-        del self.__features[attr.group_name, attr.attr_name]
-        return True
-
-    def _get_tensor_size(
-        self, attr: "torch_geometric.data.feature_store.TensorAttr"
-    ) -> Tuple:
-        return self.__features[attr.group_name, attr.attr_name].shape
-
-    def get_all_tensor_attrs(
-        self,
-    ) -> List["torch_geometric.data.feature_store.TensorAttr"]:
-        attrs = []
-        for (group_name, attr_name) in self.__features.keys():
-            attrs.append(
-                torch_geometric.data.feature_store.TensorAttr(
-                    group_name=group_name,
-                    attr_name=attr_name,
-                )
-            )
-
-        return attrs
diff --git a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py b/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
deleted file mode 100644
index c47dda5eaa5..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/data/graph_store.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import cupy
-import cudf
-import pandas
-
-import pylibcugraph
-
-from cugraph.utilities.utils import import_optional, MissingModule
-from cugraph.gnn.comms import cugraph_comms_get_raft_handle
-
-from typing import Union, Optional, List, Dict, Tuple
-
-
-# Have to use import_optional even though these are required
-# dependencies in order to build properly.
-torch_geometric = import_optional("torch_geometric")
-torch = import_optional("torch")
-tensordict = import_optional("tensordict")
-
-TensorType = Union["torch.Tensor", cupy.ndarray, np.ndarray, cudf.Series, pandas.Series]
-
-
-class GraphStore(
-    object
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.data.GraphStore
-):
-    """
-    cuGraph-backed PyG GraphStore implementation that distributes
-    the graph across workers.  This object uses lazy graph creation.
-    Users can repeatedly call put_edge_index, and the tensors won't
-    be converted into a cuGraph graph until one is needed
-    (i.e. when creating a loader). Supports
-    single-node/single-GPU, single-node/multi-GPU, and
-    multi-node/multi-GPU graph storage.
-
-    Each worker should have a slice of the graph locally, and
-    call put_edge_index with its slice.
-    """
-
-    def __init__(self, is_multi_gpu: bool = False):
-        """
-        Constructs a new, empty GraphStore object.  This object
-        represents one slice of a graph on particular worker.
-        """
-        self.__edge_indices = tensordict.TensorDict({}, batch_size=(2,))
-        self.__sizes = {}
-
-        self.__handle = None
-        self.__is_multi_gpu = is_multi_gpu
-
-        self.__clear_graph()
-
-        super().__init__()
-
-    def __clear_graph(self):
-        self.__graph = None
-        self.__vertex_offsets = None
-        self.__weight_attr = None
-
-    def _put_edge_index(
-        self,
-        edge_index: "torch_geometric.typing.EdgeTensorType",
-        edge_attr: "torch_geometric.data.EdgeAttr",
-    ) -> bool:
-        if edge_attr.layout != torch_geometric.data.graph_store.EdgeLayout.COO:
-            raise ValueError("Only COO format supported")
-
-        if isinstance(edge_index, (cupy.ndarray, cudf.Series)):
-            edge_index = torch.as_tensor(edge_index, device="cuda")
-        elif isinstance(edge_index, (np.ndarray)):
-            edge_index = torch.as_tensor(edge_index, device="cpu")
-        elif isinstance(edge_index, pandas.Series):
-            edge_index = torch.as_tensor(edge_index.values, device="cpu")
-        elif isinstance(edge_index, cudf.Series):
-            edge_index = torch.as_tensor(edge_index.values, device="cuda")
-
-        self.__edge_indices[edge_attr.edge_type] = torch.stack(
-            [edge_index[0], edge_index[1]]
-        )
-        self.__sizes[edge_attr.edge_type] = edge_attr.size
-
-        # invalidate the graph
-        self.__clear_graph()
-        return True
-
-    def _get_edge_index(
-        self, edge_attr: "torch_geometric.data.EdgeAttr"
-    ) -> Optional["torch_geometric.typing.EdgeTensorType"]:
-        ei = torch_geometric.EdgeIndex(self.__edge_indices[edge_attr.edge_type])
-
-        if edge_attr.layout == "csr":
-            return ei.sort_by("row").values.get_csr()
-        elif edge_attr.layout == "csc":
-            return ei.sort_by("col").values.get_csc()
-
-        return ei
-
-    def _remove_edge_index(self, edge_attr: "torch_geometric.data.EdgeAttr") -> bool:
-        del self.__edge_indices[edge_attr.edge_type]
-
-        # invalidate the graph
-        self.__clear_graph()
-        return True
-
-    def get_all_edge_attrs(self) -> List["torch_geometric.data.EdgeAttr"]:
-        attrs = []
-        for et in self.__edge_indices.keys(leaves_only=True, include_nested=True):
-            attrs.append(
-                torch_geometric.data.EdgeAttr(
-                    edge_type=et, layout="coo", is_sorted=False, size=self.__sizes[et]
-                )
-            )
-
-        return attrs
-
-    @property
-    def is_multi_gpu(self):
-        return self.__is_multi_gpu
-
-    @property
-    def _resource_handle(self):
-        if self.__handle is None:
-            if self.is_multi_gpu:
-                self.__handle = pylibcugraph.ResourceHandle(
-                    cugraph_comms_get_raft_handle().getHandle()
-                )
-            else:
-                self.__handle = pylibcugraph.ResourceHandle()
-        return self.__handle
-
-    @property
-    def _graph(self) -> Union[pylibcugraph.SGGraph, pylibcugraph.MGGraph]:
-        graph_properties = pylibcugraph.GraphProperties(
-            is_multigraph=True, is_symmetric=False
-        )
-
-        if self.__graph is None:
-            edgelist_dict = self.__get_edgelist()
-
-            if self.is_multi_gpu:
-                rank = torch.distributed.get_rank()
-                world_size = torch.distributed.get_world_size()
-
-                vertices_array = cupy.arange(
-                    sum(self._num_vertices().values()), dtype="int64"
-                )
-                vertices_array = cupy.array_split(vertices_array, world_size)[rank]
-
-                self.__graph = pylibcugraph.MGGraph(
-                    self._resource_handle,
-                    graph_properties,
-                    [cupy.asarray(edgelist_dict["src"]).astype("int64")],
-                    [cupy.asarray(edgelist_dict["dst"]).astype("int64")],
-                    vertices_array=[vertices_array],
-                    edge_id_array=[cupy.asarray(edgelist_dict["eid"])],
-                    edge_type_array=[cupy.asarray(edgelist_dict["etp"])],
-                    weight_array=[cupy.asarray(edgelist_dict["wgt"])]
-                    if "wgt" in edgelist_dict
-                    else None,
-                )
-            else:
-                self.__graph = pylibcugraph.SGGraph(
-                    self._resource_handle,
-                    graph_properties,
-                    cupy.asarray(edgelist_dict["src"]).astype("int64"),
-                    cupy.asarray(edgelist_dict["dst"]).astype("int64"),
-                    vertices_array=cupy.arange(
-                        sum(self._num_vertices().values()), dtype="int64"
-                    ),
-                    edge_id_array=cupy.asarray(edgelist_dict["eid"]),
-                    edge_type_array=cupy.asarray(edgelist_dict["etp"]),
-                    weight_array=cupy.asarray(edgelist_dict["wgt"])
-                    if "wgt" in edgelist_dict
-                    else None,
-                )
-
-        return self.__graph
-
-    def _num_vertices(self) -> Dict[str, int]:
-        num_vertices = {}
-        for edge_attr in self.get_all_edge_attrs():
-            if edge_attr.size is not None:
-                num_vertices[edge_attr.edge_type[0]] = (
-                    max(num_vertices[edge_attr.edge_type[0]], edge_attr.size[0])
-                    if edge_attr.edge_type[0] in num_vertices
-                    else edge_attr.size[0]
-                )
-                num_vertices[edge_attr.edge_type[2]] = (
-                    max(num_vertices[edge_attr.edge_type[2]], edge_attr.size[1])
-                    if edge_attr.edge_type[2] in num_vertices
-                    else edge_attr.size[1]
-                )
-            else:
-                if edge_attr.edge_type[0] != edge_attr.edge_type[2]:
-                    if edge_attr.edge_type[0] not in num_vertices:
-                        num_vertices[edge_attr.edge_type[0]] = int(
-                            self.__edge_indices[edge_attr.edge_type][0].max() + 1
-                        )
-                    if edge_attr.edge_type[2] not in num_vertices:
-                        num_vertices[edge_attr.edge_type[1]] = int(
-                            self.__edge_indices[edge_attr.edge_type][1].max() + 1
-                        )
-                elif edge_attr.edge_type[0] not in num_vertices:
-                    num_vertices[edge_attr.edge_type[0]] = int(
-                        self.__edge_indices[edge_attr.edge_type].max() + 1
-                    )
-
-        if self.is_multi_gpu:
-            vtypes = num_vertices.keys()
-            for vtype in vtypes:
-                sz = torch.tensor(num_vertices[vtype], device="cuda")
-                torch.distributed.all_reduce(sz, op=torch.distributed.ReduceOp.MAX)
-                num_vertices[vtype] = int(sz)
-        return num_vertices
-
-    @property
-    def _vertex_offsets(self) -> Dict[str, int]:
-        if self.__vertex_offsets is None:
-            num_vertices = self._num_vertices()
-            ordered_keys = sorted(list(num_vertices.keys()))
-            self.__vertex_offsets = {}
-            offset = 0
-            for vtype in ordered_keys:
-                self.__vertex_offsets[vtype] = offset
-                offset += num_vertices[vtype]
-
-        return dict(self.__vertex_offsets)
-
-    @property
-    def is_homogeneous(self) -> bool:
-        return len(self._vertex_offsets) == 1
-
-    def _set_weight_attr(self, attr: Tuple["torch_geometric.data.FeatureStore", str]):
-        if attr != self.__weight_attr:
-            self.__clear_graph()
-            self.__weight_attr = attr
-
-    def __get_weight_tensor(
-        self,
-        sorted_keys: List[Tuple[str, str, str]],
-        start_offsets: "torch.Tensor",
-        num_edges_t: "torch.Tensor",
-    ):
-        feature_store, attr_name = self.__weight_attr
-
-        weights = []
-        for i, et in enumerate(sorted_keys):
-            ix = torch.arange(
-                start_offsets[i],
-                start_offsets[i] + num_edges_t[i],
-                dtype=torch.int64,
-                device="cpu",
-            )
-
-            weights.append(feature_store[et, attr_name][ix])
-
-        return torch.concat(weights)
-
-    def __get_edgelist(self):
-        """
-        Returns
-        -------
-        Dict[str, torch.Tensor] with the following keys:
-            src: source vertices (int64)
-                Note that src is the 2nd element of the PyG edge index.
-            dst: destination vertices (int64)
-                Note that dst is the 1st element of the PyG edge index.
-            eid: edge ids for each edge (int64)
-                Note that these start from 0 for each edge type.
-            etp: edge types for each edge (int32)
-                Note that these are in lexicographic order.
-        """
-        sorted_keys = sorted(
-            list(self.__edge_indices.keys(leaves_only=True, include_nested=True))
-        )
-
-        # note that this still follows the PyG convention of (dst, rel, src)
-        # i.e. (author, writes, paper): [[0,1,2],[2,0,1]] is referring to a
-        # cuGraph graph where (paper 2) -> (author 0), (paper 0) -> (author 1),
-        # and (paper 1) -> (author 0)
-        edge_index = torch.concat(
-            [
-                torch.stack(
-                    [
-                        self.__edge_indices[dst_type, rel_type, src_type][0]
-                        + self._vertex_offsets[dst_type],
-                        self.__edge_indices[dst_type, rel_type, src_type][1]
-                        + self._vertex_offsets[src_type],
-                    ]
-                )
-                for (dst_type, rel_type, src_type) in sorted_keys
-            ],
-            axis=1,
-        ).cuda()
-
-        edge_type_array = torch.arange(
-            len(sorted_keys), dtype=torch.int32, device="cuda"
-        ).repeat_interleave(
-            torch.tensor(
-                [self.__edge_indices[et].shape[1] for et in sorted_keys],
-                device="cuda",
-                dtype=torch.int64,
-            )
-        )
-
-        num_edges_t = torch.tensor(
-            [self.__edge_indices[et].shape[1] for et in sorted_keys], device="cuda"
-        )
-
-        if self.is_multi_gpu:
-            rank = torch.distributed.get_rank()
-            world_size = torch.distributed.get_world_size()
-
-            num_edges_all_t = torch.empty(
-                world_size, num_edges_t.numel(), dtype=torch.int64, device="cuda"
-            )
-            torch.distributed.all_gather_into_tensor(num_edges_all_t, num_edges_t)
-
-            start_offsets = num_edges_all_t[:rank].T.sum(axis=1)
-        else:
-            rank = 0
-            start_offsets = torch.zeros(
-                (len(sorted_keys),), dtype=torch.int64, device="cuda"
-            )
-            num_edges_all_t = num_edges_t.reshape((1, num_edges_t.numel()))
-
-        edge_id_array = torch.concat(
-            [
-                torch.arange(
-                    start_offsets[i],
-                    start_offsets[i] + num_edges_all_t[rank][i],
-                    dtype=torch.int64,
-                    device="cuda",
-                )
-                for i in range(len(sorted_keys))
-            ]
-        )
-
-        d = {
-            "dst": edge_index[0],
-            "src": edge_index[1],
-            "etp": edge_type_array,
-            "eid": edge_id_array,
-        }
-
-        if self.__weight_attr is not None:
-            d["wgt"] = self.__get_weight_tensor(
-                sorted_keys, start_offsets.cpu(), num_edges_t.cpu()
-            ).cuda()
-
-        return d
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
deleted file mode 100644
index 31cbaf69ca5..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_mg.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example shows how to use cuGraph nccl-only comms, pylibcuGraph,
-# and PyTorch DDP to run a multi-GPU sampling workflow.  Most users of the
-# GNN packages will not interact with cuGraph directly.  This example
-# is intented for users who want to extend cuGraph within a DDP workflow.
-
-import os
-import re
-import tempfile
-
-import numpy as np
-import torch
-import torch.multiprocessing as tmp
-import torch.distributed as dist
-
-import cudf
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-    cugraph_comms_get_raft_handle,
-    DistSampleWriter,
-    UniformNeighborSampler,
-)
-
-from pylibcugraph import MGGraph, ResourceHandle, GraphProperties
-
-from ogb.nodeproppred import NodePropPredDataset
-
-
-def init_pytorch(rank, world_size):
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-
-def sample(rank: int, world_size: int, uid, edgelist, directory):
-    init_pytorch(rank, world_size)
-
-    device = rank
-    cugraph_comms_init(rank, world_size, uid, device)
-
-    print(f"rank {rank} initialized cugraph")
-
-    src = cudf.Series(np.array_split(edgelist[0], world_size)[rank])
-    dst = cudf.Series(np.array_split(edgelist[1], world_size)[rank])
-
-    seeds_per_rank = 50
-    seeds = cudf.Series(np.arange(rank * seeds_per_rank, (rank + 1) * seeds_per_rank))
-    handle = ResourceHandle(cugraph_comms_get_raft_handle().getHandle())
-
-    print("constructing graph")
-    G = MGGraph(
-        handle,
-        GraphProperties(is_multigraph=True, is_symmetric=False),
-        [src],
-        [dst],
-    )
-    print("graph constructed")
-
-    sample_writer = DistSampleWriter(directory=directory, batches_per_partition=2)
-    sampler = UniformNeighborSampler(
-        G,
-        sample_writer,
-        fanout=[5, 5],
-    )
-
-    sampler.sample_from_nodes(seeds, batch_size=16, random_state=62)
-
-    dist.barrier()
-    cugraph_comms_shutdown()
-    print(f"rank {rank} shut down cugraph")
-
-
-def main():
-    world_size = torch.cuda.device_count()
-    uid = cugraph_comms_create_unique_id()
-
-    dataset = NodePropPredDataset("ogbn-products")
-    el = dataset[0][0]["edge_index"].astype("int64")
-
-    with tempfile.TemporaryDirectory() as directory:
-        tmp.spawn(
-            sample,
-            args=(world_size, uid, el, directory),
-            nprocs=world_size,
-        )
-
-        print("Printing samples...")
-        for file in os.listdir(directory):
-            m = re.match(r"batch=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet", file)
-            rank, start, _, end = int(m[1]), int(m[2]), int(m[3]), int(m[4])
-            print(f"File: {file} (batches {start} to {end} for rank {rank})")
-            print(cudf.read_parquet(os.path.join(directory, file)))
-            print("\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
deleted file mode 100644
index de45acc7456..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/cugraph_dist_sampling_sg.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example shows how to use cuGraph nccl-only comms, pylibcuGraph,
-# and PyTorch to run a single-GPU sampling workflow.  Most users of the
-# GNN packages will not interact with cuGraph directly.  This example
-# is intented for users who want to extend cuGraph within a PyTorch workflow.
-
-import os
-import re
-import tempfile
-
-import numpy as np
-
-import cudf
-
-from cugraph.gnn import (
-    DistSampleWriter,
-    UniformNeighborSampler,
-)
-
-from pylibcugraph import SGGraph, ResourceHandle, GraphProperties
-
-from ogb.nodeproppred import NodePropPredDataset
-
-
-def sample(edgelist, directory):
-    src = cudf.Series(edgelist[0])
-    dst = cudf.Series(edgelist[1])
-
-    seeds_per_rank = 50
-    seeds = cudf.Series(np.arange(0, seeds_per_rank))
-
-    print("constructing graph")
-    G = SGGraph(
-        ResourceHandle(),
-        GraphProperties(is_multigraph=True, is_symmetric=False),
-        src,
-        dst,
-    )
-    print("graph constructed")
-
-    sample_writer = DistSampleWriter(directory=directory, batches_per_partition=2)
-    sampler = UniformNeighborSampler(
-        G,
-        sample_writer,
-        fanout=[5, 5],
-        compression="CSR",
-        retain_original_seeds=True,
-    )
-
-    sampler.sample_from_nodes(seeds, batch_size=16, random_state=62)
-
-
-def main():
-    dataset = NodePropPredDataset("ogbn-products")
-    el = dataset[0][0]["edge_index"].astype("int64")
-
-    with tempfile.TemporaryDirectory() as directory:
-        sample(el, directory)
-
-        print("Printing samples...")
-        for file in os.listdir(directory):
-            m = re.match(r"batch=([0-9]+)\.([0-9]+)\-([0-9]+)\.([0-9]+)\.parquet", file)
-            rank, start, _, end = int(m[1]), int(m[2]), int(m[3]), int(m[4])
-            print(f"File: {file} (batches {start} to {end} for rank {rank})")
-            print(cudf.read_parquet(os.path.join(directory, file)))
-            print("\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
deleted file mode 100644
index 127ca809d91..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_mnmg.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Multi-node, multi-GPU example with WholeGraph feature storage.
-# Can be run with torchrun.
-
-import argparse
-import os
-import warnings
-import tempfile
-import time
-import json
-
-
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-from ogb.nodeproppred import PygNodePropPredDataset
-from torch.nn.parallel import DistributedDataParallel
-
-import torch_geometric
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-)
-
-from pylibwholegraph.torch.initialize import (
-    init as wm_init,
-    finalize as wm_finalize,
-)
-
-# Allow computation on objects that are larger than GPU memory
-# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
-os.environ["CUDF_SPILL"] = "1"
-
-# Ensures that a CUDA context is not created on import of rapids.
-# Allows pytorch to create the context instead
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-
-def init_pytorch_worker(global_rank, local_rank, world_size, cugraph_id):
-    import rmm
-
-    rmm.reinitialize(
-        devices=local_rank,
-        managed_memory=True,
-        pool_allocator=True,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(local_rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(local_rank)
-
-    cugraph_comms_init(
-        rank=global_rank, world_size=world_size, uid=cugraph_id, device=local_rank
-    )
-
-    wm_init(global_rank, world_size, local_rank, torch.cuda.device_count())
-
-
-def partition_data(dataset, split_idx, edge_path, feature_path, label_path, meta_path):
-    data = dataset[0]
-
-    # Split and save edge index
-    os.makedirs(
-        edge_path,
-        exist_ok=True,
-    )
-    for (r, e) in enumerate(torch.tensor_split(data.edge_index, world_size, dim=1)):
-        rank_path = os.path.join(edge_path, f"rank={r}.pt")
-        torch.save(
-            e.clone(),
-            rank_path,
-        )
-
-    # Split and save features
-    os.makedirs(
-        feature_path,
-        exist_ok=True,
-    )
-
-    for (r, f) in enumerate(torch.tensor_split(data.x, world_size)):
-        rank_path = os.path.join(feature_path, f"rank={r}_x.pt")
-        torch.save(
-            f.clone(),
-            rank_path,
-        )
-    for (r, f) in enumerate(torch.tensor_split(data.y, world_size)):
-        rank_path = os.path.join(feature_path, f"rank={r}_y.pt")
-        torch.save(
-            f.clone(),
-            rank_path,
-        )
-
-    # Split and save labels
-    os.makedirs(
-        label_path,
-        exist_ok=True,
-    )
-    for (d, i) in split_idx.items():
-        i_parts = torch.tensor_split(i, world_size)
-        for r, i_part in enumerate(i_parts):
-            rank_path = os.path.join(label_path, f"rank={r}")
-            os.makedirs(rank_path, exist_ok=True)
-            torch.save(i_part, os.path.join(rank_path, f"{d}.pt"))
-
-    # Save metadata
-    meta = {
-        "num_classes": int(dataset.num_classes),
-        "num_features": int(dataset.num_features),
-        "num_nodes": int(data.num_nodes),
-    }
-    with open(meta_path, "w") as f:
-        json.dump(meta, f)
-
-
-def load_partitioned_data(
-    rank, edge_path, feature_path, label_path, meta_path, wg_mem_type
-):
-    from cugraph_pyg.data import GraphStore, WholeFeatureStore
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    feature_store = WholeFeatureStore(memory_type=wg_mem_type)
-
-    # Load metadata
-    with open(meta_path, "r") as f:
-        meta = json.load(f)
-
-    # Load labels
-    split_idx = {}
-    for split in ["train", "test", "valid"]:
-        split_idx[split] = torch.load(
-            os.path.join(label_path, f"rank={rank}", f"{split}.pt")
-        )
-
-    # Load features
-    feature_store["node", "x"] = torch.load(
-        os.path.join(feature_path, f"rank={rank}_x.pt")
-    )
-    feature_store["node", "y"] = torch.load(
-        os.path.join(feature_path, f"rank={rank}_y.pt")
-    )
-
-    # Load edge index
-    eix = torch.load(os.path.join(edge_path, f"rank={rank}.pt"))
-    graph_store[
-        ("node", "rel", "node"), "coo", False, (meta["num_nodes"], meta["num_nodes"])
-    ] = eix
-
-    return (feature_store, graph_store), split_idx, meta
-
-
-def run_train(
-    global_rank,
-    data,
-    split_idx,
-    world_size,
-    device,
-    model,
-    epochs,
-    batch_size,
-    fan_out,
-    num_classes,
-    wall_clock_start,
-    tempdir=None,
-    num_layers=3,
-    in_memory=False,
-    seeds_per_call=-1,
-):
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)
-
-    kwargs = dict(
-        num_neighbors=[fan_out] * num_layers,
-        batch_size=batch_size,
-    )
-    # Set Up Neighbor Loading
-    from cugraph_pyg.loader import NeighborLoader
-
-    ix_train = split_idx["train"].cuda()
-    train_path = None if in_memory else os.path.join(tempdir, f"train_{global_rank}")
-    if train_path:
-        os.mkdir(train_path)
-    train_loader = NeighborLoader(
-        data,
-        input_nodes=ix_train,
-        directory=train_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
-        **kwargs,
-    )
-
-    ix_test = split_idx["test"].cuda()
-    test_path = None if in_memory else os.path.join(tempdir, f"test_{global_rank}")
-    if test_path:
-        os.mkdir(test_path)
-    test_loader = NeighborLoader(
-        data,
-        input_nodes=ix_test,
-        directory=test_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=80000,
-        **kwargs,
-    )
-
-    ix_valid = split_idx["valid"].cuda()
-    valid_path = None if in_memory else os.path.join(tempdir, f"valid_{global_rank}")
-    if valid_path:
-        os.mkdir(valid_path)
-    valid_loader = NeighborLoader(
-        data,
-        input_nodes=ix_valid,
-        directory=valid_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
-        **kwargs,
-    )
-
-    dist.barrier()
-
-    eval_steps = 1000
-    warmup_steps = 20
-    dist.barrier()
-    torch.cuda.synchronize()
-
-    if global_rank == 0:
-        prep_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total time before training begins (prep_time) =", prep_time, "seconds")
-        print("Beginning training...")
-
-    for epoch in range(epochs):
-        for i, batch in enumerate(train_loader):
-            if i == warmup_steps:
-                torch.cuda.synchronize()
-                start = time.time()
-
-            batch = batch.to(device)
-            batch_size = batch.batch_size
-
-            batch.y = batch.y.view(-1).to(torch.long)
-            optimizer.zero_grad()
-            out = model(batch.x, batch.edge_index)
-            loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size])
-            loss.backward()
-            optimizer.step()
-            if global_rank == 0 and i % 10 == 0:
-                print(
-                    "Epoch: "
-                    + str(epoch)
-                    + ", Iteration: "
-                    + str(i)
-                    + ", Loss: "
-                    + str(loss)
-                )
-        nb = i + 1.0
-
-        if global_rank == 0:
-            print(
-                "Average Training Iteration Time:",
-                (time.time() - start) / (nb - warmup_steps),
-                "s/iter",
-            )
-
-        with torch.no_grad():
-            total_correct = total_examples = 0
-            for i, batch in enumerate(valid_loader):
-                if i >= eval_steps:
-                    break
-
-                batch = batch.to(device)
-                batch_size = batch.batch_size
-
-                batch.y = batch.y.to(torch.long)
-                out = model(batch.x, batch.edge_index)[:batch_size]
-
-                pred = out.argmax(dim=-1)
-                y = batch.y[:batch_size].view(-1).to(torch.long)
-
-                total_correct += int((pred == y).sum())
-                total_examples += y.size(0)
-
-            acc_val = total_correct / total_examples
-            if global_rank == 0:
-                print(
-                    f"Validation Accuracy: {acc_val * 100.0:.4f}%",
-                )
-
-        torch.cuda.synchronize()
-
-    with torch.no_grad():
-        total_correct = total_examples = 0
-        for i, batch in enumerate(test_loader):
-            batch = batch.to(device)
-            batch_size = batch.batch_size
-
-            batch.y = batch.y.to(torch.long)
-            out = model(batch.x, batch.edge_index)[:batch_size]
-
-            pred = out.argmax(dim=-1)
-            y = batch.y[:batch_size].view(-1).to(torch.long)
-
-            total_correct += int((pred == y).sum())
-            total_examples += y.size(0)
-
-        acc_test = total_correct / total_examples
-        if global_rank == 0:
-            print(
-                f"Test Accuracy: {acc_test * 100.0:.4f}%",
-            )
-
-    if global_rank == 0:
-        total_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total Program Runtime (total_time) =", total_time, "seconds")
-        print("total_time - prep_time =", total_time - prep_time, "seconds")
-
-    wm_finalize()
-    cugraph_comms_shutdown()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=256)
-    parser.add_argument("--num_layers", type=int, default=2)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=1024)
-    parser.add_argument("--fan_out", type=int, default=30)
-    parser.add_argument("--tempdir_root", type=str, default=None)
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--dataset", type=str, default="ogbn-products")
-    parser.add_argument("--skip_partition", action="store_true")
-    parser.add_argument("--wg_mem_type", type=str, default="distributed")
-
-    parser.add_argument("--in_memory", action="store_true", default=False)
-    parser.add_argument("--seeds_per_call", type=int, default=-1)
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    wall_clock_start = time.perf_counter()
-
-    if "LOCAL_RANK" in os.environ:
-        dist.init_process_group("nccl")
-        world_size = dist.get_world_size()
-        global_rank = dist.get_rank()
-        local_rank = int(os.environ["LOCAL_RANK"])
-        device = torch.device(local_rank)
-
-        # Create the uid needed for cuGraph comms
-        if global_rank == 0:
-            cugraph_id = [cugraph_comms_create_unique_id()]
-        else:
-            cugraph_id = [None]
-        dist.broadcast_object_list(cugraph_id, src=0, device=device)
-        cugraph_id = cugraph_id[0]
-
-        init_pytorch_worker(global_rank, local_rank, world_size, cugraph_id)
-
-        # Split the data
-        edge_path = os.path.join(args.dataset_root, args.dataset + "_eix_part")
-        feature_path = os.path.join(args.dataset_root, args.dataset + "_fea_part")
-        label_path = os.path.join(args.dataset_root, args.dataset + "_label_part")
-        meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
-
-        # We partition the data to avoid loading it in every worker, which will
-        # waste memory and can lead to an out of memory exception.
-        # cugraph_pyg.GraphStore and cugraph_pyg.WholeFeatureStore are always
-        # constructed from partitions of the edge index and features, respectively,
-        # so this works well.
-        if not args.skip_partition and global_rank == 0:
-            dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
-            split_idx = dataset.get_idx_split()
-
-            partition_data(
-                dataset,
-                split_idx,
-                meta_path=meta_path,
-                label_path=label_path,
-                feature_path=feature_path,
-                edge_path=edge_path,
-            )
-
-        dist.barrier()
-        data, split_idx, meta = load_partitioned_data(
-            rank=global_rank,
-            edge_path=edge_path,
-            feature_path=feature_path,
-            label_path=label_path,
-            meta_path=meta_path,
-            wg_mem_type=args.wg_mem_type,
-        )
-        dist.barrier()
-
-        model = torch_geometric.nn.models.GCN(
-            meta["num_features"],
-            args.hidden_channels,
-            args.num_layers,
-            meta["num_classes"],
-        ).to(device)
-        model = DistributedDataParallel(model, device_ids=[local_rank])
-
-        with tempfile.TemporaryDirectory(dir=args.tempdir_root) as tempdir:
-            run_train(
-                global_rank,
-                data,
-                split_idx,
-                world_size,
-                device,
-                model,
-                args.epochs,
-                args.batch_size,
-                args.fan_out,
-                meta["num_classes"],
-                wall_clock_start,
-                tempdir,
-                args.num_layers,
-                args.in_memory,
-                args.seeds_per_call,
-            )
-    else:
-        warnings.warn("This script should be run with 'torchrun`.  Exiting.")
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
deleted file mode 100644
index 0f9c39bf04d..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_sg.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-import argparse
-import tempfile
-import os
-import warnings
-
-from typing import Optional, Tuple, Dict
-
-import torch
-import cupy
-
-import rmm
-from rmm.allocators.cupy import rmm_cupy_allocator
-from rmm.allocators.torch import rmm_torch_allocator
-
-# Must change allocators immediately upon import
-# or else other imports will cause memory to be
-# allocated and prevent changing the allocator
-rmm.reinitialize(devices=[0], pool_allocator=True, managed_memory=True)
-cupy.cuda.set_allocator(rmm_cupy_allocator)
-torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-import torch.nn.functional as F  # noqa: E402
-import torch_geometric  # noqa: E402
-import cugraph_pyg  # noqa: E402
-from cugraph_pyg.loader import NeighborLoader  # noqa: E402
-
-# Enable cudf spilling to save gpu memory
-from cugraph.testing.mg_utils import enable_spilling  # noqa: E402
-
-enable_spilling()
-
-
-def train(epoch: int):
-    model.train()
-    for i, batch in enumerate(train_loader):
-        if i == warmup_steps:
-            torch.cuda.synchronize()
-            start_avg_time = time.perf_counter()
-        batch = batch.to(device)
-
-        optimizer.zero_grad()
-        batch_size = batch.batch_size
-        out = model(batch.x, batch.edge_index)[:batch_size]
-        y = batch.y[:batch_size].view(-1).to(torch.long)
-
-        loss = F.cross_entropy(out, y)
-        loss.backward()
-        optimizer.step()
-
-        if i % 10 == 0:
-            print(f"Epoch: {epoch:02d}, Iteration: {i}, Loss: {loss:.4f}")
-    torch.cuda.synchronize()
-    print(
-        f"Average Training Iteration Time (s/iter): \
-            {(time.perf_counter() - start_avg_time) / (i - warmup_steps):.6f}"
-    )
-
-
-@torch.no_grad()
-def test(loader: NeighborLoader, val_steps: Optional[int] = None):
-    model.eval()
-
-    total_correct = total_examples = 0
-    for i, batch in enumerate(loader):
-        if val_steps is not None and i >= val_steps:
-            break
-        batch = batch.to(device)
-        batch_size = batch.batch_size
-        out = model(batch.x, batch.edge_index)[:batch_size]
-        pred = out.argmax(dim=-1)
-        y = batch.y[:batch_size].view(-1).to(torch.long)
-
-        total_correct += int((pred == y).sum())
-        total_examples += y.size(0)
-
-    return total_correct / total_examples
-
-
-def create_loader(
-    data,
-    num_neighbors,
-    input_nodes,
-    replace,
-    batch_size,
-    samples_dir,
-    stage_name,
-    local_seeds_per_call,
-):
-    if samples_dir is not None:
-        directory = os.path.join(samples_dir, stage_name)
-        os.mkdir(directory)
-    else:
-        directory = None
-    return NeighborLoader(
-        data,
-        num_neighbors=num_neighbors,
-        input_nodes=input_nodes,
-        replace=replace,
-        batch_size=batch_size,
-        directory=directory,
-        local_seeds_per_call=local_seeds_per_call,
-    )
-
-
-def load_data(
-    dataset, dataset_root
-) -> Tuple[
-    Tuple[torch_geometric.data.FeatureStore, torch_geometric.data.GraphStore],
-    Dict[str, torch.Tensor],
-    int,
-    int,
-]:
-    from ogb.nodeproppred import PygNodePropPredDataset
-
-    dataset = PygNodePropPredDataset(dataset, root=dataset_root)
-    split_idx = dataset.get_idx_split()
-    data = dataset[0]
-
-    graph_store = cugraph_pyg.data.GraphStore()
-    graph_store[
-        ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
-    ] = data.edge_index
-
-    feature_store = cugraph_pyg.data.TensorDictFeatureStore()
-    feature_store["node", "x"] = data.x
-    feature_store["node", "y"] = data.y
-
-    return (
-        (feature_store, graph_store),
-        split_idx,
-        dataset.num_features,
-        dataset.num_classes,
-    )
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=256)
-    parser.add_argument("--num_layers", type=int, default=2)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=1024)
-    parser.add_argument("--fan_out", type=int, default=30)
-    parser.add_argument("--tempdir_root", type=str, default=None)
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--dataset", type=str, default="ogbn-products")
-    parser.add_argument("--in_memory", action="store_true", default=False)
-    parser.add_argument("--seeds_per_call", type=int, default=-1)
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    wall_clock_start = time.perf_counter()
-    device = torch.device("cuda")
-
-    data, split_idx, num_features, num_classes = load_data(
-        args.dataset, args.dataset_root
-    )
-
-    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == "1":
-        warnings.warn("Pruning test dataset for CI run.")
-        split_idx["test"] = split_idx["test"][:1000]
-
-    with tempfile.TemporaryDirectory(dir=args.tempdir_root) as samples_dir:
-        loader_kwargs = {
-            "data": data,
-            "num_neighbors": [args.fan_out] * args.num_layers,
-            "replace": False,
-            "batch_size": args.batch_size,
-            "samples_dir": None if args.in_memory else samples_dir,
-            "local_seeds_per_call": None
-            if args.seeds_per_call <= 0
-            else args.seeds_per_call,
-        }
-
-        train_loader = create_loader(
-            input_nodes=split_idx["train"],
-            stage_name="train",
-            **loader_kwargs,
-        )
-
-        val_loader = create_loader(
-            input_nodes=split_idx["valid"],
-            stage_name="val",
-            **loader_kwargs,
-        )
-
-        test_loader = create_loader(
-            input_nodes=split_idx["test"],
-            stage_name="test",
-            **loader_kwargs,
-        )
-
-        model = torch_geometric.nn.models.GCN(
-            num_features,
-            args.hidden_channels,
-            args.num_layers,
-            num_classes,
-        ).to(device)
-
-        optimizer = torch.optim.Adam(
-            model.parameters(), lr=args.lr, weight_decay=0.0005
-        )
-
-        warmup_steps = 20
-
-        torch.cuda.synchronize()
-        prep_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total time before training begins (prep_time)=", prep_time, "seconds")
-        print("Beginning training...")
-        for epoch in range(1, 1 + args.epochs):
-            train(epoch)
-            val_acc = test(val_loader, val_steps=100)
-            print(f"Val Acc: ~{val_acc:.4f}")
-
-        test_acc = test(test_loader)
-        print(f"Test Acc: {test_acc:.4f}")
-        total_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total Program Runtime (total_time) =", total_time, "seconds")
-        print("total_time - prep_time =", total_time - prep_time, "seconds")
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
deleted file mode 100644
index 73efbc92a24..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/gcn_dist_snmg.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Single-node, multi-GPU example.
-
-import argparse
-import os
-import tempfile
-import time
-import warnings
-
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
-import torch.nn.functional as F
-from ogb.nodeproppred import PygNodePropPredDataset
-from torch.nn.parallel import DistributedDataParallel
-
-import torch_geometric
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-)
-
-# Allow computation on objects that are larger than GPU memory
-# https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory
-os.environ["CUDF_SPILL"] = "1"
-
-# Ensures that a CUDA context is not created on import of rapids.
-# Allows pytorch to create the context instead
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-
-def init_pytorch_worker(rank, world_size, cugraph_id):
-    import rmm
-
-    rmm.reinitialize(
-        devices=rank,
-        managed_memory=True,
-        pool_allocator=True,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(rank)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
-
-
-def run_train(
-    rank,
-    data,
-    world_size,
-    cugraph_id,
-    model,
-    epochs,
-    batch_size,
-    fan_out,
-    split_idx,
-    num_classes,
-    wall_clock_start,
-    tempdir=None,
-    num_layers=3,
-    in_memory=False,
-    seeds_per_call=-1,
-):
-
-    init_pytorch_worker(
-        rank,
-        world_size,
-        cugraph_id,
-    )
-
-    model = model.to(rank)
-    model = DistributedDataParallel(model, device_ids=[rank])
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)
-
-    kwargs = dict(
-        num_neighbors=[fan_out] * num_layers,
-        batch_size=batch_size,
-    )
-    # Set Up Neighbor Loading
-    from cugraph_pyg.data import GraphStore, TensorDictFeatureStore
-    from cugraph_pyg.loader import NeighborLoader
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    ixr = torch.tensor_split(data.edge_index, world_size, dim=1)[rank]
-    graph_store[
-        ("node", "rel", "node"), "coo", False, (data.num_nodes, data.num_nodes)
-    ] = ixr
-
-    feature_store = TensorDictFeatureStore()
-    feature_store["node", "x"] = data.x
-    feature_store["node", "y"] = data.y
-
-    dist.barrier()
-
-    ix_train = torch.tensor_split(split_idx["train"], world_size)[rank].cuda()
-    train_path = None if in_memory else os.path.join(tempdir, f"train_{rank}")
-    if train_path:
-        os.mkdir(train_path)
-    train_loader = NeighborLoader(
-        (feature_store, graph_store),
-        input_nodes=ix_train,
-        directory=train_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
-        **kwargs,
-    )
-
-    ix_test = torch.tensor_split(split_idx["test"], world_size)[rank].cuda()
-    test_path = None if in_memory else os.path.join(tempdir, f"test_{rank}")
-    if test_path:
-        os.mkdir(test_path)
-    test_loader = NeighborLoader(
-        (feature_store, graph_store),
-        input_nodes=ix_test,
-        directory=test_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=80000,
-        **kwargs,
-    )
-
-    ix_valid = torch.tensor_split(split_idx["valid"], world_size)[rank].cuda()
-    valid_path = None if in_memory else os.path.join(tempdir, f"valid_{rank}")
-    if valid_path:
-        os.mkdir(valid_path)
-    valid_loader = NeighborLoader(
-        (feature_store, graph_store),
-        input_nodes=ix_valid,
-        directory=valid_path,
-        shuffle=True,
-        drop_last=True,
-        local_seeds_per_call=seeds_per_call if seeds_per_call > 0 else None,
-        **kwargs,
-    )
-
-    dist.barrier()
-
-    eval_steps = 1000
-    warmup_steps = 20
-    dist.barrier()
-    torch.cuda.synchronize()
-
-    if rank == 0:
-        prep_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total time before training begins (prep_time) =", prep_time, "seconds")
-        print("Beginning training...")
-    for epoch in range(epochs):
-        for i, batch in enumerate(train_loader):
-            if i == warmup_steps:
-                torch.cuda.synchronize()
-                start = time.time()
-
-            batch = batch.to(rank)
-            batch_size = batch.batch_size
-
-            batch.y = batch.y.to(torch.long)
-            optimizer.zero_grad()
-            out = model(batch.x, batch.edge_index)
-            loss = F.cross_entropy(out[:batch_size], batch.y[:batch_size])
-            loss.backward()
-            optimizer.step()
-            if rank == 0 and i % 10 == 0:
-                print(
-                    "Epoch: "
-                    + str(epoch)
-                    + ", Iteration: "
-                    + str(i)
-                    + ", Loss: "
-                    + str(loss)
-                )
-        nb = i + 1.0
-
-        if rank == 0:
-            print(
-                "Average Training Iteration Time:",
-                (time.time() - start) / (nb - warmup_steps),
-                "s/iter",
-            )
-
-        with torch.no_grad():
-            total_correct = total_examples = 0
-            for i, batch in enumerate(valid_loader):
-                if i >= eval_steps:
-                    break
-
-                batch = batch.to(rank)
-                batch_size = batch.batch_size
-
-                batch.y = batch.y.to(torch.long)
-                out = model(batch.x, batch.edge_index)[:batch_size]
-
-                pred = out.argmax(dim=-1)
-                y = batch.y[:batch_size].view(-1).to(torch.long)
-
-                total_correct += int((pred == y).sum())
-                total_examples += y.size(0)
-
-            acc_val = total_correct / total_examples
-            if rank == 0:
-                print(
-                    f"Validation Accuracy: {acc_val * 100.0:.4f}%",
-                )
-
-        torch.cuda.synchronize()
-
-    with torch.no_grad():
-        total_correct = total_examples = 0
-        for i, batch in enumerate(test_loader):
-            batch = batch.to(rank)
-            batch_size = batch.batch_size
-
-            batch.y = batch.y.to(torch.long)
-            out = model(batch.x, batch.edge_index)[:batch_size]
-
-            pred = out.argmax(dim=-1)
-            y = batch.y[:batch_size].view(-1).to(torch.long)
-
-            total_correct += int((pred == y).sum())
-            total_examples += y.size(0)
-
-        acc_test = total_correct / total_examples
-        if rank == 0:
-            print(
-                f"Test Accuracy: {acc_test * 100.0:.4f}%",
-            )
-
-    if rank == 0:
-        total_time = round(time.perf_counter() - wall_clock_start, 2)
-        print("Total Program Runtime (total_time) =", total_time, "seconds")
-        print("total_time - prep_time =", total_time - prep_time, "seconds")
-
-    cugraph_comms_shutdown()
-    dist.destroy_process_group()
-
-
-if __name__ == "__main__":
-    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == "1":
-        warnings.warn("Skipping SMNG example in CI due to memory limit")
-    else:
-        parser = argparse.ArgumentParser()
-        parser.add_argument("--hidden_channels", type=int, default=256)
-        parser.add_argument("--num_layers", type=int, default=2)
-        parser.add_argument("--lr", type=float, default=0.001)
-        parser.add_argument("--epochs", type=int, default=4)
-        parser.add_argument("--batch_size", type=int, default=1024)
-        parser.add_argument("--fan_out", type=int, default=30)
-        parser.add_argument("--tempdir_root", type=str, default=None)
-        parser.add_argument("--dataset_root", type=str, default="dataset")
-        parser.add_argument("--dataset", type=str, default="ogbn-products")
-        parser.add_argument("--in_memory", action="store_true", default=False)
-        parser.add_argument("--seeds_per_call", type=int, default=-1)
-
-        parser.add_argument(
-            "--n_devices",
-            type=int,
-            default=-1,
-            help="1-8 to use that many GPUs. Defaults to all available GPUs",
-        )
-
-        args = parser.parse_args()
-        wall_clock_start = time.perf_counter()
-
-        from rmm.allocators.torch import rmm_torch_allocator
-
-        torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-        dataset = PygNodePropPredDataset(name=args.dataset, root=args.dataset_root)
-        split_idx = dataset.get_idx_split()
-        data = dataset[0]
-        data.y = data.y.reshape(-1)
-
-        model = torch_geometric.nn.models.GCN(
-            dataset.num_features,
-            args.hidden_channels,
-            args.num_layers,
-            dataset.num_classes,
-        )
-
-        print("Data =", data)
-        if args.n_devices == -1:
-            world_size = torch.cuda.device_count()
-        else:
-            world_size = args.n_devices
-        print("Using", world_size, "GPUs...")
-
-        # Create the uid needed for cuGraph comms
-        cugraph_id = cugraph_comms_create_unique_id()
-
-        with tempfile.TemporaryDirectory(dir=args.tempdir_root) as tempdir:
-            mp.spawn(
-                run_train,
-                args=(
-                    data,
-                    world_size,
-                    cugraph_id,
-                    model,
-                    args.epochs,
-                    args.batch_size,
-                    args.fan_out,
-                    split_idx,
-                    dataset.num_classes,
-                    wall_clock_start,
-                    tempdir,
-                    args.num_layers,
-                    args.in_memory,
-                    args.seeds_per_call,
-                ),
-                nprocs=world_size,
-                join=True,
-            )
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
deleted file mode 100644
index 145675c8a06..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# For this script, dask must be started first in a separate process.
-# To do this, the `start_dask.sh` script has been provided.  This scripts starts
-# a dask scheduler and dask workers.  To select the GPUs and amount of memory
-# allocated to dask per GPU, the `CUDA_VISIBLE_DEVICES` and `WORKER_RMM_POOL_SIZE`
-# arguments in that script can be modified.
-# To connect to dask, the scheduler JSON file must be provided.  This can be done
-# using the `--dask_scheduler_file` argument in the mg python script being run.
-
-from ogb.nodeproppred import NodePropPredDataset
-
-import time
-import argparse
-import gc
-import warnings
-
-import torch
-import numpy as np
-
-from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
-
-import torch.nn as nn
-import torch.nn.functional as F
-
-import torch.distributed as td
-import torch.multiprocessing as tmp
-from torch.nn.parallel import DistributedDataParallel as ddp
-
-from typing import List
-
-
-class CuGraphSAGE(nn.Module):
-    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
-        super().__init__()
-
-        self.convs = torch.nn.ModuleList()
-        self.convs.append(CuGraphSAGEConv(in_channels, hidden_channels))
-        for _ in range(num_layers - 1):
-            conv = CuGraphSAGEConv(hidden_channels, hidden_channels)
-            self.convs.append(conv)
-
-        self.lin = nn.Linear(hidden_channels, out_channels)
-
-    def forward(self, x, edge, size):
-        edge_csc = CuGraphSAGEConv.to_csc(edge, (size[0], size[0]))
-        for conv in self.convs:
-            x = conv(x, edge_csc)[: size[1]]
-            x = F.relu(x)
-            x = F.dropout(x, p=0.5)
-
-        return self.lin(x)
-
-
-def enable_cudf_spilling():
-    import cudf
-
-    cudf.set_option("spill", True)
-
-
-def init_pytorch_worker(rank, devices, manager_ip, manager_port) -> None:
-    import cupy
-    import rmm
-
-    device_id = devices[rank]
-
-    rmm.reinitialize(
-        devices=[device_id],
-        pool_allocator=False,
-    )
-
-    # torch.cuda.change_current_allocator(rmm.rmm_torch_allocator)
-    # cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
-
-    cupy.cuda.Device(device_id).use()
-    torch.cuda.set_device(device_id)
-
-    # Pytorch training worker initialization
-    dist_init_method = f"tcp://{manager_ip}:{manager_port}"
-
-    torch.distributed.init_process_group(
-        backend="nccl",
-        init_method=dist_init_method,
-        world_size=len(devices),
-        rank=rank,
-    )
-
-    # enable_cudf_spilling()
-
-
-def start_cugraph_dask_client(rank, dask_scheduler_file):
-    print(
-        "Connecting to dask... "
-        "(warning: this may take a while depending on your configuration)"
-    )
-    start_time_connect_dask = time.perf_counter_ns()
-    from distributed import Client
-    from cugraph.dask.comms import comms as Comms
-
-    client = Client(scheduler_file=dask_scheduler_file)
-    Comms.initialize(p2p=True)
-
-    end_time_connect_dask = time.perf_counter_ns()
-    print(
-        f"Successfully connected to dask on rank {rank}, took "
-        f"{(end_time_connect_dask - start_time_connect_dask) / 1e9:3.4f} s"
-    )
-    return client
-
-
-def stop_cugraph_dask_client():
-    from cugraph.dask.comms import comms as Comms
-
-    Comms.destroy()
-
-    from dask.distributed import get_client
-
-    get_client().close()
-
-
-def train(
-    rank,
-    torch_devices: List[int],
-    manager_ip: str,
-    manager_port: int,
-    dask_scheduler_file: str,
-    num_epochs: int,
-    features_on_gpu=True,
-) -> None:
-    """
-    Parameters
-    ----------
-    device: int
-        The CUDA device where the model, graph data, and node labels will be stored.
-    features_on_gpu: bool
-        Whether to store a replica of features on each worker's GPU.  If False,
-        all features will be stored on the CPU.
-    """
-
-    start_time_preprocess = time.perf_counter_ns()
-
-    world_size = len(torch_devices)
-    device_id = torch_devices[rank]
-    features_device = device_id if features_on_gpu else "cpu"
-    init_pytorch_worker(rank, torch_devices, manager_ip, manager_port)
-    td.barrier()
-
-    client = start_cugraph_dask_client(rank, dask_scheduler_file)
-
-    from distributed import Event as Dask_Event
-
-    event = Dask_Event("cugraph_store_creation_event")
-    download_event = Dask_Event("dataset_download_event")
-
-    td.barrier()
-
-    import cugraph
-    from cugraph_pyg.data import DaskGraphStore
-    from cugraph_pyg.loader import DaskNeighborLoader
-
-    if rank == 0:
-        print("Rank 0 downloading dataset")
-        dataset = NodePropPredDataset(name="ogbn-mag")
-        data = dataset[0]
-        download_event.set()
-        print("Dataset downloaded")
-    else:
-        if download_event.wait(timeout=1000):
-            print(f"Rank {rank} loading dataset")
-            dataset = NodePropPredDataset(name="ogbn-mag")
-            data = dataset[0]
-            print(f"Rank {rank} loaded dataset successfully")
-
-    ei = data[0]["edge_index_dict"][("paper", "cites", "paper")]
-    G = {
-        ("paper", "cites", "paper"): np.stack(
-            [np.concatenate([ei[0], ei[1]]), np.concatenate([ei[1], ei[0]])]
-        )
-    }
-    N = {"paper": data[0]["num_nodes_dict"]["paper"]}
-
-    fs = cugraph.gnn.FeatureStore(backend="torch")
-
-    fs.add_data(
-        torch.as_tensor(data[0]["node_feat_dict"]["paper"], device=features_device),
-        "paper",
-        "x",
-    )
-
-    fs.add_data(torch.as_tensor(data[1]["paper"].T[0], device=device_id), "paper", "y")
-
-    num_papers = data[0]["num_nodes_dict"]["paper"]
-
-    if rank == 0:
-        train_perc = 0.1
-        all_train_nodes = torch.randperm(num_papers)
-        all_train_nodes = all_train_nodes[: int(train_perc * num_papers)]
-        train_nodes = all_train_nodes[: int(len(all_train_nodes) / world_size)]
-
-        train_mask = torch.full((num_papers,), -1, device=device_id)
-        train_mask[train_nodes] = 1
-        fs.add_data(train_mask, "paper", "train")
-
-    print(f"Rank {rank} finished loading graph and feature data")
-
-    if rank == 0:
-        print("Rank 0 creating its cugraph store and initializing distributed graph")
-        # Rank 0 will initialize the distributed cugraph graph.
-        cugraph_store_create_start = time.perf_counter_ns()
-        print("G:", G[("paper", "cites", "paper")].shape)
-        cugraph_store = DaskGraphStore(fs, G, N, multi_gpu=True)
-        cugraph_store_create_end = time.perf_counter_ns()
-        print(
-            "cuGraph Store created on rank 0 in "
-            f"{(cugraph_store_create_end - cugraph_store_create_start) / 1e9:3.4f} s"
-        )
-        client.publish_dataset(train_nodes=all_train_nodes)
-        event.set()
-        print("Rank 0 done with cugraph store creation")
-    else:
-        if event.wait(timeout=1000):
-            print(f"Rank {rank} creating cugraph store")
-            train_nodes = client.get_dataset("train_nodes")
-            train_nodes = train_nodes[
-                int(rank * len(train_nodes) / world_size) : int(
-                    (rank + 1) * len(train_nodes) / world_size
-                )
-            ]
-
-            train_mask = torch.full((num_papers,), -1, device=device_id)
-            train_mask[train_nodes] = 1
-            fs.add_data(train_mask, "paper", "train")
-
-            # Will automatically use the stored distributed cugraph graph on rank 0.
-            cugraph_store_create_start = time.perf_counter_ns()
-            cugraph_store = DaskGraphStore(fs, G, N, multi_gpu=True)
-            cugraph_store_create_end = time.perf_counter_ns()
-            print(
-                f"Rank {rank} created cugraph store in "
-                f"{(cugraph_store_create_end - cugraph_store_create_start) / 1e9:3.4f}"
-                " s"
-            )
-            print(f"Rank {rank} done with cugraph store creation")
-
-    end_time_preprocess = time.perf_counter_ns()
-    print(f"rank {rank}: train {train_nodes.shape}", flush=True)
-    print(
-        f"rank {rank}: all preprocessing took"
-        f" {(end_time_preprocess - start_time_preprocess) / 1e9:3.4f}",
-        flush=True,
-    )
-    td.barrier()
-    model = (
-        CuGraphSAGE(in_channels=128, hidden_channels=64, out_channels=349, num_layers=3)
-        .to(torch.float32)
-        .to(device_id)
-    )
-    model = ddp(model, device_ids=[device_id], output_device=device_id)
-    td.barrier()
-
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
-
-    for epoch in range(num_epochs):
-        start_time_train = time.perf_counter_ns()
-        model.train()
-
-        start_time_loader = time.perf_counter_ns()
-        cugraph_bulk_loader = DaskNeighborLoader(
-            cugraph_store,
-            train_nodes,
-            batch_size=250,
-            num_neighbors=[10, 10, 10],
-            seeds_per_call=1000,
-            batches_per_partition=2,
-            replace=False,
-        )
-        end_time_loader = time.perf_counter_ns()
-        total_time_loader = (end_time_loader - start_time_loader) / 1e9
-
-        total_loss = 0
-        num_batches = 0
-
-        print(f"rank {rank} starting epoch {epoch}")
-        with td.algorithms.join.Join([model]):
-            total_time_sample = 0
-            total_time_forward = 0
-            total_time_backward = 0
-
-            start_time_sample = time.perf_counter_ns()
-            for iter_i, hetero_data in enumerate(cugraph_bulk_loader):
-                end_time_sample = time.perf_counter_ns()
-                total_time_sample += (end_time_sample - start_time_sample) / 1e9
-                num_batches += 1
-
-                if iter_i % 20 == 0:
-                    print(f"iteration {iter_i}")
-
-                # train
-                train_mask = hetero_data.train_dict["paper"]
-                y_true = hetero_data.y_dict["paper"]
-
-                start_time_forward = time.perf_counter_ns()
-                y_pred = model(
-                    hetero_data.x_dict["paper"].to(device_id).to(torch.float32),
-                    hetero_data.edge_index_dict[("paper", "cites", "paper")].to(
-                        device_id
-                    ),
-                    (len(y_true), len(y_true)),
-                )
-                end_time_forward = time.perf_counter_ns()
-                total_time_forward += (end_time_forward - start_time_forward) / 1e9
-
-                y_true = F.one_hot(
-                    y_true[train_mask].to(torch.int64), num_classes=349
-                ).to(torch.float32)
-
-                y_pred = y_pred[train_mask]
-
-                loss = F.cross_entropy(y_pred, y_true)
-
-                start_time_backward = time.perf_counter_ns()
-                optimizer.zero_grad()
-                loss.backward()
-                optimizer.step()
-                end_time_backward = time.perf_counter_ns()
-                total_time_backward += (end_time_backward - start_time_backward) / 1e9
-
-                total_loss += loss.item()
-
-                del y_true
-                del y_pred
-                del loss
-                del hetero_data
-                gc.collect()
-
-                start_time_sample = time.perf_counter_ns()
-
-            end_time_train = time.perf_counter_ns()
-            print(
-                f"epoch {epoch} "
-                f"total time: {(end_time_train - start_time_train) / 1e9:3.4f} s"
-                f"\nloader create time per batch: {total_time_loader / num_batches} s"
-                f"\nsampling/load time per batch: {total_time_sample / num_batches} s"
-                f"\nforward time per batch: {total_time_forward / num_batches} s"
-                f"\nbackward time per batch: {total_time_backward / num_batches} s"
-                f"\nnum batches: {num_batches}"
-            )
-            print(f"loss after epoch {epoch}: {total_loss / num_batches}")
-
-    td.barrier()
-    if rank == 0:
-        print("DONE", flush=True)
-        client.unpublish_dataset("train_nodes")
-        event.clear()
-
-    td.destroy_process_group()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--torch_devices",
-        type=str,
-        default="0,1",
-        help="GPU to allocate to pytorch for model, graph data, and node label storage",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--num_epochs",
-        type=int,
-        default=1,
-        help="Number of training epochs",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--features_on_gpu",
-        type=bool,
-        default=True,
-        help="Whether to store the features on each worker's GPU",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--torch_manager_ip",
-        type=str,
-        default="127.0.0.1",
-        help="The torch distributed manager ip address",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--torch_manager_port",
-        type=str,
-        default="12346",
-        help="The torch distributed manager port",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--dask_scheduler_file",
-        type=str,
-        help="The path to the dask scheduler file",
-        required=False,
-        default=None,
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-    if args.dask_scheduler_file is None:
-        warnings.warn(
-            "You must provide the dask scheduler file " "to run this example.  Exiting."
-        )
-
-    else:
-        torch_devices = [int(d) for d in args.torch_devices.split(",")]
-
-        train_args = (
-            torch_devices,
-            args.torch_manager_ip,
-            args.torch_manager_port,
-            args.dask_scheduler_file,
-            args.num_epochs,
-            args.features_on_gpu,
-        )
-
-        tmp.spawn(train, args=train_args, nprocs=len(torch_devices))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
deleted file mode 100644
index e0169ee2c25..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import time
-import argparse
-import gc
-
-import torch
-
-from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
-
-import torch.nn as nn
-import torch.nn.functional as F
-
-from typing import Union
-
-
-class CuGraphSAGE(nn.Module):
-    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
-        super().__init__()
-
-        self.convs = torch.nn.ModuleList()
-        self.convs.append(CuGraphSAGEConv(in_channels, hidden_channels))
-        for _ in range(num_layers - 1):
-            conv = CuGraphSAGEConv(hidden_channels, hidden_channels)
-            self.convs.append(conv)
-
-        self.lin = nn.Linear(hidden_channels, out_channels)
-
-    def forward(self, x, edge, size):
-        edge_csc = CuGraphSAGEConv.to_csc(edge, (size[0], size[0]))
-        for conv in self.convs:
-            x = conv(x, edge_csc)[: size[1]]
-            x = F.relu(x)
-            x = F.dropout(x, p=0.5)
-
-        return self.lin(x)
-
-
-def init_pytorch_worker(device_id: int) -> None:
-    import cupy
-    import rmm
-
-    rmm.reinitialize(
-        devices=[device_id],
-        pool_allocator=False,
-    )
-
-    cupy.cuda.Device(device_id).use()
-    torch.cuda.set_device(device_id)
-
-
-def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -> None:
-    """
-    Parameters
-    ----------
-    device: int
-        The CUDA device where the model, graph data, and node labels will be stored.
-    features_device: Union[str, int]
-        The device (CUDA device or CPU) where features will be stored.
-    """
-
-    init_pytorch_worker(device)
-
-    import cugraph
-    from cugraph_pyg.data import DaskGraphStore
-    from cugraph_pyg.loader import DaskNeighborLoader
-
-    from ogb.nodeproppred import NodePropPredDataset
-
-    dataset = NodePropPredDataset(name="ogbn-mag")
-    data = dataset[0]
-
-    G = data[0]["edge_index_dict"]
-    N = data[0]["num_nodes_dict"]
-
-    fs = cugraph.gnn.FeatureStore(backend="torch")
-
-    fs.add_data(
-        torch.as_tensor(data[0]["node_feat_dict"]["paper"], device=features_device),
-        "paper",
-        "x",
-    )
-
-    fs.add_data(torch.as_tensor(data[1]["paper"].T[0], device=device), "paper", "y")
-
-    num_papers = data[0]["num_nodes_dict"]["paper"]
-    train_perc = 0.1
-
-    train_nodes = torch.randperm(num_papers)
-    train_nodes = train_nodes[: int(train_perc * num_papers)]
-
-    train_mask = torch.full((num_papers,), -1, device=device)
-    train_mask[train_nodes] = 1
-
-    fs.add_data(train_mask, "paper", "train")
-
-    cugraph_store = DaskGraphStore(fs, G, N)
-
-    model = (
-        CuGraphSAGE(in_channels=128, hidden_channels=64, out_channels=349, num_layers=3)
-        .to(torch.float32)
-        .to(device)
-    )
-
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
-
-    for epoch in range(num_epochs):
-        start_time_train = time.perf_counter_ns()
-        model.train()
-
-        cugraph_bulk_loader = DaskNeighborLoader(
-            cugraph_store, train_nodes, batch_size=500, num_neighbors=[10, 25]
-        )
-
-        total_loss = 0
-        num_batches = 0
-
-        # This context manager will handle different # batches per rank
-        # barrier() cannot do this since the number of ops per rank is
-        # different.  It essentially acts like barrier would if the
-        # number of ops per rank was the same.
-        for iter_i, hetero_data in enumerate(cugraph_bulk_loader):
-            num_batches += 1
-            if iter_i % 20 == 0:
-                print(f"iteration {iter_i}")
-
-            # train
-            train_mask = hetero_data.train_dict["paper"]
-            y_true = hetero_data.y_dict["paper"]
-
-            y_pred = model(
-                hetero_data.x_dict["paper"].to(device).to(torch.float32),
-                hetero_data.edge_index_dict[("paper", "cites", "paper")].to(device),
-                (len(y_true), len(y_true)),
-            )
-
-            y_true = F.one_hot(y_true[train_mask].to(torch.int64), num_classes=349).to(
-                torch.float32
-            )
-
-            y_pred = y_pred[train_mask]
-
-            loss = F.cross_entropy(y_pred, y_true)
-
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            total_loss += loss.item()
-
-            del y_true
-            del y_pred
-            del loss
-            del hetero_data
-            gc.collect()
-
-        end_time_train = time.perf_counter_ns()
-        print(
-            f"epoch {epoch} time: "
-            f"{(end_time_train - start_time_train) / 1e9:3.4f} s"
-        )
-        print(f"loss after epoch {epoch}: {total_loss / num_batches}")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--device",
-        type=int,
-        default=0,
-        help="GPU to allocate to pytorch for model, graph data, and node label storage",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--features_device",
-        type=str,
-        default="0",
-        help="Device to allocate to pytorch for feature storage",
-        required=False,
-    )
-
-    parser.add_argument(
-        "--num_epochs",
-        type=int,
-        default=1,
-        help="Number of training epochs",
-        required=False,
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    try:
-        features_device = int(args.features_device)
-    except ValueError:
-        features_device = args.features_device
-
-    train(args.device, features_device, args.num_epochs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_mg.py
deleted file mode 100644
index 832c5ec74f0..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_mg.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example shows how to use cuGraph nccl-only comms, pylibcuGraph,
-# and PyTorch DDP to run a multi-GPU workflow.  Most users of the
-# GNN packages will not interact with cuGraph directly.  This example
-# is intented for users who want to extend cuGraph within a DDP workflow.
-
-import os
-
-import pandas
-import numpy as np
-import torch
-import torch.multiprocessing as tmp
-import torch.distributed as dist
-
-import cudf
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-    cugraph_comms_get_raft_handle,
-)
-
-from pylibcugraph import MGGraph, ResourceHandle, GraphProperties, degrees
-
-from ogb.nodeproppred import NodePropPredDataset
-
-
-def init_pytorch(rank, world_size):
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-
-def calc_degree(rank: int, world_size: int, uid, edgelist):
-    init_pytorch(rank, world_size)
-
-    device = rank
-    cugraph_comms_init(rank, world_size, uid, device)
-
-    print(f"rank {rank} initialized cugraph")
-
-    src = cudf.Series(np.array_split(edgelist[0], world_size)[rank])
-    dst = cudf.Series(np.array_split(edgelist[1], world_size)[rank])
-
-    seeds = cudf.Series(np.arange(rank * 50, (rank + 1) * 50))
-    handle = ResourceHandle(cugraph_comms_get_raft_handle().getHandle())
-
-    print("constructing graph")
-    G = MGGraph(
-        handle,
-        GraphProperties(is_multigraph=True, is_symmetric=False),
-        [src],
-        [dst],
-    )
-    print("graph constructed")
-
-    print("calculating degrees")
-    vertices, in_deg, out_deg = degrees(handle, G, seeds, do_expensive_check=False)
-    print("degrees calculated")
-
-    print("constructing dataframe")
-    df = pandas.DataFrame(
-        {"v": vertices.get(), "in": in_deg.get(), "out": out_deg.get()}
-    )
-    print(df)
-
-    dist.barrier()
-    cugraph_comms_shutdown()
-    print(f"rank {rank} shut down cugraph")
-
-
-def main():
-    world_size = torch.cuda.device_count()
-    uid = cugraph_comms_create_unique_id()
-
-    dataset = NodePropPredDataset("ogbn-products")
-    el = dataset[0][0]["edge_index"].astype("int64")
-
-    tmp.spawn(
-        calc_degree,
-        args=(world_size, uid, el),
-        nprocs=world_size,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_sg.py
deleted file mode 100644
index 2f273ee581e..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/pylibcugraph_sg.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example shows how to use cuGraph and pylibcuGraph to run a
-# single-GPU workflow.  Most users of the GNN packages will not interact
-# with cuGraph directly.  This example is intented for users who want
-# to extend cuGraph within a PyTorch workflow.
-
-import pandas
-import numpy as np
-
-import cudf
-
-from pylibcugraph import SGGraph, ResourceHandle, GraphProperties, degrees
-
-from ogb.nodeproppred import NodePropPredDataset
-
-
-def calc_degree(edgelist):
-    src = cudf.Series(edgelist[0])
-    dst = cudf.Series(edgelist[1])
-
-    seeds = cudf.Series(np.arange(256))
-
-    print("constructing graph")
-    G = SGGraph(
-        ResourceHandle(),
-        GraphProperties(is_multigraph=True, is_symmetric=False),
-        src,
-        dst,
-    )
-    print("graph constructed")
-
-    print("calculating degrees")
-    vertices, in_deg, out_deg = degrees(
-        ResourceHandle(), G, seeds, do_expensive_check=False
-    )
-    print("degrees calculated")
-
-    print("constructing dataframe")
-    df = pandas.DataFrame(
-        {"v": vertices.get(), "in": in_deg.get(), "out": out_deg.get()}
-    )
-    print(df)
-
-    print("done")
-
-
-def main():
-    dataset = NodePropPredDataset("ogbn-products")
-    el = dataset[0][0]["edge_index"].astype("int64")
-    calc_degree(el)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_mnmg.py b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_mnmg.py
deleted file mode 100644
index 5c75e01e6f5..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_mnmg.py
+++ /dev/null
@@ -1,418 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example illustrates link classification using the ogbl-wikikg2 dataset.
-
-import os
-import json
-import argparse
-import warnings
-
-import torch
-
-import torch.nn.functional as F
-from torch.nn import Parameter
-from torch_geometric.nn import FastRGCNConv, GAE
-from torch.nn.parallel import DistributedDataParallel
-
-from ogb.linkproppred import PygLinkPropPredDataset
-
-import cugraph_pyg
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_create_unique_id,
-    cugraph_comms_shutdown,
-)
-
-from pylibwholegraph.torch.initialize import (
-    init as wm_init,
-    finalize as wm_finalize,
-)
-
-
-# Enable cudf spilling to save gpu memory
-from cugraph.testing.mg_utils import enable_spilling
-
-# Ensures that a CUDA context is not created on import of rapids.
-# Allows pytorch to create the context instead
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-
-def init_pytorch_worker(global_rank, local_rank, world_size, uid):
-    import rmm
-
-    rmm.reinitialize(devices=[local_rank], pool_allocator=True, managed_memory=True)
-
-    import cupy
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    cugraph_comms_init(
-        global_rank,
-        world_size,
-        uid,
-        local_rank,
-    )
-
-    wm_init(global_rank, world_size, local_rank, torch.cuda.device_count())
-
-    enable_spilling()
-
-
-class RGCNEncoder(torch.nn.Module):
-    def __init__(self, num_nodes, hidden_channels, num_relations, num_bases=30):
-        super().__init__()
-        self.node_emb = Parameter(torch.empty(num_nodes, hidden_channels))
-        self.conv1 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.conv2 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        torch.nn.init.xavier_uniform_(self.node_emb)
-        self.conv1.reset_parameters()
-        self.conv2.reset_parameters()
-
-    def forward(self, edge_index, edge_type):
-        x = self.node_emb
-        x = self.conv1(x, edge_index, edge_type).relu_()
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv2(x, edge_index, edge_type)
-        return x
-
-
-def train(epoch, model, optimizer, train_loader, edge_feature_store, num_steps=None):
-    model.train()
-    optimizer.zero_grad()
-
-    for i, batch in enumerate(train_loader):
-        r = edge_feature_store[("n", "e", "n"), "rel"][batch.e_id].flatten().cuda()
-        z = model.encode(batch.edge_index, r)
-
-        loss = model.recon_loss(z, batch.edge_index)
-        loss.backward()
-        optimizer.step()
-
-        if i % 10 == 0:
-            print(
-                f"Epoch: {epoch:02d}, Iteration: {i:02d}, Loss: {loss:.4f}", flush=True
-            )
-        if num_steps and i == num_steps:
-            break
-
-
-def test(stage, epoch, model, loader, num_steps=None):
-    # TODO support ROC-AUC metric
-    # Predict probabilities of future edges
-    model.eval()
-
-    rr = 0.0
-    for i, (h, h_neg, t, t_neg, r) in enumerate(loader):
-        if num_steps and i >= num_steps:
-            break
-
-        ei = torch.concatenate(
-            [
-                torch.stack([h, t]).cuda(),
-                torch.stack([h_neg.flatten(), t_neg.flatten()]).cuda(),
-            ],
-            dim=-1,
-        )
-
-        r = torch.concatenate([r, torch.repeat_interleave(r, h_neg.shape[-1])]).cuda()
-
-        z = model.encode(ei, r)
-        q = model.decode(z, ei)
-
-        _, ix = torch.sort(q, descending=True)
-        rr += 1.0 / (1.0 + ix[0])
-
-    print(f"epoch {epoch:02d} {stage} mrr:", rr / i, flush=True)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=128)
-    parser.add_argument("--num_layers", type=int, default=1)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=16384)
-    parser.add_argument("--num_neg", type=int, default=500)
-    parser.add_argument("--num_pos", type=int, default=-1)
-    parser.add_argument("--fan_out", type=int, default=10)
-    parser.add_argument("--dataset", type=str, default="ogbl-wikikg2")
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--seeds_per_call", type=int, default=-1)
-    parser.add_argument("--n_devices", type=int, default=-1)
-    parser.add_argument("--skip_partition", action="store_true")
-
-    return parser.parse_args()
-
-
-def run_train(rank, world_size, model, data, edge_feature_store, meta, splits, args):
-    model = model.to(rank)
-    model = GAE(DistributedDataParallel(model, device_ids=[rank]))
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-
-    eli = torch.stack([splits["train"]["head"], splits["train"]["tail"]])
-
-    train_loader = cugraph_pyg.loader.LinkNeighborLoader(
-        data,
-        [args.fan_out] * args.num_layers,
-        edge_label_index=eli,
-        local_seeds_per_call=args.seeds_per_call if args.seeds_per_call > 0 else None,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=True,
-    )
-
-    def get_eval_loader(stage: str):
-        head = splits[stage]["head"]
-        tail = splits[stage]["tail"]
-
-        head_neg = splits[stage]["head_neg"][:, : args.num_neg]
-        tail_neg = splits[stage]["tail_neg"][:, : args.num_neg]
-
-        rel = splits[stage]["relation"]
-
-        return torch.utils.data.DataLoader(
-            torch.utils.data.TensorDataset(
-                head.pin_memory(),
-                head_neg.pin_memory(),
-                tail.pin_memory(),
-                tail_neg.pin_memory(),
-                rel.pin_memory(),
-            ),
-            batch_size=1,
-            shuffle=False,
-            drop_last=True,
-        )
-
-    test_loader = get_eval_loader("test")
-    valid_loader = get_eval_loader("valid")
-
-    num_train_steps = (args.num_pos // args.batch_size) if args.num_pos > 0 else 100
-
-    for epoch in range(1, 1 + args.epochs):
-        train(
-            epoch,
-            model,
-            optimizer,
-            train_loader,
-            edge_feature_store,
-            num_steps=num_train_steps,
-        )
-        test("validation", epoch, model, valid_loader, num_steps=1024)
-
-    test("test", epoch, model, test_loader, num_steps=1024)
-
-    wm_finalize()
-    cugraph_comms_shutdown()
-
-
-def partition_data(
-    data, splits, meta, edge_path, rel_path, pos_path, neg_path, meta_path
-):
-    # Split and save edge index
-    os.makedirs(
-        edge_path,
-        exist_ok=True,
-    )
-    for (r, e) in enumerate(torch.tensor_split(data.edge_index, world_size, dim=1)):
-        rank_path = os.path.join(edge_path, f"rank={r}.pt")
-        torch.save(
-            e.clone(),
-            rank_path,
-        )
-
-    # Split and save edge reltypes
-    os.makedirs(
-        rel_path,
-        exist_ok=True,
-    )
-    for (r, f) in enumerate(torch.tensor_split(data.edge_reltype, world_size)):
-        rank_path = os.path.join(rel_path, f"rank={r}.pt")
-        torch.save(
-            f.clone(),
-            rank_path,
-        )
-
-    # Split and save positive edges
-    os.makedirs(
-        pos_path,
-        exist_ok=True,
-    )
-    for stage in ["train", "test", "valid"]:
-        for (r, n) in enumerate(
-            torch.tensor_split(
-                torch.stack([splits[stage]["head"], splits[stage]["tail"]]),
-                world_size,
-                dim=-1,
-            )
-        ):
-            rank_path = os.path.join(pos_path, f"rank={r}_{stage}.pt")
-            torch.save(
-                n.clone(),
-                rank_path,
-            )
-
-    # Split and save negative edges
-    os.makedirs(
-        neg_path,
-        exist_ok=True,
-    )
-    for stage in ["test", "valid"]:
-        for (r, n) in enumerate(
-            torch.tensor_split(
-                torch.stack([splits[stage]["head_neg"], splits[stage]["tail_neg"]]),
-                world_size,
-                dim=1,
-            )
-        ):
-            rank_path = os.path.join(neg_path, f"rank={r}_{stage}.pt")
-            torch.save(n.clone(), rank_path)
-        for (r, n) in enumerate(
-            torch.tensor_split(splits[stage]["relation"], world_size, dim=-1)
-        ):
-            print(n)
-            rank_path = os.path.join(neg_path, f"rank={r}_{stage}_relation.pt")
-            torch.save(n.clone(), rank_path)
-
-    with open(meta_path, "w") as f:
-        json.dump(meta, f)
-
-
-def load_partitioned_data(rank, edge_path, rel_path, pos_path, neg_path, meta_path):
-    from cugraph_pyg.data import GraphStore, WholeFeatureStore, TensorDictFeatureStore
-
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()
-    edge_feature_store = WholeFeatureStore()
-
-    # Load edge index
-    graph_store[("n", "e", "n"), "coo"] = torch.load(
-        os.path.join(edge_path, f"rank={rank}.pt")
-    )
-
-    # Load edge rel type
-    edge_feature_store[("n", "e", "n"), "rel"] = torch.load(
-        os.path.join(rel_path, f"rank={rank}.pt")
-    )
-
-    splits = {}
-
-    # Load positive edges
-    for stage in ["train", "test", "valid"]:
-        head, tail = torch.load(os.path.join(pos_path, f"rank={rank}_{stage}.pt"))
-        splits[stage] = {
-            "head": head,
-            "tail": tail,
-        }
-
-    # Load negative edges
-    for stage in ["test", "valid"]:
-        head_neg, tail_neg = torch.load(
-            os.path.join(neg_path, f"rank={rank}_{stage}.pt")
-        )
-        relation = torch.load(
-            os.path.join(neg_path, f"rank={rank}_{stage}_relation.pt")
-        )
-        splits[stage]["head_neg"] = head_neg
-        splits[stage]["tail_neg"] = tail_neg
-        splits[stage]["relation"] = relation
-
-    with open(meta_path, "r") as f:
-        meta = json.load(f)
-
-    return (feature_store, graph_store), edge_feature_store, splits, meta
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    if "LOCAL_RANK" in os.environ:
-        torch.distributed.init_process_group("nccl")
-        world_size = torch.distributed.get_world_size()
-        global_rank = torch.distributed.get_rank()
-        local_rank = int(os.environ["LOCAL_RANK"])
-        device = torch.device(local_rank)
-
-        # Create the uid needed for cuGraph comms
-        if global_rank == 0:
-            cugraph_id = [cugraph_comms_create_unique_id()]
-        else:
-            cugraph_id = [None]
-        torch.distributed.broadcast_object_list(cugraph_id, src=0, device=device)
-        cugraph_id = cugraph_id[0]
-
-        init_pytorch_worker(global_rank, local_rank, world_size, cugraph_id)
-
-        # Split the data
-        edge_path = os.path.join(args.dataset_root, args.dataset + "_eix_part")
-        rel_path = os.path.join(args.dataset_root, args.dataset + "_rel_part")
-        pos_path = os.path.join(args.dataset_root, args.dataset + "_e_pos_part")
-        neg_path = os.path.join(args.dataset_root, args.dataset + "_e_neg_part")
-        meta_path = os.path.join(args.dataset_root, args.dataset + "_meta.json")
-
-        if not args.skip_partition and global_rank == 0:
-            data = PygLinkPropPredDataset(args.dataset, root=args.dataset_root)
-            dataset = data[0]
-
-            splits = data.get_edge_split()
-
-            meta = {}
-            meta["num_nodes"] = int(dataset.num_nodes)
-            meta["num_rels"] = int(dataset.edge_reltype.max()) + 1
-
-            partition_data(
-                dataset,
-                splits,
-                meta,
-                edge_path=edge_path,
-                rel_path=rel_path,
-                pos_path=pos_path,
-                neg_path=neg_path,
-                meta_path=meta_path,
-            )
-            del data
-            del dataset
-            del splits
-        torch.distributed.barrier()
-
-        # Load partitions
-        data, edge_feature_store, splits, meta = load_partitioned_data(
-            rank=global_rank,
-            edge_path=edge_path,
-            rel_path=rel_path,
-            pos_path=pos_path,
-            neg_path=neg_path,
-            meta_path=meta_path,
-        )
-        torch.distributed.barrier()
-
-        model = RGCNEncoder(
-            meta["num_nodes"],
-            hidden_channels=args.hidden_channels,
-            num_relations=meta["num_rels"],
-        )
-
-        run_train(
-            global_rank, world_size, model, data, edge_feature_store, meta, splits, args
-        )
-    else:
-        warnings.warn("This script should be run with 'torchrun`.  Exiting.")
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_sg.py
deleted file mode 100644
index 67d7eecc7c2..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_sg.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example illustrates link classification using the ogbl-wikikg2 dataset.
-
-import argparse
-
-from typing import Tuple, Dict, Any
-
-import torch
-import cupy
-
-import rmm
-from rmm.allocators.cupy import rmm_cupy_allocator
-from rmm.allocators.torch import rmm_torch_allocator
-
-# Must change allocators immediately upon import
-# or else other imports will cause memory to be
-# allocated and prevent changing the allocator
-rmm.reinitialize(devices=[0], pool_allocator=True, managed_memory=True)
-cupy.cuda.set_allocator(rmm_cupy_allocator)
-torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-import torch.nn.functional as F  # noqa: E402
-from torch.nn import Parameter  # noqa: E402
-from torch_geometric.nn import FastRGCNConv, GAE  # noqa: E402
-import torch_geometric  # noqa: E402
-import cugraph_pyg  # noqa: E402
-
-# Enable cudf spilling to save gpu memory
-from cugraph.testing.mg_utils import enable_spilling  # noqa: E402
-
-enable_spilling()
-
-
-class RGCNEncoder(torch.nn.Module):
-    def __init__(self, num_nodes, hidden_channels, num_relations, num_bases=30):
-        super().__init__()
-        self.node_emb = Parameter(torch.empty(num_nodes, hidden_channels))
-        self.conv1 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.conv2 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        torch.nn.init.xavier_uniform_(self.node_emb)
-        self.conv1.reset_parameters()
-        self.conv2.reset_parameters()
-
-    def forward(self, edge_index, edge_type):
-        x = self.node_emb
-        x = self.conv1(x, edge_index, edge_type).relu_()
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv2(x, edge_index, edge_type)
-        return x
-
-
-def load_data(
-    dataset_str, dataset_root: str
-) -> Tuple[
-    Tuple["torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"],
-    "torch_geometric.data.FeatureStore",
-    Dict[str, Dict[str, "torch.Tensor"]],
-    Dict[str, Any],
-]:
-    from ogb.linkproppred import PygLinkPropPredDataset
-
-    data = PygLinkPropPredDataset(dataset_str, root=dataset_root)
-    dataset = data[0]
-
-    splits = data.get_edge_split()
-
-    from cugraph_pyg.data import GraphStore, TensorDictFeatureStore
-
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()
-    edge_feature_store = TensorDictFeatureStore()
-    meta = {}
-
-    graph_store[("n", "e", "n"), "coo"] = dataset.edge_index
-    edge_feature_store[("n", "e", "n"), "rel"] = dataset.edge_reltype.pin_memory()
-    meta["num_nodes"] = dataset.num_nodes
-    meta["num_rels"] = dataset.edge_reltype.max() + 1
-
-    return (feature_store, graph_store), edge_feature_store, splits, meta
-
-
-def train(epoch, model, optimizer, train_loader, edge_feature_store):
-    model.train()
-    optimizer.zero_grad()
-
-    for i, batch in enumerate(train_loader):
-        r = edge_feature_store[("n", "e", "n"), "rel"][batch.e_id].flatten().cuda()
-        z = model.encode(batch.edge_index, r)
-
-        loss = model.recon_loss(z, batch.edge_index)
-        loss.backward()
-        optimizer.step()
-
-        if i % 10 == 0:
-            print(f"Epoch: {epoch:02d}, Iteration: {i:02d}, Loss: {loss:.4f}")
-            if i == 100:
-                break
-
-
-def test(stage, epoch, model, loader, num_steps=None):
-    # TODO support ROC-AUC metric
-    # Predict probabilities of future edges
-    model.eval()
-
-    rr = 0.0
-    for i, (h, h_neg, t, t_neg, r) in enumerate(loader):
-        if num_steps and i >= num_steps:
-            break
-
-        ei = torch.concatenate(
-            [
-                torch.stack([h, t]).cuda(),
-                torch.stack([h_neg.flatten(), t_neg.flatten()]).cuda(),
-            ],
-            dim=-1,
-        )
-
-        r = torch.concatenate([r, torch.repeat_interleave(r, h_neg.shape[-1])]).cuda()
-
-        z = model.encode(ei, r)
-        q = model.decode(z, ei)
-
-        _, ix = torch.sort(q, descending=True)
-        rr += 1.0 / (1.0 + ix[0])
-
-    print(f"epoch {epoch:02d} {stage} mrr:", rr / i)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=128)
-    parser.add_argument("--num_layers", type=int, default=1)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=16384)
-    parser.add_argument("--num_neg", type=int, default=500)
-    parser.add_argument("--fan_out", type=int, default=10)
-    parser.add_argument("--dataset", type=str, default="ogbl-wikikg2")
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--seeds_per_call", type=int, default=-1)
-
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    data, edge_feature_store, splits, meta = load_data(args.dataset, args.dataset_root)
-
-    model = GAE(
-        RGCNEncoder(
-            meta["num_nodes"],
-            hidden_channels=args.hidden_channels,
-            num_relations=meta["num_rels"],
-        )
-    ).cuda()
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-
-    train_loader = cugraph_pyg.loader.LinkNeighborLoader(
-        data,
-        [args.fan_out] * args.num_layers,
-        edge_label_index=torch.stack(
-            [splits["train"]["head"], splits["train"]["tail"]]
-        ),
-        local_seeds_per_call=args.seeds_per_call if args.seeds_per_call > 0 else None,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=True,
-    )
-
-    def get_eval_loader(stage: str):
-        head = splits[stage]["head"]
-        tail = splits[stage]["tail"]
-
-        head_neg = splits[stage]["head_neg"][:, : args.num_neg]
-        tail_neg = splits[stage]["tail_neg"][:, : args.num_neg]
-
-        rel = splits[stage]["relation"]
-
-        return torch.utils.data.DataLoader(
-            torch.utils.data.TensorDataset(
-                head.pin_memory(),
-                head_neg.pin_memory(),
-                tail.pin_memory(),
-                tail_neg.pin_memory(),
-                rel.pin_memory(),
-            ),
-            batch_size=1,
-            shuffle=False,
-            drop_last=True,
-        )
-
-    test_loader = get_eval_loader("test")
-    valid_loader = get_eval_loader("valid")
-
-    for epoch in range(1, 1 + args.epochs):
-        train(epoch, model, optimizer, train_loader, edge_feature_store)
-        test("validation", epoch, model, valid_loader, num_steps=1024)
-
-    test("test", epoch, model, test_loader, num_steps=1024)
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_snmg.py b/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_snmg.py
deleted file mode 100644
index 2c0ae53a08e..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/rgcn_link_class_snmg.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This example illustrates link classification using the ogbl-wikikg2 dataset.
-
-import os
-import argparse
-import warnings
-
-from typing import Tuple, Any
-
-import torch
-
-import torch.nn.functional as F
-from torch.nn import Parameter
-from torch_geometric.nn import FastRGCNConv, GAE
-from torch.nn.parallel import DistributedDataParallel
-
-import torch_geometric
-import cugraph_pyg
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_create_unique_id,
-    cugraph_comms_shutdown,
-)
-
-from pylibwholegraph.torch.initialize import (
-    init as wm_init,
-    finalize as wm_finalize,
-)
-
-
-# Enable cudf spilling to save gpu memory
-from cugraph.testing.mg_utils import enable_spilling
-
-# Ensures that a CUDA context is not created on import of rapids.
-# Allows pytorch to create the context instead
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-
-def init_pytorch_worker(rank, world_size, uid):
-    import rmm
-
-    rmm.reinitialize(devices=[rank], pool_allocator=True, managed_memory=True)
-
-    import cupy
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    cugraph_comms_init(
-        rank,
-        world_size,
-        uid,
-        rank,
-    )
-
-    wm_init(rank, world_size, rank, world_size)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    torch.distributed.init_process_group(
-        "nccl",
-        rank=rank,
-        world_size=world_size,
-    )
-
-    enable_spilling()
-
-
-class RGCNEncoder(torch.nn.Module):
-    def __init__(self, num_nodes, hidden_channels, num_relations, num_bases=30):
-        super().__init__()
-        self.node_emb = Parameter(torch.empty(num_nodes, hidden_channels))
-        self.conv1 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.conv2 = FastRGCNConv(
-            hidden_channels, hidden_channels, num_relations, num_bases=num_bases
-        )
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        torch.nn.init.xavier_uniform_(self.node_emb)
-        self.conv1.reset_parameters()
-        self.conv2.reset_parameters()
-
-    def forward(self, edge_index, edge_type):
-        x = self.node_emb
-        x = self.conv1(x, edge_index, edge_type).relu_()
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv2(x, edge_index, edge_type)
-        return x
-
-
-def load_data(
-    rank: int,
-    world_size: int,
-    data: Any,
-) -> Tuple[
-    Tuple["torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"],
-    "torch_geometric.data.FeatureStore",
-]:
-    from cugraph_pyg.data import GraphStore, WholeFeatureStore, TensorDictFeatureStore
-
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()  # empty fs required by PyG
-    edge_feature_store = WholeFeatureStore()
-
-    graph_store[("n", "e", "n"), "coo"] = torch.tensor_split(
-        data.edge_index.cuda(), world_size, dim=1
-    )[rank]
-
-    edge_feature_store[("n", "e", "n"), "rel"] = torch.tensor_split(
-        data.edge_reltype.cuda(),
-        world_size,
-    )[rank]
-
-    return (feature_store, graph_store), edge_feature_store
-
-
-def train(epoch, model, optimizer, train_loader, edge_feature_store, num_steps=None):
-    model.train()
-    optimizer.zero_grad()
-
-    for i, batch in enumerate(train_loader):
-        r = edge_feature_store[("n", "e", "n"), "rel"][batch.e_id].flatten().cuda()
-        z = model.encode(batch.edge_index, r)
-
-        loss = model.recon_loss(z, batch.edge_index)
-        loss.backward()
-        optimizer.step()
-
-        if i % 10 == 0:
-            print(
-                f"Epoch: {epoch:02d}, Iteration: {i:02d}, Loss: {loss:.4f}", flush=True
-            )
-        if num_steps and i == num_steps:
-            break
-
-
-def test(stage, epoch, model, loader, num_steps=None):
-    # TODO support ROC-AUC metric
-    # Predict probabilities of future edges
-    model.eval()
-
-    rr = 0.0
-    for i, (h, h_neg, t, t_neg, r) in enumerate(loader):
-        if num_steps and i >= num_steps:
-            break
-
-        ei = torch.concatenate(
-            [
-                torch.stack([h, t]).cuda(),
-                torch.stack([h_neg.flatten(), t_neg.flatten()]).cuda(),
-            ],
-            dim=-1,
-        )
-
-        r = torch.concatenate([r, torch.repeat_interleave(r, h_neg.shape[-1])]).cuda()
-
-        z = model.encode(ei, r)
-        q = model.decode(z, ei)
-
-        _, ix = torch.sort(q, descending=True)
-        rr += 1.0 / (1.0 + ix[0])
-
-    print(f"epoch {epoch:02d} {stage} mrr:", rr / i, flush=True)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--hidden_channels", type=int, default=128)
-    parser.add_argument("--num_layers", type=int, default=1)
-    parser.add_argument("--lr", type=float, default=0.001)
-    parser.add_argument("--epochs", type=int, default=4)
-    parser.add_argument("--batch_size", type=int, default=16384)
-    parser.add_argument("--num_neg", type=int, default=500)
-    parser.add_argument("--num_pos", type=int, default=-1)
-    parser.add_argument("--fan_out", type=int, default=10)
-    parser.add_argument("--dataset", type=str, default="ogbl-wikikg2")
-    parser.add_argument("--dataset_root", type=str, default="dataset")
-    parser.add_argument("--seeds_per_call", type=int, default=-1)
-    parser.add_argument("--n_devices", type=int, default=-1)
-
-    return parser.parse_args()
-
-
-def run_train(rank, world_size, uid, model, data, meta, splits, args):
-    init_pytorch_worker(
-        rank,
-        world_size,
-        uid,
-    )
-
-    model = model.to(rank)
-    model = GAE(DistributedDataParallel(model, device_ids=[rank]))
-    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
-
-    data, edge_feature_store = load_data(rank, world_size, data)
-
-    eli = torch.stack(
-        [
-            torch.tensor_split(splits["train"]["head"], world_size)[rank],
-            torch.tensor_split(splits["train"]["tail"], world_size)[rank],
-        ]
-    )
-
-    train_loader = cugraph_pyg.loader.LinkNeighborLoader(
-        data,
-        [args.fan_out] * args.num_layers,
-        edge_label_index=eli,
-        local_seeds_per_call=args.seeds_per_call if args.seeds_per_call > 0 else None,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=True,
-    )
-
-    def get_eval_loader(stage: str):
-        head = torch.tensor_split(splits[stage]["head"], world_size)[rank]
-        tail = torch.tensor_split(splits[stage]["tail"], world_size)[rank]
-
-        head_neg = torch.tensor_split(
-            splits[stage]["head_neg"][:, : args.num_neg], world_size
-        )[rank]
-        tail_neg = torch.tensor_split(
-            splits[stage]["tail_neg"][:, : args.num_neg], world_size
-        )[rank]
-
-        rel = torch.tensor_split(splits[stage]["relation"], world_size)[rank]
-
-        return torch.utils.data.DataLoader(
-            torch.utils.data.TensorDataset(
-                head.pin_memory(),
-                head_neg.pin_memory(),
-                tail.pin_memory(),
-                tail_neg.pin_memory(),
-                rel.pin_memory(),
-            ),
-            batch_size=1,
-            shuffle=False,
-            drop_last=True,
-        )
-
-    test_loader = get_eval_loader("test")
-    valid_loader = get_eval_loader("valid")
-
-    num_train_steps = (args.num_pos // args.batch_size) if args.num_pos > 0 else 100
-
-    for epoch in range(1, 1 + args.epochs):
-        train(
-            epoch,
-            model,
-            optimizer,
-            train_loader,
-            edge_feature_store,
-            num_steps=num_train_steps,
-        )
-        test("validation", epoch, model, valid_loader, num_steps=1024)
-
-    test("test", epoch, model, test_loader, num_steps=1024)
-
-    wm_finalize()
-    cugraph_comms_shutdown()
-
-
-if __name__ == "__main__":
-    if "CI_RUN" in os.environ and os.environ["CI_RUN"] == "1":
-        warnings.warn("Skipping SMNG example in CI due to memory limit")
-    else:
-        args = parse_args()
-
-        # change the allocator before any allocations are made
-        from rmm.allocators.torch import rmm_torch_allocator
-
-        torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
-        # import ogb here to stop it from creating a context and breaking pytorch/rmm
-        from ogb.linkproppred import PygLinkPropPredDataset
-
-        data = PygLinkPropPredDataset(args.dataset, root=args.dataset_root)
-        dataset = data[0]
-
-        splits = data.get_edge_split()
-
-        meta = {}
-        meta["num_nodes"] = dataset.num_nodes
-        meta["num_rels"] = dataset.edge_reltype.max() + 1
-
-        model = RGCNEncoder(
-            meta["num_nodes"],
-            hidden_channels=args.hidden_channels,
-            num_relations=meta["num_rels"],
-        )
-
-        print("Data =", data)
-        if args.n_devices == -1:
-            world_size = torch.cuda.device_count()
-        else:
-            world_size = args.n_devices
-        print("Using", world_size, "GPUs...")
-
-        uid = cugraph_comms_create_unique_id()
-        torch.multiprocessing.spawn(
-            run_train,
-            (world_size, uid, model, data, meta, splits, args),
-            nprocs=world_size,
-            join=True,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/examples/start_dask.sh b/python/cugraph-pyg/cugraph_pyg/examples/start_dask.sh
deleted file mode 100755
index 54c82f81298..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/examples/start_dask.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-WORKER_RMM_POOL_SIZE=14G \
-CUDA_VISIBLE_DEVICES=0,1 \
-SCHEDULER_FILE=$(pwd)/scheduler.json \
-../../../../mg_utils/run-dask-process.sh \
-    scheduler workers \
-    --tcp
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
deleted file mode 100644
index c804b3d1f97..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from cugraph_pyg.loader.node_loader import NodeLoader
-from cugraph_pyg.loader.neighbor_loader import NeighborLoader
-
-from cugraph_pyg.loader.link_loader import LinkLoader
-from cugraph_pyg.loader.link_neighbor_loader import LinkNeighborLoader
-
-from cugraph_pyg.loader.dask_node_loader import DaskNeighborLoader
-
-from cugraph_pyg.loader.dask_node_loader import BulkSampleLoader
-
-
-def CuGraphNeighborLoader(*args, **kwargs):
-    warnings.warn(
-        "CuGraphNeighborLoader has been renamed to DaskNeighborLoader", FutureWarning
-    )
-    return DaskNeighborLoader(*args, **kwargs)
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
deleted file mode 100644
index 9b24281b190..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/dask_node_loader.py
+++ /dev/null
@@ -1,558 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-
-import os
-import re
-import warnings
-
-import cupy
-import cudf
-
-from cugraph.gnn import BulkSampler
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import DaskGraphStore
-from cugraph_pyg.sampler.sampler_utils import (
-    _sampler_output_from_sampling_results_heterogeneous,
-    _sampler_output_from_sampling_results_homogeneous_csr,
-    _sampler_output_from_sampling_results_homogeneous_coo,
-)
-
-from typing import Union, Tuple, Sequence, List, Dict
-
-torch_geometric = import_optional("torch_geometric")
-torch = import_optional("torch")
-InputNodes = (
-    Sequence
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.typing.InputNodes
-)
-
-
-class BulkSampleLoader:
-    """
-    Iterator that executes sampling using Dask and cuGraph and
-    loads sampled minibatches from disk.
-    """
-
-    __ex_parquet_file = re.compile(r"batch=([0-9]+)\-([0-9]+)\.parquet")
-
-    def __init__(
-        self,
-        feature_store: DaskGraphStore,
-        graph_store: DaskGraphStore,
-        input_nodes: InputNodes = None,
-        batch_size: int = 0,
-        *,
-        shuffle: bool = False,
-        drop_last: bool = True,
-        edge_types: Sequence[Tuple[str]] = None,
-        directory: Union[str, tempfile.TemporaryDirectory] = None,
-        input_files: List[str] = None,
-        starting_batch_id: int = 0,
-        batches_per_partition: int = 100,
-        # Sampler args
-        num_neighbors: Union[List[int], Dict[Tuple[str, str, str], List[int]]] = None,
-        replace: bool = True,
-        compression: str = "COO",
-        # Other kwargs for the BulkSampler
-        **kwargs,
-    ):
-        """
-        Executes a bulk sampling job immediately upon creation.
-        Allows iteration over the returned results.
-
-        Parameters
-        ----------
-        feature_store: DaskGraphStore
-            The feature store containing features for the graph.
-
-        graph_store: DaskGraphStore
-            The graph store containing the graph structure.
-
-        input_nodes: InputNodes
-            The input nodes associated with this sampler.
-            If None, this loader will load batches
-            from disk rather than performing sampling in memory.
-
-        batch_size: int
-            The number of input nodes per sampling batch.
-            Generally required unless loading already-sampled
-            data from disk.
-
-        shuffle: bool (optional, default=False)
-            Whether to shuffle the input indices.
-            If True, will shuffle the input indices.
-            If False, will create batches in the original order.
-
-        edge_types: Sequence[Tuple[str]] (optional, default=None)
-            The desired edge types for the subgraph.
-            Defaults to all edges in the graph.
-
-        directory: str (optional, default=new tempdir)
-            The path of the directory to write samples to.
-            Defaults to a new generated temporary directory.
-
-        input_files: List[str] (optional, default=None)
-            The input files to read from the directory containing
-            samples.  This argument is only used when loading
-            alread-sampled batches from disk.
-
-        starting_batch_id: int (optional, default=0)
-            The starting id for each batch.  Defaults to 0.
-
-        batches_per_partition: int (optional, default=100)
-            The number of batches in each output partition.
-            Defaults to 100.  Gets passed to the bulk
-            sampler if there is one; otherwise, this argument
-            is used to determine which files to read.
-
-        num_neighbors: Union[List[int],
-                 Dict[Tuple[str, str, str], List[int]]] (required)
-            The number of neighbors to sample for each node in each iteration.
-            If an entry is set to -1, all neighbors will be included.
-            In heterogeneous graphs, may also take in a dictionary denoting
-            the number of neighbors to sample for each individual edge type.
-
-            Note: in cuGraph, only one value of num_neighbors is currently supported.
-            Passing in a dictionary will result in an exception.
-        """
-
-        self.__feature_store = feature_store
-        self.__graph_store = graph_store
-        self.__next_batch = -1
-        self.__end_exclusive = -1
-        self.__batches_per_partition = batches_per_partition
-        self.__starting_batch_id = starting_batch_id
-
-        self._total_read_time = 0.0
-        self._total_convert_time = 0.0
-        self._total_feature_time = 0.0
-
-        if input_nodes is None:
-            # Will be loading from disk
-            self.__num_batches = input_nodes
-            self.__directory = directory
-            if input_files is None:
-                if isinstance(self.__directory, str):
-                    self.__input_files = iter(os.listdir(self.__directory))
-                else:
-                    self.__input_files = iter(os.listdir(self.__directory.name))
-            else:
-                self.__input_files = iter(input_files)
-            return
-
-        # To accommodate DLFW/PyG 2.5
-        get_input_nodes = torch_geometric.loader.utils.get_input_nodes
-        get_input_nodes_kwargs = {}
-        if "input_id" in get_input_nodes.__annotations__:
-            get_input_nodes_kwargs["input_id"] = None
-        input_node_info = get_input_nodes(
-            (feature_store, graph_store), input_nodes, **get_input_nodes_kwargs
-        )
-
-        # PyG 2.4
-        if len(input_node_info) == 2:
-            input_type, input_nodes = input_node_info
-        # PyG 2.5
-        elif len(input_node_info) == 3:
-            input_type, input_nodes, input_id = input_node_info
-        # Invalid
-        else:
-            raise ValueError("Invalid output from get_input_nodes")
-
-        if input_type is not None:
-            input_nodes = graph_store._get_sample_from_vertex_groups(
-                {input_type: input_nodes}
-            )
-
-        if batch_size is None or batch_size < 1:
-            raise ValueError("Batch size must be >= 1")
-
-        self.__directory = (
-            tempfile.TemporaryDirectory() if directory is None else directory
-        )
-
-        if isinstance(num_neighbors, dict):
-            raise ValueError("num_neighbors dict is currently unsupported!")
-
-        if "renumber" in kwargs:
-            warnings.warn(
-                "Setting renumbering manually could result in invalid output,"
-                " please ensure you intended to do this."
-            )
-            renumber = kwargs.pop("renumber")
-        else:
-            renumber = (
-                True
-                if (
-                    (len(self.__graph_store.node_types) == 1)
-                    and (len(self.__graph_store.edge_types) == 1)
-                )
-                else False
-            )
-
-        bulk_sampler = BulkSampler(
-            batch_size,
-            self.__directory
-            if isinstance(self.__directory, str)
-            else self.__directory.name,
-            self.__graph_store._subgraph(edge_types),
-            fanout_vals=num_neighbors,
-            with_replacement=replace,
-            batches_per_partition=self.__batches_per_partition,
-            renumber=renumber,
-            use_legacy_names=False,
-            deduplicate_sources=True,
-            prior_sources_behavior="exclude",
-            include_hop_column=(compression == "COO"),
-            **kwargs,
-        )
-
-        # Make sure indices are in cupy
-        input_nodes = cupy.asarray(input_nodes)
-
-        # Shuffle
-        if shuffle:
-            cupy.random.shuffle(input_nodes)
-
-        # Truncate if we can't evenly divide the input array
-        stop = (len(input_nodes) // batch_size) * batch_size
-        input_nodes, remainder = cupy.array_split(input_nodes, [stop])
-
-        # Split into batches
-        input_nodes = cupy.split(input_nodes, max(len(input_nodes) // batch_size, 1))
-
-        if not drop_last:
-            input_nodes.append(remainder)
-
-        self.__num_batches = 0
-        for batch_num, batch_i in enumerate(input_nodes):
-            batch_len = len(batch_i)
-            if batch_len > 0:
-                self.__num_batches += 1
-                bulk_sampler.add_batches(
-                    cudf.DataFrame(
-                        {
-                            "start": batch_i,
-                            "batch": cupy.full(
-                                batch_len, batch_num + starting_batch_id, dtype="int32"
-                            ),
-                        }
-                    ),
-                    start_col_name="start",
-                    batch_col_name="batch",
-                )
-
-        bulk_sampler.flush()
-        self.__input_files = iter(
-            os.listdir(
-                self.__directory
-                if isinstance(self.__directory, str)
-                else self.__directory.name
-            )
-        )
-
-    def __next__(self):
-        from time import perf_counter
-
-        start_time_read_data = perf_counter()
-
-        # Load the next set of sampling results if necessary
-        if self.__next_batch >= self.__end_exclusive:
-            if self.__directory is None:
-                raise StopIteration
-
-            # Read the next parquet file into memory
-            dir_path = (
-                self.__directory
-                if isinstance(self.__directory, str)
-                else self.__directory.name
-            )
-
-            # Will raise StopIteration if there are no files left
-            try:
-                fname = next(self.__input_files)
-            except StopIteration as ex:
-                # Won't delete a non-temp dir (since it would just be deleting a string)
-                del self.__directory
-                self.__directory = None
-                raise StopIteration(ex)
-
-            m = self.__ex_parquet_file.match(fname)
-            if m is None:
-                raise ValueError(f"Invalid parquet filename {fname}")
-
-            self.__start_inclusive, end_inclusive = [int(g) for g in m.groups()]
-            self.__next_batch = self.__start_inclusive
-            self.__end_exclusive = end_inclusive + 1
-
-            parquet_path = os.path.join(
-                dir_path,
-                fname,
-            )
-
-            raw_sample_data = cudf.read_parquet(parquet_path)
-
-            if "map" in raw_sample_data.columns:
-                if "renumber_map_offsets" not in raw_sample_data.columns:
-                    num_batches = end_inclusive - self.__start_inclusive + 1
-
-                    map_end = raw_sample_data["map"].iloc[num_batches]
-
-                    map = torch.as_tensor(
-                        raw_sample_data["map"].iloc[0:map_end], device="cuda"
-                    )
-                    raw_sample_data.drop("map", axis=1, inplace=True)
-
-                    self.__renumber_map_offsets = map[0 : num_batches + 1] - map[0]
-                    self.__renumber_map = map[num_batches + 1 :]
-                else:
-                    self.__renumber_map = raw_sample_data["map"]
-                    self.__renumber_map_offsets = raw_sample_data[
-                        "renumber_map_offsets"
-                    ]
-                    raw_sample_data.drop(
-                        columns=["map", "renumber_map_offsets"], inplace=True
-                    )
-
-                    self.__renumber_map.dropna(inplace=True)
-                    self.__renumber_map = torch.as_tensor(
-                        self.__renumber_map, device="cuda"
-                    )
-
-                    self.__renumber_map_offsets.dropna(inplace=True)
-                    self.__renumber_map_offsets = torch.as_tensor(
-                        self.__renumber_map_offsets, device="cuda"
-                    )
-
-            else:
-                self.__renumber_map = None
-
-            self.__data = raw_sample_data
-            self.__coo = "majors" in self.__data.columns
-            if self.__coo:
-                self.__data.dropna(inplace=True)
-
-            if (
-                len(self.__graph_store.edge_types) == 1
-                and len(self.__graph_store.node_types) == 1
-            ):
-                if self.__coo:
-                    group_cols = ["batch_id", "hop_id"]
-                    self.__data_index = self.__data.groupby(
-                        group_cols, as_index=True
-                    ).agg({"majors": "max", "minors": "max"})
-                    self.__data_index.rename(
-                        columns={"majors": "src_max", "minors": "dst_max"},
-                        inplace=True,
-                    )
-                    self.__data_index = self.__data_index.to_dict(orient="index")
-                else:
-                    self.__data_index = None
-
-                    self.__label_hop_offsets = self.__data["label_hop_offsets"]
-                    self.__data.drop(columns=["label_hop_offsets"], inplace=True)
-                    self.__label_hop_offsets.dropna(inplace=True)
-                    self.__label_hop_offsets = torch.as_tensor(
-                        self.__label_hop_offsets, device="cuda"
-                    )
-                    self.__label_hop_offsets -= self.__label_hop_offsets[0].clone()
-
-                    self.__major_offsets = self.__data["major_offsets"]
-                    self.__data.drop(columns="major_offsets", inplace=True)
-                    self.__major_offsets.dropna(inplace=True)
-                    self.__major_offsets = torch.as_tensor(
-                        self.__major_offsets, device="cuda"
-                    )
-                    self.__major_offsets -= self.__major_offsets[0].clone()
-
-                    self.__minors = self.__data["minors"]
-                    self.__data.drop(columns="minors", inplace=True)
-                    self.__minors.dropna(inplace=True)
-                    self.__minors = torch.as_tensor(self.__minors, device="cuda")
-
-                    num_batches = self.__end_exclusive - self.__start_inclusive
-                    offsets_len = len(self.__label_hop_offsets) - 1
-                    if offsets_len % num_batches != 0:
-                        raise ValueError("invalid label-hop offsets")
-                    self.__fanout_length = int(offsets_len / num_batches)
-
-        end_time_read_data = perf_counter()
-        self._total_read_time += end_time_read_data - start_time_read_data
-
-        # Pull the next set of sampling results out of the dataframe in memory
-        if self.__coo:
-            f = self.__data["batch_id"] == self.__next_batch
-        if self.__renumber_map is not None:
-            i = self.__next_batch - self.__start_inclusive
-
-            # this should avoid d2h copy
-            current_renumber_map = self.__renumber_map[
-                self.__renumber_map_offsets[i] : self.__renumber_map_offsets[i + 1]
-            ]
-
-        else:
-            current_renumber_map = None
-
-        start_time_convert = perf_counter()
-        # Get and return the sampled subgraph
-        if (
-            len(self.__graph_store.edge_types) == 1
-            and len(self.__graph_store.node_types) == 1
-        ):
-            if self.__coo:
-                sampler_output = _sampler_output_from_sampling_results_homogeneous_coo(
-                    self.__data[f],
-                    current_renumber_map,
-                    self.__graph_store,
-                    self.__data_index,
-                    self.__next_batch,
-                )
-            else:
-                i = (self.__next_batch - self.__start_inclusive) * self.__fanout_length
-                current_label_hop_offsets = self.__label_hop_offsets[
-                    i : i + self.__fanout_length + 1
-                ]
-
-                current_major_offsets = self.__major_offsets[
-                    current_label_hop_offsets[0] : (current_label_hop_offsets[-1] + 1)
-                ]
-
-                current_minors = self.__minors[
-                    current_major_offsets[0] : current_major_offsets[-1]
-                ]
-
-                sampler_output = _sampler_output_from_sampling_results_homogeneous_csr(
-                    current_major_offsets,
-                    current_minors,
-                    current_renumber_map,
-                    self.__graph_store,
-                    current_label_hop_offsets,
-                    self.__data_index,
-                    self.__next_batch,
-                )
-        else:
-            sampler_output = _sampler_output_from_sampling_results_heterogeneous(
-                self.__data[f], current_renumber_map, self.__graph_store
-            )
-
-        # Get ready for next iteration
-        self.__next_batch += 1
-
-        end_time_convert = perf_counter()
-        self._total_convert_time += end_time_convert - start_time_convert
-
-        start_time_feature = perf_counter()
-        # Create a PyG HeteroData object, loading the required features
-        if self.__graph_store != self.__feature_store:
-            # TODO Possibly support this if there is an actual use case
-            raise ValueError("Separate graph and feature stores currently unsupported")
-
-        out = self.__graph_store.filter(
-            "COO" if self.__coo else "CSC",
-            sampler_output.node,
-            sampler_output.row,
-            sampler_output.col,
-            sampler_output.edge,
-        )
-
-        # Account for CSR format in cuGraph vs. CSC format in PyG
-        # TODO deprecate and remove this functionality
-        if self.__coo and self.__graph_store.order == "CSC":
-            for edge_type in out.edge_index_dict:
-                out[edge_type].edge_index = out[edge_type].edge_index.flip(dims=[0])
-
-        out.set_value_dict("num_sampled_nodes", sampler_output.num_sampled_nodes)
-        out.set_value_dict("num_sampled_edges", sampler_output.num_sampled_edges)
-
-        end_time_feature = perf_counter()
-        self._total_feature_time = end_time_feature - start_time_feature
-
-        return out
-
-    @property
-    def _starting_batch_id(self):
-        return self.__starting_batch_id
-
-    def __iter__(self):
-        return self
-
-
-class DaskNeighborLoader:
-    """
-    Duck-typed version of the PyG NeighborLoader interface that uses
-    Dask to sample nodes using the uniform neighbor sampling algorithm.
-    """
-
-    def __init__(
-        self,
-        data: Union[DaskGraphStore, Tuple[DaskGraphStore, DaskGraphStore]],
-        input_nodes: Union[InputNodes, int] = None,
-        batch_size: int = None,
-        **kwargs,
-    ):
-        """
-        Constructs a new DaskNeighborLoader object.
-
-        Parameters
-        ----------
-        data: DaskGraphStore or (DaskGraphStore, DaskGraphStore)
-            The DaskGraphStore or stores where the graph/feature data is held.
-
-        batch_size: int (required)
-            The number of input nodes in each batch.
-
-        input_nodes: Union[InputNodes, int] (required)
-            The input nodes associated with this sampler.
-
-        **kwargs: kwargs
-            Keyword arguments to pass through for sampling.
-            i.e. "shuffle", "fanout"
-            See BulkSampleLoader.
-        """
-
-        if input_nodes is None:
-            raise ValueError("input_nodes is required")
-        if batch_size is None:
-            raise ValueError("batch_size is required")
-
-        # Allow passing in a feature store and graph store as a tuple, as
-        # in the standard PyG API.  If only one is passed, it is assumed
-        # it is behaving as both a graph store and a feature store.
-        if isinstance(data, (list, tuple)):
-            self.__feature_store, self.__graph_store = data
-        else:
-            self.__feature_store = data
-            self.__graph_store = data
-
-        self.__batch_size = batch_size
-        self.__input_nodes = input_nodes
-        self.inner_loader_args = kwargs
-
-    @property
-    def batch_size(self) -> int:
-        return self.__batch_size
-
-    def __iter__(self):
-        self.current_loader = BulkSampleLoader(
-            self.__feature_store,
-            self.__graph_store,
-            self.__input_nodes,
-            self.__batch_size,
-            **self.inner_loader_args,
-        )
-
-        return self.current_loader
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/link_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/link_loader.py
deleted file mode 100644
index 77e2ac4f99d..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/link_loader.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-import cugraph_pyg
-from typing import Union, Tuple, Callable, Optional
-
-from cugraph.utilities.utils import import_optional
-
-torch_geometric = import_optional("torch_geometric")
-torch = import_optional("torch")
-
-
-class LinkLoader:
-    """
-    Duck-typed version of torch_geometric.loader.LinkLoader.
-    Loads samples from batches of input nodes using a
-    `~cugraph_pyg.sampler.BaseSampler.sample_from_edges`
-    function.
-    """
-
-    def __init__(
-        self,
-        data: Union[
-            "torch_geometric.data.Data",
-            "torch_geometric.data.HeteroData",
-            Tuple[
-                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-            ],
-        ],
-        link_sampler: "cugraph_pyg.sampler.BaseSampler",
-        edge_label_index: "torch_geometric.typing.InputEdges" = None,
-        edge_label: "torch_geometric.typing.OptTensor" = None,
-        edge_label_time: "torch_geometric.typing.OptTensor" = None,
-        neg_sampling: Optional["torch_geometric.sampler.NegativeSampling"] = None,
-        neg_sampling_ratio: Optional[Union[int, float]] = None,
-        transform: Optional[Callable] = None,
-        transform_sampler_output: Optional[Callable] = None,
-        filter_per_worker: Optional[bool] = None,
-        custom_cls: Optional["torch_geometric.data.HeteroData"] = None,
-        input_id: "torch_geometric.typing.OptTensor" = None,
-        batch_size: int = 1,  # refers to number of edges in batch
-        shuffle: bool = False,
-        drop_last: bool = False,
-        **kwargs,
-    ):
-        """
-        Parameters
-        ----------
-            data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
-                See torch_geometric.loader.NodeLoader.
-            link_sampler: BaseSampler
-                See torch_geometric.loader.LinkLoader.
-            edge_label_index: InputEdges
-                See torch_geometric.loader.LinkLoader.
-            edge_label: OptTensor
-                See torch_geometric.loader.LinkLoader.
-            edge_label_time: OptTensor
-                See torch_geometric.loader.LinkLoader.
-            neg_sampling: Optional[NegativeSampling]
-                Type of negative sampling to perform, if desired.
-                See torch_geometric.loader.LinkLoader.
-            neg_sampling_ratio: Optional[Union[int, float]]
-                Negative sampling ratio.  Affects how many negative
-                samples are generated.
-                See torch_geometric.loader.LinkLoader.
-            transform: Callable (optional, default=None)
-                This argument currently has no effect.
-            transform_sampler_output: Callable (optional, default=None)
-                This argument currently has no effect.
-            filter_per_worker: bool (optional, default=False)
-                This argument currently has no effect.
-            custom_cls: HeteroData
-                This argument currently has no effect.  This loader will
-                always return a Data or HeteroData object.
-            input_id: OptTensor
-                See torch_geometric.loader.LinkLoader.
-
-        """
-        if not isinstance(data, (list, tuple)) or not isinstance(
-            data[1], cugraph_pyg.data.GraphStore
-        ):
-            # Will eventually automatically convert these objects to cuGraph objects.
-            raise NotImplementedError("Currently can't accept non-cugraph graphs")
-
-        if not isinstance(link_sampler, cugraph_pyg.sampler.BaseSampler):
-            raise NotImplementedError("Must provide a cuGraph sampler")
-
-        if edge_label_time is not None:
-            raise ValueError("Temporal sampling is currently unsupported")
-
-        if filter_per_worker:
-            warnings.warn("filter_per_worker is currently ignored")
-
-        if custom_cls is not None:
-            warnings.warn("custom_cls is currently ignored")
-
-        if transform is not None:
-            warnings.warn("transform is currently ignored.")
-
-        if transform_sampler_output is not None:
-            warnings.warn("transform_sampler_output is currently ignored.")
-
-        if neg_sampling_ratio is not None:
-            warnings.warn(
-                "The 'neg_sampling_ratio' argument is deprecated in PyG"
-                " and is not supported in cuGraph-PyG."
-            )
-
-        neg_sampling = torch_geometric.sampler.NegativeSampling.cast(neg_sampling)
-
-        (
-            input_type,
-            edge_label_index,
-        ) = torch_geometric.loader.utils.get_edge_label_index(
-            data,
-            (None, edge_label_index),
-        )
-
-        self.__input_data = torch_geometric.sampler.EdgeSamplerInput(
-            input_id=torch.arange(
-                edge_label_index[0].numel(), dtype=torch.int64, device="cuda"
-            )
-            if input_id is None
-            else input_id,
-            row=edge_label_index[0],
-            col=edge_label_index[1],
-            label=edge_label,
-            time=edge_label_time,
-            input_type=input_type,
-        )
-
-        # Edge label check from torch_geometric.loader.LinkLoader
-        if (
-            neg_sampling is not None
-            and neg_sampling.is_binary()
-            and edge_label is not None
-            and edge_label.min() == 0
-        ):
-            edge_label = edge_label + 1
-
-        if (
-            neg_sampling is not None
-            and neg_sampling.is_triplet()
-            and edge_label is not None
-        ):
-            raise ValueError(
-                "'edge_label' needs to be undefined for "
-                "'triplet'-based negative sampling. Please use "
-                "`src_index`, `dst_pos_index` and "
-                "`neg_pos_index` of the returned mini-batch "
-                "instead to differentiate between positive and "
-                "negative samples."
-            )
-
-        self.__data = data
-
-        self.__link_sampler = link_sampler
-        self.__neg_sampling = neg_sampling
-
-        self.__batch_size = batch_size
-        self.__shuffle = shuffle
-        self.__drop_last = drop_last
-
-    def __iter__(self):
-        if self.__shuffle:
-            perm = torch.randperm(self.__input_data.row.numel())
-        else:
-            perm = torch.arange(self.__input_data.row.numel())
-
-        if self.__drop_last:
-            d = perm.numel() % self.__batch_size
-            perm = perm[:-d]
-
-        input_data = torch_geometric.sampler.EdgeSamplerInput(
-            input_id=self.__input_data.input_id[perm],
-            row=self.__input_data.row[perm],
-            col=self.__input_data.col[perm],
-            label=None
-            if self.__input_data.label is None
-            else self.__input_data.label[perm],
-            time=None
-            if self.__input_data.time is None
-            else self.__input_data.time[perm],
-            input_type=self.__input_data.input_type,
-        )
-
-        return cugraph_pyg.sampler.SampleIterator(
-            self.__data,
-            self.__link_sampler.sample_from_edges(
-                input_data,
-                neg_sampling=self.__neg_sampling,
-            ),
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/link_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/link_neighbor_loader.py
deleted file mode 100644
index 080565368c4..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/link_neighbor_loader.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from typing import Union, Tuple, Optional, Callable, List, Dict
-
-import cugraph_pyg
-from cugraph_pyg.loader import LinkLoader
-from cugraph_pyg.sampler import BaseSampler
-
-from cugraph.gnn import NeighborSampler, DistSampleWriter
-from cugraph.utilities.utils import import_optional
-
-torch_geometric = import_optional("torch_geometric")
-
-
-class LinkNeighborLoader(LinkLoader):
-    """
-    Duck-typed version of torch_geometric.loader.LinkNeighborLoader
-
-    Link loader that implements the neighbor sampling
-    algorithm used in GraphSAGE.
-    """
-
-    def __init__(
-        self,
-        data: Union[
-            "torch_geometric.data.Data",
-            "torch_geometric.data.HeteroData",
-            Tuple[
-                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-            ],
-        ],
-        num_neighbors: Union[
-            List[int], Dict["torch_geometric.typing.EdgeType", List[int]]
-        ],
-        edge_label_index: "torch_geometric.typing.InputEdges" = None,
-        edge_label: "torch_geometric.typing.OptTensor" = None,
-        edge_label_time: "torch_geometric.typing.OptTensor" = None,
-        replace: bool = False,
-        subgraph_type: Union[
-            "torch_geometric.typing.SubgraphType", str
-        ] = "directional",
-        disjoint: bool = False,
-        temporal_strategy: str = "uniform",
-        neg_sampling: Optional["torch_geometric.sampler.NegativeSampling"] = None,
-        neg_sampling_ratio: Optional[Union[int, float]] = None,
-        time_attr: Optional[str] = None,
-        weight_attr: Optional[str] = None,
-        transform: Optional[Callable] = None,
-        transform_sampler_output: Optional[Callable] = None,
-        is_sorted: bool = False,
-        filter_per_worker: Optional[bool] = None,
-        neighbor_sampler: Optional["torch_geometric.sampler.NeighborSampler"] = None,
-        directed: bool = True,  # Deprecated.
-        batch_size: int = 16,  # Refers to number of edges per batch.
-        directory: Optional[str] = None,
-        batches_per_partition=256,
-        format: str = "parquet",
-        compression: Optional[str] = None,
-        local_seeds_per_call: Optional[int] = None,
-        **kwargs,
-    ):
-        """
-        data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
-            See torch_geometric.loader.LinkNeighborLoader.
-        num_neighbors: List[int] or Dict[EdgeType, List[int]]
-            Fanout values.
-            See torch_geometric.loader.LinkNeighborLoader.
-        edge_label_index: InputEdges
-            Input edges for sampling.
-            See torch_geometric.loader.LinkNeighborLoader.
-        edge_label: OptTensor
-            Labels for input edges.
-            See torch_geometric.loader.LinkNeighborLoader.
-        edge_label_time: OptTensor
-            Time attribute for input edges.
-            See torch_geometric.loader.LinkNeighborLoader.
-        replace: bool (optional, default=False)
-            Whether to sample with replacement.
-            See torch_geometric.loader.LinkNeighborLoader.
-        subgraph_type: Union[SubgraphType, str] (optional, default='directional')
-            The type of subgraph to return.
-            Currently only 'directional' is supported.
-            See torch_geometric.loader.LinkNeighborLoader.
-        disjoint: bool (optional, default=False)
-            Whether to perform disjoint sampling.
-            Currently unsupported.
-            See torch_geometric.loader.LinkNeighborLoader.
-        temporal_strategy: str (optional, default='uniform')
-            Currently only 'uniform' is suppported.
-            See torch_geometric.loader.LinkNeighborLoader.
-        time_attr: str (optional, default=None)
-            Used for temporal sampling.
-            See torch_geometric.loader.LinkNeighborLoader.
-        weight_attr: str (optional, default=None)
-            Used for biased sampling.
-            See torch_geometric.loader.LinkNeighborLoader.
-        transform: Callable (optional, default=None)
-            See torch_geometric.loader.LinkNeighborLoader.
-        transform_sampler_output: Callable (optional, default=None)
-            See torch_geometric.loader.LinkNeighborLoader.
-        is_sorted: bool (optional, default=False)
-            Ignored by cuGraph.
-            See torch_geometric.loader.LinkNeighborLoader.
-        filter_per_worker: bool (optional, default=False)
-            Currently ignored by cuGraph, but this may
-            change once in-memory sampling is implemented.
-            See torch_geometric.loader.LinkNeighborLoader.
-        neighbor_sampler: torch_geometric.sampler.NeighborSampler
-            (optional, default=None)
-            Not supported by cuGraph.
-            See torch_geometric.loader.LinkNeighborLoader.
-        directed: bool (optional, default=True)
-            Deprecated.
-            See torch_geometric.loader.LinkNeighborLoader.
-        batch_size: int (optional, default=16)
-            The number of input nodes per output minibatch.
-            See torch.utils.dataloader.
-        directory: str (optional, default=None)
-            The directory where samples will be temporarily stored,
-            if spilling samples to disk.  If None, this loader
-            will perform buffered in-memory sampling.
-            If writing to disk, setting this argument
-            to a tempfile.TemporaryDirectory with a context
-            manager is a good option but depending on the filesystem,
-            you may want to choose an alternative location with fast I/O
-            intead.
-            See cugraph.gnn.DistSampleWriter.
-        batches_per_partition: int (optional, default=256)
-            The number of batches per partition if writing samples to
-            disk.  Manually tuning this parameter is not recommended
-            but reducing it may help conserve GPU memory.
-            See cugraph.gnn.DistSampleWriter.
-        format: str (optional, default='parquet')
-            If writing samples to disk, they will be written in this
-            file format.
-            See cugraph.gnn.DistSampleWriter.
-        compression: str (optional, default=None)
-            The compression type to use if writing samples to disk.
-            If not provided, it is automatically chosen.
-        local_seeds_per_call: int (optional, default=None)
-            The number of seeds to process within a single sampling call.
-            Manually tuning this parameter is not recommended but reducing
-            it may conserve GPU memory.  The total number of seeds processed
-            per sampling call is equal to the sum of this parameter across
-            all workers.  If not provided, it will be automatically
-            calculated.
-            See cugraph.gnn.DistSampler.
-        **kwargs
-            Other keyword arguments passed to the superclass.
-        """
-
-        subgraph_type = torch_geometric.sampler.base.SubgraphType(subgraph_type)
-
-        if not directed:
-            subgraph_type = torch_geometric.sampler.base.SubgraphType.induced
-            warnings.warn(
-                "The 'directed' argument is deprecated. "
-                "Use subgraph_type='induced' instead."
-            )
-        if subgraph_type != torch_geometric.sampler.base.SubgraphType.directional:
-            raise ValueError("Only directional subgraphs are currently supported")
-        if disjoint:
-            raise ValueError("Disjoint sampling is currently unsupported")
-        if temporal_strategy != "uniform":
-            warnings.warn("Only the uniform temporal strategy is currently supported")
-        if neighbor_sampler is not None:
-            raise ValueError("Passing a neighbor sampler is currently unsupported")
-        if time_attr is not None:
-            raise ValueError("Temporal sampling is currently unsupported")
-        if is_sorted:
-            warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
-        if not isinstance(data, (list, tuple)) or not isinstance(
-            data[1], cugraph_pyg.data.GraphStore
-        ):
-            # Will eventually automatically convert these objects to cuGraph objects.
-            raise NotImplementedError("Currently can't accept non-cugraph graphs")
-
-        if compression is None:
-            compression = "CSR"
-        elif compression not in ["CSR", "COO"]:
-            raise ValueError("Invalid value for compression (expected 'CSR' or 'COO')")
-
-        writer = (
-            None
-            if directory is None
-            else DistSampleWriter(
-                directory=directory,
-                batches_per_partition=batches_per_partition,
-                format=format,
-            )
-        )
-
-        feature_store, graph_store = data
-
-        if weight_attr is not None:
-            graph_store._set_weight_attr((feature_store, weight_attr))
-
-        sampler = BaseSampler(
-            NeighborSampler(
-                graph_store._graph,
-                writer,
-                retain_original_seeds=True,
-                fanout=num_neighbors,
-                prior_sources_behavior="exclude",
-                deduplicate_sources=True,
-                compression=compression,
-                compress_per_hop=False,
-                with_replacement=replace,
-                local_seeds_per_call=local_seeds_per_call,
-                biased=(weight_attr is not None),
-            ),
-            (feature_store, graph_store),
-            batch_size=batch_size,
-        )
-        # TODO add heterogeneous support and pass graph_store._vertex_offsets
-
-        super().__init__(
-            (feature_store, graph_store),
-            sampler,
-            edge_label_index=edge_label_index,
-            edge_label=edge_label,
-            edge_label_time=edge_label_time,
-            neg_sampling=neg_sampling,
-            neg_sampling_ratio=neg_sampling_ratio,
-            transform=transform,
-            transform_sampler_output=transform_sampler_output,
-            filter_per_worker=filter_per_worker,
-            batch_size=batch_size,
-            **kwargs,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
deleted file mode 100644
index 1da2c6dc381..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/neighbor_loader.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-from typing import Union, Tuple, Optional, Callable, List, Dict
-
-import cugraph_pyg
-from cugraph_pyg.loader import NodeLoader
-from cugraph_pyg.sampler import BaseSampler
-
-from cugraph.gnn import NeighborSampler, DistSampleWriter
-from cugraph.utilities.utils import import_optional
-
-torch_geometric = import_optional("torch_geometric")
-
-
-class NeighborLoader(NodeLoader):
-    """
-    Duck-typed version of torch_geometric.loader.NeighborLoader
-
-    Node loader that implements the neighbor sampling
-    algorithm used in GraphSAGE.
-    """
-
-    def __init__(
-        self,
-        data: Union[
-            "torch_geometric.data.Data",
-            "torch_geometric.data.HeteroData",
-            Tuple[
-                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-            ],
-        ],
-        num_neighbors: Union[
-            List[int], Dict["torch_geometric.typing.EdgeType", List[int]]
-        ],
-        input_nodes: "torch_geometric.typing.InputNodes" = None,
-        input_time: "torch_geometric.typing.OptTensor" = None,
-        replace: bool = False,
-        subgraph_type: Union[
-            "torch_geometric.typing.SubgraphType", str
-        ] = "directional",
-        disjoint: bool = False,
-        temporal_strategy: str = "uniform",
-        time_attr: Optional[str] = None,
-        weight_attr: Optional[str] = None,
-        transform: Optional[Callable] = None,
-        transform_sampler_output: Optional[Callable] = None,
-        is_sorted: bool = False,
-        filter_per_worker: Optional[bool] = None,
-        neighbor_sampler: Optional["torch_geometric.sampler.NeighborSampler"] = None,
-        directed: bool = True,  # Deprecated.
-        batch_size: int = 16,
-        directory: Optional[str] = None,
-        batches_per_partition=256,
-        format: str = "parquet",
-        compression: Optional[str] = None,
-        local_seeds_per_call: Optional[int] = None,
-        **kwargs,
-    ):
-        """
-        data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
-            See torch_geometric.loader.NeighborLoader.
-        num_neighbors: List[int] or Dict[EdgeType, List[int]]
-            Fanout values.
-            See torch_geometric.loader.NeighborLoader.
-        input_nodes: InputNodes
-            Input nodes for sampling.
-            See torch_geometric.loader.NeighborLoader.
-        input_time: OptTensor (optional)
-            See torch_geometric.loader.NeighborLoader.
-        replace: bool (optional, default=False)
-            Whether to sample with replacement.
-            See torch_geometric.loader.NeighborLoader.
-        subgraph_type: Union[SubgraphType, str] (optional, default='directional')
-            The type of subgraph to return.
-            Currently only 'directional' is supported.
-            See torch_geometric.loader.NeighborLoader.
-        disjoint: bool (optional, default=False)
-            Whether to perform disjoint sampling.
-            Currently unsupported.
-            See torch_geometric.loader.NeighborLoader.
-        temporal_strategy: str (optional, default='uniform')
-            Currently only 'uniform' is suppported.
-            See torch_geometric.loader.NeighborLoader.
-        time_attr: str (optional, default=None)
-            Used for temporal sampling.
-            See torch_geometric.loader.NeighborLoader.
-        weight_attr: str (optional, default=None)
-            Used for biased sampling.
-            See torch_geometric.loader.NeighborLoader.
-        transform: Callable (optional, default=None)
-            See torch_geometric.loader.NeighborLoader.
-        transform_sampler_output: Callable (optional, default=None)
-            See torch_geometric.loader.NeighborLoader.
-        is_sorted: bool (optional, default=False)
-            Ignored by cuGraph.
-            See torch_geometric.loader.NeighborLoader.
-        filter_per_worker: bool (optional, default=False)
-            Currently ignored by cuGraph, but this may
-            change once in-memory sampling is implemented.
-            See torch_geometric.loader.NeighborLoader.
-        neighbor_sampler: torch_geometric.sampler.NeighborSampler
-            (optional, default=None)
-            Not supported by cuGraph.
-            See torch_geometric.loader.NeighborLoader.
-        directed: bool (optional, default=True)
-            Deprecated.
-            See torch_geometric.loader.NeighborLoader.
-        batch_size: int (optional, default=16)
-            The number of input nodes per output minibatch.
-            See torch.utils.dataloader.
-        directory: str (optional, default=None)
-            The directory where samples will be temporarily stored,
-            if spilling samples to disk.  If None, this loader
-            will perform buffered in-memory sampling.
-            If writing to disk, setting this argument
-            to a tempfile.TemporaryDirectory with a context
-            manager is a good option but depending on the filesystem,
-            you may want to choose an alternative location with fast I/O
-            intead.
-            See cugraph.gnn.DistSampleWriter.
-        batches_per_partition: int (optional, default=256)
-            The number of batches per partition if writing samples to
-            disk.  Manually tuning this parameter is not recommended
-            but reducing it may help conserve GPU memory.
-            See cugraph.gnn.DistSampleWriter.
-        format: str (optional, default='parquet')
-            If writing samples to disk, they will be written in this
-            file format.
-            See cugraph.gnn.DistSampleWriter.
-        compression: str (optional, default=None)
-            The compression type to use if writing samples to disk.
-            If not provided, it is automatically chosen.
-        local_seeds_per_call: int (optional, default=None)
-            The number of seeds to process within a single sampling call.
-            Manually tuning this parameter is not recommended but reducing
-            it may conserve GPU memory.  The total number of seeds processed
-            per sampling call is equal to the sum of this parameter across
-            all workers.  If not provided, it will be automatically
-            calculated.
-            See cugraph.gnn.DistSampler.
-        **kwargs
-            Other keyword arguments passed to the superclass.
-        """
-
-        subgraph_type = torch_geometric.sampler.base.SubgraphType(subgraph_type)
-
-        if not directed:
-            subgraph_type = torch_geometric.sampler.base.SubgraphType.induced
-            warnings.warn(
-                "The 'directed' argument is deprecated. "
-                "Use subgraph_type='induced' instead."
-            )
-        if subgraph_type != torch_geometric.sampler.base.SubgraphType.directional:
-            raise ValueError("Only directional subgraphs are currently supported")
-        if disjoint:
-            raise ValueError("Disjoint sampling is currently unsupported")
-        if temporal_strategy != "uniform":
-            warnings.warn("Only the uniform temporal strategy is currently supported")
-        if neighbor_sampler is not None:
-            raise ValueError("Passing a neighbor sampler is currently unsupported")
-        if time_attr is not None:
-            raise ValueError("Temporal sampling is currently unsupported")
-        if is_sorted:
-            warnings.warn("The 'is_sorted' argument is ignored by cuGraph.")
-        if not isinstance(data, (list, tuple)) or not isinstance(
-            data[1], cugraph_pyg.data.GraphStore
-        ):
-            # Will eventually automatically convert these objects to cuGraph objects.
-            raise NotImplementedError("Currently can't accept non-cugraph graphs")
-
-        if compression is None:
-            compression = "CSR"
-        elif compression not in ["CSR", "COO"]:
-            raise ValueError("Invalid value for compression (expected 'CSR' or 'COO')")
-
-        writer = (
-            None
-            if directory is None
-            else DistSampleWriter(
-                directory=directory,
-                batches_per_partition=batches_per_partition,
-                format=format,
-            )
-        )
-
-        feature_store, graph_store = data
-
-        if weight_attr is not None:
-            graph_store._set_weight_attr((feature_store, weight_attr))
-
-        sampler = BaseSampler(
-            NeighborSampler(
-                graph_store._graph,
-                writer,
-                retain_original_seeds=True,
-                fanout=num_neighbors,
-                prior_sources_behavior="exclude",
-                deduplicate_sources=True,
-                compression=compression,
-                compress_per_hop=False,
-                with_replacement=replace,
-                local_seeds_per_call=local_seeds_per_call,
-                biased=(weight_attr is not None),
-            ),
-            (feature_store, graph_store),
-            batch_size=batch_size,
-        )
-        # TODO add heterogeneous support and pass graph_store._vertex_offsets
-
-        super().__init__(
-            (feature_store, graph_store),
-            sampler,
-            input_nodes=input_nodes,
-            input_time=input_time,
-            transform=transform,
-            transform_sampler_output=transform_sampler_output,
-            filter_per_worker=filter_per_worker,
-            batch_size=batch_size,
-            **kwargs,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
deleted file mode 100644
index 4b236f75885..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/node_loader.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-import cugraph_pyg
-from typing import Union, Tuple, Callable, Optional
-
-from cugraph.utilities.utils import import_optional
-
-torch_geometric = import_optional("torch_geometric")
-torch = import_optional("torch")
-
-
-class NodeLoader:
-    """
-    Duck-typed version of torch_geometric.loader.NodeLoader.
-    Loads samples from batches of input nodes using a
-    `~cugraph_pyg.sampler.BaseSampler.sample_from_nodes`
-    function.
-    """
-
-    def __init__(
-        self,
-        data: Union[
-            "torch_geometric.data.Data",
-            "torch_geometric.data.HeteroData",
-            Tuple[
-                "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-            ],
-        ],
-        node_sampler: "cugraph_pyg.sampler.BaseSampler",
-        input_nodes: "torch_geometric.typing.InputNodes" = None,
-        input_time: "torch_geometric.typing.OptTensor" = None,
-        transform: Optional[Callable] = None,
-        transform_sampler_output: Optional[Callable] = None,
-        filter_per_worker: Optional[bool] = None,
-        custom_cls: Optional["torch_geometric.data.HeteroData"] = None,
-        input_id: "torch_geometric.typing.OptTensor" = None,
-        batch_size: int = 1,
-        shuffle: bool = False,
-        drop_last: bool = False,
-        **kwargs,
-    ):
-        """
-        Parameters
-        ----------
-            data: Data, HeteroData, or Tuple[FeatureStore, GraphStore]
-                See torch_geometric.loader.NodeLoader.
-            node_sampler: BaseSampler
-                See torch_geometric.loader.NodeLoader.
-            input_nodes: InputNodes
-                See torch_geometric.loader.NodeLoader.
-            input_time: OptTensor
-                See torch_geometric.loader.NodeLoader.
-            transform: Callable (optional, default=None)
-                This argument currently has no effect.
-            transform_sampler_output: Callable (optional, default=None)
-                This argument currently has no effect.
-            filter_per_worker: bool (optional, default=False)
-                This argument currently has no effect.
-            custom_cls: HeteroData
-                This argument currently has no effect.  This loader will
-                always return a Data or HeteroData object.
-            input_id: OptTensor
-                See torch_geometric.loader.NodeLoader.
-
-        """
-        if not isinstance(data, (list, tuple)) or not isinstance(
-            data[1], cugraph_pyg.data.GraphStore
-        ):
-            # Will eventually automatically convert these objects to cuGraph objects.
-            raise NotImplementedError("Currently can't accept non-cugraph graphs")
-
-        if not isinstance(node_sampler, cugraph_pyg.sampler.BaseSampler):
-            raise NotImplementedError("Must provide a cuGraph sampler")
-
-        if input_time is not None:
-            raise ValueError("Temporal sampling is currently unsupported")
-
-        if filter_per_worker:
-            warnings.warn("filter_per_worker is currently ignored")
-
-        if custom_cls is not None:
-            warnings.warn("custom_cls is currently ignored")
-
-        if transform is not None:
-            warnings.warn("transform is currently ignored.")
-
-        if transform_sampler_output is not None:
-            warnings.warn("transform_sampler_output is currently ignored.")
-
-        (
-            input_type,
-            input_nodes,
-            input_id,
-        ) = torch_geometric.loader.utils.get_input_nodes(
-            data,
-            input_nodes,
-            input_id,
-        )
-
-        self.__input_data = torch_geometric.sampler.NodeSamplerInput(
-            input_id=torch.arange(len(input_nodes), dtype=torch.int64, device="cuda")
-            if input_id is None
-            else input_id,
-            node=input_nodes,
-            time=None,
-            input_type=input_type,
-        )
-
-        self.__data = data
-
-        self.__node_sampler = node_sampler
-
-        self.__batch_size = batch_size
-        self.__shuffle = shuffle
-        self.__drop_last = drop_last
-
-    def __iter__(self):
-        if self.__shuffle:
-            perm = torch.randperm(self.__input_data.node.numel())
-        else:
-            perm = torch.arange(self.__input_data.node.numel())
-
-        if self.__drop_last:
-            d = perm.numel() % self.__batch_size
-            perm = perm[:-d]
-
-        input_data = torch_geometric.sampler.NodeSamplerInput(
-            input_id=self.__input_data.input_id[perm],
-            node=self.__input_data.node[perm],
-            time=None
-            if self.__input_data.time is None
-            else self.__input_data.time[perm],
-            input_type=self.__input_data.input_type,
-        )
-
-        return cugraph_pyg.sampler.SampleIterator(
-            self.__data, self.__node_sampler.sample_from_nodes(input_data)
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/__init__.py
deleted file mode 100644
index 331b49ebec0..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .conv import *
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
deleted file mode 100644
index bef3a023b93..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .gat_conv import GATConv
-from .gatv2_conv import GATv2Conv
-from .hetero_gat_conv import HeteroGATConv
-from .rgcn_conv import RGCNConv
-from .sage_conv import SAGEConv
-from .transformer_conv import TransformerConv
-
-__all__ = [
-    "GATConv",
-    "GATv2Conv",
-    "HeteroGATConv",
-    "RGCNConv",
-    "SAGEConv",
-    "TransformerConv",
-]
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
deleted file mode 100644
index 713448a8203..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-import pylibcugraphops.pytorch
-
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-# A tuple of (row, colptr, num_src_nodes)
-CSC = Tuple[torch.Tensor, torch.Tensor, int]
-
-
-class BaseConv(torch.nn.Module):  # pragma: no cover
-    r"""An abstract base class for implementing cugraph-ops message passing layers."""
-
-    def reset_parameters(self):
-        r"""Resets all learnable parameters of the module."""
-        pass
-
-    @staticmethod
-    def to_csc(
-        edge_index: torch.Tensor,
-        size: Optional[Tuple[int, int]] = None,
-        edge_attr: Optional[torch.Tensor] = None,
-    ) -> Union[CSC, Tuple[CSC, torch.Tensor],]:
-        r"""Returns a CSC representation of an :obj:`edge_index` tensor to be
-        used as input to cugraph-ops conv layers.
-
-        Args:
-            edge_index (torch.Tensor): The edge indices.
-            size ((int, int), optional). The shape of :obj:`edge_index` in each
-                dimension. (default: :obj:`None`)
-            edge_attr (torch.Tensor, optional): The edge features.
-                (default: :obj:`None`)
-        """
-        if size is None:
-            warnings.warn(
-                f"Inferring the graph size from 'edge_index' causes "
-                f"a decline in performance and does not work for "
-                f"bipartite graphs. To suppress this warning, pass "
-                f"the 'size' explicitly in '{__name__}.to_csc()'."
-            )
-            num_src_nodes = num_dst_nodes = int(edge_index.max()) + 1
-        else:
-            num_src_nodes, num_dst_nodes = size
-
-        row, col = edge_index
-        col, perm = torch_geometric.utils.index_sort(col, max_value=num_dst_nodes)
-        row = row[perm]
-
-        colptr = torch_geometric.utils.sparse.index2ptr(col, num_dst_nodes)
-
-        if edge_attr is not None:
-            return (row, colptr, num_src_nodes), edge_attr[perm]
-
-        return row, colptr, num_src_nodes
-
-    def get_cugraph(
-        self,
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        bipartite: bool = False,
-        max_num_neighbors: Optional[int] = None,
-    ) -> Tuple[pylibcugraphops.pytorch.CSC, Optional[torch.Tensor]]:
-        r"""Constructs a :obj:`cugraph-ops` graph object from CSC representation.
-        Supports both bipartite and non-bipartite graphs.
-
-        Args:
-            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
-                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
-                CSC representation.
-            bipartite (bool): If set to :obj:`True`, will create the bipartite
-                structure in cugraph-ops. (default: :obj:`False`)
-            max_num_neighbors (int, optional): The maximum number of neighbors
-                of a destination node. When enabled, it allows models to use
-                the message-flow-graph primitives in cugraph-ops.
-                (default: :obj:`None`)
-        """
-        perm = None
-        if isinstance(edge_index, torch_geometric.EdgeIndex):
-            edge_index, perm = edge_index.sort_by("col")
-            num_src_nodes = edge_index.get_sparse_size(0)
-            (colptr, row), _ = edge_index.get_csc()
-        else:
-            row, colptr, num_src_nodes = edge_index
-
-        if not row.is_cuda:
-            raise RuntimeError(
-                f"'{self.__class__.__name__}' requires GPU-based processing "
-                f"but got CPU tensor."
-            )
-
-        if max_num_neighbors is None:
-            max_num_neighbors = -1
-
-        return (
-            pylibcugraphops.pytorch.CSC(
-                offsets=colptr,
-                indices=row,
-                num_src_nodes=num_src_nodes,
-                dst_max_in_degree=max_num_neighbors,
-                is_bipartite=bipartite,
-            ),
-            perm,
-        )
-
-    def get_typed_cugraph(
-        self,
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_type: torch.Tensor,
-        num_edge_types: Optional[int] = None,
-        bipartite: bool = False,
-        max_num_neighbors: Optional[int] = None,
-    ) -> Tuple[pylibcugraphops.pytorch.HeteroCSC, Optional[torch.Tensor]]:
-        r"""Constructs a typed :obj:`cugraph` graph object from a CSC
-        representation where each edge corresponds to a given edge type.
-        Supports both bipartite and non-bipartite graphs.
-
-        Args:
-            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
-                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
-                CSC representation.
-            edge_type (torch.Tensor): The edge type.
-            num_edge_types (int, optional): The maximum number of edge types.
-                When not given, will be computed on-the-fly, leading to
-                slightly worse performance. (default: :obj:`None`)
-            bipartite (bool): If set to :obj:`True`, will create the bipartite
-                structure in cugraph-ops. (default: :obj:`False`)
-            max_num_neighbors (int, optional): The maximum number of neighbors
-                of a destination node. When enabled, it allows models to use
-                the message-flow-graph primitives in cugraph-ops.
-                (default: :obj:`None`)
-        """
-        if num_edge_types is None:
-            num_edge_types = int(edge_type.max()) + 1
-
-        if max_num_neighbors is None:
-            max_num_neighbors = -1
-
-        perm = None
-        if isinstance(edge_index, torch_geometric.EdgeIndex):
-            edge_index, perm = edge_index.sort_by("col")
-            edge_type = edge_type[perm]
-            num_src_nodes = edge_index.get_sparse_size(0)
-            (colptr, row), _ = edge_index.get_csc()
-        else:
-            row, colptr, num_src_nodes = edge_index
-        edge_type = edge_type.int()
-
-        return (
-            pylibcugraphops.pytorch.HeteroCSC(
-                offsets=colptr,
-                indices=row,
-                edge_types=edge_type,
-                num_src_nodes=num_src_nodes,
-                num_edge_types=num_edge_types,
-                dst_max_in_degree=max_num_neighbors,
-                is_bipartite=bipartite,
-            ),
-            perm,
-        )
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor): The node features.
-            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
-                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
-                CSC representation.
-        """
-        raise NotImplementedError
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
deleted file mode 100644
index 981b1c5b50d..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_gat_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-torch_geometric = import_optional("torch_geometric")
-
-
-class GATConv(BaseConv):
-    r"""The graph attentional operator from the `"Graph Attention Networks"
-    <https://arxiv.org/abs/1710.10903>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
-        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j},
-
-    where the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j]
-        \right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k]
-        \right)\right)}.
-
-    If the graph has multi-dimensional edge features :math:`\mathbf{e}_{i,j}`,
-    the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j
-        \, \Vert \, \mathbf{\Theta}_{e} \mathbf{e}_{i,j}]\right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k
-        \, \Vert \, \mathbf{\Theta}_{e} \mathbf{e}_{i,k}]\right)\right)}.
-
-    Args:
-        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
-            derive the size from the first input(s) to the forward method.
-            A tuple corresponds to the sizes of source and target
-            dimensionalities.
-        out_channels (int): Size of each output sample.
-        heads (int, optional): Number of multi-head-attentions.
-            (default: :obj:`1`)
-        concat (bool, optional): If set to :obj:`False`, the multi-head
-            attentions are averaged instead of concatenated.
-            (default: :obj:`True`)
-        negative_slope (float, optional): LeakyReLU angle of the negative
-            slope. (default: :obj:`0.2`)
-        edge_dim (int, optional): Edge feature dimensionality (in case
-            there are any). (default: :obj:`None`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        heads: int = 1,
-        concat: bool = True,
-        negative_slope: float = 0.2,
-        edge_dim: Optional[int] = None,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.heads = heads
-        self.concat = concat
-        self.negative_slope = negative_slope
-        self.edge_dim = edge_dim
-
-        Linear = torch_geometric.nn.Linear
-
-        if isinstance(in_channels, int):
-            self.lin = Linear(
-                in_channels,
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-        else:
-            self.lin_src = Linear(
-                in_channels[0],
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-            self.lin_dst = Linear(
-                in_channels[1],
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-
-        if edge_dim is not None:
-            self.lin_edge = Linear(
-                edge_dim,
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-            self.att = nn.Parameter(torch.Tensor(3 * heads * out_channels))
-        else:
-            self.register_parameter("lin_edge", None)
-            self.att = nn.Parameter(torch.Tensor(2 * heads * out_channels))
-
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(heads * out_channels))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        if isinstance(self.in_channels, int):
-            self.lin.reset_parameters()
-        else:
-            self.lin_src.reset_parameters()
-            self.lin_dst.reset_parameters()
-
-        torch_geometric.nn.inits.glorot(
-            self.att.view(-1, self.heads, self.out_channels)
-        )
-
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        torch_geometric.nn.inits.zeros(self.bias)
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_attr: Optional[torch.Tensor] = None,
-        max_num_neighbors: Optional[int] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-        high_precision_dgrad: bool = False,
-        high_precision_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor or tuple): The node features. Can be a tuple of
-                tensors denoting source and destination node features.
-            edge_index (EdgeIndex or CSC): The edge indices.
-            edge_attr: (torch.Tensor, optional) The edge features.
-            max_num_neighbors (int, optional): The maximum number of neighbors
-                of a destination node. When enabled, it allows models to use
-                the message-flow-graph primitives in cugraph-ops.
-                (default: :obj:`None`)
-            deterministic_dgrad : bool, default=False
-                Optional flag indicating whether the feature gradients
-                are computed deterministically using a dedicated workspace buffer.
-            deterministic_wgrad: bool, default=False
-                Optional flag indicating whether the weight gradients
-                are computed deterministically using a dedicated workspace buffer.
-            high_precision_dgrad: bool, default=False
-                Optional flag indicating whether gradients for inputs in half precision
-                are kept in single precision as long as possible and only casted to
-                the corresponding input type at the very end.
-            high_precision_wgrad: bool, default=False
-                Optional flag indicating whether gradients for weights in half precision
-                are kept in single precision as long as possible and only casted to
-                the corresponding input type at the very end.
-        """
-        bipartite = not isinstance(x, torch.Tensor)
-        graph, perm = self.get_cugraph(
-            edge_index=edge_index,
-            bipartite=bipartite,
-            max_num_neighbors=max_num_neighbors,
-        )
-
-        if deterministic_dgrad:
-            graph.add_reverse_graph()
-
-        if edge_attr is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_dim must be set to accept "
-                    f"edge features."
-                )
-            if edge_attr.dim() == 1:
-                edge_attr = edge_attr.view(-1, 1)
-            if perm is not None:
-                edge_attr = edge_attr[perm]
-            edge_attr = self.lin_edge(edge_attr)
-
-        if bipartite:
-            if not hasattr(self, "lin_src"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_channels must be a pair of "
-                    f"integers to allow bipartite node features, but got "
-                    f"{self.in_channels}."
-                )
-            x_src = self.lin_src(x[0])
-            x_dst = self.lin_dst(x[1])
-        else:
-            if not hasattr(self, "lin"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_channels is expected to be an "
-                    f"integer, but got {self.in_channels}."
-                )
-            x = self.lin(x)
-
-        out = mha_gat_n2n(
-            (x_src, x_dst) if bipartite else x,
-            self.att,
-            graph,
-            num_heads=self.heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=edge_attr,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-            high_precision_dgrad=high_precision_dgrad,
-            high_precision_wgrad=high_precision_wgrad,
-        )
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, heads={self.heads})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
deleted file mode 100644
index ebb30de9754..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_gat_v2_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-torch_geometric = import_optional("torch_geometric")
-
-
-class GATv2Conv(BaseConv):
-    r"""The GATv2 operator from the `"How Attentive are Graph Attention
-    Networks?" <https://arxiv.org/abs/2105.14491>`_ paper, which fixes the
-    static attention problem of the standard
-    :class:`~torch_geometric.conv.GATConv` layer.
-    Since the linear layers in the standard GAT are applied right after each
-    other, the ranking of attended nodes is unconditioned on the query node.
-    In contrast, in :class:`GATv2`, every node can attend to any other node.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
-        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j},
-
-    where the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_j]
-        \right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_k]
-        \right)\right)}.
-
-    If the graph has multi-dimensional edge features :math:`\mathbf{e}_{i,j}`,
-    the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_j \, \Vert \, \mathbf{e}_{i,j}]
-        \right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_k \, \Vert \, \mathbf{e}_{i,k}]
-        \right)\right)}.
-
-    Args:
-        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
-            derive the size from the first input(s) to the forward method.
-            A tuple corresponds to the sizes of source and target
-            dimensionalities.
-        out_channels (int): Size of each output sample.
-        heads (int, optional): Number of multi-head-attentions.
-            (default: :obj:`1`)
-        concat (bool, optional): If set to :obj:`False`, the multi-head
-            attentions are averaged instead of concatenated.
-            (default: :obj:`True`)
-        negative_slope (float, optional): LeakyReLU angle of the negative
-            slope. (default: :obj:`0.2`)
-        edge_dim (int, optional): Edge feature dimensionality (in case
-            there are any). (default: :obj:`None`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-        share_weights (bool, optional): If set to :obj:`True`, the same matrix
-            will be applied to the source and the target node of every edge.
-            (default: :obj:`False`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        heads: int = 1,
-        concat: bool = True,
-        negative_slope: float = 0.2,
-        edge_dim: Optional[int] = None,
-        bias: bool = True,
-        share_weights: bool = False,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.heads = heads
-        self.concat = concat
-        self.negative_slope = negative_slope
-        self.edge_dim = edge_dim
-        self.share_weights = share_weights
-
-        Linear = torch_geometric.nn.Linear
-
-        if isinstance(in_channels, int):
-            self.lin_src = Linear(
-                in_channels,
-                heads * out_channels,
-                bias=bias,
-                weight_initializer="glorot",
-            )
-
-            if share_weights:
-                self.lin_dst = self.lin_src
-            else:
-                self.lin_dst = Linear(
-                    in_channels,
-                    heads * out_channels,
-                    bias=bias,
-                    weight_initializer="glorot",
-                )
-        else:
-            self.lin_src = Linear(
-                in_channels[0],
-                heads * out_channels,
-                bias=bias,
-                weight_initializer="glorot",
-            )
-            self.lin_dst = Linear(
-                in_channels[1],
-                heads * out_channels,
-                bias=bias,
-                weight_initializer="glorot",
-            )
-
-        self.att = nn.Parameter(torch.Tensor(heads * out_channels))
-
-        if edge_dim is not None:
-            self.lin_edge = Linear(
-                edge_dim, heads * out_channels, bias=False, weight_initializer="glorot"
-            )
-        else:
-            self.register_parameter("lin_edge", None)
-
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(heads * out_channels))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.lin_src.reset_parameters()
-        self.lin_dst.reset_parameters()
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        torch_geometric.nn.inits.glorot(
-            self.att.view(-1, self.heads, self.out_channels)
-        )
-
-        torch_geometric.nn.inits.zeros(self.bias)
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_attr: Optional[torch.Tensor] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor or tuple): The node features. Can be a tuple of
-                tensors denoting source and destination node features.
-            edge_index (EdgeIndex or CSC): The edge indices.
-            edge_attr: (torch.Tensor, optional) The edge features.
-            deterministic_dgrad : bool, default=False
-                Optional flag indicating whether the feature gradients
-                are computed deterministically using a dedicated workspace buffer.
-            deterministic_wgrad: bool, default=False
-                Optional flag indicating whether the weight gradients
-                are computed deterministically using a dedicated workspace buffer.
-        """
-        bipartite = not isinstance(x, torch.Tensor) or not self.share_weights
-        graph, perm = self.get_cugraph(edge_index, bipartite=bipartite)
-        if deterministic_dgrad:
-            graph.add_reverse_graph()
-
-        if edge_attr is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_dim must be set to accept "
-                    f"edge features."
-                )
-            if edge_attr.dim() == 1:
-                edge_attr = edge_attr.view(-1, 1)
-            if perm is not None:
-                edge_attr = edge_attr[perm]
-            edge_attr = self.lin_edge(edge_attr)
-
-        if bipartite:
-            if isinstance(x, torch.Tensor):
-                x = (x, x)
-            x_src = self.lin_src(x[0])
-            x_dst = self.lin_dst(x[1])
-        else:
-            x = self.lin_src(x)
-
-        out = mha_gat_v2_n2n(
-            (x_src, x_dst) if bipartite else x,
-            self.att,
-            graph,
-            num_heads=self.heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=edge_attr,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-        )
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, heads={self.heads})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
deleted file mode 100644
index a73dd8e57ff..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-from collections import defaultdict
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_gat_n2n
-
-from .base import BaseConv
-from cugraph_pyg.utils.imports import package_available
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class HeteroGATConv(BaseConv):
-    r"""The graph attentional operator on heterogeneous graphs, where a separate
-    `GATConv` is applied on the homogeneous graph for each edge type. Compared
-    with directly wrapping `GATConv`s with `HeteroConv`, `HeteroGATConv` fuses
-    all the linear transformation associated with each node type together into 1
-    GEMM call, to improve the performance on GPUs.
-
-    Parameters
-    ----------
-    in_channels : int or Dict[str, int])
-        Size of each input sample of every node type.
-
-    out_channels : int
-        Size of each output sample.
-
-    node_types : List[str]
-        List of Node types.
-
-    edge_types : List[Tuple[str, str, str]]
-        List of Edge types.
-
-    heads : int, optional (default=1)
-        Number of multi-head-attentions.
-
-    concat : bool, optional (default=True):
-        If set to :obj:`False`, the multi-head attentions are averaged instead
-        of concatenated.
-
-    negative_slope : float, optional (default=0.2)
-        LeakyReLU angle of the negative slope.
-
-    bias : bool, optional (default=True)
-        If set to :obj:`False`, the layer will not learn an additive bias.
-
-    aggr : str, optional (default="sum")
-        The aggregation scheme to use for grouping node embeddings generated by
-        different relations. Choose from "sum", "mean", "min", "max".
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, dict[str, int]],
-        out_channels: int,
-        node_types: list[str],
-        edge_types: list[tuple[str, str, str]],
-        heads: int = 1,
-        concat: bool = True,
-        negative_slope: float = 0.2,
-        bias: bool = True,
-        aggr: str = "sum",
-    ):
-        if not package_available("torch_geometric>=2.4.0"):
-            raise RuntimeError(
-                f"{self.__class__.__name__} requires torch_geometric>=2.4.0."
-            )
-
-        super().__init__()
-
-        if isinstance(in_channels, int):
-            in_channels = dict.fromkeys(node_types, in_channels)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-
-        self.node_types = node_types
-        self.edge_types = edge_types
-        self.num_heads = heads
-        self.concat_heads = concat
-
-        self.negative_slope = negative_slope
-        self.aggr = aggr
-
-        self.relations_per_ntype = defaultdict(lambda: ([], []))
-
-        lin_weights = dict.fromkeys(self.node_types)
-        attn_weights = dict.fromkeys(self.edge_types)
-        biases = dict.fromkeys(self.edge_types)
-
-        ParameterDict = torch_geometric.nn.parameter_dict.ParameterDict
-
-        for edge_type in self.edge_types:
-            src_type, _, dst_type = edge_type
-            self.relations_per_ntype[src_type][0].append(edge_type)
-            if src_type != dst_type:
-                self.relations_per_ntype[dst_type][1].append(edge_type)
-
-            attn_weights[edge_type] = torch.empty(
-                2 * self.num_heads * self.out_channels
-            )
-
-            if bias and concat:
-                biases[edge_type] = torch.empty(self.num_heads * out_channels)
-            elif bias:
-                biases[edge_type] = torch.empty(out_channels)
-            else:
-                biases[edge_type] = None
-
-        for ntype in self.node_types:
-            n_src_rel = len(self.relations_per_ntype[ntype][0])
-            n_dst_rel = len(self.relations_per_ntype[ntype][1])
-            n_rel = n_src_rel + n_dst_rel
-
-            lin_weights[ntype] = torch.empty(
-                (n_rel * self.num_heads * self.out_channels, self.in_channels[ntype])
-            )
-
-        self.lin_weights = ParameterDict(lin_weights)
-        self.attn_weights = ParameterDict(attn_weights)
-
-        if bias:
-            self.bias = ParameterDict(biases)
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def split_tensors(
-        self, x_fused_dict: dict[str, torch.Tensor], dim: int
-    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
-        """Split fused tensors into chunks based on edge types.
-
-        Parameters
-        ----------
-        x_fused_dict : dict[str, torch.Tensor]
-            A dictionary to hold node feature for each node type. The key is
-            node type; the value is a fused tensor that account for all
-            relations for that node type.
-
-        dim : int
-            Dimension along which to split the fused tensor.
-
-        Returns
-        -------
-        x_src_dict : dict[str, torch.Tensor]
-            A dictionary to hold source node feature for each relation graph.
-
-        x_dst_dict : dict[str, torch.Tensor]
-            A dictionary to hold destination node feature for each relation graph.
-        """
-        x_src_dict = dict.fromkeys(self.edge_types)
-        x_dst_dict = dict.fromkeys(self.edge_types)
-
-        for ntype, t in x_fused_dict.items():
-            n_src_rel = len(self.relations_per_ntype[ntype][0])
-            n_dst_rel = len(self.relations_per_ntype[ntype][1])
-            n_rel = n_src_rel + n_dst_rel
-            t_list = torch.chunk(t, chunks=n_rel, dim=dim)
-
-            for i, src_rel in enumerate(self.relations_per_ntype[ntype][0]):
-                x_src_dict[src_rel] = t_list[i]
-
-            for i, dst_rel in enumerate(self.relations_per_ntype[ntype][1]):
-                x_dst_dict[dst_rel] = t_list[i + n_src_rel]
-
-        return x_src_dict, x_dst_dict
-
-    def reset_parameters(self, seed: Optional[int] = None):
-        if seed is not None:
-            torch.manual_seed(seed)
-
-        w_src, w_dst = self.split_tensors(self.lin_weights, dim=0)
-
-        for edge_type in self.edge_types:
-            src_type, _, dst_type = edge_type
-
-            # lin_src
-            torch_geometric.nn.inits.glorot(w_src[edge_type])
-
-            # lin_dst
-            if src_type != dst_type:
-                torch_geometric.nn.inits.glorot(w_dst[edge_type])
-
-            # attn_weights
-            torch_geometric.nn.inits.glorot(
-                self.attn_weights[edge_type].view(-1, self.num_heads, self.out_channels)
-            )
-
-            # bias
-            if self.bias is not None:
-                torch_geometric.nn.inits.zeros(self.bias[edge_type])
-
-    def forward(
-        self,
-        x_dict: dict[str, torch.Tensor],
-        edge_index_dict: dict[tuple[str, str, str], torch.Tensor],
-    ) -> dict[str, torch.Tensor]:
-        feat_dict = dict.fromkeys(x_dict.keys())
-
-        for ntype, x in x_dict.items():
-            feat_dict[ntype] = x @ self.lin_weights[ntype].T
-
-        x_src_dict, x_dst_dict = self.split_tensors(feat_dict, dim=1)
-
-        out_dict = defaultdict(list)
-
-        for edge_type, edge_index in edge_index_dict.items():
-            src_type, _, dst_type = edge_type
-
-            csc = BaseConv.to_csc(
-                edge_index, (x_dict[src_type].size(0), x_dict[dst_type].size(0))
-            )
-
-            if src_type == dst_type:
-                graph, _ = self.get_cugraph(
-                    csc,
-                    bipartite=False,
-                )
-                out = mha_gat_n2n(
-                    x_src_dict[edge_type],
-                    self.attn_weights[edge_type],
-                    graph,
-                    num_heads=self.num_heads,
-                    activation="LeakyReLU",
-                    negative_slope=self.negative_slope,
-                    concat_heads=self.concat_heads,
-                )
-
-            else:
-                graph, _ = self.get_cugraph(
-                    csc,
-                    bipartite=True,
-                )
-                out = mha_gat_n2n(
-                    (x_src_dict[edge_type], x_dst_dict[edge_type]),
-                    self.attn_weights[edge_type],
-                    graph,
-                    num_heads=self.num_heads,
-                    activation="LeakyReLU",
-                    negative_slope=self.negative_slope,
-                    concat_heads=self.concat_heads,
-                )
-
-            if self.bias is not None:
-                out = out + self.bias[edge_type]
-
-            out_dict[dst_type].append(out)
-
-        for key, value in out_dict.items():
-            out_dict[key] = torch_geometric.nn.conv.hetero_conv.group(value, self.aggr)
-
-        return out_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
deleted file mode 100644
index 13fa08db5c5..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import agg_hg_basis_n2n_post
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class RGCNConv(BaseConv):  # pragma: no cover
-    r"""The relational graph convolutional operator from the `"Modeling
-    Relational Data with Graph Convolutional Networks"
-    <https://arxiv.org/abs/1703.06103>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \mathbf{\Theta}_{\textrm{root}} \cdot
-        \mathbf{x}_i + \sum_{r \in \mathcal{R}} \sum_{j \in \mathcal{N}_r(i)}
-        \frac{1}{|\mathcal{N}_r(i)|} \mathbf{\Theta}_r \cdot \mathbf{x}_j,
-
-    where :math:`\mathcal{R}` denotes the set of relations, *i.e.* edge types.
-    Edge type needs to be a one-dimensional :obj:`torch.long` tensor which
-    stores a relation identifier
-    :math:`\in \{ 0, \ldots, |\mathcal{R}| - 1\}` for each edge.
-
-    Args:
-        in_channels (int): Size of each input sample.
-        out_channels (int): Size of each output sample.
-        num_relations (int): Number of relations.
-        num_bases (int, optional): If set, this layer will use the
-            basis-decomposition regularization scheme where :obj:`num_bases`
-            denotes the number of bases to use. (default: :obj:`None`)
-        aggr (str, optional): The aggregation scheme to use
-            (:obj:`"add"`, :obj:`"mean"`, :obj:`"sum"`).
-            (default: :obj:`"mean"`)
-        root_weight (bool, optional): If set to :obj:`False`, the layer will
-            not add transformed root node features to the output.
-            (default: :obj:`True`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        num_relations: int,
-        num_bases: Optional[int] = None,
-        aggr: str = "mean",
-        root_weight: bool = True,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if aggr not in ["mean", "sum", "add"]:
-            raise ValueError(
-                f"Aggregation function must be chosen from 'mean', 'sum' or "
-                f"'add', but got '{aggr}'."
-            )
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.num_relations = num_relations
-        self.num_bases = num_bases
-        self.aggr = aggr
-        self.root_weight = root_weight
-
-        dim_root_weight = 1 if root_weight else 0
-
-        if num_bases is not None:
-            self.weight = torch.nn.Parameter(
-                torch.empty(num_bases + dim_root_weight, in_channels, out_channels)
-            )
-            self.comp = torch.nn.Parameter(torch.empty(num_relations, num_bases))
-        else:
-            self.weight = torch.nn.Parameter(
-                torch.empty(num_relations + dim_root_weight, in_channels, out_channels)
-            )
-            self.register_parameter("comp", None)
-
-        if bias:
-            self.bias = torch.nn.Parameter(torch.empty(out_channels))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        end = -1 if self.root_weight else None
-        torch_geometric.nn.inits.glorot(self.weight[:end])
-        torch_geometric.nn.inits.glorot(self.comp)
-        if self.root_weight:
-            torch_geometric.nn.inits.glorot(self.weight[-1])
-        torch_geometric.nn.inits.zeros(self.bias)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_type: torch.Tensor,
-        max_num_neighbors: Optional[int] = None,
-    ) -> torch.Tensor:
-
-        graph, _ = self.get_typed_cugraph(
-            edge_index,
-            edge_type,
-            self.num_relations,
-            max_num_neighbors=max_num_neighbors,
-        )
-
-        out = agg_hg_basis_n2n_post(
-            x,
-            self.comp,
-            graph,
-            concat_own=self.root_weight,
-            norm_by_out_degree=bool(self.aggr == "mean"),
-        )
-
-        out = out @ self.weight.view(-1, self.out_channels)
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, num_relations={self.num_relations})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
deleted file mode 100644
index 65dc99d8988..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import agg_concat_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class SAGEConv(BaseConv):
-    r"""The GraphSAGE operator from the `"Inductive Representation Learning on
-    Large Graphs" <https://arxiv.org/abs/1706.02216>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i + \mathbf{W}_2 \cdot
-        \mathrm{mean}_{j \in \mathcal{N(i)}} \mathbf{x}_j
-
-    If :obj:`project = True`, then :math:`\mathbf{x}_j` will first get
-    projected via
-
-    .. math::
-        \mathbf{x}_j \leftarrow \sigma ( \mathbf{W}_3 \mathbf{x}_j +
-        \mathbf{b})
-
-    as described in Eq. (3) of the paper.
-
-    Args:
-        in_channels (int or tuple): Size of each input sample. A tuple
-            corresponds to the sizes of source and target dimensionalities.
-        out_channels (int): Size of each output sample.
-        aggr (str or Aggregation, optional): The aggregation scheme to use.
-            Choose from :obj:`"mean"`, :obj:`"sum"`, :obj:`"min"` or
-            :obj:`"max"`. (default: :obj:`"mean"`)
-        normalize (bool, optional): If set to :obj:`True`, output features
-            will be :math:`\ell_2`-normalized, *i.e.*,
-            :math:`\frac{\mathbf{h}_i^{k+1}}
-            {\| \mathbf{h}_i^{k+1} \|_2}`.
-            (default: :obj:`False`)
-        root_weight (bool, optional): If set to :obj:`False`, the layer will
-            not add transformed root node features to the output.
-            (default: :obj:`True`)
-        project (bool, optional): If set to :obj:`True`, the layer will apply a
-            linear transformation followed by an activation function before
-            aggregation (as described in Eq. (3) of the paper).
-            (default: :obj:`False`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        aggr: str = "mean",
-        normalize: bool = False,
-        root_weight: bool = True,
-        project: bool = False,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if aggr not in ["mean", "sum", "min", "max"]:
-            raise ValueError(
-                f"Aggregation function must be chosen from 'mean',"
-                f" 'sum', 'min' or 'max', but got '{aggr}'."
-            )
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.aggr = aggr
-        self.normalize = normalize
-        self.root_weight = root_weight
-        self.project = project
-
-        if isinstance(in_channels, int):
-            self.in_channels_src = self.in_channels_dst = in_channels
-        else:
-            self.in_channels_src, self.in_channels_dst = in_channels
-
-        if self.project:
-            self.pre_lin = torch_geometric.nn.Linear(
-                self.in_channels_src, self.in_channels_src, bias=True
-            )
-
-        if self.root_weight:
-            self.lin = torch_geometric.nn.Linear(
-                self.in_channels_src + self.in_channels_dst, out_channels, bias=bias
-            )
-        else:
-            self.lin = torch_geometric.nn.Linear(
-                self.in_channels_src, out_channels, bias=bias
-            )
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        if self.project:
-            self.pre_lin.reset_parameters()
-        self.lin.reset_parameters()
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        max_num_neighbors: Optional[int] = None,
-    ) -> torch.Tensor:
-        bipartite = isinstance(x, Tuple)
-        graph, _ = self.get_cugraph(
-            edge_index=edge_index,
-            bipartite=bipartite,
-            max_num_neighbors=max_num_neighbors,
-        )
-
-        if self.project:
-            if bipartite:
-                x = (self.pre_lin(x[0]).relu(), x[1])
-            else:
-                x = self.pre_lin(x).relu()
-
-        out = agg_concat_n2n(x, graph, self.aggr)
-
-        if self.root_weight:
-            out = self.lin(out)
-        else:
-            out = self.lin(out[:, : self.in_channels_src])
-
-        if self.normalize:
-            out = torch.nn.functional.normalize(out, p=2.0, dim=-1)
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, aggr={self.aggr})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
deleted file mode 100644
index e184ee0e893..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_simple_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-torch_geometric = import_optional("torch_geometric")
-
-
-class TransformerConv(BaseConv):
-    r"""The graph transformer operator from the `"Masked Label Prediction:
-    Unified Message Passing Model for Semi-Supervised Classification"
-    <https://arxiv.org/abs/2009.03509>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i +
-        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j} \mathbf{W}_2 \mathbf{x}_{j},
-
-    where the attention coefficients :math:`\alpha_{i,j}` are computed via
-    multi-head dot product attention:
-
-    .. math::
-        \alpha_{i,j} = \textrm{softmax} \left(
-        \frac{(\mathbf{W}_3\mathbf{x}_i)^{\top} (\mathbf{W}_4\mathbf{x}_j)}
-        {\sqrt{d}} \right)
-
-    Args:
-        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
-            derive the size from the first input(s) to the forward method.
-            A tuple corresponds to the sizes of source and target
-            dimensionalities.
-        out_channels (int): Size of each output sample.
-        heads (int, optional): Number of multi-head-attentions.
-            (default: :obj:`1`)
-        concat (bool, optional): If set to :obj:`False`, the multi-head
-            attentions are averaged instead of concatenated.
-            (default: :obj:`True`)
-        beta (bool, optional): If set, will combine aggregation and
-            skip information via
-
-            .. math::
-                \mathbf{x}^{\prime}_i = \beta_i \mathbf{W}_1 \mathbf{x}_i +
-                (1 - \beta_i) \underbrace{\left(\sum_{j \in \mathcal{N}(i)}
-                \alpha_{i,j} \mathbf{W}_2 \vec{x}_j \right)}_{=\mathbf{m}_i}
-
-            with :math:`\beta_i = \textrm{sigmoid}(\mathbf{w}_5^{\top}
-            [ \mathbf{W}_1 \mathbf{x}_i, \mathbf{m}_i, \mathbf{W}_1
-            \mathbf{x}_i - \mathbf{m}_i ])` (default: :obj:`False`)
-        edge_dim (int, optional): Edge feature dimensionality (in case
-            there are any). Edge features are added to the keys after
-            linear transformation, that is, prior to computing the
-            attention dot product. They are also added to final values
-            after the same linear transformation. The model is:
-
-            .. math::
-                \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i +
-                \sum_{j \in \mathcal{N}(i)} \alpha_{i,j} \left(
-                \mathbf{W}_2 \mathbf{x}_{j} + \mathbf{W}_6 \mathbf{e}_{ij}
-                \right),
-
-            where the attention coefficients :math:`\alpha_{i,j}` are now
-            computed via:
-
-            .. math::
-                \alpha_{i,j} = \textrm{softmax} \left(
-                \frac{(\mathbf{W}_3\mathbf{x}_i)^{\top}
-                (\mathbf{W}_4\mathbf{x}_j + \mathbf{W}_6 \mathbf{e}_{ij})}
-                {\sqrt{d}} \right)
-
-            (default :obj:`None`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-        root_weight (bool, optional): If set to :obj:`False`, the layer will
-            not add the transformed root node features to the output and the
-            option  :attr:`beta` is set to :obj:`False`. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        heads: int = 1,
-        concat: bool = True,
-        beta: bool = False,
-        edge_dim: Optional[int] = None,
-        bias: bool = True,
-        root_weight: bool = True,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.heads = heads
-        self.beta = beta and root_weight
-        self.root_weight = root_weight
-        self.concat = concat
-        self.edge_dim = edge_dim
-
-        if isinstance(in_channels, int):
-            in_channels = (in_channels, in_channels)
-
-        Linear = torch_geometric.nn.Linear
-
-        self.lin_key = Linear(in_channels[0], heads * out_channels)
-        self.lin_query = Linear(in_channels[1], heads * out_channels)
-        self.lin_value = Linear(in_channels[0], heads * out_channels)
-        if edge_dim is not None:
-            self.lin_edge = Linear(edge_dim, heads * out_channels, bias=False)
-        else:
-            self.lin_edge = self.register_parameter("lin_edge", None)
-
-        if concat:
-            self.lin_skip = Linear(in_channels[1], heads * out_channels, bias=bias)
-            if self.beta:
-                self.lin_beta = Linear(3 * heads * out_channels, 1, bias=False)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-        else:
-            self.lin_skip = Linear(in_channels[1], out_channels, bias=bias)
-            if self.beta:
-                self.lin_beta = Linear(3 * out_channels, 1, bias=False)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.lin_key.reset_parameters()
-        self.lin_query.reset_parameters()
-        self.lin_value.reset_parameters()
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-        self.lin_skip.reset_parameters()
-        if self.lin_beta is not None:
-            self.lin_beta.reset_parameters()
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_attr: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor or tuple): The node features. Can be a tuple of
-                tensors denoting source and destination node features.
-            edge_index (EdgeIndex or CSC): The edge indices.
-            edge_attr: (torch.Tensor, optional) The edge features.
-        """
-        bipartite = True
-        graph, perm = self.get_cugraph(edge_index=edge_index, bipartite=bipartite)
-
-        if isinstance(x, torch.Tensor):
-            x = (x, x)
-
-        query = self.lin_query(x[1])
-        key = self.lin_key(x[0])
-        value = self.lin_value(x[0])
-
-        if edge_attr is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_dim must be set to accept "
-                    f"edge features."
-                )
-            if perm is not None:
-                edge_attr = edge_attr[perm]
-            edge_attr = self.lin_edge(edge_attr)
-
-        out = mha_simple_n2n(
-            key,
-            query,
-            value,
-            graph,
-            self.heads,
-            self.concat,
-            edge_emb=edge_attr,
-            norm_by_dim=True,
-            score_bias=None,
-        )
-
-        if self.root_weight:
-            x_r = self.lin_skip(x[1])
-            if self.lin_beta is not None:
-                beta = self.lin_beta(torch.cat([out, x_r, out - x_r], dim=-1))
-                beta = beta.sigmoid()
-                out = beta * x_r + (1 - beta) * out
-            else:
-                out = out + x_r
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, heads={self.heads})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py b/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
deleted file mode 100644
index 34fe9c4463e..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/sampler/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph_pyg.sampler.sampler import BaseSampler, SampleIterator
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
deleted file mode 100644
index bc3d4fd8d3c..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler.py
+++ /dev/null
@@ -1,540 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Iterator, Union, Dict, Tuple
-
-from cugraph.utilities.utils import import_optional
-from cugraph.gnn import DistSampler
-
-from .sampler_utils import filter_cugraph_pyg_store, neg_sample, neg_cat
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class SampleIterator:
-    """
-    Iterator that combines output graphs with their
-    features to produce final output minibatches
-    that can be fed into a GNN model.
-    """
-
-    def __init__(
-        self,
-        data: Tuple[
-            "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-        ],
-        output_iter: Iterator[
-            Union[
-                "torch_geometric.sampler.HeteroSamplerOutput",
-                "torch_geometric.sampler.SamplerOutput",
-            ]
-        ],
-    ):
-        """
-        Constructs a new SampleIterator
-
-        Parameters
-        ----------
-        data: Tuple[torch_geometric.data.FeatureStore, torch_geometric.data.GraphStore]
-            The original graph that samples were generated from, as a
-            FeatureStore/GraphStore tuple.
-        output_iter: Iterator[Union["torch_geometric.sampler.HeteroSamplerOutput",
-        "torch_geometric.sampler.SamplerOutput"]]
-            An iterator over outputted sampling results.
-        """
-        self.__feature_store, self.__graph_store = data
-        self.__output_iter = output_iter
-
-    def __next__(self):
-        next_sample = next(self.__output_iter)
-        if isinstance(next_sample, torch_geometric.sampler.SamplerOutput):
-            sz = next_sample.edge.numel()
-            if sz == next_sample.col.numel() and (
-                next_sample.node.numel() > next_sample.col[-1]
-            ):
-                # This will only trigger on very small batches and will have minimal
-                # performance impact.  If COO output is removed, then this condition
-                # can be avoided.
-                col = next_sample.col
-            else:
-                col = torch_geometric.edge_index.ptr2index(
-                    next_sample.col, next_sample.edge.numel()
-                )
-
-            data = filter_cugraph_pyg_store(
-                self.__feature_store,
-                self.__graph_store,
-                next_sample.node,
-                next_sample.row,
-                col,
-                next_sample.edge,
-                None,
-            )
-
-            """
-            # TODO Re-enable this once PyG resolves
-            # the issue with edge features (9566)
-            data = torch_geometric.loader.utils.filter_custom_store(
-                self.__feature_store,
-                self.__graph_store,
-                next_sample.node,
-                next_sample.row,
-                col,
-                next_sample.edge,
-                None,
-            )
-            """
-
-            if "n_id" not in data:
-                data.n_id = next_sample.node
-            if next_sample.edge is not None and "e_id" not in data:
-                edge = next_sample.edge.to(torch.long)
-                data.e_id = edge
-
-            data.batch = next_sample.batch
-            data.num_sampled_nodes = next_sample.num_sampled_nodes
-            data.num_sampled_edges = next_sample.num_sampled_edges
-
-            data.input_id = next_sample.metadata[0]
-            data.batch_size = data.input_id.size(0)
-
-            if len(next_sample.metadata) == 2:
-                data.seed_time = next_sample.metadata[1]
-            elif len(next_sample.metadata) == 4:
-                (
-                    data.edge_label_index,
-                    data.edge_label,
-                    data.seed_time,
-                ) = next_sample.metadata[1:]
-            else:
-                raise ValueError("Invalid metadata")
-
-        elif isinstance(next_sample, torch_geometric.sampler.HeteroSamplerOutput):
-            col = {}
-            for edge_type, col_idx in next_sample.col:
-                sz = next_sample.edge[edge_type].numel()
-                if sz == col_idx.numel():
-                    col[edge_type] = col_idx
-                else:
-                    col[edge_type] = torch_geometric.edge_index.ptr2index(col_idx, sz)
-
-            data = torch_geometric.loader.utils.filter_custom_hetero_store(
-                self.__feature_store,
-                self.__graph_store,
-                next_sample.node,
-                next_sample.row,
-                col,
-                next_sample.edge,
-                None,
-            )
-
-            for key, node in next_sample.node.items():
-                if "n_id" not in data[key]:
-                    data[key].n_id = node
-
-            for key, edge in (next_sample.edge or {}).items():
-                if edge is not None and "e_id" not in data[key]:
-                    edge = edge.to(torch.long)
-                    data[key].e_id = edge
-
-            data.set_value_dict("batch", next_sample.batch)
-            data.set_value_dict("num_sampled_nodes", next_sample.num_sampled_nodes)
-            data.set_value_dict("num_sampled_edges", next_sample.num_sampled_edges)
-
-            # TODO figure out how to set input_id for heterogeneous output
-        else:
-            raise ValueError("Invalid output type")
-
-        return data
-
-    def __iter__(self):
-        return self
-
-
-class SampleReader:
-    """
-    Iterator that processes results from the cuGraph distributed sampler.
-    """
-
-    def __init__(
-        self, base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-    ):
-        """
-        Constructs a new SampleReader.
-
-        Parameters
-        ----------
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-            The reader responsible for loading saved samples produced by
-            the cuGraph distributed sampler.
-        """
-        self.__base_reader = base_reader
-        self.__num_samples_remaining = 0
-        self.__index = 0
-
-    def __next__(self):
-        if self.__num_samples_remaining == 0:
-            # raw_sample_data is already a dict of tensors
-            self.__raw_sample_data, start_inclusive, end_inclusive = next(
-                self.__base_reader
-            )
-
-            self.__raw_sample_data["input_offsets"] -= self.__raw_sample_data[
-                "input_offsets"
-            ][0].clone()
-            self.__raw_sample_data["label_hop_offsets"] -= self.__raw_sample_data[
-                "label_hop_offsets"
-            ][0].clone()
-            self.__raw_sample_data["renumber_map_offsets"] -= self.__raw_sample_data[
-                "renumber_map_offsets"
-            ][0].clone()
-            if "major_offsets" in self.__raw_sample_data:
-                self.__raw_sample_data["major_offsets"] -= self.__raw_sample_data[
-                    "major_offsets"
-                ][0].clone()
-
-            self.__num_samples_remaining = end_inclusive - start_inclusive + 1
-            self.__index = 0
-
-        out = self._decode(self.__raw_sample_data, self.__index)
-        self.__index += 1
-        self.__num_samples_remaining -= 1
-        return out
-
-    def __iter__(self):
-        return self
-
-
-class HomogeneousSampleReader(SampleReader):
-    """
-    Subclass of SampleReader that reads homogeneous output samples
-    produced by the cuGraph distributed sampler.
-    """
-
-    def __init__(
-        self, base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-    ):
-        """
-        Constructs a new HomogeneousSampleReader
-
-        Parameters
-        ----------
-        base_reader: Iterator[Tuple[Dict[str, "torch.Tensor"], int, int]]
-            The iterator responsible for loading saved samples produced by
-            the cuGraph distributed sampler.
-        """
-        super().__init__(base_reader)
-
-    def __decode_csc(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
-        fanout_length = (raw_sample_data["label_hop_offsets"].numel() - 1) // (
-            raw_sample_data["renumber_map_offsets"].numel() - 1
-        )
-
-        major_offsets_start_incl = raw_sample_data["label_hop_offsets"][
-            index * fanout_length
-        ]
-        major_offsets_end_incl = raw_sample_data["label_hop_offsets"][
-            (index + 1) * fanout_length
-        ]
-
-        major_offsets = raw_sample_data["major_offsets"][
-            major_offsets_start_incl : major_offsets_end_incl + 1
-        ].clone()
-        minors = raw_sample_data["minors"][major_offsets[0] : major_offsets[-1]]
-        edge_id = raw_sample_data["edge_id"][major_offsets[0] : major_offsets[-1]]
-        # don't retrieve edge type for a homogeneous graph
-
-        major_offsets -= major_offsets[0].clone()
-
-        renumber_map_start = raw_sample_data["renumber_map_offsets"][index]
-        renumber_map_end = raw_sample_data["renumber_map_offsets"][index + 1]
-
-        renumber_map = raw_sample_data["map"][renumber_map_start:renumber_map_end]
-
-        current_label_hop_offsets = raw_sample_data["label_hop_offsets"][
-            index * fanout_length : (index + 1) * fanout_length + 1
-        ].clone()
-        current_label_hop_offsets -= current_label_hop_offsets[0].clone()
-
-        num_sampled_edges = major_offsets[current_label_hop_offsets].diff()
-
-        num_sampled_nodes_hops = torch.tensor(
-            [
-                minors[: num_sampled_edges[:i].sum()].max() + 1
-                for i in range(1, fanout_length + 1)
-            ],
-            device="cpu",
-        )
-
-        num_seeds = (
-            torch.searchsorted(major_offsets, num_sampled_edges[0]).reshape((1,)).cpu()
-        )
-        num_sampled_nodes = torch.concat(
-            [num_seeds, num_sampled_nodes_hops.diff(prepend=num_seeds)]
-        )
-
-        input_index = raw_sample_data["input_index"][
-            raw_sample_data["input_offsets"][index] : raw_sample_data["input_offsets"][
-                index + 1
-            ]
-        ]
-
-        num_seeds = input_index.numel()
-        input_index = input_index[input_index >= 0]
-
-        num_pos = input_index.numel()
-        num_neg = num_seeds - num_pos
-        if num_neg > 0:
-            edge_label = torch.concat(
-                [
-                    torch.full((num_pos,), 1.0),
-                    torch.full((num_neg,), 0.0),
-                ]
-            )
-        else:
-            edge_label = None
-
-        edge_inverse = (
-            (
-                raw_sample_data["edge_inverse"][
-                    (raw_sample_data["input_offsets"][index] * 2) : (
-                        raw_sample_data["input_offsets"][index + 1] * 2
-                    )
-                ]
-            )
-            if "edge_inverse" in raw_sample_data
-            else None
-        )
-
-        if edge_inverse is None:
-            metadata = (
-                input_index,
-                None,  # TODO this will eventually include time
-            )
-        else:
-            metadata = (
-                input_index,
-                edge_inverse.view(2, -1),
-                edge_label,
-                None,  # TODO this will eventually include time
-            )
-
-        return torch_geometric.sampler.SamplerOutput(
-            node=renumber_map.cpu(),
-            row=minors,
-            col=major_offsets,
-            edge=edge_id.cpu(),
-            batch=renumber_map[:num_seeds],
-            num_sampled_nodes=num_sampled_nodes.cpu(),
-            num_sampled_edges=num_sampled_edges.cpu(),
-            metadata=metadata,
-        )
-
-    def __decode_coo(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
-        fanout_length = (raw_sample_data["label_hop_offsets"].numel() - 1) // (
-            raw_sample_data["renumber_map_offsets"].numel() - 1
-        )
-
-        major_minor_start = raw_sample_data["label_hop_offsets"][index * fanout_length]
-        ix_end = (index + 1) * fanout_length
-        if ix_end == raw_sample_data["label_hop_offsets"].numel():
-            major_minor_end = raw_sample_data["majors"].numel()
-        else:
-            major_minor_end = raw_sample_data["label_hop_offsets"][ix_end]
-
-        majors = raw_sample_data["majors"][major_minor_start:major_minor_end]
-        minors = raw_sample_data["minors"][major_minor_start:major_minor_end]
-        edge_id = raw_sample_data["edge_id"][major_minor_start:major_minor_end]
-        # don't retrieve edge type for a homogeneous graph
-
-        renumber_map_start = raw_sample_data["renumber_map_offsets"][index]
-        renumber_map_end = raw_sample_data["renumber_map_offsets"][index + 1]
-
-        renumber_map = raw_sample_data["map"][renumber_map_start:renumber_map_end]
-
-        num_sampled_edges = (
-            raw_sample_data["label_hop_offsets"][
-                index * fanout_length : (index + 1) * fanout_length + 1
-            ]
-            .diff()
-            .cpu()
-        )
-
-        num_seeds = (majors[: num_sampled_edges[0]].max() + 1).reshape((1,)).cpu()
-        num_sampled_nodes_hops = torch.tensor(
-            [
-                minors[: num_sampled_edges[:i].sum()].max() + 1
-                for i in range(1, fanout_length + 1)
-            ],
-            device="cpu",
-        )
-
-        num_sampled_nodes = torch.concat(
-            [num_seeds, num_sampled_nodes_hops.diff(prepend=num_seeds)]
-        )
-
-        input_index = raw_sample_data["input_index"][
-            raw_sample_data["input_offsets"][index] : raw_sample_data["input_offsets"][
-                index + 1
-            ]
-        ]
-
-        edge_inverse = (
-            (
-                raw_sample_data["edge_inverse"][
-                    (raw_sample_data["input_offsets"][index] * 2) : (
-                        raw_sample_data["input_offsets"][index + 1] * 2
-                    )
-                ]
-            )
-            if "edge_inverse" in raw_sample_data
-            else None
-        )
-
-        if edge_inverse is None:
-            metadata = (
-                input_index,
-                None,  # TODO this will eventually include time
-            )
-        else:
-            metadata = (
-                input_index,
-                edge_inverse.view(2, -1),
-                None,
-                None,  # TODO this will eventually include time
-            )
-
-        return torch_geometric.sampler.SamplerOutput(
-            node=renumber_map.cpu(),
-            row=minors,
-            col=majors,
-            edge=edge_id,
-            batch=renumber_map[:num_seeds],
-            num_sampled_nodes=num_sampled_nodes,
-            num_sampled_edges=num_sampled_edges,
-            metadata=metadata,
-        )
-
-    def _decode(self, raw_sample_data: Dict[str, "torch.Tensor"], index: int):
-        if "major_offsets" in raw_sample_data:
-            return self.__decode_csc(raw_sample_data, index)
-        else:
-            return self.__decode_coo(raw_sample_data, index)
-
-
-class BaseSampler:
-    def __init__(
-        self,
-        sampler: DistSampler,
-        data: Tuple[
-            "torch_geometric.data.FeatureStore", "torch_geometric.data.GraphStore"
-        ],
-        batch_size: int = 16,
-    ):
-        self.__sampler = sampler
-        self.__feature_store, self.__graph_store = data
-        self.__batch_size = batch_size
-
-    def sample_from_nodes(
-        self, index: "torch_geometric.sampler.NodeSamplerInput", **kwargs
-    ) -> Iterator[
-        Union[
-            "torch_geometric.sampler.HeteroSamplerOutput",
-            "torch_geometric.sampler.SamplerOutput",
-        ]
-    ]:
-        reader = self.__sampler.sample_from_nodes(
-            index.node, batch_size=self.__batch_size, input_id=index.input_id, **kwargs
-        )
-
-        edge_attrs = self.__graph_store.get_all_edge_attrs()
-        if (
-            len(edge_attrs) == 1
-            and edge_attrs[0].edge_type[0] == edge_attrs[0].edge_type[2]
-        ):
-            return HomogeneousSampleReader(reader)
-        else:
-            # TODO implement heterogeneous sampling
-            raise NotImplementedError(
-                "Sampling heterogeneous graphs is currently"
-                " unsupported in the non-dask API"
-            )
-
-    def sample_from_edges(
-        self,
-        index: "torch_geometric.sampler.EdgeSamplerInput",
-        neg_sampling: Optional["torch_geometric.sampler.NegativeSampling"],
-        **kwargs,
-    ) -> Iterator[
-        Union[
-            "torch_geometric.sampler.HeteroSamplerOutput",
-            "torch_geometric.sampler.SamplerOutput",
-        ]
-    ]:
-        src = index.row
-        dst = index.col
-        input_id = index.input_id
-        neg_batch_size = 0
-        if neg_sampling:
-            # Sample every negative subset at once.
-            # TODO handle temporal sampling (node_time)
-            src_neg, dst_neg = neg_sample(
-                self.__graph_store,
-                index.row,
-                index.col,
-                self.__batch_size,
-                neg_sampling,
-                None,  # src_time,
-                None,  # src_node_time,
-            )
-            if neg_sampling.is_binary():
-                src, _ = neg_cat(src.cuda(), src_neg, self.__batch_size)
-            else:
-                # triplet, cat dst to src so length is the same; will
-                # result in the same set of unique vertices
-                src, _ = neg_cat(src.cuda(), dst_neg, self.__batch_size)
-            dst, neg_batch_size = neg_cat(dst.cuda(), dst_neg, self.__batch_size)
-
-            # Concatenate -1s so the input id tensor lines up and can
-            # be processed by the dist sampler.
-            # When loading the output batch, '-1' will be dropped.
-            input_id, _ = neg_cat(
-                input_id,
-                torch.full(
-                    (dst_neg.numel(),), -1, dtype=torch.int64, device=input_id.device
-                ),
-                self.__batch_size,
-            )
-
-        # TODO for temporal sampling, node times have to be
-        # adjusted here.
-        reader = self.__sampler.sample_from_edges(
-            torch.stack([src, dst]),  # reverse of usual convention
-            input_id=input_id,
-            batch_size=self.__batch_size + neg_batch_size,
-            **kwargs,
-        )
-
-        edge_attrs = self.__graph_store.get_all_edge_attrs()
-        if (
-            len(edge_attrs) == 1
-            and edge_attrs[0].edge_type[0] == edge_attrs[0].edge_type[2]
-        ):
-            return HomogeneousSampleReader(reader)
-        else:
-            # TODO implement heterogeneous sampling
-            raise NotImplementedError(
-                "Sampling heterogeneous graphs is currently"
-                " unsupported in the non-dask API"
-            )
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
deleted file mode 100644
index b3d56ef9992..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/sampler/sampler_utils.py
+++ /dev/null
@@ -1,531 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Sequence, Dict, Tuple
-
-from math import ceil
-
-from cugraph_pyg.data import GraphStore, DaskGraphStore
-
-from cugraph.utilities.utils import import_optional
-import cudf
-import cupy
-import pylibcugraph
-
-dask_cudf = import_optional("dask_cudf")
-torch_geometric = import_optional("torch_geometric")
-
-torch = import_optional("torch")
-HeteroSamplerOutput = torch_geometric.sampler.base.HeteroSamplerOutput
-
-
-def _get_unique_nodes(
-    sampling_results: cudf.DataFrame,
-    graph_store: DaskGraphStore,
-    node_type: str,
-    node_position: str,
-) -> int:
-    """
-    Counts the number of unique nodes of a given node type.
-
-    Parameters
-    ----------
-    sampling_results: cudf.DataFrame
-        The dataframe containing sampling results or filtered sampling results
-        (i.e. sampling results for hop 2)
-    graph_store: DaskGraphStore
-        The graph store containing the structure of the sampled graph.
-    node_type: str
-        The node type to count the number of unique nodes of.
-    node_position: str ('src' or 'dst')
-        Whether to examine source or destination nodes.
-
-    Returns
-    -------
-    cudf.Series
-        The unique nodes of the given node type.
-    """
-    if node_position == "src":
-        edge_index = "majors"
-        edge_sel = 0
-    elif node_position == "dst":
-        edge_index = "minors"
-        edge_sel = -1
-    else:
-        raise ValueError(f"Illegal value {node_position} for node_position")
-
-    etypes = [
-        graph_store.canonical_edge_type_to_numeric(et)
-        for et in graph_store.edge_types
-        if et[edge_sel] == node_type
-    ]
-    if len(etypes) > 0:
-        f = sampling_results.edge_type == etypes[0]
-        for et in etypes[1:]:
-            f |= sampling_results.edge_type == et
-
-        sampling_results_node = sampling_results[f]
-    else:
-        return cudf.Series([], dtype="int64")
-
-    return sampling_results_node[edge_index]
-
-
-def _sampler_output_from_sampling_results_homogeneous_coo(
-    sampling_results: cudf.DataFrame,
-    renumber_map: torch.Tensor,
-    graph_store: DaskGraphStore,
-    data_index: Dict[Tuple[int, int], Dict[str, int]],
-    batch_id: int,
-    metadata: Sequence = None,
-) -> HeteroSamplerOutput:
-    """
-    Parameters
-    ----------
-    sampling_results: cudf.DataFrame
-        The dataframe containing sampling results.
-    renumber_map: torch.Tensor
-        The tensor containing the renumber map, or None if there
-        is no renumber map.
-    graph_store: DaskGraphStore
-        The graph store containing the structure of the sampled graph.
-    data_index: Dict[Tuple[int, int], Dict[str, int]]
-        Dictionary where keys are the batch id and hop id,
-        and values are dictionaries containing the max src
-        and max dst node ids for the batch and hop.
-    batch_id: int
-        The current batch id, whose samples are being retrieved
-        from the sampling results and data index.
-    metadata: Tensor
-        The metadata for the sampled batch.
-
-    Returns
-    -------
-    HeteroSamplerOutput
-    """
-
-    if len(graph_store.edge_types) > 1 or len(graph_store.node_types) > 1:
-        raise ValueError("Graph is heterogeneous")
-
-    hops = torch.arange(
-        sampling_results.hop_id.iloc[len(sampling_results) - 1] + 1, device="cuda"
-    )
-    hops = torch.searchsorted(
-        torch.as_tensor(sampling_results.hop_id, device="cuda"), hops
-    )
-
-    node_type = graph_store.node_types[0]
-    edge_type = graph_store.edge_types[0]
-
-    num_nodes_per_hop_dict = {node_type: torch.zeros(len(hops) + 1, dtype=torch.int64)}
-    num_edges_per_hop_dict = {edge_type: torch.zeros(len(hops), dtype=torch.int64)}
-
-    if renumber_map is None:
-        raise ValueError("Renumbered input is expected for homogeneous graphs")
-
-    noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
-
-    row_dict = {
-        edge_type: torch.as_tensor(sampling_results.majors, device="cuda"),
-    }
-
-    col_dict = {
-        edge_type: torch.as_tensor(sampling_results.minors, device="cuda"),
-    }
-
-    num_nodes_per_hop_dict[node_type][0] = data_index[batch_id, 0]["src_max"] + 1
-    for hop in range(len(hops)):
-        hop_ix_start = hops[hop]
-        hop_ix_end = hops[hop + 1] if hop < len(hops) - 1 else len(sampling_results)
-
-        if num_nodes_per_hop_dict[node_type][hop] > 0:
-            max_id_hop = data_index[batch_id, hop]["dst_max"]
-            max_id_prev_hop = (
-                data_index[batch_id, hop - 1]["dst_max"]
-                if hop > 0
-                else data_index[batch_id, 0]["src_max"]
-            )
-
-            if max_id_hop > max_id_prev_hop:
-                num_nodes_per_hop_dict[node_type][hop + 1] = (
-                    max_id_hop - max_id_prev_hop
-                )
-            else:
-                num_nodes_per_hop_dict[node_type][hop + 1] = 0
-        # will default to 0 if the previous hop was 0, since this is a PyG requirement
-
-        num_edges_per_hop_dict[edge_type][hop] = hop_ix_end - hop_ix_start
-
-    if HeteroSamplerOutput is None:
-        raise ImportError("Error importing from pyg")
-
-    return HeteroSamplerOutput(
-        node=noi_index,
-        row=row_dict,
-        col=col_dict,
-        edge=None,
-        num_sampled_nodes={k: t.tolist() for k, t in num_nodes_per_hop_dict.items()},
-        num_sampled_edges={k: t.tolist() for k, t in num_edges_per_hop_dict.items()},
-        metadata=metadata,
-    )
-
-
-def _sampler_output_from_sampling_results_homogeneous_csr(
-    major_offsets: torch.Tensor,
-    minors: torch.Tensor,
-    renumber_map: torch.Tensor,
-    graph_store: DaskGraphStore,
-    label_hop_offsets: torch.Tensor,
-    batch_id: int,
-    metadata: Sequence = None,
-) -> HeteroSamplerOutput:
-    """
-    Parameters
-    ----------
-    major_offsets: torch.Tensor
-        The major offsets for the CSC/CSR matrix ("row pointer")
-    minors: torch.Tensor
-        The minors for the CSC/CSR matrix ("col index")
-    renumber_map: torch.Tensor
-        The tensor containing the renumber map.
-        Required.
-    graph_store: DaskGraphStore
-        The graph store containing the structure of the sampled graph.
-    label_hop_offsets: torch.Tensor
-        The tensor containing the label-hop offsets.
-    batch_id: int
-        The current batch id, whose samples are being retrieved
-        from the sampling results and data index.
-    metadata: Tensor
-        The metadata for the sampled batch.
-
-    Returns
-    -------
-    HeteroSamplerOutput
-    """
-
-    if len(graph_store.edge_types) > 1 or len(graph_store.node_types) > 1:
-        raise ValueError("Graph is heterogeneous")
-
-    if renumber_map is None:
-        raise ValueError("Renumbered input is expected for homogeneous graphs")
-    node_type = graph_store.node_types[0]
-    edge_type = graph_store.edge_types[0]
-
-    major_offsets = major_offsets.clone() - major_offsets[0]
-    label_hop_offsets = label_hop_offsets.clone() - label_hop_offsets[0]
-
-    num_edges_per_hop_dict = {
-        edge_type: major_offsets[label_hop_offsets].diff().tolist()
-    }
-
-    label_hop_offsets = label_hop_offsets.cpu()
-    num_nodes_per_hop_dict = {
-        node_type: torch.concat(
-            [
-                label_hop_offsets.diff(),
-                (renumber_map.shape[0] - label_hop_offsets[-1]).reshape((1,)),
-            ]
-        ).tolist()
-    }
-
-    noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
-
-    col_dict = {
-        edge_type: major_offsets,
-    }
-
-    row_dict = {
-        edge_type: minors,
-    }
-
-    if HeteroSamplerOutput is None:
-        raise ImportError("Error importing from pyg")
-
-    return HeteroSamplerOutput(
-        node=noi_index,
-        row=row_dict,
-        col=col_dict,
-        edge=None,
-        num_sampled_nodes=num_nodes_per_hop_dict,
-        num_sampled_edges=num_edges_per_hop_dict,
-        metadata=metadata,
-    )
-
-
-def _sampler_output_from_sampling_results_heterogeneous(
-    sampling_results: cudf.DataFrame,
-    renumber_map: cudf.Series,
-    graph_store: DaskGraphStore,
-    metadata: Sequence = None,
-) -> HeteroSamplerOutput:
-    """
-    Parameters
-    ----------
-    sampling_results: cudf.DataFrame
-        The dataframe containing sampling results.
-    renumber_map: cudf.Series
-        The series containing the renumber map, or None if there
-        is no renumber map.
-    graph_store: DaskGraphStore
-        The graph store containing the structure of the sampled graph.
-    metadata: Tensor
-        The metadata for the sampled batch.
-
-    Returns
-    -------
-    HeteroSamplerOutput
-    """
-
-    hops = torch.arange(sampling_results.hop_id.max() + 1, device="cuda")
-    hops = torch.searchsorted(
-        torch.as_tensor(sampling_results.hop_id, device="cuda"), hops
-    )
-
-    num_nodes_per_hop_dict = {}
-    num_edges_per_hop_dict = {}
-
-    # Fill out hop 0 in num_nodes_per_hop_dict, which is based on src instead of dst
-    sampling_results_hop_0 = sampling_results.iloc[
-        0 : (hops[1] if len(hops) > 1 else len(sampling_results))
-    ]
-
-    for node_type in graph_store.node_types:
-        num_unique_nodes = _get_unique_nodes(
-            sampling_results_hop_0, graph_store, node_type, "src"
-        ).nunique()
-
-        if num_unique_nodes > 0:
-            num_nodes_per_hop_dict[node_type] = torch.zeros(
-                len(hops) + 1, dtype=torch.int64
-            )
-            num_nodes_per_hop_dict[node_type][0] = num_unique_nodes
-
-    if renumber_map is not None:
-        raise ValueError(
-            "Precomputing the renumber map is currently "
-            "unsupported for heterogeneous graphs."
-        )
-
-    # Calculate nodes of interest based on unique nodes in order of appearance
-    # Use hop 0 sources since those are the only ones not included in destinations
-    # Use torch.concat based on benchmark performance (vs. cudf.concat)
-
-    if sampling_results_hop_0 is None:
-        sampling_results_hop_0 = sampling_results.iloc[
-            0 : (hops[1] if len(hops) > 1 else len(sampling_results))
-        ]
-
-    nodes_of_interest = (
-        cudf.Series(
-            torch.concat(
-                [
-                    torch.as_tensor(sampling_results_hop_0.majors, device="cuda"),
-                    torch.as_tensor(sampling_results.minors, device="cuda"),
-                ]
-            ),
-            name="nodes_of_interest",
-        )
-        .drop_duplicates()
-        .sort_index()
-    )
-
-    # Get the grouped node index (for creating the renumbered grouped edge index)
-    noi_index = graph_store._get_vertex_groups_from_sample(
-        torch.as_tensor(nodes_of_interest, device="cuda")
-    )
-    del nodes_of_interest
-
-    # Get the new edge index (by type as expected for HeteroData)
-    # FIXME handle edge ids/types after the C++ updates
-    row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
-        sampling_results, noi_index
-    )
-
-    for hop in range(len(hops)):
-        hop_ix_start = hops[hop]
-        hop_ix_end = hops[hop + 1] if hop < len(hops) - 1 else len(sampling_results)
-        sampling_results_to_hop = sampling_results.iloc[0:hop_ix_end]
-
-        for node_type in graph_store.node_types:
-            unique_nodes_hop = _get_unique_nodes(
-                sampling_results_to_hop, graph_store, node_type, "dst"
-            )
-
-            unique_nodes_0 = _get_unique_nodes(
-                sampling_results_hop_0, graph_store, node_type, "src"
-            )
-
-            num_unique_nodes = cudf.concat([unique_nodes_0, unique_nodes_hop]).nunique()
-
-            if num_unique_nodes > 0:
-                if node_type not in num_nodes_per_hop_dict:
-                    num_nodes_per_hop_dict[node_type] = torch.zeros(
-                        len(hops) + 1, dtype=torch.int64
-                    )
-                num_nodes_per_hop_dict[node_type][hop + 1] = num_unique_nodes - int(
-                    num_nodes_per_hop_dict[node_type][: hop + 1].sum(0)
-                )
-
-        numeric_etypes, counts = torch.unique(
-            torch.as_tensor(
-                sampling_results.iloc[hop_ix_start:hop_ix_end].edge_type,
-                device="cuda",
-            ),
-            return_counts=True,
-        )
-        numeric_etypes = list(numeric_etypes)
-        counts = list(counts)
-        for num_etype, count in zip(numeric_etypes, counts):
-            can_etype = graph_store.numeric_edge_type_to_canonical(num_etype)
-            if can_etype not in num_edges_per_hop_dict:
-                num_edges_per_hop_dict[can_etype] = torch.zeros(
-                    len(hops), dtype=torch.int64
-                )
-            num_edges_per_hop_dict[can_etype][hop] = count
-
-    if HeteroSamplerOutput is None:
-        raise ImportError("Error importing from pyg")
-
-    return HeteroSamplerOutput(
-        node=noi_index,
-        row=row_dict,
-        col=col_dict,
-        edge=None,
-        num_sampled_nodes={k: t.tolist() for k, t in num_nodes_per_hop_dict.items()},
-        num_sampled_edges={k: t.tolist() for k, t in num_edges_per_hop_dict.items()},
-        metadata=metadata,
-    )
-
-
-def filter_cugraph_pyg_store(
-    feature_store,
-    graph_store,
-    node,
-    row,
-    col,
-    edge,
-    clx,
-) -> "torch_geometric.data.Data":
-    data = torch_geometric.data.Data()
-
-    data.edge_index = torch.stack([row, col], dim=0)
-
-    required_attrs = []
-    for attr in feature_store.get_all_tensor_attrs():
-        attr.index = edge if isinstance(attr.group_name, tuple) else node
-        required_attrs.append(attr)
-        data.num_nodes = attr.index.size(0)
-
-    tensors = feature_store.multi_get_tensor(required_attrs)
-    for i, attr in enumerate(required_attrs):
-        data[attr.attr_name] = tensors[i]
-
-    return data
-
-
-def neg_sample(
-    graph_store: GraphStore,
-    seed_src: "torch.Tensor",
-    seed_dst: "torch.Tensor",
-    batch_size: int,
-    neg_sampling: "torch_geometric.sampler.NegativeSampling",
-    time: "torch.Tensor",
-    node_time: "torch.Tensor",
-) -> Tuple["torch.Tensor", "torch.Tensor"]:
-    try:
-        # Compatibility for PyG 2.5
-        src_weight = neg_sampling.src_weight
-        dst_weight = neg_sampling.dst_weight
-    except AttributeError:
-        src_weight = neg_sampling.weight
-        dst_weight = neg_sampling.weight
-    unweighted = src_weight is None and dst_weight is None
-
-    # Require at least one negative edge per batch
-    num_neg = max(
-        int(ceil(neg_sampling.amount * seed_src.numel())),
-        int(ceil(seed_src.numel() / batch_size)),
-    )
-
-    if graph_store.is_multi_gpu:
-        num_neg_global = torch.tensor([num_neg], device="cuda")
-        torch.distributed.all_reduce(num_neg_global, op=torch.distributed.ReduceOp.SUM)
-        num_neg = int(num_neg_global)
-    else:
-        num_neg_global = num_neg
-
-    if node_time is None:
-        result_dict = pylibcugraph.negative_sampling(
-            graph_store._resource_handle,
-            graph_store._graph,
-            num_neg_global,
-            vertices=None
-            if unweighted
-            else cupy.arange(src_weight.numel(), dtype="int64"),
-            src_bias=None if src_weight is None else cupy.asarray(src_weight),
-            dst_bias=None if dst_weight is None else cupy.asarray(dst_weight),
-            remove_duplicates=False,
-            remove_false_negatives=False,
-            exact_number_of_samples=True,
-            do_expensive_check=False,
-        )
-
-        src_neg = torch.as_tensor(result_dict["sources"], device="cuda")[:num_neg]
-        dst_neg = torch.as_tensor(result_dict["destinations"], device="cuda")[:num_neg]
-
-        # TODO modifiy the C API so this condition is impossible
-        if src_neg.numel() < num_neg:
-            num_gen = num_neg - src_neg.numel()
-            src_neg = torch.concat(
-                [
-                    src_neg,
-                    torch.randint(
-                        0, src_neg.max(), (num_gen,), device="cuda", dtype=torch.int64
-                    ),
-                ]
-            )
-            dst_neg = torch.concat(
-                [
-                    dst_neg,
-                    torch.randint(
-                        0, dst_neg.max(), (num_gen,), device="cuda", dtype=torch.int64
-                    ),
-                ]
-            )
-        return src_neg, dst_neg
-    raise NotImplementedError(
-        "Temporal negative sampling is currently unimplemented in cuGraph-PyG"
-    )
-
-
-def neg_cat(
-    seed_pos: "torch.Tensor", seed_neg: "torch.Tensor", pos_batch_size: int
-) -> Tuple["torch.Tensor", int]:
-    num_seeds = seed_pos.numel()
-    num_batches = int(ceil(num_seeds / pos_batch_size))
-    neg_batch_size = int(ceil(seed_neg.numel() / num_batches))
-
-    batch_pos_offsets = torch.full((num_batches,), pos_batch_size).cumsum(-1)[:-1]
-    seed_pos_splits = torch.tensor_split(seed_pos, batch_pos_offsets)
-
-    batch_neg_offsets = torch.full((num_batches,), neg_batch_size).cumsum(-1)[:-1]
-    seed_neg_splits = torch.tensor_split(seed_neg, batch_neg_offsets)
-
-    return (
-        torch.concatenate(
-            [torch.concatenate(s) for s in zip(seed_pos_splits, seed_neg_splits)]
-        ),
-        neg_batch_size,
-    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
deleted file mode 100644
index 30994289f9c..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import pytest
-
-from dask_cuda.initialize import initialize as dask_initialize
-from dask_cuda import LocalCUDACluster
-from dask.distributed import Client
-from cugraph.dask.comms import comms as Comms
-from cugraph.dask.common.mg_utils import get_visible_devices
-from cugraph.testing.mg_utils import stop_dask_client
-
-import torch
-import numpy as np
-from cugraph.gnn import FeatureStore
-from cugraph.datasets import karate
-
-import tempfile
-
-# module-wide fixtures
-
-# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark"
-# fixture will be available automatically. Check that this fixture is available
-# by trying to import rapids_pytest_benchmark, and if that fails, set
-# "gpubenchmark" to the standard "benchmark" fixture provided by
-# pytest-benchmark.
-try:
-    import rapids_pytest_benchmark  # noqa: F401
-except ImportError:
-    import pytest_benchmark
-
-    gpubenchmark = pytest_benchmark.plugin.benchmark
-
-
-@pytest.fixture(scope="module")
-def dask_client():
-    dask_scheduler_file = os.environ.get("SCHEDULER_FILE")
-    cuda_visible_devices = get_visible_devices()
-
-    if dask_scheduler_file is not None:
-        dask_initialize()
-        dask_client = Client(scheduler_file=dask_scheduler_file)
-    else:
-        # The tempdir created by tempdir_object should be cleaned up once
-        # tempdir_object goes out-of-scope and is deleted.
-        tempdir_object = tempfile.TemporaryDirectory()
-        cluster = LocalCUDACluster(
-            local_directory=tempdir_object.name,
-            protocol="tcp",
-            CUDA_VISIBLE_DEVICES=cuda_visible_devices,
-        )
-
-        dask_client = Client(cluster)
-        dask_client.wait_for_workers(len(cuda_visible_devices))
-
-    if not Comms.is_initialized():
-        Comms.initialize(p2p=True)
-
-    yield dask_client
-
-    stop_dask_client(dask_client)
-    print("\ndask_client fixture: client.close() called")
-
-
-@pytest.fixture
-def karate_gnn():
-    el = karate.get_edgelist().reset_index(drop=True)
-    el.src = el.src.astype("int64")
-    el.dst = el.dst.astype("int64")
-    all_vertices = np.array_split(np.arange(34), 2)
-
-    F = FeatureStore(backend="torch")
-    F.add_data(
-        torch.arange(len(all_vertices[0]), dtype=torch.float32) * 31,
-        "type0",
-        "prop0",
-    )
-    F.add_data(
-        torch.arange(len(all_vertices[1]), dtype=torch.float32) * 41,
-        "type1",
-        "prop0",
-    )
-
-    N = {
-        "type0": len(all_vertices[0]),
-        "type1": len(all_vertices[1]),
-    }
-
-    offsets = {"type0": 0, "type1": N["type0"]}
-
-    G = {
-        ("type0", "et01", "type1"): el[
-            el.src.isin(all_vertices[0]) & el.dst.isin(all_vertices[1])
-        ].reset_index(drop=True),
-        ("type1", "et10", "type0"): el[
-            el.src.isin(all_vertices[1]) & el.dst.isin(all_vertices[0])
-        ].reset_index(drop=True),
-        ("type0", "et00", "type0"): el[
-            el.src.isin(all_vertices[0]) & el.dst.isin(all_vertices[0])
-        ],
-        ("type1", "et11", "type1"): el[
-            el.src.isin(all_vertices[1]) & el.dst.isin(all_vertices[1])
-        ].reset_index(drop=True),
-    }
-
-    G = {
-        (src_type, edge_type, dst_type): (
-            torch.tensor(elx["src"].values_host - offsets[src_type]),
-            torch.tensor(elx["dst"].values_host - offsets[dst_type]),
-        )
-        for (src_type, edge_type, dst_type), elx in G.items()
-    }
-
-    return F, G, N
-
-
-@pytest.fixture
-def basic_graph_1():
-    G = {
-        ("vt1", "pig", "vt1"): [
-            torch.tensor([0, 0, 1, 2, 2, 3]),
-            torch.tensor([1, 2, 4, 3, 4, 1]),
-        ]
-    }
-
-    N = {"vt1": 5}
-
-    F = FeatureStore()
-    F.add_data(
-        torch.tensor([100, 200, 300, 400, 500]), type_name="vt1", feat_name="prop1"
-    )
-
-    F.add_data(torch.tensor([5, 4, 3, 2, 1]), type_name="vt1", feat_name="prop2")
-
-    return F, G, N
-
-
-@pytest.fixture
-def multi_edge_graph_1():
-    G = {
-        ("vt1", "pig", "vt1"): [torch.tensor([0, 2, 3, 1]), torch.tensor([1, 3, 1, 4])],
-        ("vt1", "dog", "vt1"): [torch.tensor([0, 3, 4]), torch.tensor([2, 2, 3])],
-        ("vt1", "cat", "vt1"): [
-            torch.tensor([1, 2, 2]),
-            torch.tensor([4, 3, 4]),
-        ],
-    }
-
-    N = {"vt1": 5}
-
-    F = FeatureStore()
-    F.add_data(
-        torch.tensor([100, 200, 300, 400, 500]), type_name="vt1", feat_name="prop1"
-    )
-
-    F.add_data(torch.tensor([5, 4, 3, 2, 1]), type_name="vt1", feat_name="prop2")
-
-    return F, G, N
-
-
-@pytest.fixture
-def multi_edge_multi_vertex_graph_1():
-
-    G = {
-        ("brown", "horse", "brown"): [
-            torch.tensor([0, 0]),
-            torch.tensor([1, 2]),
-        ],
-        ("brown", "tortoise", "black"): [
-            torch.tensor([1, 1, 2]),
-            torch.tensor([1, 0, 1]),
-        ],
-        ("brown", "mongoose", "black"): [
-            torch.tensor([2, 1]),
-            torch.tensor([0, 1]),
-        ],
-        ("black", "cow", "brown"): [
-            torch.tensor([0, 0]),
-            torch.tensor([1, 2]),
-        ],
-        ("black", "snake", "black"): [
-            torch.tensor([1]),
-            torch.tensor([0]),
-        ],
-    }
-
-    N = {"brown": 3, "black": 2}
-
-    F = FeatureStore()
-    F.add_data(torch.tensor([100, 200, 300]), type_name="brown", feat_name="prop1")
-
-    F.add_data(torch.tensor([400, 500]), type_name="black", feat_name="prop1")
-
-    F.add_data(torch.tensor([5, 4, 3]), type_name="brown", feat_name="prop2")
-
-    F.add_data(torch.tensor([2, 1]), type_name="black", feat_name="prop2")
-
-    return F, G, N
-
-
-@pytest.fixture
-def multi_edge_multi_vertex_no_graph_1():
-    G = {
-        ("brown", "horse", "brown"): 2,
-        ("brown", "tortoise", "black"): 3,
-        ("brown", "mongoose", "black"): 3,
-        ("black", "cow", "brown"): 3,
-        ("black", "snake", "black"): 1,
-    }
-
-    N = {"brown": 3, "black": 2}
-
-    F = FeatureStore()
-    F.add_data(np.array([100, 200, 300]), type_name="brown", feat_name="prop1")
-
-    F.add_data(np.array([400, 500]), type_name="black", feat_name="prop1")
-
-    F.add_data(np.array([5, 4, 3]), type_name="brown", feat_name="prop2")
-
-    F.add_data(np.array([2, 1]), type_name="black", feat_name="prop2")
-
-    return F, G, N
-
-
-@pytest.fixture
-def abc_graph():
-    N = {
-        "A": 2,  # 0, 1
-        "B": 3,  # 2, 3, 4
-        "C": 4,  # 5, 6, 7, 8
-    }
-
-    G = {
-        # (0->2, 0->3, 1->3)
-        ("A", "ab", "B"): [
-            torch.tensor([0, 0, 1], dtype=torch.int64),
-            torch.tensor([0, 1, 1], dtype=torch.int64),
-        ],
-        # (2->0, 2->1, 3->1, 4->0)
-        ("B", "ba", "A"): [
-            torch.tensor([0, 0, 1, 2], dtype=torch.int64),
-            torch.tensor([0, 1, 1, 0], dtype=torch.int64),
-        ],
-        # (2->6, 2->8, 3->5, 3->7, 4->5, 4->8)
-        ("B", "bc", "C"): [
-            torch.tensor([0, 0, 1, 1, 2, 2], dtype=torch.int64),
-            torch.tensor([1, 3, 0, 2, 0, 3], dtype=torch.int64),
-        ],
-    }
-
-    F = FeatureStore()
-    F.add_data(
-        torch.tensor([3.2, 2.1], dtype=torch.float32), type_name="A", feat_name="prop1"
-    )
-
-    return F, G, N
-
-
-@pytest.fixture
-def basic_pyg_graph_1():
-    edge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]])
-    size = (4, 4)
-    return edge_index, size
-
-
-@pytest.fixture
-def basic_pyg_graph_2():
-    edge_index = torch.tensor(
-        [
-            [0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9],
-            [1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0],
-        ]
-    )
-    size = (10, 10)
-    return edge_index, size
-
-
-@pytest.fixture
-def sample_pyg_hetero_data():
-    torch.manual_seed(12345)
-    raw_data_dict = {
-        "v0": torch.randn(6, 3),
-        "v1": torch.randn(7, 2),
-        "v2": torch.randn(5, 4),
-        ("v2", "e0", "v1"): torch.tensor([[0, 2, 2, 4, 4], [4, 3, 6, 0, 1]]),
-        ("v1", "e1", "v1"): torch.tensor(
-            [[0, 2, 2, 2, 3, 5, 5], [4, 0, 4, 5, 3, 0, 1]]
-        ),
-        ("v0", "e2", "v0"): torch.tensor([[0, 2, 2, 3, 5, 5], [1, 1, 5, 1, 1, 2]]),
-        ("v1", "e3", "v2"): torch.tensor(
-            [[0, 1, 1, 2, 4, 5, 6], [1, 2, 3, 1, 2, 2, 2]]
-        ),
-        ("v0", "e4", "v2"): torch.tensor([[1, 1, 3, 3, 4, 4], [1, 4, 1, 4, 0, 3]]),
-    }
-
-    # create a nested dictionary to facilitate PyG's HeteroData construction
-    hetero_data_dict = {}
-    for key, value in raw_data_dict.items():
-        if isinstance(key, tuple):
-            hetero_data_dict[key] = {"edge_index": value}
-        else:
-            hetero_data_dict[key] = {"x": value}
-
-    return hetero_data_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
deleted file mode 100644
index 0a997a960b8..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cugraph
-from cugraph_pyg.data.dask_graph_store import (
-    CuGraphTensorAttr,
-    CuGraphEdgeAttr,
-    EdgeLayout,
-)
-from cugraph_pyg.data import DaskGraphStore
-
-import cudf
-import cupy
-import numpy as np
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-import pytest
-
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_tensor_attr():
-    ta = CuGraphTensorAttr("group0", "property1")
-    assert not ta.is_fully_specified()
-    assert not ta.is_set("index")
-
-    ta.fully_specify()
-    assert ta.is_fully_specified()
-
-    other_ta = CuGraphTensorAttr(index=[1, 2, 3])
-    ta.update(other_ta)
-    assert ta.index == [1, 2, 3]
-
-    casted_ta1 = CuGraphTensorAttr.cast(ta)
-    assert casted_ta1 == ta
-
-    casted_ta2 = CuGraphTensorAttr.cast(index=[1, 2, 3])
-    assert casted_ta2.index == [1, 2, 3]
-    assert not casted_ta2.is_fully_specified()
-
-    casted_ta3 = CuGraphTensorAttr.cast(
-        "group2",
-        "property2",
-        [1, 2, 3],
-    )
-    assert casted_ta3.group_name == "group2"
-    assert casted_ta3.attr_name == "property2"
-    assert casted_ta3.index == [1, 2, 3]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_edge_attr():
-    ea = CuGraphEdgeAttr("type0", EdgeLayout.COO, False, 10)
-    assert ea.edge_type == "type0"
-    assert ea.layout == EdgeLayout.COO
-    assert not ea.is_sorted
-    assert ea.size == 10
-
-    ea = CuGraphEdgeAttr(edge_type="type1", layout="csr", is_sorted=True)
-    assert ea.size is None
-
-    ea = CuGraphEdgeAttr.cast("type0", EdgeLayout.COO, False, 10)
-    assert ea.edge_type == "type0"
-    assert ea.layout == EdgeLayout.COO
-    assert not ea.is_sorted
-    assert ea.size == 10
-
-
-@pytest.fixture(
-    params=[
-        "basic_graph_1",
-        "multi_edge_graph_1",
-        "multi_edge_multi_vertex_graph_1",
-    ]
-)
-def graph(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.fixture(params=["basic_graph_1", "multi_edge_graph_1"])
-def single_vertex_graph(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.parametrize("edge_index_type", ["numpy", "torch-cpu", "torch-gpu", "cudf"])
-@pytest.mark.sg
-def test_get_edge_index(graph, edge_index_type):
-    F, G, N = graph
-    if "torch" in edge_index_type:
-        if edge_index_type == "torch-cpu":
-            device = "cpu"
-        else:
-            device = "cuda"
-        for et in list(G.keys()):
-            G[et][0] = torch.as_tensor(G[et][0], device=device)
-            G[et][1] = torch.as_tensor(G[et][1], device=device)
-    elif edge_index_type == "cudf":
-        for et in list(G.keys()):
-            G[et][0] = cudf.Series(G[et][0])
-            G[et][1] = cudf.Series(G[et][1])
-
-    cugraph_store = DaskGraphStore(F, G, N, order="CSC")
-
-    for pyg_can_edge_type in G:
-        src, dst = cugraph_store.get_edge_index(
-            edge_type=pyg_can_edge_type, layout="coo", is_sorted=False
-        )
-
-        if edge_index_type == "cudf":
-            assert G[pyg_can_edge_type][0].values_host.tolist() == src.tolist()
-            assert G[pyg_can_edge_type][1].values_host.tolist() == dst.tolist()
-        else:
-            assert G[pyg_can_edge_type][0].tolist() == src.tolist()
-            assert G[pyg_can_edge_type][1].tolist() == dst.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_edge_types(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    eta = cugraph_store._edge_types_to_attrs
-    assert eta.keys() == G.keys()
-
-    for attr_name, attr_repr in eta.items():
-        src_size = N[attr_name[0]]
-        dst_size = N[attr_name[-1]]
-        assert src_size == attr_repr.size[0]
-        assert dst_size == attr_repr.size[-1]
-        assert attr_name == attr_repr.edge_type
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_subgraph(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    if len(G.keys()) > 1:
-        for edge_type in G.keys():
-            # Subgraphing is not implemented yet and should raise an error
-            with pytest.raises(ValueError):
-                sg = cugraph_store._subgraph([edge_type])
-
-    sg = cugraph_store._subgraph(list(G.keys()))
-    assert isinstance(sg, cugraph.MultiGraph)
-
-    num_edges = sum([len(v[0]) for v in G.values()])
-    assert sg.number_of_edges() == num_edges
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_renumber_vertices_basic(single_vertex_graph):
-    F, G, N = single_vertex_graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    nodes_of_interest = torch.as_tensor(
-        cupy.random.randint(0, sum(N.values()), 3), device="cuda"
-    )
-
-    index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-    assert index["vt1"].tolist() == nodes_of_interest.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph_1):
-    F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    nodes_of_interest = torch.as_tensor(
-        cupy.random.randint(0, sum(N.values()), 3), device="cuda"
-    ).unique()
-
-    index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-
-    black_nodes = nodes_of_interest[nodes_of_interest <= 1]
-    brown_nodes = nodes_of_interest[nodes_of_interest > 1] - 2
-
-    if len(black_nodes) > 0:
-        assert index["black"].tolist() == sorted(black_nodes.tolist())
-    if len(brown_nodes) > 0:
-        assert index["brown"].tolist() == sorted(brown_nodes.tolist())
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_renumber_edges(abc_graph):
-    F, G, N = abc_graph
-
-    graph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
-    mock_sampling_results = cudf.DataFrame(
-        {
-            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
-            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
-        }
-    )
-
-    mock_noi_index = {
-        "A": torch.tensor([0, 1], device="cuda"),
-        "B": torch.tensor([0, 1], device="cuda"),
-        "C": torch.tensor([3, 2, 0], device="cuda"),
-    }
-
-    row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
-        mock_sampling_results, mock_noi_index
-    )
-
-    assert len(row_dict) == 3
-    assert len(col_dict) == 3
-    assert row_dict[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
-    assert col_dict[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
-    assert row_dict[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
-    assert col_dict[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
-    assert row_dict[("B", "ba", "A")].tolist() == [1, 1]
-    assert col_dict[("B", "ba", "A")].tolist() == [1, 1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_tensor(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    for feature_name, feature_on_types in F.get_feature_list().items():
-        for type_name in feature_on_types:
-            v_ids = np.arange(N[type_name])
-            base_series = F.get_data(
-                v_ids,
-                type_name=type_name,
-                feat_name=feature_name,
-            ).tolist()
-
-            tsr = cugraph_store.get_tensor(
-                type_name, feature_name, v_ids, None, cupy.int64
-            ).tolist()
-
-            assert tsr == base_series
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_tensor_empty_idx(karate_gnn):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    t = cugraph_store.get_tensor(
-        CuGraphTensorAttr(group_name="type0", attr_name="prop0", index=None)
-    )
-    assert t.tolist() == (torch.arange(17, dtype=torch.float32) * 31).tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_multi_get_tensor(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    for vertex_type in sorted(N.keys()):
-        v_ids = np.arange(N[vertex_type])
-        feat_names = list(F.get_feature_list().keys())
-        base_series = None
-        for feat_name in feat_names:
-            if base_series is None:
-                base_series = F.get_data(v_ids, vertex_type, feat_name)
-            else:
-                base_series = np.stack(
-                    [base_series, F.get_data(v_ids, vertex_type, feat_name)]
-                )
-
-        tsr = cugraph_store.multi_get_tensor(
-            [
-                CuGraphTensorAttr(vertex_type, feat_name, v_ids)
-                for feat_name in feat_names
-            ]
-        )
-
-        assert torch.stack(tsr).tolist() == base_series.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_all_tensor_attrs(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    tensor_attrs = []
-    for vertex_type in sorted(N.keys()):
-        for prop in ["prop1", "prop2"]:
-            tensor_attrs.append(
-                CuGraphTensorAttr(
-                    vertex_type,
-                    prop,
-                    properties=None,
-                    dtype=F.get_data([0], vertex_type, "prop1").dtype,
-                )
-            )
-
-    for t in tensor_attrs:
-        print(t)
-
-    print("\n\n")
-
-    for t in cugraph_store.get_all_tensor_attrs():
-        print(t)
-
-    assert sorted(tensor_attrs, key=lambda a: (a.group_name, a.attr_name)) == sorted(
-        cugraph_store.get_all_tensor_attrs(), key=lambda a: (a.group_name, a.attr_name)
-    )
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_tensor_from_tensor_attrs(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    tensor_attrs = cugraph_store.get_all_tensor_attrs()
-    for tensor_attr in tensor_attrs:
-        v_ids = np.arange(N[tensor_attr.group_name])
-        data = F.get_data(v_ids, tensor_attr.group_name, tensor_attr.attr_name)
-
-        tensor_attr.index = v_ids
-        assert cugraph_store.get_tensor(tensor_attr).tolist() == data.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_get_tensor_size(graph):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    tensor_attrs = cugraph_store.get_all_tensor_attrs()
-    for tensor_attr in tensor_attrs:
-        sz = N[tensor_attr.group_name]
-
-        tensor_attr.index = np.arange(sz)
-        assert cugraph_store.get_tensor_size(tensor_attr) == torch.Size((sz,))
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(
-    isinstance(torch_geometric, MissingModule), reason="pyg not available"
-)
-@pytest.mark.sg
-def test_get_input_nodes(karate_gnn):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    input_node_info = torch_geometric.loader.utils.get_input_nodes(
-        (cugraph_store, cugraph_store), "type0"
-    )
-
-    # PyG 2.4
-    if len(input_node_info) == 2:
-        node_type, input_nodes = input_node_info
-    # PyG 2.5
-    elif len(input_node_info) == 3:
-        node_type, input_nodes, input_id = input_node_info
-    # Invalid
-    else:
-        raise ValueError("Invalid output from get_input_nodes")
-
-    assert node_type == "type0"
-    assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
-
-
-@pytest.mark.sg
-def test_serialize(multi_edge_multi_vertex_no_graph_1):
-    import pickle
-
-    F, G, N = multi_edge_multi_vertex_no_graph_1
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    cugraph_store_copy = pickle.loads(pickle.dumps(cugraph_store))
-
-    for tensor_attr in cugraph_store.get_all_tensor_attrs():
-        sz = cugraph_store.get_tensor_size(tensor_attr)[0]
-        tensor_attr.index = np.arange(sz)
-        assert (
-            cugraph_store.get_tensor(tensor_attr).tolist()
-            == cugraph_store_copy.get_tensor(tensor_attr).tolist()
-        )
-
-    # Currently does not store edgelist properly for SG
-    """
-    for edge_attr in cugraph_store.get_all_edge_attrs():
-        assert cugraph_store.get_edge_index(edge_attr) \
-            == cugraph_store_copy.get_edge_index(edge_attr)
-    """
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
deleted file mode 100644
index 65cb8984586..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_dask_graph_store_mg.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cugraph
-from cugraph_pyg.data.dask_graph_store import (
-    CuGraphTensorAttr,
-    CuGraphEdgeAttr,
-    EdgeLayout,
-)
-from cugraph_pyg.data import DaskGraphStore
-
-import cudf
-import dask_cudf
-import cupy
-import numpy as np
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-import pytest
-
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-def test_tensor_attr():
-    ta = CuGraphTensorAttr("group0", "property1")
-    assert not ta.is_fully_specified()
-    assert not ta.is_set("index")
-
-    ta.fully_specify()
-    assert ta.is_fully_specified()
-
-    other_ta = CuGraphTensorAttr(index=[1, 2, 3])
-    ta.update(other_ta)
-    assert ta.index == [1, 2, 3]
-
-    casted_ta1 = CuGraphTensorAttr.cast(ta)
-    assert casted_ta1 == ta
-
-    casted_ta2 = CuGraphTensorAttr.cast(index=[1, 2, 3])
-    assert casted_ta2.index == [1, 2, 3]
-    assert not casted_ta2.is_fully_specified()
-
-    casted_ta3 = CuGraphTensorAttr.cast(
-        "group2",
-        "property2",
-        [1, 2, 3],
-    )
-    assert casted_ta3.group_name == "group2"
-    assert casted_ta3.attr_name == "property2"
-    assert casted_ta3.index == [1, 2, 3]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-def test_edge_attr():
-    ea = CuGraphEdgeAttr("type0", EdgeLayout.COO, False, 10)
-    assert ea.edge_type == "type0"
-    assert ea.layout == EdgeLayout.COO
-    assert not ea.is_sorted
-    assert ea.size == 10
-
-    ea = CuGraphEdgeAttr(edge_type="type1", layout="csr", is_sorted=True)
-    assert ea.size is None
-
-    ea = CuGraphEdgeAttr.cast("type0", EdgeLayout.COO, False, 10)
-    assert ea.edge_type == "type0"
-    assert ea.layout == EdgeLayout.COO
-    assert not ea.is_sorted
-    assert ea.size == 10
-
-
-@pytest.fixture(
-    params=[
-        "basic_graph_1",
-        "multi_edge_graph_1",
-        "multi_edge_multi_vertex_graph_1",
-    ]
-)
-def graph(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.fixture(params=["basic_graph_1", "multi_edge_graph_1"])
-def single_vertex_graph(request):
-    return request.getfixturevalue(request.param)
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.parametrize(
-    "edge_index_type", ["numpy", "torch-cpu", "torch-gpu", "cudf", "dask-cudf"]
-)
-@pytest.mark.mg
-def test_get_edge_index(graph, edge_index_type, dask_client):
-    F, G, N = graph
-    if "torch" in edge_index_type:
-        if edge_index_type == "torch-cpu":
-            device = "cpu"
-        else:
-            device = "cuda"
-        for et in list(G.keys()):
-            G[et][0] = torch.as_tensor(G[et][0], device=device)
-            G[et][1] = torch.as_tensor(G[et][1], device=device)
-    elif edge_index_type == "cudf":
-        for et in list(G.keys()):
-            G[et][0] = cudf.Series(G[et][0])
-            G[et][1] = cudf.Series(G[et][1])
-    elif edge_index_type == "dask-cudf":
-        for et in list(G.keys()):
-            G[et][0] = dask_cudf.from_cudf(cudf.Series(G[et][0]), npartitions=1)
-            G[et][1] = dask_cudf.from_cudf(cudf.Series(G[et][1]), npartitions=1)
-
-    cugraph_store = DaskGraphStore(F, G, N, order="CSC", multi_gpu=True)
-
-    for pyg_can_edge_type in G:
-        src, dst = cugraph_store.get_edge_index(
-            edge_type=pyg_can_edge_type, layout="coo", is_sorted=False
-        )
-
-        if edge_index_type == "cudf":
-            assert G[pyg_can_edge_type][0].values_host.tolist() == src.tolist()
-            assert G[pyg_can_edge_type][1].values_host.tolist() == dst.tolist()
-        elif edge_index_type == "dask-cudf":
-            assert (
-                G[pyg_can_edge_type][0].compute().values_host.tolist() == src.tolist()
-            )
-            assert (
-                G[pyg_can_edge_type][1].compute().values_host.tolist() == dst.tolist()
-            )
-        else:
-            assert G[pyg_can_edge_type][0].tolist() == src.tolist()
-            assert G[pyg_can_edge_type][1].tolist() == dst.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_edge_types(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    eta = cugraph_store._edge_types_to_attrs
-    assert eta.keys() == G.keys()
-
-    for attr_name, attr_repr in eta.items():
-        src_size = N[attr_name[0]]
-        dst_size = N[attr_name[-1]]
-        assert src_size == attr_repr.size[0]
-        assert dst_size == attr_repr.size[-1]
-        assert attr_name == attr_repr.edge_type
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_subgraph(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    if len(G.keys()) > 1:
-        for edge_type in G.keys():
-            # Subgraphing is not implemented yet and should raise an error
-            with pytest.raises(ValueError):
-                sg = cugraph_store._subgraph([edge_type])
-
-    sg = cugraph_store._subgraph(list(G.keys()))
-    assert isinstance(sg, cugraph.MultiGraph)
-
-    num_edges = sum([len(v[0]) for v in G.values()])
-    assert sg.number_of_edges() == num_edges
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_renumber_vertices_basic(single_vertex_graph, dask_client):
-    F, G, N = single_vertex_graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    nodes_of_interest = torch.as_tensor(
-        cupy.random.randint(0, sum(N.values()), 3), device="cuda"
-    )
-
-    index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-    assert index["vt1"].tolist() == nodes_of_interest.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_renumber_vertices_multi_edge_multi_vertex(
-    multi_edge_multi_vertex_graph_1, dask_client
-):
-    F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    nodes_of_interest = torch.as_tensor(
-        cupy.random.randint(0, sum(N.values()), 3), device="cuda"
-    ).unique()
-
-    index = cugraph_store._get_vertex_groups_from_sample(nodes_of_interest)
-
-    black_nodes = nodes_of_interest[nodes_of_interest <= 1]
-    brown_nodes = nodes_of_interest[nodes_of_interest > 1] - 2
-
-    if len(black_nodes) > 0:
-        assert index["black"].tolist() == sorted(black_nodes.tolist())
-    if len(brown_nodes) > 0:
-        assert index["brown"].tolist() == sorted(brown_nodes.tolist())
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_renumber_edges(abc_graph, dask_client):
-    F, G, N = abc_graph
-
-    graph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-
-    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
-    mock_sampling_results = cudf.DataFrame(
-        {
-            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
-            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
-        }
-    )
-
-    mock_noi_index = {
-        "A": torch.tensor([0, 1], device="cuda"),
-        "B": torch.tensor([0, 1], device="cuda"),
-        "C": torch.tensor([3, 2, 0], device="cuda"),
-    }
-
-    row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
-        mock_sampling_results, mock_noi_index
-    )
-
-    assert len(row_dict) == 3
-    assert len(col_dict) == 3
-    assert row_dict[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
-    assert col_dict[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
-    assert row_dict[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
-    assert col_dict[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
-    assert row_dict[("B", "ba", "A")].tolist() == [1, 1]
-    assert col_dict[("B", "ba", "A")].tolist() == [1, 1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_tensor(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    for feature_name, feature_on_types in F.get_feature_list().items():
-        for type_name in feature_on_types:
-            v_ids = np.arange(N[type_name])
-            base_series = F.get_data(
-                v_ids,
-                type_name=type_name,
-                feat_name=feature_name,
-            ).tolist()
-
-            tsr = cugraph_store.get_tensor(
-                type_name, feature_name, v_ids, None, cupy.int64
-            ).tolist()
-
-            assert tsr == base_series
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_tensor_empty_idx(karate_gnn, dask_client):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    t = cugraph_store.get_tensor(
-        CuGraphTensorAttr(group_name="type0", attr_name="prop0", index=None)
-    )
-    assert t.tolist() == (torch.arange(17, dtype=torch.float32) * 31).tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_multi_get_tensor(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    for vertex_type in sorted(N.keys()):
-        v_ids = np.arange(N[vertex_type])
-        feat_names = list(F.get_feature_list().keys())
-        base_series = None
-        for feat_name in feat_names:
-            if base_series is None:
-                base_series = F.get_data(v_ids, vertex_type, feat_name)
-            else:
-                base_series = np.stack(
-                    [base_series, F.get_data(v_ids, vertex_type, feat_name)]
-                )
-
-        tsr = cugraph_store.multi_get_tensor(
-            [
-                CuGraphTensorAttr(vertex_type, feat_name, v_ids)
-                for feat_name in feat_names
-            ]
-        )
-
-        assert torch.stack(tsr).tolist() == base_series.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_all_tensor_attrs(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    tensor_attrs = []
-    for vertex_type in sorted(N.keys()):
-        for prop in ["prop1", "prop2"]:
-            tensor_attrs.append(
-                CuGraphTensorAttr(
-                    vertex_type,
-                    prop,
-                    properties=None,
-                    dtype=F.get_data([0], vertex_type, "prop1").dtype,
-                )
-            )
-
-    assert sorted(tensor_attrs, key=lambda a: (a.group_name, a.attr_name)) == sorted(
-        cugraph_store.get_all_tensor_attrs(), key=lambda a: (a.group_name, a.attr_name)
-    )
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_tensor_from_tensor_attrs(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    tensor_attrs = cugraph_store.get_all_tensor_attrs()
-    for tensor_attr in tensor_attrs:
-        v_ids = np.arange(N[tensor_attr.group_name])
-        data = F.get_data(v_ids, tensor_attr.group_name, tensor_attr.attr_name)
-
-        tensor_attr.index = v_ids
-        assert cugraph_store.get_tensor(tensor_attr).tolist() == data.tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_get_tensor_size(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    tensor_attrs = cugraph_store.get_all_tensor_attrs()
-    for tensor_attr in tensor_attrs:
-        sz = N[tensor_attr.group_name]
-
-        tensor_attr.index = np.arange(sz)
-        assert cugraph_store.get_tensor_size(tensor_attr) == torch.Size((sz,))
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(
-    isinstance(torch_geometric, MissingModule), reason="pyg not available"
-)
-@pytest.mark.mg
-def test_get_input_nodes(karate_gnn, dask_client):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-
-    nodes = torch_geometric.loader.utils.get_input_nodes(
-        (cugraph_store, cugraph_store), "type0"
-    )
-
-    if len(nodes) == 2:
-        node_type, input_nodes = nodes
-    else:
-        node_type, input_nodes, _ = nodes
-
-    assert node_type == "type0"
-    assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_mg_frame_handle(graph, dask_client):
-    F, G, N = graph
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True)
-    assert isinstance(cugraph_store._DaskGraphStore__graph._plc_graph, dict)
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_cugraph_loader_large_index(dask_client):
-    large_index = (
-        np.random.randint(0, 1_000_000, (100_000_000,)),
-        np.random.randint(0, 1_000_000, (100_000_000,)),
-    )
-
-    large_features = np.random.randint(0, 50, (1_000_000,))
-    F = cugraph.gnn.FeatureStore(backend="torch")
-    F.add_data(large_features, "N", "f")
-
-    store = DaskGraphStore(
-        F,
-        {("N", "e", "N"): large_index},
-        {"N": 1_000_000},
-        multi_gpu=True,
-    )
-
-    graph = store._subgraph()
-    assert isinstance(graph, cugraph.Graph)
-
-    el = graph.view_edge_list().compute()
-    assert (el["src"].values_host - large_index[0]).sum() == 0
-    assert (el["dst"].values_host - large_index[1]).sum() == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py
deleted file mode 100644
index ab5f1e217bb..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import TensorDictFeatureStore
-
-torch = import_optional("torch")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_tensordict_feature_store_basic_api():
-    feature_store = TensorDictFeatureStore()
-
-    node_features_0 = torch.randint(128, (100, 1000))
-    node_features_1 = torch.randint(256, (100, 10))
-
-    other_features = torch.randint(1024, (10, 5))
-
-    feature_store["node", "feat0"] = node_features_0
-    feature_store["node", "feat1"] = node_features_1
-    feature_store["other", "feat"] = other_features
-
-    assert (feature_store["node"]["feat0"][:] == node_features_0).all()
-    assert (feature_store["node"]["feat1"][:] == node_features_1).all()
-    assert (feature_store["other"]["feat"][:] == other_features).all()
-
-    assert len(feature_store.get_all_tensor_attrs()) == 3
-
-    del feature_store["node", "feat0"]
-    assert len(feature_store.get_all_tensor_attrs()) == 2
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py
deleted file mode 100644
index f1f514560c8..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_feature_store_mg.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import pytest
-
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import TensorDictFeatureStore, WholeFeatureStore
-
-torch = import_optional("torch")
-pylibwholegraph = import_optional("pylibwholegraph")
-
-
-def run_test_wholegraph_feature_store_basic_api(rank, world_size, dtype):
-    if dtype == "float32":
-        torch_dtype = torch.float32
-    elif dtype == "int64":
-        torch_dtype = torch.int64
-
-    torch.cuda.set_device(rank)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    pylibwholegraph.torch.initialize.init(
-        rank,
-        world_size,
-        rank,
-        world_size,
-    )
-
-    features = torch.arange(0, world_size * 2000)
-    features = features.reshape((features.numel() // 100, 100)).to(torch_dtype)
-
-    tensordict_store = TensorDictFeatureStore()
-    tensordict_store["node", "fea"] = features
-
-    whole_store = WholeFeatureStore()
-    whole_store["node", "fea"] = torch.tensor_split(features, world_size)[rank]
-
-    ix = torch.arange(features.shape[0])
-    assert (
-        whole_store["node", "fea"][ix].cpu() == tensordict_store["node", "fea"][ix]
-    ).all()
-
-    label = torch.arange(0, features.shape[0]).reshape((features.shape[0], 1))
-    tensordict_store["node", "label"] = label
-    whole_store["node", "label"] = torch.tensor_split(label, world_size)[rank]
-
-    assert (
-        whole_store["node", "fea"][ix].cpu() == tensordict_store["node", "fea"][ix]
-    ).all()
-
-    pylibwholegraph.torch.initialize.finalize()
-
-
-@pytest.mark.skipif(
-    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
-)
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.parametrize("dtype", ["float32", "int64"])
-@pytest.mark.mg
-def test_wholegraph_feature_store_basic_api(dtype):
-    world_size = torch.cuda.device_count()
-    torch.multiprocessing.spawn(
-        run_test_wholegraph_feature_store_basic_api,
-        args=(
-            world_size,
-            dtype,
-        ),
-        nprocs=world_size,
-    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py
deleted file mode 100644
index a8b93665aad..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import GraphStore
-
-torch = import_optional("torch")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_graph_store_basic_api():
-    df = karate.get_edgelist()
-    src = torch.as_tensor(df["src"], device="cuda")
-    dst = torch.as_tensor(df["dst"], device="cuda")
-
-    ei = torch.stack([dst, src])
-
-    graph_store = GraphStore()
-    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
-
-    rei = graph_store.get_edge_index(("person", "knows", "person"), "coo")
-
-    assert (ei == rei).all()
-
-    edge_attrs = graph_store.get_all_edge_attrs()
-    assert len(edge_attrs) == 1
-
-    graph_store.remove_edge_index(("person", "knows", "person"), "coo")
-    edge_attrs = graph_store.get_all_edge_attrs()
-    assert len(edge_attrs) == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
deleted file mode 100644
index 14540b7e17d..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/data/test_graph_store_mg.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import GraphStore
-
-torch = import_optional("torch")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_graph_store_basic_api_mg():
-    df = karate.get_edgelist()
-    src = torch.as_tensor(df["src"], device="cuda")
-    dst = torch.as_tensor(df["dst"], device="cuda")
-
-    ei = torch.stack([dst, src])
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
-
-    rei = graph_store.get_edge_index(("person", "knows", "person"), "coo")
-
-    assert (ei == rei).all()
-
-    edge_attrs = graph_store.get_all_edge_attrs()
-    assert len(edge_attrs) == 1
-
-    graph_store.remove_edge_index(("person", "knows", "person"), "coo")
-    edge_attrs = graph_store.get_all_edge_attrs()
-    assert len(edge_attrs) == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
deleted file mode 100644
index 34ef6a59511..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader.py
+++ /dev/null
@@ -1,543 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import tempfile
-import os
-
-import cudf
-import cupy
-import numpy as np
-
-from cugraph_pyg.loader import DaskNeighborLoader
-from cugraph_pyg.loader import BulkSampleLoader
-from cugraph_pyg.data import DaskGraphStore
-from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
-
-from cugraph.gnn import FeatureStore
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from typing import Dict, Tuple
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-trim_to_layer = import_optional("torch_geometric.utils.trim_to_layer")
-if isinstance(trim_to_layer, MissingModule):
-    trim_to_layer = import_optional("torch_geometric.utils._trim_to_layer")
-
-
-try:
-    import torch_sparse  # noqa: F401
-
-    HAS_TORCH_SPARSE = True
-except:  # noqa: E722
-    HAS_TORCH_SPARSE = False
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_cugraph_loader_basic(
-    karate_gnn: Tuple[
-        FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
-    ]
-):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-    loader = DaskNeighborLoader(
-        (cugraph_store, cugraph_store),
-        torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
-        10,
-        num_neighbors=[4, 4],
-        random_state=62,
-        replace=False,
-    )
-
-    samples = [s for s in loader]
-
-    assert len(samples) == 3
-    for sample in samples:
-        if "type0" in sample:
-            for prop in sample["type0"]["prop0"].tolist():
-                assert prop % 31 == 0
-
-        if "type1" in sample:
-            for prop in sample["type1"]["prop0"].tolist():
-                assert prop % 41 == 0
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_cugraph_loader_hetero(
-    karate_gnn: Tuple[
-        FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
-    ]
-):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-    loader = DaskNeighborLoader(
-        (cugraph_store, cugraph_store),
-        input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
-        batch_size=2,
-        num_neighbors=[4, 4],
-        random_state=62,
-        replace=False,
-    )
-
-    samples = [s for s in loader]
-
-    assert len(samples) == 2
-    for sample in samples:
-        if "type0" in sample:
-            for prop in sample["type0"]["prop0"].tolist():
-                assert prop % 31 == 0
-
-        if "type1" in sample:
-            for prop in sample["type1"]["prop0"].tolist():
-                assert prop % 41 == 0
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_cugraph_loader_from_disk():
-    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
-    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
-    x = torch.zeros(256, dtype=torch.int32)
-    x[torch.tensor(m, dtype=torch.int32)] = n
-    F = FeatureStore()
-    F.add_data(x, "t0", "x")
-
-    G = {("t0", "knows", "t0"): 9080}
-    N = {"t0": 256}
-
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    bogus_samples = cudf.DataFrame(
-        {
-            "majors": [0, 1, 2, 3, 4, 5, 6, 6],
-            "minors": [5, 4, 3, 2, 2, 6, 5, 2],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
-        }
-    )
-    map = cudf.Series(m, name="map")
-    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
-
-    tempdir = tempfile.TemporaryDirectory()
-    for s in range(256):
-        bogus_samples["batch_id"] = cupy.int32(s)
-        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
-
-    loader = BulkSampleLoader(
-        feature_store=cugraph_store,
-        graph_store=cugraph_store,
-        directory=tempdir,
-    )
-
-    num_samples = 0
-    for sample in loader:
-        num_samples += 1
-        assert sample["t0"]["num_nodes"] == 7
-        # correct vertex order is [0, 1, 2, 5, 4, 3, 6]; x = [1, 2, 3, 6, 5, 4, 7]
-        assert sample["t0"]["x"].tolist() == [3, 4, 5, 6, 7, 8, 9]
-
-        edge_index = sample[("t0", "knows", "t0")]["edge_index"]
-        assert list(edge_index.shape) == [2, 8]
-
-        assert (
-            edge_index[0].tolist() == bogus_samples.majors.dropna().values_host.tolist()
-        )
-        assert (
-            edge_index[1].tolist() == bogus_samples.minors.dropna().values_host.tolist()
-        )
-
-    assert num_samples == 256
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_cugraph_loader_from_disk_subset():
-    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
-    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
-    x = torch.zeros(256, dtype=torch.int32)
-    x[torch.tensor(m, dtype=torch.int32)] = n
-    F = FeatureStore()
-    F.add_data(x, "t0", "x")
-
-    G = {("t0", "knows", "t0"): 9080}
-    N = {"t0": 256}
-
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    bogus_samples = cudf.DataFrame(
-        {
-            "majors": [0, 1, 2, 3, 4, 5, 6, 6],
-            "minors": [5, 4, 3, 2, 2, 6, 5, 2],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
-        }
-    )
-    map = cudf.Series(m, name="map")
-    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
-
-    tempdir = tempfile.TemporaryDirectory()
-    for s in range(256):
-        bogus_samples["batch_id"] = cupy.int32(s)
-        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
-
-    loader = BulkSampleLoader(
-        feature_store=cugraph_store,
-        graph_store=cugraph_store,
-        directory=tempdir,
-        input_files=list(os.listdir(tempdir.name))[100:200],
-    )
-
-    num_samples = 0
-    for sample in loader:
-        num_samples += 1
-        assert sample["t0"]["num_nodes"] == 7
-        # correct vertex order is [0, 1, 2, 6, 4, 3, 5]; x = [1, 2, 3, 7, 5, 4, 6]
-        assert sample["t0"]["x"].tolist() == [3, 4, 5, 6, 7, 8, 9]
-
-        edge_index = sample[("t0", "knows", "t0")]["edge_index"]
-        assert list(edge_index.shape) == [2, 8]
-
-        assert (
-            edge_index[0].tolist() == bogus_samples.majors.dropna().values_host.tolist()
-        )
-        assert (
-            edge_index[1].tolist() == bogus_samples.minors.dropna().values_host.tolist()
-        )
-
-    assert num_samples == 100
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(not HAS_TORCH_SPARSE, reason="torch-sparse not available")
-@pytest.mark.sg
-def test_cugraph_loader_from_disk_subset_csr():
-    m = [2, 9, 99, 82, 11, 13]
-    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
-    x = torch.zeros(256, dtype=torch.int32)
-    x[torch.tensor(m, dtype=torch.int32)] = n
-    F = FeatureStore()
-    F.add_data(x, "t0", "x")
-
-    G = {("t0", "knows", "t0"): 9080}
-    N = {"t0": 256}
-
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    bogus_samples = cudf.DataFrame(
-        {
-            "major_offsets": [0, 3, 5, 7, 8, None, None, None],
-            "minors": [1, 2, 3, 0, 3, 4, 5, 1],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "label_hop_offsets": cudf.Series(
-                [0, 1, 4, None, None, None, None, None], dtype="int32"
-            ),
-            "renumber_map_offsets": cudf.Series([0, 6], dtype="int32"),
-        }
-    )
-    map = cudf.Series(m, name="map")
-    bogus_samples["map"] = map
-
-    tempdir = tempfile.TemporaryDirectory()
-    for s in range(256):
-        # offset the offsets
-        bogus_samples["batch_id"] = cupy.int32(s)
-        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
-
-    loader = BulkSampleLoader(
-        feature_store=cugraph_store,
-        graph_store=cugraph_store,
-        directory=tempdir,
-        input_files=list(os.listdir(tempdir.name))[100:200],
-    )
-
-    num_samples = 0
-    for sample in loader:
-        num_samples += 1
-        assert sample["t0"]["num_nodes"] == 6
-
-        assert sample["t0"]["x"].tolist() == [1, 2, 3, 4, 5, 6]
-
-        edge_index = sample[("t0", "knows", "t0")]["adj_t"]
-        assert edge_index.size(0) == 4
-        assert edge_index.size(1) == 6
-
-        colptr, row, _ = edge_index.csr()
-
-        assert (
-            colptr.tolist() == bogus_samples.major_offsets.dropna().values_host.tolist()
-        )
-        assert row.tolist() == bogus_samples.minors.dropna().values_host.tolist()
-
-        assert sample["t0"]["num_sampled_nodes"] == [1, 3, 2]
-        assert sample["t0", "knows", "t0"]["num_sampled_edges"] == [3, 5]
-
-    assert num_samples == 100
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_cugraph_loader_e2e_coo():
-    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
-    x = torch.randint(3000, (256, 256)).to(torch.float32)
-    F = FeatureStore()
-    F.add_data(x, "t0", "x")
-
-    G = {("t0", "knows", "t0"): 9999}
-    N = {"t0": 256}
-
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    bogus_samples = cudf.DataFrame(
-        {
-            "majors": [0, 1, 2, 3, 4, 5, 6, 6],
-            "minors": [5, 4, 3, 2, 2, 6, 5, 2],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
-        }
-    )
-    map = cudf.Series(m, name="map")
-    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
-
-    tempdir = tempfile.TemporaryDirectory()
-    for s in range(256):
-        bogus_samples["batch_id"] = cupy.int32(s)
-        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
-
-    loader = BulkSampleLoader(
-        feature_store=cugraph_store,
-        graph_store=cugraph_store,
-        directory=tempdir,
-        input_files=list(os.listdir(tempdir.name))[100:200],
-    )
-
-    convs = [
-        torch_geometric.nn.SAGEConv(256, 64, aggr="mean").cuda(),
-        torch_geometric.nn.SAGEConv(64, 8, aggr="mean").cuda(),
-        torch_geometric.nn.SAGEConv(8, 1, aggr="mean").cuda(),
-    ]
-
-    trim = trim_to_layer.TrimToLayer()
-    relu = torch.nn.functional.relu
-    dropout = torch.nn.functional.dropout
-
-    for hetero_data in loader:
-        ei = hetero_data["t0", "knows", "t0"]["edge_index"]
-        x = hetero_data["t0"]["x"].cuda()
-        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
-        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
-
-        for i in range(len(convs)):
-            x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)
-
-            s = x.shape[0]
-
-            x = convs[i](x, ei, size=(s, s))
-            x = relu(x)
-            x = dropout(x, p=0.5)
-
-        x = x.narrow(dim=0, start=0, length=x.shape[0] - num_sampled_nodes[1])
-
-        assert list(x.shape) == [3, 1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skipif(not HAS_TORCH_SPARSE, reason="torch-sparse not available")
-@pytest.mark.parametrize("framework", ["pyg", "cugraph-ops"])
-@pytest.mark.sg
-def test_cugraph_loader_e2e_csc(framework: str):
-    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
-    x = torch.randint(3000, (256, 256)).to(torch.float32)
-    F = FeatureStore()
-    F.add_data(x, "t0", "x")
-
-    G = {("t0", "knows", "t0"): 9999}
-    N = {"t0": 256}
-
-    cugraph_store = DaskGraphStore(F, G, N)
-
-    bogus_samples = cudf.DataFrame(
-        {
-            "major_offsets": [0, 3, 5, 7, 8, None, None, None],
-            "minors": [1, 2, 3, 0, 3, 4, 5, 1],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
-            "label_hop_offsets": cudf.Series(
-                [0, 1, 4, None, None, None, None, None], dtype="int32"
-            ),
-            "renumber_map_offsets": cudf.Series([0, 6], dtype="int32"),
-        }
-    )
-    map = cudf.Series(m, name="map")
-    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
-
-    tempdir = tempfile.TemporaryDirectory()
-    for s in range(256):
-        bogus_samples["batch_id"] = cupy.int32(s)
-        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
-
-    loader = BulkSampleLoader(
-        feature_store=cugraph_store,
-        graph_store=cugraph_store,
-        directory=tempdir,
-        input_files=list(os.listdir(tempdir.name))[100:200],
-    )
-
-    if framework == "pyg":
-        convs = [
-            torch_geometric.nn.SAGEConv(256, 64, aggr="mean").cuda(),
-            torch_geometric.nn.SAGEConv(64, 1, aggr="mean").cuda(),
-        ]
-    else:
-        convs = [
-            CuGraphSAGEConv(256, 64, aggr="mean").cuda(),
-            CuGraphSAGEConv(64, 1, aggr="mean").cuda(),
-        ]
-
-    trim = trim_to_layer.TrimToLayer()
-    relu = torch.nn.functional.relu
-    dropout = torch.nn.functional.dropout
-
-    for hetero_data in loader:
-        x = hetero_data["t0"]["x"].cuda()
-
-        if framework == "pyg":
-            ei = hetero_data["t0", "knows", "t0"]["adj_t"].coo()
-            ei = torch.stack((ei[0], ei[1]))
-        else:
-            ei = hetero_data["t0", "knows", "t0"]["adj_t"].csr()
-            ei = [ei[1], ei[0], x.shape[0]]
-
-        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
-        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
-
-        s = x.shape[0]
-        for i in range(len(convs)):
-            if framework == "pyg":
-                x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)
-            else:
-                if i > 0:
-                    x = x.narrow(
-                        dim=0,
-                        start=0,
-                        length=s - num_sampled_nodes[-i],
-                    )
-
-                    ei[0] = ei[0].narrow(
-                        dim=0,
-                        start=0,
-                        length=ei[0].size(0) - num_sampled_edges[-i],
-                    )
-                    ei[1] = ei[1].narrow(
-                        dim=0, start=0, length=ei[1].size(0) - num_sampled_nodes[-i]
-                    )
-                    ei[2] = x.size(0)
-
-            s = x.shape[0]
-
-            if framework == "pyg":
-                x = convs[i](x, ei, size=(s, s))
-            else:
-                x = convs[i](x, ei)
-            x = relu(x)
-            x = dropout(x, p=0.5)
-
-        x = x.narrow(dim=0, start=0, length=s - num_sampled_nodes[1])
-
-        assert list(x.shape) == [1, 1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.parametrize("drop_last", [True, False])
-@pytest.mark.sg
-def test_drop_last(drop_last):
-    N = {"N": 10}
-    G = {
-        ("N", "e", "N"): torch.stack(
-            [torch.tensor([0, 1, 2, 3, 4]), torch.tensor([5, 6, 7, 8, 9])]
-        )
-    }
-    F = FeatureStore(backend="torch")
-    F.add_data(torch.arange(10), "N", "z")
-
-    store = DaskGraphStore(F, G, N)
-    with tempfile.TemporaryDirectory() as dir:
-        loader = DaskNeighborLoader(
-            (store, store),
-            input_nodes=torch.tensor([0, 1, 2, 3, 4]),
-            num_neighbors=[1],
-            batch_size=2,
-            shuffle=False,
-            drop_last=drop_last,
-            batches_per_partition=1,
-            directory=dir,
-        )
-
-        t = torch.tensor([])
-        for batch in loader:
-            t = torch.concat([t, batch["N"].z])
-
-        t = t.tolist()
-
-        files = os.listdir(dir)
-        assert len(files) == 2 if drop_last else 3
-        assert "batch=0-0.parquet" in files
-        assert "batch=1-1.parquet" in files
-        if not drop_last:
-            assert "batch=2-2.parquet" in files
-
-
-@pytest.mark.parametrize("directory", ["local", "temp"])
-@pytest.mark.sg
-def test_load_directory(
-    karate_gnn: Tuple[
-        FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int]
-    ],
-    directory: str,
-):
-    if directory == "local":
-        local_dir = tempfile.TemporaryDirectory(dir=".")
-
-    cugraph_store = DaskGraphStore(*karate_gnn)
-    cugraph_loader = DaskNeighborLoader(
-        (cugraph_store, cugraph_store),
-        torch.arange(8, dtype=torch.int64),
-        2,
-        num_neighbors=[8, 4, 2],
-        random_state=62,
-        replace=False,
-        directory=None if directory == "temp" else local_dir.name,
-        batches_per_partition=1,
-    )
-
-    it = iter(cugraph_loader)
-    next_batch = next(it)
-    assert next_batch is not None
-
-    if directory == "local":
-        assert len(os.listdir(local_dir.name)) == 4
-
-    count = 1
-    while next(it, None) is not None:
-        count += 1
-
-    assert count == 4
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
deleted file mode 100644
index 9e8a85a5b67..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_dask_neighbor_loader_mg.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.loader import DaskNeighborLoader
-from cugraph_pyg.data import DaskGraphStore
-from cugraph.utilities.utils import import_optional, MissingModule
-
-torch = import_optional("torch")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_cugraph_loader_basic(dask_client, karate_gnn):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-    loader = DaskNeighborLoader(
-        (cugraph_store, cugraph_store),
-        torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
-        10,
-        num_neighbors=[4, 4],
-        random_state=62,
-        replace=False,
-    )
-
-    assert isinstance(cugraph_store._subgraph()._plc_graph, dict)
-
-    samples = [s for s in loader]
-
-    assert len(samples) == 3
-    for sample in samples:
-        if "type0" in sample:
-            for prop in sample["type0"]["prop0"].tolist():
-                assert prop % 31 == 0
-
-        if "type1" in sample:
-            for prop in sample["type1"]["prop0"].tolist():
-                assert prop % 41 == 0
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_cugraph_loader_hetero(dask_client, karate_gnn):
-    F, G, N = karate_gnn
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-    loader = DaskNeighborLoader(
-        (cugraph_store, cugraph_store),
-        input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
-        batch_size=2,
-        num_neighbors=[4, 4],
-        random_state=62,
-        replace=False,
-    )
-
-    samples = [s for s in loader]
-
-    assert len(samples) == 2
-    for sample in samples:
-        print(sample)
-        if "type0" in sample:
-            for prop in sample["type0"]["prop0"].tolist():
-                assert prop % 31 == 0
-
-        if "type1" in sample:
-            for prop in sample["type1"]["prop0"].tolist():
-                assert prop % 41 == 0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
deleted file mode 100644
index 8ee18a826f7..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-import cugraph_pyg
-from cugraph_pyg.data import TensorDictFeatureStore, GraphStore
-from cugraph_pyg.loader import NeighborLoader
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_neighbor_loader():
-    """
-    Basic e2e test that covers loading and sampling.
-    """
-
-    df = karate.get_edgelist()
-    src = torch.as_tensor(df["src"], device="cuda")
-    dst = torch.as_tensor(df["dst"], device="cuda")
-
-    ei = torch.stack([dst, src])
-
-    graph_store = GraphStore()
-    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo")
-
-    feature_store = TensorDictFeatureStore()
-    feature_store["person", "feat"] = torch.randint(128, (34, 16))
-
-    loader = NeighborLoader(
-        (feature_store, graph_store),
-        [5, 5],
-        input_nodes=torch.arange(34),
-    )
-
-    for batch in loader:
-        assert isinstance(batch, torch_geometric.data.Data)
-        assert (feature_store["person", "feat"][batch.n_id] == batch.feat).all()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_neighbor_loader_biased():
-    eix = torch.tensor(
-        [
-            [3, 4, 5],
-            [0, 1, 2],
-        ]
-    )
-
-    graph_store = GraphStore()
-    graph_store.put_edge_index(eix, ("person", "knows", "person"), "coo")
-
-    feature_store = TensorDictFeatureStore()
-    feature_store["person", "feat"] = torch.randint(128, (6, 12))
-    feature_store[("person", "knows", "person"), "bias"] = torch.tensor(
-        [0, 12, 14], dtype=torch.float32
-    )
-
-    loader = NeighborLoader(
-        (feature_store, graph_store),
-        [1],
-        input_nodes=torch.tensor([0, 1, 2], dtype=torch.int64),
-        batch_size=3,
-        weight_attr="bias",
-    )
-
-    out = list(iter(loader))
-    assert len(out) == 1
-    out = out[0]
-
-    assert out.edge_index.shape[1] == 2
-    assert (out.edge_index.cpu() == torch.tensor([[3, 4], [1, 2]])).all()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-@pytest.mark.parametrize("num_nodes", [10, 25])
-@pytest.mark.parametrize("num_edges", [64, 128])
-@pytest.mark.parametrize("batch_size", [2, 4])
-@pytest.mark.parametrize("select_edges", [16, 32])
-@pytest.mark.parametrize("depth", [1, 3])
-@pytest.mark.parametrize("num_neighbors", [1, 4])
-def test_link_neighbor_loader_basic(
-    num_nodes, num_edges, batch_size, select_edges, num_neighbors, depth
-):
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()
-
-    eix = torch.randperm(num_edges)[:select_edges]
-    graph_store[("n", "e", "n"), "coo"] = torch.stack(
-        [
-            torch.randint(0, num_nodes, (num_edges,)),
-            torch.randint(0, num_nodes, (num_edges,)),
-        ]
-    )
-
-    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
-    loader = cugraph_pyg.loader.LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[num_neighbors] * depth,
-        edge_label_index=elx,
-        batch_size=batch_size,
-        shuffle=False,
-    )
-
-    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
-    for i, batch in enumerate(loader):
-        assert (
-            batch.input_id.cpu() == torch.arange(i * batch_size, (i + 1) * batch_size)
-        ).all()
-        assert (elx[i] == batch.n_id[batch.edge_label_index.cpu()]).all()
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_link_neighbor_loader_negative_sampling_basic(batch_size):
-    num_edges = 62
-    num_nodes = 19
-    select_edges = 17
-
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()
-
-    eix = torch.randperm(num_edges)[:select_edges]
-    graph_store[("n", "e", "n"), "coo"] = torch.stack(
-        [
-            torch.randint(0, num_nodes, (num_edges,)),
-            torch.randint(0, num_nodes, (num_edges,)),
-        ]
-    )
-
-    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
-    loader = cugraph_pyg.loader.LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[3, 3, 3],
-        edge_label_index=elx,
-        batch_size=batch_size,
-        neg_sampling="binary",
-        shuffle=False,
-    )
-
-    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
-    for i, batch in enumerate(loader):
-        assert batch.edge_label[0] == 1.0
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_link_neighbor_loader_negative_sampling_uneven(batch_size):
-    num_edges = 62
-    num_nodes = 19
-    select_edges = 17
-
-    graph_store = GraphStore()
-    feature_store = TensorDictFeatureStore()
-
-    eix = torch.randperm(num_edges)[:select_edges]
-    graph_store[("n", "e", "n"), "coo"] = torch.stack(
-        [
-            torch.randint(0, num_nodes, (num_edges,)),
-            torch.randint(0, num_nodes, (num_edges,)),
-        ]
-    )
-
-    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
-    loader = cugraph_pyg.loader.LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[3, 3, 3],
-        edge_label_index=elx,
-        batch_size=batch_size,
-        neg_sampling=torch_geometric.sampler.NegativeSampling("binary", amount=0.1),
-        shuffle=False,
-    )
-
-    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
-    for i, batch in enumerate(loader):
-        assert batch.edge_label[0] == 1.0
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
deleted file mode 100644
index d1dee01a508..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/loader/test_neighbor_loader_mg.py
+++ /dev/null
@@ -1,364 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import os
-
-from cugraph.datasets import karate
-from cugraph.utilities.utils import import_optional, MissingModule
-
-from cugraph_pyg.data import TensorDictFeatureStore, GraphStore
-from cugraph_pyg.loader import NeighborLoader, LinkNeighborLoader
-
-from cugraph.gnn import (
-    cugraph_comms_init,
-    cugraph_comms_shutdown,
-    cugraph_comms_create_unique_id,
-)
-
-os.environ["RAPIDS_NO_INITIALIZE"] = "1"
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-def init_pytorch_worker(rank, world_size, cugraph_id):
-    import rmm
-
-    rmm.reinitialize(
-        devices=rank,
-        pool_allocator=False,
-    )
-
-    import cupy
-
-    cupy.cuda.Device(rank).use()
-    from rmm.allocators.cupy import rmm_cupy_allocator
-
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-
-    from cugraph.testing.mg_utils import enable_spilling
-
-    enable_spilling()
-
-    torch.cuda.set_device(rank)
-
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = "12355"
-    torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size)
-
-    cugraph_comms_init(rank=rank, world_size=world_size, uid=cugraph_id, device=rank)
-
-
-def run_test_neighbor_loader_mg(rank, uid, world_size, specify_size):
-    """
-    Basic e2e test that covers loading and sampling.
-    """
-    init_pytorch_worker(rank, world_size, uid)
-
-    df = karate.get_edgelist()
-    src = torch.as_tensor(df["src"], device="cuda")
-    dst = torch.as_tensor(df["dst"], device="cuda")
-
-    ei = torch.stack([dst, src])
-    ei = torch.tensor_split(ei.clone(), world_size, axis=1)[rank]
-
-    sz = (34, 34) if specify_size else None
-    graph_store = GraphStore(is_multi_gpu=True)
-    graph_store.put_edge_index(ei, ("person", "knows", "person"), "coo", False, sz)
-
-    feature_store = TensorDictFeatureStore()
-    feature_store["person", "feat"] = torch.randint(128, (34, 16))
-
-    ix_train = torch.tensor_split(torch.arange(34), world_size, axis=0)[rank]
-
-    loader = NeighborLoader(
-        (feature_store, graph_store),
-        [5, 5],
-        input_nodes=ix_train,
-    )
-
-    for batch in loader:
-        assert isinstance(batch, torch_geometric.data.Data)
-        assert (feature_store["person", "feat"][batch.n_id] == batch.feat).all()
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skip(reason="deleteme")
-@pytest.mark.parametrize("specify_size", [True, False])
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_neighbor_loader_mg(specify_size):
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_neighbor_loader_mg,
-        args=(
-            uid,
-            world_size,
-            specify_size,
-        ),
-        nprocs=world_size,
-    )
-
-
-def run_test_neighbor_loader_biased_mg(rank, uid, world_size):
-    init_pytorch_worker(rank, world_size, uid)
-
-    eix = torch.stack(
-        [
-            torch.arange(
-                3 * (world_size + rank),
-                3 * (world_size + rank + 1),
-                dtype=torch.int64,
-                device="cuda",
-            ),
-            torch.arange(3 * rank, 3 * (rank + 1), dtype=torch.int64, device="cuda"),
-        ]
-    )
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    graph_store.put_edge_index(eix, ("person", "knows", "person"), "coo")
-
-    feature_store = TensorDictFeatureStore()
-    feature_store["person", "feat"] = torch.randint(128, (6 * world_size, 12))
-    feature_store[("person", "knows", "person"), "bias"] = torch.concat(
-        [torch.tensor([0, 1, 1], dtype=torch.float32) for _ in range(world_size)]
-    )
-
-    loader = NeighborLoader(
-        (feature_store, graph_store),
-        [1],
-        input_nodes=torch.arange(
-            3 * rank, 3 * (rank + 1), dtype=torch.int64, device="cuda"
-        ),
-        batch_size=3,
-        weight_attr="bias",
-    )
-
-    out = list(iter(loader))
-    assert len(out) == 1
-    out = out[0]
-
-    assert (
-        out.edge_index.cpu()
-        == torch.tensor(
-            [
-                [3, 4],
-                [1, 2],
-            ]
-        )
-    ).all()
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skip(reason="deleteme")
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_neighbor_loader_biased_mg():
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_neighbor_loader_biased_mg,
-        args=(
-            uid,
-            world_size,
-        ),
-        nprocs=world_size,
-    )
-
-
-def run_test_link_neighbor_loader_basic_mg(
-    rank,
-    uid,
-    world_size,
-    num_nodes: int,
-    num_edges: int,
-    select_edges: int,
-    batch_size: int,
-    num_neighbors: int,
-    depth: int,
-):
-    init_pytorch_worker(rank, world_size, uid)
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    feature_store = TensorDictFeatureStore()
-
-    eix = torch.randperm(num_edges)[:select_edges]
-    graph_store[("n", "e", "n"), "coo"] = torch.stack(
-        [
-            torch.randint(0, num_nodes, (num_edges,)),
-            torch.randint(0, num_nodes, (num_edges,)),
-        ]
-    )
-
-    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
-    loader = LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[num_neighbors] * depth,
-        edge_label_index=elx,
-        batch_size=batch_size,
-        shuffle=False,
-    )
-
-    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
-    for i, batch in enumerate(loader):
-        assert (
-            batch.input_id.cpu() == torch.arange(i * batch_size, (i + 1) * batch_size)
-        ).all()
-        assert (elx[i] == batch.n_id[batch.edge_label_index.cpu()]).all()
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skip(reason="deleteme")
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-@pytest.mark.parametrize("select_edges", [64, 128])
-@pytest.mark.parametrize("batch_size", [2, 4])
-@pytest.mark.parametrize("depth", [1, 3])
-def test_link_neighbor_loader_basic_mg(select_edges, batch_size, depth):
-    num_nodes = 25
-    num_edges = 128
-    num_neighbors = 2
-
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_link_neighbor_loader_basic_mg,
-        args=(
-            uid,
-            world_size,
-            num_nodes,
-            num_edges,
-            select_edges,
-            batch_size,
-            num_neighbors,
-            depth,
-        ),
-        nprocs=world_size,
-    )
-
-
-def run_test_link_neighbor_loader_uneven_mg(rank, uid, world_size, edge_index):
-    init_pytorch_worker(rank, world_size, uid)
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    feature_store = TensorDictFeatureStore()
-
-    batch_size = 1
-    graph_store[("n", "e", "n"), "coo"] = torch.tensor_split(
-        edge_index, world_size, dim=-1
-    )[rank]
-
-    elx = graph_store[("n", "e", "n"), "coo"]  # select all edges on each worker
-    loader = LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[2, 2, 2],
-        edge_label_index=elx,
-        batch_size=batch_size,
-        shuffle=False,
-    )
-
-    for i, batch in enumerate(loader):
-        assert (
-            batch.input_id.cpu() == torch.arange(i * batch_size, (i + 1) * batch_size)
-        ).all()
-
-        assert (elx[:, [i]] == batch.n_id[batch.edge_label_index.cpu()]).all()
-
-    cugraph_comms_shutdown()
-
-
-@pytest.mark.skip(reason="deleteme")
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_link_neighbor_loader_uneven_mg():
-    edge_index = torch.tensor(
-        [
-            [0, 1, 3, 4, 7],
-            [1, 0, 8, 9, 12],
-        ]
-    )
-
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_link_neighbor_loader_uneven_mg,
-        args=(
-            uid,
-            world_size,
-            edge_index,
-        ),
-        nprocs=world_size,
-    )
-
-
-def run_test_link_neighbor_loader_negative_sampling_basic_mg(
-    rank, world_size, uid, batch_size
-):
-    num_edges = 62 * world_size
-    num_nodes = 19 * world_size
-    select_edges = 17
-
-    init_pytorch_worker(rank, world_size, uid)
-
-    graph_store = GraphStore(is_multi_gpu=True)
-    feature_store = TensorDictFeatureStore()
-
-    eix = torch.randperm(num_edges)[:select_edges]
-    graph_store[("n", "e", "n"), "coo"] = torch.stack(
-        [
-            torch.randint(0, num_nodes, (num_edges,)),
-            torch.randint(0, num_nodes, (num_edges,)),
-        ]
-    )
-
-    elx = graph_store[("n", "e", "n"), "coo"][:, eix]
-    loader = LinkNeighborLoader(
-        (feature_store, graph_store),
-        num_neighbors=[3, 3, 3],
-        edge_label_index=elx,
-        batch_size=batch_size,
-        neg_sampling="binary",
-        shuffle=False,
-    )
-
-    elx = torch.tensor_split(elx, eix.numel() // batch_size, dim=1)
-    for i, batch in enumerate(loader):
-        assert batch.edge_label[0] == 1.0
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_link_neighbor_loader_negative_sampling_basic_mg(batch_size):
-    uid = cugraph_comms_create_unique_id()
-    world_size = torch.cuda.device_count()
-
-    torch.multiprocessing.spawn(
-        run_test_link_neighbor_loader_negative_sampling_basic_mg,
-        args=(
-            world_size,
-            uid,
-            batch_size,
-        ),
-        nprocs=world_size,
-    )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
deleted file mode 100644
index 92d216fefa3..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import GATConv as CuGraphGATConv
-from cugraph_pyg.utils.imports import package_available
-
-ATOL = 1e-6
-
-
-@pytest.mark.skipif(
-    package_available("torch_geometric<2.5"), reason="Test requires pyg>=2.5"
-)
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("concat", [True, False])
-@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
-@pytest.mark.parametrize("max_num_neighbors", [8, None])
-@pytest.mark.parametrize("use_edge_attr", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_gat_conv_equality(
-    use_edge_index,
-    bias,
-    bipartite,
-    concat,
-    heads,
-    max_num_neighbors,
-    use_edge_attr,
-    graph,
-    request,
-):
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import GATConv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if bipartite:
-        in_channels = (5, 3)
-        x = (
-            torch.rand(size[0], in_channels[0]).cuda(),
-            torch.rand(size[1], in_channels[1]).cuda(),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels).cuda()
-    out_channels = 2
-
-    if use_edge_attr:
-        edge_dim = 3
-        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-    else:
-        edge_dim = edge_attr = None
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        if use_edge_attr:
-            csc, edge_attr_perm = CuGraphGATConv.to_csc(
-                edge_index, size, edge_attr=edge_attr
-            )
-        else:
-            csc = CuGraphGATConv.to_csc(edge_index, size)
-            edge_attr_perm = None
-
-    kwargs = dict(bias=bias, concat=concat, edge_dim=edge_dim)
-
-    conv1 = GATConv(
-        in_channels, out_channels, heads, add_self_loops=False, **kwargs
-    ).cuda()
-    conv2 = CuGraphGATConv(in_channels, out_channels, heads, **kwargs).cuda()
-
-    out_dim = heads * out_channels
-    with torch.no_grad():
-        if bipartite:
-            conv2.lin_src.weight.copy_(conv1.lin_src.weight)
-            conv2.lin_dst.weight.copy_(conv1.lin_dst.weight)
-        else:
-            conv2.lin.weight.copy_(conv1.lin.weight)
-
-        conv2.att[:out_dim].copy_(conv1.att_src.flatten())
-        conv2.att[out_dim : 2 * out_dim].copy_(conv1.att_dst.flatten())
-        if use_edge_attr:
-            conv2.att[2 * out_dim :].copy_(conv1.att_edge.flatten())
-            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
-
-    out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_attr=edge_attr, max_num_neighbors=max_num_neighbors)
-    else:
-        out2 = conv2(
-            x, csc, edge_attr=edge_attr_perm, max_num_neighbors=max_num_neighbors
-        )
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_output = torch.rand_like(out1)
-    out1.backward(grad_output)
-    out2.backward(grad_output)
-
-    if bipartite:
-        assert torch.allclose(
-            conv1.lin_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
-        )
-        assert torch.allclose(
-            conv1.lin_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
-        )
-    else:
-        assert torch.allclose(conv1.lin.weight.grad, conv2.lin.weight.grad, atol=ATOL)
-
-    assert torch.allclose(
-        conv1.att_src.grad.flatten(), conv2.att.grad[:out_dim], atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.att_dst.grad.flatten(), conv2.att.grad[out_dim : 2 * out_dim], atol=ATOL
-    )
-
-    if use_edge_attr:
-        assert torch.allclose(
-            conv1.att_edge.grad.flatten(), conv2.att.grad[2 * out_dim :], atol=ATOL
-        )
-        assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
-        )
-
-    if bias:
-        assert torch.allclose(conv1.bias.grad, conv2.bias.grad, atol=ATOL)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
deleted file mode 100644
index 2e221922add..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import GATv2Conv as CuGraphGATv2Conv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("concat", [True, False])
-@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
-@pytest.mark.parametrize("use_edge_attr", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_gatv2_conv_equality(
-    use_edge_index, bipartite, concat, heads, use_edge_attr, graph, request
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import GATv2Conv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if bipartite:
-        in_channels = (5, 3)
-        x = (
-            torch.rand(size[0], in_channels[0]).cuda(),
-            torch.rand(size[1], in_channels[1]).cuda(),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels).cuda()
-    out_channels = 2
-
-    if use_edge_attr:
-        edge_dim = 3
-        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-    else:
-        edge_dim = edge_attr = None
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        if use_edge_attr:
-            csc, edge_attr_perm = CuGraphGATv2Conv.to_csc(
-                edge_index, size, edge_attr=edge_attr
-            )
-        else:
-            csc = CuGraphGATv2Conv.to_csc(edge_index, size)
-            edge_attr_perm = None
-
-    kwargs = dict(bias=False, concat=concat, edge_dim=edge_dim)
-
-    conv1 = GATv2Conv(
-        in_channels, out_channels, heads, add_self_loops=False, **kwargs
-    ).cuda()
-    conv2 = CuGraphGATv2Conv(in_channels, out_channels, heads, **kwargs).cuda()
-
-    with torch.no_grad():
-        conv2.lin_src.weight.copy_(conv1.lin_l.weight)
-        conv2.lin_dst.weight.copy_(conv1.lin_r.weight)
-        conv2.att.copy_(conv1.att.flatten())
-        if use_edge_attr:
-            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
-
-    out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_attr=edge_attr)
-    else:
-        out2 = conv2(x, csc, edge_attr=edge_attr_perm)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_output = torch.rand_like(out1)
-    out1.backward(grad_output)
-    out2.backward(grad_output)
-
-    assert torch.allclose(conv1.lin_l.weight.grad, conv2.lin_src.weight.grad, atol=ATOL)
-    assert torch.allclose(conv1.lin_r.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL)
-
-    assert torch.allclose(conv1.att.grad.flatten(), conv2.att.grad, atol=ATOL)
-
-    if use_edge_attr:
-        assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
deleted file mode 100644
index f182869002a..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import HeteroGATConv as CuGraphHeteroGATConv
-from cugraph_pyg.utils.imports import package_available
-
-ATOL = 1e-6
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(
-    package_available("torch_geometric<2.4"), reason="Test requires pyg>=2.4"
-)
-@pytest.mark.parametrize("heads", [1, 3, 10])
-@pytest.mark.parametrize("aggr", ["sum", "mean"])
-@pytest.mark.sg
-def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
-    import torch
-    from torch_geometric.data import HeteroData
-    from torch_geometric.nn import HeteroConv, GATConv
-
-    device = torch.device("cuda")
-    data = HeteroData(sample_pyg_hetero_data).to(device)
-
-    in_channels_dict = {k: v.size(1) for k, v in data.x_dict.items()}
-    out_channels = 2
-
-    convs_dict = {}
-    kwargs1 = dict(heads=heads, add_self_loops=False, bias=False)
-    for edge_type in data.edge_types:
-        src_t, _, dst_t = edge_type
-        in_channels_src, in_channels_dst = data.x_dict[src_t].size(-1), data.x_dict[
-            dst_t
-        ].size(-1)
-        if src_t == dst_t:
-            convs_dict[edge_type] = GATConv(in_channels_src, out_channels, **kwargs1)
-        else:
-            convs_dict[edge_type] = GATConv(
-                (in_channels_src, in_channels_dst), out_channels, **kwargs1
-            )
-
-    conv1 = HeteroConv(convs_dict, aggr=aggr).to(device)
-    kwargs2 = dict(
-        heads=heads,
-        aggr=aggr,
-        node_types=data.node_types,
-        edge_types=data.edge_types,
-        bias=False,
-    )
-    conv2 = CuGraphHeteroGATConv(in_channels_dict, out_channels, **kwargs2).to(device)
-
-    # copy over linear and attention weights
-    w_src, w_dst = conv2.split_tensors(conv2.lin_weights, dim=0)
-    with torch.no_grad():
-        for edge_type in conv2.edge_types:
-            src_t, _, dst_t = edge_type
-            if src_t == dst_t:
-                w_src[edge_type].copy_(conv1.convs[edge_type].lin.weight)
-            else:
-                w_src[edge_type].copy_(conv1.convs[edge_type].lin_src.weight)
-                if w_dst[edge_type] is not None:
-                    w_dst[edge_type].copy_(conv1.convs[edge_type].lin_dst.weight)
-
-            conv2.attn_weights[edge_type][: heads * out_channels].copy_(
-                conv1.convs[edge_type].att_src.flatten()
-            )
-            conv2.attn_weights[edge_type][heads * out_channels :].copy_(
-                conv1.convs[edge_type].att_dst.flatten()
-            )
-
-    out1 = conv1(data.x_dict, data.edge_index_dict)
-    out2 = conv2(data.x_dict, data.edge_index_dict)
-
-    for node_type in data.node_types:
-        assert torch.allclose(out1[node_type], out2[node_type], atol=ATOL)
-
-    loss1 = 0
-    loss2 = 0
-    for node_type in data.node_types:
-        loss1 += out1[node_type].mean()
-        loss2 += out2[node_type].mean()
-
-    loss1.backward()
-    loss2.backward()
-
-    # check gradient w.r.t attention weights
-    out_dim = heads * out_channels
-    for edge_type in conv2.edge_types:
-        assert torch.allclose(
-            conv1.convs[edge_type].att_src.grad.flatten(),
-            conv2.attn_weights[edge_type].grad[:out_dim],
-            atol=ATOL,
-        )
-        assert torch.allclose(
-            conv1.convs[edge_type].att_dst.grad.flatten(),
-            conv2.attn_weights[edge_type].grad[out_dim:],
-            atol=ATOL,
-        )
-
-    # check gradient w.r.t linear weights
-    grad_lin_weights_ref = dict.fromkeys(out1.keys())
-    for node_t, (rels_as_src, rels_as_dst) in conv2.relations_per_ntype.items():
-        grad_list = []
-        for rel_t in rels_as_src:
-            src_type, _, dst_type = rel_t
-            if src_type == dst_type:
-                grad_list.append(conv1.convs[rel_t].lin.weight.grad.clone())
-            else:
-                grad_list.append(conv1.convs[rel_t].lin_src.weight.grad.clone())
-        for rel_t in rels_as_dst:
-            grad_list.append(conv1.convs[rel_t].lin_dst.weight.grad.clone())
-        assert len(grad_list) > 0
-        grad_lin_weights_ref[node_t] = torch.vstack(grad_list)
-
-    for node_type in conv2.lin_weights:
-        assert torch.allclose(
-            grad_lin_weights_ref[node_type],
-            conv2.lin_weights[node_type].grad,
-            atol=ATOL,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
deleted file mode 100644
index 8b06cb2e180..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import RGCNConv as CuGraphRGCNConv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("aggr", ["add", "sum", "mean"])
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("max_num_neighbors", [8, None])
-@pytest.mark.parametrize("num_bases", [1, 2, None])
-@pytest.mark.parametrize("root_weight", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_rgcn_conv_equality(
-    use_edge_index,
-    aggr,
-    bias,
-    max_num_neighbors,
-    num_bases,
-    root_weight,
-    graph,
-    request,
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import FastRGCNConv as RGCNConv
-
-    torch.manual_seed(12345)
-    in_channels, out_channels, num_relations = (4, 2, 3)
-    kwargs = dict(aggr=aggr, bias=bias, num_bases=num_bases, root_weight=root_weight)
-
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-    edge_type = torch.randint(num_relations, (edge_index.size(1),)).cuda()
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        csc, edge_type_perm = CuGraphRGCNConv.to_csc(edge_index, size, edge_type)
-
-    x = torch.rand(size[0], in_channels, device="cuda")
-
-    conv1 = RGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
-    conv2 = CuGraphRGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
-
-    with torch.no_grad():
-        if root_weight:
-            conv2.weight[:-1].copy_(conv1.weight)
-            conv2.weight[-1].copy_(conv1.root)
-        else:
-            conv2.weight.copy_(conv1.weight)
-        if num_bases is not None:
-            conv2.comp.copy_(conv1.comp)
-
-    out1 = conv1(x, edge_index, edge_type)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_type)
-    else:
-        out2 = conv2(x, csc, edge_type_perm, max_num_neighbors=max_num_neighbors)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.rand_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-
-    if root_weight:
-        assert torch.allclose(conv1.weight.grad, conv2.weight.grad[:-1], atol=ATOL)
-        assert torch.allclose(conv1.root.grad, conv2.weight.grad[-1], atol=ATOL)
-    else:
-        assert torch.allclose(conv1.weight.grad, conv2.weight.grad, atol=ATOL)
-
-    if num_bases is not None:
-        assert torch.allclose(conv1.comp.grad, conv2.comp.grad, atol=ATOL)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
deleted file mode 100644
index 878ceff632a..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("aggr", ["sum", "mean", "min", "max"])
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("max_num_neighbors", [8, None])
-@pytest.mark.parametrize("normalize", [True, False])
-@pytest.mark.parametrize("root_weight", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_sage_conv_equality(
-    use_edge_index,
-    aggr,
-    bias,
-    bipartite,
-    max_num_neighbors,
-    normalize,
-    root_weight,
-    graph,
-    request,
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import SAGEConv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        csc = CuGraphSAGEConv.to_csc(edge_index, size)
-
-    if bipartite:
-        in_channels = (7, 3)
-        x = (
-            torch.rand(size[0], in_channels[0]).cuda(),
-            torch.rand(size[1], in_channels[1]).cuda(),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels).cuda()
-    out_channels = 4
-
-    kwargs = dict(aggr=aggr, bias=bias, normalize=normalize, root_weight=root_weight)
-
-    conv1 = SAGEConv(in_channels, out_channels, **kwargs).cuda()
-    conv2 = CuGraphSAGEConv(in_channels, out_channels, **kwargs).cuda()
-
-    in_channels_src = conv2.in_channels_src
-    with torch.no_grad():
-        conv2.lin.weight[:, :in_channels_src].copy_(conv1.lin_l.weight)
-        if root_weight:
-            conv2.lin.weight[:, in_channels_src:].copy_(conv1.lin_r.weight)
-        if bias:
-            conv2.lin.bias.copy_(conv1.lin_l.bias)
-
-    out1 = conv1(x, edge_index)
-    out2 = conv2(x, csc, max_num_neighbors=max_num_neighbors)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.rand_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-
-    assert torch.allclose(
-        conv1.lin_l.weight.grad,
-        conv2.lin.weight.grad[:, :in_channels_src],
-        atol=ATOL,
-    )
-
-    if root_weight:
-        assert torch.allclose(
-            conv1.lin_r.weight.grad,
-            conv2.lin.weight.grad[:, in_channels_src:],
-            atol=ATOL,
-        )
-
-    if bias:
-        assert torch.allclose(
-            conv1.lin_l.bias.grad,
-            conv2.lin.bias.grad,
-            atol=ATOL,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
deleted file mode 100644
index d207a4d7947..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import TransformerConv as CuGraphTransformerConv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("use_edge_attr", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("concat", [True, False])
-@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_transformer_conv_equality(
-    use_edge_index, use_edge_attr, bipartite, concat, heads, graph, request
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import TransformerConv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if bipartite:
-        in_channels = (5, 3)
-        x = (
-            torch.rand(size[0], in_channels[0], device="cuda"),
-            torch.rand(size[1], in_channels[1], device="cuda"),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels, device="cuda")
-    out_channels = 2
-
-    if use_edge_attr:
-        edge_dim = 3
-        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-    else:
-        edge_dim = edge_attr = None
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        if use_edge_attr:
-            csc, edge_attr_perm = CuGraphTransformerConv.to_csc(
-                edge_index, size, edge_attr=edge_attr
-            )
-        else:
-            csc = CuGraphTransformerConv.to_csc(edge_index, size)
-            edge_attr_perm = None
-
-    kwargs = dict(concat=concat, bias=False, edge_dim=edge_dim, root_weight=False)
-
-    conv1 = TransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
-    conv2 = CuGraphTransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
-
-    with torch.no_grad():
-        conv2.lin_query.weight.copy_(conv1.lin_query.weight)
-        conv2.lin_key.weight.copy_(conv1.lin_key.weight)
-        conv2.lin_value.weight.copy_(conv1.lin_value.weight)
-        conv2.lin_query.bias.copy_(conv1.lin_query.bias)
-        conv2.lin_key.bias.copy_(conv1.lin_key.bias)
-        conv2.lin_value.bias.copy_(conv1.lin_value.bias)
-        if use_edge_attr:
-            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
-
-    out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_attr=edge_attr)
-    else:
-        out2 = conv2(x, csc, edge_attr=edge_attr_perm)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_output = torch.rand_like(out1)
-    out1.backward(grad_output)
-    out2.backward(grad_output)
-
-    assert torch.allclose(
-        conv1.lin_query.weight.grad, conv2.lin_query.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.lin_key.weight.grad, conv2.lin_key.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.lin_value.weight.grad, conv2.lin_value.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.lin_query.bias.grad, conv2.lin_query.bias.grad, atol=ATOL
-    )
-    assert torch.allclose(conv1.lin_key.bias.grad, conv2.lin_key.bias.grad, atol=ATOL)
-    assert torch.allclose(
-        conv1.lin_value.bias.grad, conv2.lin_value.bias.grad, atol=ATOL
-    )
-
-    if use_edge_attr:
-        assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/pytest.ini b/python/cugraph-pyg/cugraph_pyg/tests/pytest.ini
deleted file mode 100644
index 7b0a9f29fb1..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/pytest.ini
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[pytest]
-addopts = --tb=native
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
deleted file mode 100644
index 7659fdc386f..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cudf
-import cupy
-
-import pytest
-
-from cugraph_pyg.data import DaskGraphStore
-from cugraph_pyg.sampler.sampler_utils import (
-    _sampler_output_from_sampling_results_heterogeneous,
-)
-
-from cugraph.utilities.utils import import_optional, MissingModule
-from cugraph import uniform_neighbor_sample
-
-torch = import_optional("torch")
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_neighbor_sample(basic_graph_1):
-    F, G, N = basic_graph_1
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    batches = cudf.DataFrame(
-        {
-            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
-            "batch": cudf.Series(cupy.zeros(5, dtype="int32")),
-        }
-    )
-
-    sampling_results = uniform_neighbor_sample(
-        cugraph_store._subgraph(),
-        batches,
-        fanout_vals=[-1],
-        with_replacement=False,
-        with_edge_properties=True,
-        with_batch_ids=True,
-        random_state=62,
-        return_offsets=False,
-        use_legacy_names=False,
-    ).sort_values(by=["majors", "minors"])
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        sampling_results=sampling_results,
-        renumber_map=None,
-        graph_store=cugraph_store,
-        metadata=torch.arange(6, dtype=torch.int64),
-    )
-
-    noi_groups = out.node
-    row_dict = out.row
-    col_dict = out.col
-    metadata = out.metadata
-
-    assert metadata.tolist() == list(range(6))
-
-    for node_type, node_ids in noi_groups.items():
-        actual_vertex_ids = torch.arange(N[node_type])
-
-        assert sorted(node_ids.tolist()) == actual_vertex_ids.tolist()
-
-    assert (
-        row_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][0].tolist()
-    )
-    assert (
-        col_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][1].tolist()
-    )
-
-    # check the hop dictionaries
-    assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"] == [4, 1]
-
-    assert len(out.num_sampled_edges) == 1
-    assert out.num_sampled_edges[("vt1", "pig", "vt1")] == [6]
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
-    F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    batches = cudf.DataFrame(
-        {
-            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
-            "batch": cudf.Series(cupy.zeros(5, dtype="int32")),
-        }
-    )
-
-    sampling_results = uniform_neighbor_sample(
-        cugraph_store._subgraph(),
-        batches,
-        fanout_vals=[-1],
-        with_replacement=False,
-        with_edge_properties=True,
-        random_state=62,
-        return_offsets=False,
-        with_batch_ids=True,
-        use_legacy_names=False,
-    ).sort_values(by=["majors", "minors"])
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        sampling_results=sampling_results,
-        renumber_map=None,
-        graph_store=cugraph_store,
-        metadata=torch.arange(6, dtype=torch.int64),
-    )
-
-    noi_groups = out.node
-    row_dict = out.row
-    col_dict = out.col
-    metadata = out.metadata
-
-    assert metadata.tolist() == list(range(6))
-
-    for node_type, node_ids in noi_groups.items():
-        actual_vertex_ids = torch.arange(N[node_type])
-
-        assert node_ids.tolist() == sorted(actual_vertex_ids.tolist())
-
-    for edge_type, ei in G.items():
-        assert sorted(row_dict[edge_type].tolist()) == sorted(ei[0].tolist())
-        assert sorted(col_dict[edge_type].tolist()) == sorted(ei[1].tolist())
-
-    # check the hop dictionaries
-    assert len(out.num_sampled_nodes) == 2
-    assert out.num_sampled_nodes["black"] == [2, 0]
-    assert out.num_sampled_nodes["brown"] == [3, 0]
-
-    assert len(out.num_sampled_edges) == 5
-    assert out.num_sampled_edges[("brown", "horse", "brown")] == [2]
-    assert out.num_sampled_edges[("brown", "tortoise", "black")] == [3]
-    assert out.num_sampled_edges[("brown", "mongoose", "black")] == [2]
-    assert out.num_sampled_edges[("black", "cow", "brown")] == [2]
-    assert out.num_sampled_edges[("black", "snake", "black")] == [1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.sg
-def test_neighbor_sample_mock_sampling_results(abc_graph):
-    F, G, N = abc_graph
-
-    graph_store = DaskGraphStore(F, G, N, order="CSR")
-
-    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
-    mock_sampling_results = cudf.DataFrame(
-        {
-            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
-            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
-        }
-    )
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        mock_sampling_results, None, graph_store, None
-    )
-
-    assert out.metadata is None
-    assert len(out.node) == 3
-    assert out.node["A"].tolist() == [0, 1]
-    assert out.node["B"].tolist() == [0, 1]
-    assert out.node["C"].tolist() == [3, 2, 0]
-
-    assert len(out.row) == 3
-    assert len(out.col) == 3
-    assert out.row[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
-    assert out.col[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
-    assert out.row[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
-    assert out.col[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
-    assert out.row[("B", "ba", "A")].tolist() == [1, 1]
-    assert out.col[("B", "ba", "A")].tolist() == [1, 1]
-
-    assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"] == [2, 0, 0, 0, 0]
-    assert out.num_sampled_nodes["B"] == [0, 2, 0, 0, 0]
-    assert out.num_sampled_nodes["C"] == [0, 0, 2, 0, 1]
-
-    assert len(out.num_sampled_edges) == 3
-    assert out.num_sampled_edges[("A", "ab", "B")] == [3, 0, 1, 0]
-    assert out.num_sampled_edges[("B", "ba", "A")] == [0, 1, 0, 1]
-    assert out.num_sampled_edges[("B", "bc", "C")] == [0, 2, 0, 2]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py b/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
deleted file mode 100644
index 91e0668b3c1..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/sampler/test_sampler_utils_mg.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cudf
-import cupy
-
-import pytest
-
-from cugraph_pyg.data import DaskGraphStore
-from cugraph_pyg.sampler.sampler_utils import (
-    _sampler_output_from_sampling_results_heterogeneous,
-)
-
-from cugraph.gnn import FeatureStore
-
-from cugraph.utilities.utils import import_optional, MissingModule
-from cugraph.dask import uniform_neighbor_sample
-
-torch = import_optional("torch")
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_neighbor_sample(dask_client, basic_graph_1):
-    F, G, N = basic_graph_1
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-
-    batches = cudf.DataFrame(
-        {
-            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
-            "batch": cudf.Series(cupy.zeros(5, dtype="int32")),
-        }
-    )
-
-    sampling_results = (
-        uniform_neighbor_sample(
-            cugraph_store._subgraph(),
-            batches,
-            with_batch_ids=True,
-            fanout_vals=[-1],
-            with_replacement=False,
-            with_edge_properties=True,
-            random_state=62,
-            return_offsets=False,
-            return_hops=True,
-            use_legacy_names=False,
-        )
-        .compute()
-        .sort_values(by=["majors", "minors"])
-    )
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        sampling_results=sampling_results,
-        renumber_map=None,
-        graph_store=cugraph_store,
-        metadata=torch.arange(6, dtype=torch.int64),
-    )
-
-    noi_groups = out.node
-    row_dict = out.row
-    col_dict = out.col
-    metadata = out.metadata
-
-    assert metadata.tolist() == list(range(6))
-
-    for node_type, node_ids in noi_groups.items():
-        actual_vertex_ids = torch.arange(N[node_type])
-
-        assert sorted(node_ids.tolist()) == actual_vertex_ids.tolist()
-
-    assert (
-        row_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][0].tolist()
-    )
-    assert (
-        col_dict[("vt1", "pig", "vt1")].tolist() == G[("vt1", "pig", "vt1")][1].tolist()
-    )
-
-    # check the hop dictionaries
-    assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"] == [4, 1]
-
-    assert len(out.num_sampled_edges) == 1
-    assert out.num_sampled_edges[("vt1", "pig", "vt1")] == [6]
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.skip(reason="broken")
-@pytest.mark.mg
-def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph_1):
-    F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-
-    batches = cudf.DataFrame(
-        {
-            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
-            "batches": cudf.Series(cupy.zeros(5, dtype="int32")),
-        }
-    )
-
-    sampling_results = (
-        uniform_neighbor_sample(
-            cugraph_store._subgraph(),
-            batches,
-            fanout_vals=[-1],
-            with_replacement=False,
-            with_edge_properties=True,
-            random_state=62,
-            return_offsets=False,
-            with_batch_ids=True,
-            use_legacy_names=False,
-        )
-        .sort_values(by=["majors", "minors"])
-        .compute()
-    )
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        sampling_results=sampling_results,
-        renumber_map=None,
-        graph_store=cugraph_store,
-        metadata=torch.arange(6, dtype=torch.int64),
-    )
-
-    noi_groups = out.node
-    row_dict = out.row
-    col_dict = out.col
-    metadata = out.metadata
-
-    assert metadata.tolist() == list(range(6))
-
-    for node_type, node_ids in noi_groups.items():
-        actual_vertex_ids = torch.arange(N[node_type])
-
-        assert node_ids.tolist() == sorted(actual_vertex_ids.tolist())
-
-    for edge_type, ei in G.items():
-        assert sorted(row_dict[edge_type].tolist()) == sorted(ei[0].tolist())
-        assert sorted(col_dict[edge_type].tolist()) == sorted(ei[1].tolist())
-
-    # check the hop dictionaries
-    assert len(out.num_sampled_nodes) == 2
-    assert out.num_sampled_nodes["black"].tolist() == [2, 0]
-    assert out.num_sampled_nodes["brown"].tolist() == [3, 0]
-
-    assert len(out.num_sampled_edges) == 5
-    assert out.num_sampled_edges[("brown", "horse", "brown")].tolist() == [2]
-    assert out.num_sampled_edges[("brown", "tortoise", "black")].tolist() == [3]
-    assert out.num_sampled_edges[("brown", "mongoose", "black")].tolist() == [2]
-    assert out.num_sampled_edges[("black", "cow", "brown")].tolist() == [2]
-    assert out.num_sampled_edges[("black", "snake", "black")].tolist() == [1]
-
-
-@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-@pytest.mark.mg
-def test_neighbor_sample_mock_sampling_results(dask_client):
-    N = {
-        "A": 2,  # 0, 1
-        "B": 3,  # 2, 3, 4
-        "C": 4,  # 5, 6, 7, 8
-    }
-
-    G = {
-        # (0->2, 0->3, 1->3)
-        ("A", "ab", "B"): [
-            torch.tensor([0, 0, 1], dtype=torch.int64),
-            torch.tensor([0, 1, 1], dtype=torch.int64),
-        ],
-        # (2->0, 2->1, 3->1, 4->0)
-        ("B", "ba", "A"): [
-            torch.tensor([0, 0, 1, 2], dtype=torch.int64),
-            torch.tensor([0, 1, 1, 0], dtype=torch.int64),
-        ],
-        # (2->6, 2->8, 3->5, 3->7, 4->5, 4->8)
-        ("B", "bc", "C"): [
-            torch.tensor([0, 0, 1, 1, 2, 2], dtype=torch.int64),
-            torch.tensor([1, 3, 0, 2, 0, 3], dtype=torch.int64),
-        ],
-    }
-
-    F = FeatureStore()
-    F.add_data(
-        torch.tensor([3.2, 2.1], dtype=torch.float32), type_name="A", feat_name="prop1"
-    )
-
-    graph_store = DaskGraphStore(F, G, N, multi_gpu=True, order="CSR")
-
-    # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
-    mock_sampling_results = cudf.DataFrame(
-        {
-            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
-            "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
-        }
-    )
-
-    out = _sampler_output_from_sampling_results_heterogeneous(
-        mock_sampling_results, None, graph_store, None
-    )
-
-    assert out.metadata is None
-    assert len(out.node) == 3
-    assert out.node["A"].tolist() == [0, 1]
-    assert out.node["B"].tolist() == [0, 1]
-    assert out.node["C"].tolist() == [3, 2, 0]
-
-    assert len(out.row) == 3
-    assert len(out.col) == 3
-    assert out.row[("A", "ab", "B")].tolist() == [0, 0, 1, 1]
-    assert out.col[("A", "ab", "B")].tolist() == [0, 1, 1, 1]
-    assert out.row[("B", "bc", "C")].tolist() == [0, 1, 1, 1]
-    assert out.col[("B", "bc", "C")].tolist() == [0, 1, 2, 1]
-    assert out.row[("B", "ba", "A")].tolist() == [1, 1]
-    assert out.col[("B", "ba", "A")].tolist() == [1, 1]
-
-    assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"] == [2, 0, 0, 0, 0]
-    assert out.num_sampled_nodes["B"] == [0, 2, 0, 0, 0]
-    assert out.num_sampled_nodes["C"] == [0, 0, 2, 0, 1]
-
-    assert len(out.num_sampled_edges) == 3
-    assert out.num_sampled_edges[("A", "ab", "B")] == [3, 0, 1, 0]
-    assert out.num_sampled_edges[("B", "ba", "A")] == [0, 1, 0, 1]
-    assert out.num_sampled_edges[("B", "bc", "C")] == [0, 2, 0, 2]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_version.py b/python/cugraph-pyg/cugraph_pyg/tests/test_version.py
deleted file mode 100644
index 4ea0f9875f5..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_version.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import cugraph_pyg
-
-
-def test_version_constants_are_populated():
-    # __git_commit__ will only be non-empty in a built distribution
-    assert isinstance(cugraph_pyg.__git_commit__, str)
-
-    # __version__ should always be non-empty
-    assert isinstance(cugraph_pyg.__version__, str)
-    assert len(cugraph_pyg.__version__) > 0
diff --git a/python/cugraph-pyg/cugraph_pyg/utils/__init__.py b/python/cugraph-pyg/cugraph_pyg/utils/__init__.py
deleted file mode 100644
index aeae6078111..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/utils/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-pyg/cugraph_pyg/utils/imports.py b/python/cugraph-pyg/cugraph_pyg/utils/imports.py
deleted file mode 100644
index 1cc865a1f35..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/utils/imports.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from packaging.requirements import Requirement
-from importlib import import_module
-
-
-def package_available(requirement: str) -> bool:
-    """Check if a package is installed and meets the version requirement."""
-    req = Requirement(requirement)
-    try:
-        pkg = import_module(req.name)
-    except ImportError:
-        return False
-
-    if len(req.specifier) > 0:
-        if hasattr(pkg, "__version__"):
-            return pkg.__version__ in req.specifier
-        else:
-            return False
-
-    return True
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
deleted file mode 100644
index a30cd375635..00000000000
--- a/python/cugraph-pyg/pyproject.toml
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-[build-system]
-
-requires = [
-    "rapids-build-backend>=0.3.1,<0.4.0.dev0",
-    "setuptools>=61.0.0",
-    "wheel",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "rapids_build_backend.build"
-
-[tool.pytest.ini_options]
-testpaths = ["cugraph_pyg/tests"]
-
-[project]
-name = "cugraph-pyg"
-dynamic = ["version"]
-description = "cugraph-pyg - PyG support for cuGraph massive-scale, ultra-fast GPU graph analytics."
-authors = [
-    { name = "NVIDIA Corporation" },
-]
-license = { text = "Apache 2.0" }
-requires-python = ">=3.10"
-classifiers = [
-    "Intended Audience :: Developers",
-    "Programming Language :: Python",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-]
-dependencies = [
-    "cugraph==24.12.*,>=0.0.0a0",
-    "numba>=0.57",
-    "numpy>=1.23,<3.0a0",
-    "pylibcugraphops==24.12.*,>=0.0.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[project.urls]
-Homepage = "https://github.com/rapidsai/cugraph"
-Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
-
-[project.optional-dependencies]
-test = [
-    "pandas",
-    "pylibwholegraph==24.12.*,>=0.0.0a0",
-    "pytest",
-    "pytest-benchmark",
-    "pytest-cov",
-    "pytest-xdist",
-    "scipy",
-    "tensordict>=0.1.2",
-    "torch>=2.3",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "cugraph_pyg/VERSION"}
-
-[tool.setuptools.packages.find]
-include = [
-    "cugraph_pyg*",
-    "cugraph_pyg.*",
-]
-
-[tool.rapids-build-backend]
-build-backend = "setuptools.build_meta"
-dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true"
diff --git a/python/cugraph-pyg/pytest.ini b/python/cugraph-pyg/pytest.ini
deleted file mode 100644
index 07c4ffa0958..00000000000
--- a/python/cugraph-pyg/pytest.ini
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-[pytest]
-addopts =
-           --benchmark-warmup=off
-           --benchmark-max-time=0
-           --benchmark-min-rounds=1
-           --benchmark-columns="mean, rounds"
-           --tb=native
-           ## do not run slow tests/benchmarks by default
-           -m "not slow"
-
-markers =
-          slow: slow-running tests/benchmarks
-          cugraph_ops: Tests requiring cugraph-ops
-          mg: Test MG code paths - number of gpu > 1
-          sg: Test SG code paths and dask sg tests - number of gpu == 1
-
-python_classes =
-          Bench*
-          Test*
-
-python_files =
-          bench_*
-          test_*
-
-python_functions =
-          bench_*
-          test_*
diff --git a/readme_pages/cugraph_dgl.md b/readme_pages/cugraph_dgl.md
deleted file mode 100644
index 7b19787f4c6..00000000000
--- a/readme_pages/cugraph_dgl.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# cugraph_dgl
-
-[RAPIDS](https://rapids.ai) cugraph_dgl enables the ability to use cugraph Property Graphs with DGL.  This cugraph backend allows DGL users access to a collection of GPU-accelerated algorithms for graph analytics, such as sampling, centrality computation, and community detection.
-
-
-The goal of `cugraph_dgl` is to enable Multi-Node Multi-GPU cugraph accelerated graphs to help train large-scale Graph Neural Networks(GNN) on DGL by providing a duck-typed version of the [DGLGraph](https://docs.dgl.ai/api/python/dgl.DGLGraph.html#dgl.DGLGraph)  which uses cugraph for storing graph structure and node/edge feature data.
-
-## Usage
-```diff
-
-+from cugraph_dgl.convert import cugraph_storage_from_heterograph
-+cugraph_g = cugraph_storage_from_heterograph(dgl_g)
-
-sampler = dgl.dataloading.NeighborSampler(
-        [15, 10, 5], prefetch_node_feats=['feat'], prefetch_labels=['label'])
-
-train_dataloader = dgl.dataloading.DataLoader(
-- dgl_g,
-+ cugraph_g,
-train_idx,
-sampler,
-device=device,
-batch_size=1024,
-shuffle=True,
-drop_last=False,
-num_workers=0)
-```
diff --git a/readme_pages/cugraph_pyg.md b/readme_pages/cugraph_pyg.md
deleted file mode 100644
index 147cd70b944..00000000000
--- a/readme_pages/cugraph_pyg.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# cugraph_pyg
-
-[RAPIDS](https://rapids.ai) cugraph_pyg enables the ability to use cugraph Property Graphs with PyTorch Geometric (PyG).  PyG users will have access to cuGraph and cuGraph-Service through the PyG GraphStore, FeatureStore, and Sampler interfaces.  Through cugraph_pyg, PyG users have the full power of cuGraph's GPU-accelerated algorithms for graph analytics, such as sampling, centrality computation, and community detection.
-
-
-The goal of `cugraph_pyg` is to enable accelerated single-GPU and multi-node, multi-GPU cugraph accelerated graphs to help train large-scale Graph Neural Networks (GNN) on PyG by providing duck-typed drop-in replacements of the `GraphStore`, `FeatureStore`, and `Sampler` interfaces backed by either cuGraph or cuGraph-Service.
-
-Users of cugraph_pyg have the option of installing either the cugraph or cugraph_service_client packages.  Only one is required.
-
-## Usage
-```
-G = cuGraph.PropertyGraph()
-...
-feature_store, graph_store = to_pyg(G)
-sampler = CuGraphSampler(
-    data=(feature_store, graph_store),
-    shuffle=True,
-    num_neighbors=[10,25],
-    batch_size=50,
-)
-...
-```
diff --git a/readme_pages/gnn_support.md b/readme_pages/gnn_support.md
index 924c2bf62af..72978883531 100644
--- a/readme_pages/gnn_support.md
+++ b/readme_pages/gnn_support.md
@@ -27,6 +27,6 @@ An overview of GNN's and how they are used is found in this excellent [blog](htt
 
 RAPIDS GNN components improve other industy GNN specific projects. Due to the degree distribution of nodes, memory bottlenecks are the pain point for large scale graphs. To solve this problem, sampling operations form the backbone for Graph Neural Networks (GNN) training. However, current sampling methods provided by other libraries are not optimized enough for the whole process of GNN training. The main limit to performance is moving data between the hosts and devices. In cuGraph, we provide an end-to-end solution from data loading to training all on the GPUs.
 
-CuGraph now supports compatibility with [Deep Graph Library](https://www.dgl.ai/) (DGL) and [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/) (PyG) by allowing conversion between a cuGraph object and a DGL or PyG object, making it possible for DGL and PyG users to access efficient data loader and graph operations (such as uniformed sampling) implementations in cuGraph, as well as keep their models unchanged in DGL or PyG. We have considerable speedup compared with the original implementation in DGL and PyG.
+CuGraph now supports compatibility with [Deep Graph Library](https://www.dgl.ai/) (DGL) and [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/) (PyG) by allowing conversion between a cuGraph object and a DGL or PyG object, making it possible for DGL and PyG users to access efficient data loader and graph operations (such as uniformed sampling) implementations in cuGraph, as well as keep their models unchanged in DGL or PyG. We have considerable speedup compared with the original implementation in DGL and PyG. The GNN packages are now developed within the [cugraph-gnn](https://github.com/rapidsai/cugraph-gnn) repository.
 
 [<img src="../img/gnn_context.png">](https://developer.nvidia.com/blog/optimizing-fraud-detection-in-financial-services-with-graph-neural-networks-and-nvidia-gpus/)

From 460d8e468a73e2d8edee625b9a9fe97ef24fa36f Mon Sep 17 00:00:00 2001
From: Mike Sarahan <msarahan@gmail.com>
Date: Fri, 22 Nov 2024 16:36:57 -0600
Subject: [PATCH 4/4] add telemetry (#4740)

Enables telemetry during cugraph's build process. This parses github job metadata to obtain timing information. It should have very little impact on overall build time, and should not interfere with any build tools.

This implements emitting OpenTelemetry traces and spans, as described in https://github.com/rapidsai/build-infra/issues/139

Authors:
  - Mike Sarahan (https://github.com/msarahan)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cugraph/pull/4740
---
 .github/workflows/pr.yaml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e82342dfd94..c8bf94b0987 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -10,6 +10,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  # Please keep pr-builder as the top job here
   pr-builder:
     needs:
       - changed-files
@@ -25,14 +26,24 @@ jobs:
       - wheel-tests-pylibcugraph
       - wheel-build-cugraph
       - wheel-tests-cugraph
+      - telemetry-setup
       - devcontainer
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
+  telemetry-setup:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    env:
+        OTEL_SERVICE_NAME: "pr-cugraph"
+    steps:
+      - name: Telemetry setup
+        uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
   changed-files:
     secrets: inherit
+    needs: telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
     with:
       files_yaml: |
@@ -63,9 +74,11 @@ jobs:
           - '!notebooks/**'
   checks:
     secrets: inherit
+    needs: telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
     with:
       enable_check_generated_files: false
+      ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
@@ -161,6 +174,7 @@ jobs:
       script: ci/test_wheel_cugraph.sh
   devcontainer:
     secrets: inherit
+    needs: telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
     with:
       arch: '["amd64"]'
@@ -171,3 +185,17 @@ jobs:
         sccache -z;
         build-all --verbose -j$(nproc --ignore=1) -DBUILD_CUGRAPH_MG_TESTS=ON;
         sccache -s;
+  telemetry-summarize:
+    runs-on: ubuntu-latest
+    needs: pr-builder
+    if: always()
+    continue-on-error: true
+    steps:
+      - name: Load stashed telemetry env vars
+        uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
+        with:
+            load_service_name: true
+      - name: Telemetry summarize
+        uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
+        with:
+          cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"