From 067a64e4c1968c891794620847b854580550d3e4 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 21 Jul 2023 09:55:43 -0400
Subject: [PATCH 01/72] v23.10 Updates [skip ci]

---
 .github/workflows/build.yaml                  | 20 +++++-----
 .github/workflows/pr.yaml                     | 32 +++++++--------
 .github/workflows/test.yaml                   | 12 +++---
 ci/build_docs.sh                              |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             | 26 ++++++------
 .../cugraph-service/conda_build_config.yaml   |  2 +-
 conda/recipes/cugraph/conda_build_config.yaml |  2 +-
 .../pylibcugraph/conda_build_config.yaml      |  2 +-
 cpp/CMakeLists.txt                            |  2 +-
 cpp/doxygen/Doxyfile                          |  2 +-
 cpp/libcugraph_etl/CMakeLists.txt             |  2 +-
 dependencies.yaml                             | 40 +++++++++----------
 docs/cugraph/source/conf.py                   |  4 +-
 fetch_rapids.cmake                            |  2 +-
 .../conda/cugraph_dgl_dev_cuda-118.yaml       |  4 +-
 python/cugraph-dgl/cugraph_dgl/__init__.py    |  2 +-
 python/cugraph-dgl/pyproject.toml             |  4 +-
 python/cugraph-pyg/cugraph_pyg/__init__.py    |  2 +-
 python/cugraph-pyg/pyproject.toml             |  4 +-
 .../client/cugraph_service_client/__init__.py |  2 +-
 python/cugraph-service/client/pyproject.toml  |  2 +-
 .../server/cugraph_service_server/__init__.py |  2 +-
 python/cugraph-service/server/pyproject.toml  | 16 ++++----
 python/cugraph/CMakeLists.txt                 |  2 +-
 python/cugraph/cugraph/__init__.py            |  2 +-
 python/cugraph/pyproject.toml                 | 22 +++++-----
 python/pylibcugraph/CMakeLists.txt            |  2 +-
 python/pylibcugraph/pylibcugraph/__init__.py  |  2 +-
 python/pylibcugraph/pyproject.toml            | 12 +++---
 29 files changed, 115 insertions(+), 115 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index de9caa0fabe..7d191d8e8bf 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -81,14 +81,14 @@ jobs:
       # the CMake variables in get_cumlprims_mg.cmake since CMake will just use
       # the clone as is.
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.08
+      extra-repo-sha: branch-23.10
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
 
       skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/python/pylibcugraph/cugraph-ops/"
   wheel-publish-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -98,7 +98,7 @@ jobs:
   wheel-build-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -111,7 +111,7 @@ jobs:
       # the CMake variables in get_cumlprims_mg.cmake since CMake will just use
       # the clone as is.
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.08
+      extra-repo-sha: branch-23.10
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
 
       before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 /local-wheelhouse"
@@ -119,7 +119,7 @@ jobs:
   wheel-publish-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4d52cd26de4..d4aff126058 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -24,41 +24,41 @@ jobs:
       - wheel-build-cugraph
       - wheel-tests-cugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -68,7 +68,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -78,19 +78,19 @@ jobs:
   wheel-build-pylibcugraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.10
     with:
       build_type: pull-request
       package-name: pylibcugraph
       package-dir: python/pylibcugraph
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.08
+      extra-repo-sha: branch-23.10
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/python/pylibcugraph/cugraph-ops/"
   wheel-tests-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.10
     with:
       build_type: pull-request
       package-name: pylibcugraph
@@ -99,26 +99,26 @@ jobs:
   wheel-build-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.10
     with:
       build_type: pull-request
       package-name: cugraph
       package-dir: python/cugraph
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.08
+      extra-repo-sha: branch-23.10
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 /local-wheelhouse"
       skbuild-configure-options: "-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/python/cugraph/cugraph-ops/"
   wheel-tests-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.10
     with:
       build_type: pull-request
       package-name: cugraph
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
+      test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10"
       # Skip dataset downloads on arm to save CI time -- arm only runs smoke tests.
-      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10"
       test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets python -m pytest -m sg ./python/cugraph/cugraph/tests"
       test-smoketest: "python ci/wheel_smoke_test_cugraph.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index d697b8f1649..61e04e6b12f 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets python -m pytest ./python/pylibcugraph/pylibcugraph/tests"
   wheel-tests-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -50,6 +50,6 @@ jobs:
       sha: ${{ inputs.sha }}
       package-name: cugraph
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
-      test-before-arm64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
+      test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10"
+      test-before-arm64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10"
       test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets python -m pytest -m sg ./python/cugraph/cugraph/tests"
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 5d038bf23a8..2941d062d80 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -37,7 +37,7 @@ rapids-mamba-retry install \
 rapids-logger "Install cugraph-dgl"
 rapids-mamba-retry install "${PYTHON_CHANNEL}/linux-64/cugraph-dgl-*.tar.bz2"
 
-export RAPIDS_VERSION_NUMBER="23.08"
+export RAPIDS_VERSION_NUMBER="23.10"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 16a4d4f0dbc..80702b1f547 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -14,13 +14,13 @@ dependencies:
 - cmake>=3.26.4
 - cuda-version=11.8
 - cudatoolkit
-- cudf==23.8.*
+- cudf==23.10.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=0.29,<0.30
 - dask-core>=2023.5.1
-- dask-cuda==23.8.*
-- dask-cudf==23.8.*
+- dask-cuda==23.10.*
+- dask-cudf==23.10.*
 - dask>=2023.5.1
 - distributed>=2023.5.1
 - doxygen
@@ -30,11 +30,11 @@ dependencies:
 - graphviz
 - gtest>=1.13.0
 - ipython
-- libcudf==23.8.*
-- libcugraphops==23.8.*
-- libraft-headers==23.8.*
-- libraft==23.8.*
-- librmm==23.8.*
+- libcudf==23.10.*
+- libcugraphops==23.10.*
+- libraft-headers==23.10.*
+- libraft==23.10.*
+- librmm==23.10.*
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
@@ -48,17 +48,17 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibcugraphops==23.8.*
-- pylibraft==23.8.*
+- pylibcugraphops==23.10.*
+- pylibraft==23.10.*
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-xdist
 - python-louvain
-- raft-dask==23.8.*
+- raft-dask==23.10.*
 - recommonmark
 - requests
-- rmm==23.8.*
+- rmm==23.10.*
 - scikit-build>=0.13.1
 - scikit-learn>=0.23.1
 - scipy
@@ -67,5 +67,5 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
-- ucx-py==0.33.*
+- ucx-py==0.34.*
 name: all_cuda-118_arch-x86_64
diff --git a/conda/recipes/cugraph-service/conda_build_config.yaml b/conda/recipes/cugraph-service/conda_build_config.yaml
index af1d362141a..5fe8d372eba 100644
--- a/conda/recipes/cugraph-service/conda_build_config.yaml
+++ b/conda/recipes/cugraph-service/conda_build_config.yaml
@@ -1,2 +1,2 @@
 ucx_py_version:
-  - "0.33.*"
+  - "0.34.*"
diff --git a/conda/recipes/cugraph/conda_build_config.yaml b/conda/recipes/cugraph/conda_build_config.yaml
index 4530a4c942d..ba5b46dda1f 100644
--- a/conda/recipes/cugraph/conda_build_config.yaml
+++ b/conda/recipes/cugraph/conda_build_config.yaml
@@ -17,4 +17,4 @@ sysroot_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.33.*"
+  - "0.34.*"
diff --git a/conda/recipes/pylibcugraph/conda_build_config.yaml b/conda/recipes/pylibcugraph/conda_build_config.yaml
index 4530a4c942d..ba5b46dda1f 100644
--- a/conda/recipes/pylibcugraph/conda_build_config.yaml
+++ b/conda/recipes/pylibcugraph/conda_build_config.yaml
@@ -17,4 +17,4 @@ sysroot_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.33.*"
+  - "0.34.*"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8af50e5f72b..53c3deccaaa 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUGRAPH)
 
-project(CUGRAPH VERSION 23.08.00 LANGUAGES C CXX CUDA)
+project(CUGRAPH VERSION 23.10.00 LANGUAGES C CXX CUDA)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 4a58e314520..eb414925388 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libcugraph"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER=23.08
+PROJECT_NUMBER=23.10
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/cpp/libcugraph_etl/CMakeLists.txt b/cpp/libcugraph_etl/CMakeLists.txt
index 929701a4948..18271871087 100644
--- a/cpp/libcugraph_etl/CMakeLists.txt
+++ b/cpp/libcugraph_etl/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUGRAPH_ETL)
 
-project(CUGRAPH_ETL VERSION 23.08.00 LANGUAGES C CXX CUDA)
+project(CUGRAPH_ETL VERSION 23.10.00 LANGUAGES C CXX CUDA)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
diff --git a/dependencies.yaml b/dependencies.yaml
index 572638069dc..b6d5f3fa7e9 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -233,10 +233,10 @@ dependencies:
           - cxx-compiler
           - gmock>=1.13.0
           - gtest>=1.13.0
-          - libcugraphops==23.8.*
-          - libraft-headers==23.8.*
-          - libraft==23.8.*
-          - librmm==23.8.*
+          - libcugraphops==23.10.*
+          - libraft-headers==23.10.*
+          - libraft==23.10.*
+          - librmm==23.10.*
           - openmpi # Required for building cpp-mgtests (multi-GPU tests)
     specific:
       - output_types: [conda]
@@ -281,7 +281,7 @@ dependencies:
           - sphinx-markdown-tables
           - sphinx<6
           - sphinxcontrib-websupport
-          - pylibcugraphops==23.8.*
+          - pylibcugraphops==23.10.*
   py_version:
     specific:
       - output_types: [conda]
@@ -308,38 +308,38 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - cython>=0.29,<0.30
-          - &pylibraft pylibraft==23.8.*
-          - &rmm rmm==23.8.*
+          - &pylibraft pylibraft==23.10.*
+          - &rmm rmm==23.10.*
           - scikit-build>=0.13.1
   python_build_cugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - pylibcugraph==23.8.*
+          - pylibcugraph==23.10.*
   python_run_cugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &cudf cudf==23.8.*
+          - &cudf cudf==23.10.*
           - &dask dask>=2023.5.1
           - &distributed distributed>=2023.5.1
-          - &dask_cuda dask-cuda==23.8.*
-          - &dask_cudf dask-cudf==23.8.*
+          - &dask_cuda dask-cuda==23.10.*
+          - &dask_cudf dask-cudf==23.10.*
           - &numba numba>=0.57
-          - raft-dask==23.8.*
+          - raft-dask==23.10.*
           - *rmm
-          - &ucx_py ucx-py==0.33.*
+          - &ucx_py ucx-py==0.34.*
       - output_types: conda
         packages:
           - &cupy cupy>=12.0.0
           - &dask-core dask-core>=2023.5.1
-          - libcudf==23.8.*
+          - libcudf==23.10.*
           - nccl>=2.9.9
           - ucx-proc=*=gpu
       - output_types: pyproject
         packages:
           - &cupy_pip cupy-cuda11x>=12.0.0
-          - pylibcugraph==23.8.*
+          - pylibcugraph==23.10.*
   python_run_pylibcugraph:
     common:
       - output_types: [conda, pyproject]
@@ -354,7 +354,7 @@ dependencies:
           - &numpy numpy>=1.21
       - output_types: [pyproject]
         packages:
-          - &cugraph cugraph==23.8.*
+          - &cugraph cugraph==23.10.*
   python_run_cugraph_pyg:
     common:
       - output_types: [conda, pyproject]
@@ -391,7 +391,7 @@ dependencies:
         packages:
           - *cupy_pip
           - *cugraph
-          - cugraph-service-client==23.8.*
+          - cugraph-service-client==23.10.*
   doc:
     common:
       - output_types: [conda]
@@ -405,7 +405,7 @@ dependencies:
           - sphinxcontrib-websupport
           - sphinx-markdown-tables
           - sphinx-copybutton
-          - pylibcugraphops==23.8.*
+          - pylibcugraphops==23.10.*
   test_notebook:
     common:
       - output_types: [conda, requirements]
@@ -444,8 +444,8 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==23.8.*
-          - pylibcugraphops==23.8.*
+          - cugraph==23.10.*
+          - pylibcugraphops==23.10.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - dgl>=1.1.0.cu*
diff --git a/docs/cugraph/source/conf.py b/docs/cugraph/source/conf.py
index b64901772dc..a96f0fc1e82 100644
--- a/docs/cugraph/source/conf.py
+++ b/docs/cugraph/source/conf.py
@@ -76,9 +76,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.08'
+version = '23.10'
 # The full version, including alpha/beta/rc tags.
-release = '23.08.00'
+release = '23.10.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index dbbbbd4d82e..c32dc74da40 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake
   )
 endif()
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
index 6961a485742..2bb4b0f3cd3 100644
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -9,10 +9,10 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==23.8.*
+- cugraph==23.10.*
 - dgl>=1.1.0.cu*
 - pre-commit
-- pylibcugraphops==23.8.*
+- pylibcugraphops==23.10.*
 - pytest
 - pytorch-cuda==11.8
 - pytorch>=2.0
diff --git a/python/cugraph-dgl/cugraph_dgl/__init__.py b/python/cugraph-dgl/cugraph_dgl/__init__.py
index 0609aad88b1..b30cd6c79d9 100644
--- a/python/cugraph-dgl/cugraph_dgl/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/__init__.py
@@ -20,4 +20,4 @@
 import cugraph_dgl.dataloading
 import cugraph_dgl.nn
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index 110a2d90154..4205ac69df5 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cugraph-dgl"
-version = "23.08.00"
+version = "23.10.00"
 description = "cugraph extensions for DGL"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cugraph==23.8.*",
+    "cugraph==23.10.*",
     "numba>=0.57",
     "numpy>=1.21",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cugraph-pyg/cugraph_pyg/__init__.py b/python/cugraph-pyg/cugraph_pyg/__init__.py
index 48c2f2193b8..f8187059b86 100644
--- a/python/cugraph-pyg/cugraph_pyg/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/__init__.py
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index a25aee5008b..a28a4b3e905 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -12,7 +12,7 @@ testpaths = ["cugraph_pyg/tests"]
 
 [project]
 name = "cugraph_pyg"
-version = "23.08.00"
+version = "23.10.00"
 description = "cugraph_pyg - PyG support for cuGraph massive-scale, ultra-fast GPU graph analytics."
 authors = [
     { name = "NVIDIA Corporation" },
@@ -26,7 +26,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
 ]
 dependencies = [
-    "cugraph==23.8.*",
+    "cugraph==23.10.*",
     "numba>=0.57",
     "numpy>=1.21",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cugraph-service/client/cugraph_service_client/__init__.py b/python/cugraph-service/client/cugraph_service_client/__init__.py
index bc224dbd372..229d07b8bc6 100644
--- a/python/cugraph-service/client/cugraph_service_client/__init__.py
+++ b/python/cugraph-service/client/cugraph_service_client/__init__.py
@@ -35,4 +35,4 @@
 from cugraph_service_client.client import CugraphServiceClient
 from cugraph_service_client.remote_graph import RemoteGraph
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
diff --git a/python/cugraph-service/client/pyproject.toml b/python/cugraph-service/client/pyproject.toml
index b9369eefb71..cef9391805e 100644
--- a/python/cugraph-service/client/pyproject.toml
+++ b/python/cugraph-service/client/pyproject.toml
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cugraph-service-client"
-version = "23.08.00"
+version = "23.10.00"
 description = "cuGraph Service client"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
diff --git a/python/cugraph-service/server/cugraph_service_server/__init__.py b/python/cugraph-service/server/cugraph_service_server/__init__.py
index 22d48c9f714..017f0990f89 100644
--- a/python/cugraph-service/server/cugraph_service_server/__init__.py
+++ b/python/cugraph-service/server/cugraph_service_server/__init__.py
@@ -61,4 +61,4 @@ def start_server_blocking(
     server.serve()  # blocks until Ctrl-C (kill -2)
 
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index 680811512c3..cf14f04dfd1 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cugraph-service-server"
-version = "23.08.00"
+version = "23.10.00"
 description = "cuGraph Service server"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -19,19 +19,19 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.8.*",
-    "cugraph-service-client==23.8.*",
-    "cugraph==23.8.*",
+    "cudf==23.10.*",
+    "cugraph-service-client==23.10.*",
+    "cugraph==23.10.*",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==23.8.*",
-    "dask-cudf==23.8.*",
+    "dask-cuda==23.10.*",
+    "dask-cudf==23.10.*",
     "dask>=2023.5.1",
     "distributed>=2023.5.1",
     "numba>=0.57",
     "numpy>=1.21",
-    "rmm==23.8.*",
+    "rmm==23.10.*",
     "thriftpy2",
-    "ucx-py==0.33.*",
+    "ucx-py==0.34.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index f405ad4f360..c770a758698 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cugraph_version 23.08.00)
+set(cugraph_version 23.10.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py
index 3b9c4e007e2..43cb30beceb 100644
--- a/python/cugraph/cugraph/__init__.py
+++ b/python/cugraph/cugraph/__init__.py
@@ -120,4 +120,4 @@
 
 from cugraph import exceptions
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index 8dac14db659..b71b65cd937 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -6,9 +6,9 @@ requires = [
     "cmake>=3.26.4",
     "cython>=0.29,<0.30",
     "ninja",
-    "pylibcugraph==23.8.*",
-    "pylibraft==23.8.*",
-    "rmm==23.8.*",
+    "pylibcugraph==23.10.*",
+    "pylibraft==23.10.*",
+    "rmm==23.10.*",
     "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
@@ -20,7 +20,7 @@ testpaths = ["cugraph/tests"]
 
 [project]
 name = "cugraph"
-version = "23.08.00"
+version = "23.10.00"
 description = "cuGraph - RAPIDS GPU Graph Analytics"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -29,17 +29,17 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.8.*",
+    "cudf==23.10.*",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==23.8.*",
-    "dask-cudf==23.8.*",
+    "dask-cuda==23.10.*",
+    "dask-cudf==23.10.*",
     "dask>=2023.5.1",
     "distributed>=2023.5.1",
     "numba>=0.57",
-    "pylibcugraph==23.8.*",
-    "raft-dask==23.8.*",
-    "rmm==23.8.*",
-    "ucx-py==0.33.*",
+    "pylibcugraph==23.10.*",
+    "raft-dask==23.10.*",
+    "rmm==23.10.*",
+    "ucx-py==0.34.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index 8f96245e383..65ccdec1af8 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(pylibcugraph_version 23.08.00)
+set(pylibcugraph_version 23.10.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index 5c03d8f98cc..be53946b395 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -83,4 +83,4 @@
 
 from pylibcugraph import exceptions
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 3e891d14803..10fba0b7722 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -6,8 +6,8 @@ requires = [
     "cmake>=3.26.4",
     "cython>=0.29,<0.30",
     "ninja",
-    "pylibraft==23.8.*",
-    "rmm==23.8.*",
+    "pylibraft==23.10.*",
+    "rmm==23.10.*",
     "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
@@ -19,7 +19,7 @@ testpaths = ["pylibcugraph/tests"]
 
 [project]
 name = "pylibcugraph"
-version = "23.08.00"
+version = "23.10.00"
 description = "pylibcugraph - Python bindings for the libcugraph cuGraph C/C++/CUDA library"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -28,8 +28,8 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "pylibraft==23.8.*",
-    "rmm==23.8.*",
+    "pylibraft==23.10.*",
+    "rmm==23.10.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -40,7 +40,7 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "cudf==23.8.*",
+    "cudf==23.10.*",
     "networkx>=2.5.1",
     "numpy>=1.21",
     "pandas",

From d1c53430dabec3b50e75907ee2f607ddc187b2f7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 4 Aug 2023 13:03:27 -0700
Subject: [PATCH 02/72] Update to Cython 3.0.0 (#3716)

This PR contains the minimal set of changes to compile using Cython 3 without warnings. Future PRs can be made to take advantage of new or improved features.

The main change is that a the graph_primtypes module runs into https://github.com/cython/cython/issues/5554. I was able to work around that issue by removing all the fused type logic and instead just duplicate one function. There were multiple functions present that were not used at all corresponding to fused type specializations that were never instantiated, so the changes still ended up being a net reduction in code.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3716
---
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-120_arch-x86_64.yaml             |  2 +-
 conda/recipes/cugraph-pyg/meta.yaml           |  2 +-
 conda/recipes/cugraph/meta.yaml               |  2 +-
 conda/recipes/pylibcugraph/meta.yaml          |  2 +-
 dependencies.yaml                             |  2 +-
 .../community/ktruss_subgraph_wrapper.pyx     |  6 +--
 .../cugraph/structure/graph_primtypes.pxd     | 17 +------
 .../cugraph/structure/graph_primtypes.pyx     | 51 +++++++------------
 python/cugraph/pyproject.toml                 |  2 +-
 .../pylibcugraph/uniform_neighbor_sample.pyx  | 18 ++++---
 python/pylibcugraph/pyproject.toml            |  2 +-
 12 files changed, 41 insertions(+), 67 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 80702b1f547..c92f1f47fc9 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cudf==23.10.*
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=0.29,<0.30
+- cython>=3.0.0
 - dask-core>=2023.5.1
 - dask-cuda==23.10.*
 - dask-cudf==23.10.*
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index e8c40f5eed6..0b211458562 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cudf==23.10.*
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=0.29,<0.30
+- cython>=3.0.0
 - dask-core>=2023.5.1
 - dask-cuda==23.10.*
 - dask-cudf==23.10.*
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 6ff0fa01c96..66397778a9e 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -22,7 +22,7 @@ requirements:
   build:
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
-    - cython >=0.29,<0.30
+    - cython >=3.0.0
     - python
     - scikit-build >=0.13.1
   run:
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index e2b9d38c181..e65e4dd8451 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -55,7 +55,7 @@ requirements:
     - cudatoolkit
     {% endif %}
     - cudf ={{ minor_version }}
-    - cython >=0.29,<0.30
+    - cython >=3.0.0
     - libcugraph ={{ version }}
     - pylibraft ={{ minor_version }}
     - python
diff --git a/conda/recipes/pylibcugraph/meta.yaml b/conda/recipes/pylibcugraph/meta.yaml
index aa82c20ad44..4ac3bb2dde1 100644
--- a/conda/recipes/pylibcugraph/meta.yaml
+++ b/conda/recipes/pylibcugraph/meta.yaml
@@ -54,7 +54,7 @@ requirements:
     {% if cuda_major == "11" %}
     - cudatoolkit
     {% endif %}
-    - cython >=0.29,<0.30
+    - cython >=3.0.0
     - libcugraph ={{ version }}
     - pylibraft ={{ minor_version }}
     - python
diff --git a/dependencies.yaml b/dependencies.yaml
index 7a7af2b3699..1c2d30c0546 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -308,7 +308,7 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - cython>=0.29,<0.30
+          - cython>=3.0.0
           - &pylibraft pylibraft==23.10.*
           - &rmm rmm==23.10.*
           - scikit-build>=0.13.1
diff --git a/python/cugraph/cugraph/community/ktruss_subgraph_wrapper.pyx b/python/cugraph/cugraph/community/ktruss_subgraph_wrapper.pyx
index d3b7a38ba41..8b705e8a7b4 100644
--- a/python/cugraph/cugraph/community/ktruss_subgraph_wrapper.pyx
+++ b/python/cugraph/cugraph/community/ktruss_subgraph_wrapper.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -23,12 +23,12 @@ import numpy as np
 
 
 def ktruss_subgraph_float(input_graph, k, use_weights):
-    cdef GraphCOOViewFloat in_graph = get_graph_view[GraphCOOViewFloat](input_graph, use_weights)
+    cdef GraphCOOViewFloat in_graph = get_coo_float_graph_view(input_graph, use_weights)
     return coo_to_df(move(k_truss_subgraph[int,int,float](in_graph, k)))
 
 
 def ktruss_subgraph_double(input_graph, k, use_weights):
-    cdef GraphCOOViewDouble in_graph = get_graph_view[GraphCOOViewDouble](input_graph, use_weights)
+    cdef GraphCOOViewDouble in_graph = get_coo_double_graph_view(input_graph, use_weights)
     return coo_to_df(move(k_truss_subgraph[int,int,double](in_graph, k)))
 
 
diff --git a/python/cugraph/cugraph/structure/graph_primtypes.pxd b/python/cugraph/cugraph/structure/graph_primtypes.pxd
index 4e5a380f798..eaf552195da 100644
--- a/python/cugraph/cugraph/structure/graph_primtypes.pxd
+++ b/python/cugraph/cugraph/structure/graph_primtypes.pxd
@@ -154,22 +154,9 @@ ctypedef GraphCOOView[int,int,double] GraphCOOViewDouble
 ctypedef GraphCSRView[int,int,float] GraphCSRViewFloat
 ctypedef GraphCSRView[int,int,double] GraphCSRViewDouble
 
-ctypedef fused GraphCOOViewType:
-    GraphCOOViewFloat
-    GraphCOOViewDouble
-
-ctypedef fused GraphCSRViewType:
-    GraphCSRViewFloat
-    GraphCSRViewDouble
-
-ctypedef fused GraphViewType:
-    GraphCOOViewFloat
-    GraphCOOViewDouble
-    GraphCSRViewFloat
-    GraphCSRViewDouble
-
 cdef move_device_buffer_to_column(unique_ptr[device_buffer] device_buffer_unique_ptr, dtype)
 cdef move_device_buffer_to_series(unique_ptr[device_buffer] device_buffer_unique_ptr, dtype, series_name)
 cdef coo_to_df(GraphCOOPtrType graph)
 cdef csr_to_series(GraphCSRPtrType graph)
-cdef GraphViewType get_graph_view(input_graph, bool weightless=*, GraphViewType* dummy=*)
+cdef GraphCOOViewFloat get_coo_float_graph_view(input_graph, bool weighted=*)
+cdef GraphCOOViewDouble get_coo_double_graph_view(input_graph, bool weighted=*)
diff --git a/python/cugraph/cugraph/structure/graph_primtypes.pyx b/python/cugraph/cugraph/structure/graph_primtypes.pyx
index fadd0f73a08..10f3871e157 100644
--- a/python/cugraph/cugraph/structure/graph_primtypes.pyx
+++ b/python/cugraph/cugraph/structure/graph_primtypes.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -101,28 +101,27 @@ cdef csr_to_series(GraphCSRPtrType graph):
     return (csr_offsets, csr_indices, csr_weights)
 
 
-cdef GraphCSRViewType get_csr_graph_view(input_graph, bool weighted=True, GraphCSRViewType* dummy=NULL):
-    if not input_graph.adjlist:
-        input_graph.view_adj_list()
+cdef GraphCOOViewFloat get_coo_float_graph_view(input_graph, bool weighted=True):
+    # FIXME: this function assumes columns named "src" and "dst" and can only
+    # be used for SG graphs due to that assumption.
+    if not input_graph.edgelist:
+        input_graph.view_edge_list()
+
+    num_edges = input_graph.number_of_edges(directed_edges=True)
+    num_verts = input_graph.number_of_vertices()
 
-    cdef uintptr_t c_off = input_graph.adjlist.offsets.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_ind = input_graph.adjlist.indices.__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_src = input_graph.edgelist.edgelist_df['src'].__cuda_array_interface__['data'][0]
+    cdef uintptr_t c_dst = input_graph.edgelist.edgelist_df['dst'].__cuda_array_interface__['data'][0]
     cdef uintptr_t c_weights = <uintptr_t>NULL
 
-    if input_graph.adjlist.weights is not None and weighted:
-        c_weights = input_graph.adjlist.weights.__cuda_array_interface__['data'][0]
+    # FIXME explicit check for None fails, different behavior than get_csr_graph_view
+    if input_graph.edgelist.weights and weighted:
+        c_weights = input_graph.edgelist.edgelist_df['weights'].__cuda_array_interface__['data'][0]
 
-    num_verts = input_graph.number_of_vertices()
-    num_edges = input_graph.number_of_edges(directed_edges=True)
-    cdef GraphCSRViewType in_graph
-    if GraphCSRViewType is GraphCSRViewFloat:
-        in_graph = GraphCSRViewFloat(<int*>c_off, <int*>c_ind, <float*>c_weights, num_verts, num_edges)
-    elif GraphCSRViewType is GraphCSRViewDouble:
-        in_graph = GraphCSRViewDouble(<int*>c_off, <int*>c_ind, <double*>c_weights, num_verts, num_edges)
-    return in_graph
+    return GraphCOOViewFloat(<int*>c_src, <int*>c_dst, <float*>c_weights, num_verts, num_edges)
 
 
-cdef GraphCOOViewType get_coo_graph_view(input_graph, bool weighted=True, GraphCOOViewType* dummy=NULL):
+cdef GraphCOOViewDouble get_coo_double_graph_view(input_graph, bool weighted=True):
     # FIXME: this function assumes columns named "src" and "dst" and can only
     # be used for SG graphs due to that assumption.
     if not input_graph.edgelist:
@@ -139,20 +138,4 @@ cdef GraphCOOViewType get_coo_graph_view(input_graph, bool weighted=True, GraphC
     if input_graph.edgelist.weights and weighted:
         c_weights = input_graph.edgelist.edgelist_df['weights'].__cuda_array_interface__['data'][0]
 
-    cdef GraphCOOViewType in_graph
-    if GraphCOOViewType is GraphCOOViewFloat:
-        in_graph = GraphCOOViewFloat(<int*>c_src, <int*>c_dst, <float*>c_weights, num_verts, num_edges)
-    elif GraphCOOViewType is GraphCOOViewDouble:
-        in_graph = GraphCOOViewDouble(<int*>c_src, <int*>c_dst, <double*>c_weights, num_verts, num_edges)
-    return in_graph
-
-
-cdef GraphViewType get_graph_view(input_graph, bool weighted = True, GraphViewType* dummy=NULL):
-    if GraphViewType is GraphCOOViewFloat:
-        return get_coo_graph_view[GraphCOOViewFloat](input_graph, weighted, dummy)
-    elif GraphViewType is GraphCOOViewDouble:
-        return get_coo_graph_view[GraphCOOViewDouble](input_graph, weighted, dummy)
-    elif GraphViewType is GraphCSRViewFloat:
-        return get_csr_graph_view[GraphCSRViewFloat](input_graph, weighted, dummy)
-    elif GraphViewType is GraphCSRViewDouble:
-        return get_csr_graph_view[GraphCSRViewDouble](input_graph, weighted, dummy)
+    return GraphCOOViewDouble(<int*>c_src, <int*>c_dst, <double*>c_weights, num_verts, num_edges)
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index b71b65cd937..343ae1f748f 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -4,7 +4,7 @@
 
 requires = [
     "cmake>=3.26.4",
-    "cython>=0.29,<0.30",
+    "cython>=3.0.0",
     "ninja",
     "pylibcugraph==23.10.*",
     "pylibraft==23.10.*",
diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
index d19162d503f..bc2aa9205f1 100644
--- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
+++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
@@ -82,14 +82,14 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
                             *,
                             bool_t with_replacement,
                             bool_t do_expensive_check,
-                            bool_t with_edge_properties=<bool_t>False,
+                            with_edge_properties=False,
                             batch_id_list=None,
                             label_list=None,
                             label_to_output_comm_rank=None,
                             prior_sources_behavior=None,
-                            bool_t deduplicate_sources=<bool_t>False,
-                            bool_t return_hops=<bool_t>False,
-                            bool_t renumber=<bool_t>False,
+                            deduplicate_sources=False,
+                            return_hops=False,
+                            renumber=False,
                             random_state=None):
     """
     Does neighborhood sampling, which samples nodes from a graph based on the
@@ -177,6 +177,10 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
         resource_handle.c_resource_handle_ptr
     cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
 
+    cdef bool_t c_deduplicate_sources = deduplicate_sources
+    cdef bool_t c_return_hops = return_hops
+    cdef bool_t c_renumber = renumber
+
     assert_CAI_type(start_list, "start_list")
     assert_CAI_type(batch_id_list, "batch_id_list", True)
     assert_CAI_type(label_list, "label_list", True)
@@ -271,10 +275,10 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
     assert_success(error_code, error_ptr, "cugraph_sampling_options_create")
 
     cugraph_sampling_set_with_replacement(sampling_options, with_replacement)
-    cugraph_sampling_set_return_hops(sampling_options, return_hops)
-    cugraph_sampling_set_dedupe_sources(sampling_options, deduplicate_sources)
+    cugraph_sampling_set_return_hops(sampling_options, c_return_hops)
+    cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
     cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
-    cugraph_sampling_set_renumber_results(sampling_options, renumber)
+    cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
 
     error_code = cugraph_uniform_neighbor_sample(
         c_resource_handle_ptr,
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 10fba0b7722..8301c25a11b 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -4,7 +4,7 @@
 
 requires = [
     "cmake>=3.26.4",
-    "cython>=0.29,<0.30",
+    "cython>=3.0.0",
     "ninja",
     "pylibraft==23.10.*",
     "rmm==23.10.*",

From 5204c36757e3f3c61af8ada52fef4779a663d756 Mon Sep 17 00:00:00 2001
From: Rick Ratzel <rratzel@nvidia.com>
Date: Wed, 9 Aug 2023 14:16:01 -0500
Subject: [PATCH 03/72] Updates latest dask versions needed for testing 23.10.

---
 ci/test_wheel_cugraph.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/test_wheel_cugraph.sh b/ci/test_wheel_cugraph.sh
index a117e00b8a2..1c356ba3073 100755
--- a/ci/test_wheel_cugraph.sh
+++ b/ci/test_wheel_cugraph.sh
@@ -9,7 +9,7 @@ RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-whe
 python -m pip install --no-deps ./local-pylibcugraph-dep/pylibcugraph*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+python -m pip install git+https://github.com/dask/dask.git@2023.7.1 git+https://github.com/dask/distributed.git@2023.7.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
 
 # Only download test data for x86
 arch=$(uname -m)

From a1a85a7847b4efe9e1fb59b9cc75116c1ed7f50d Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Sat, 12 Aug 2023 18:08:14 -0500
Subject: [PATCH 04/72] Fixes `KeyError` for `get_two_hop_neighbors` when
 called with a small start vertices list (#3778)

closes #3745

This PR adds updates to replace the `get_distributed_data()` call with `persist_dask_df_equal_parts_per_worker()` and `get_persisted_df_worker_map()` to avoid a problem where `get_distributed_data()` does not distribute data properly across all workers.  This resulted in a `KeyError` when the data was accessed via worker, when that worker was not a key in the map.

More details are in the [linked issue](https://github.com/rapidsai/cugraph/issues/3745).

This PR also does minor refactoring in `get_two_hop_neighbors()` and reorganizes the imports according to [PEP 8](https://peps.python.org/pep-0008/#imports).

Tested manually on a 4-GPU system, where the problem described in #3745 was reproduced, the change in the PR applied and re-run, and the error no longer occurring.

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3778
---
 .../simpleDistributedGraph.py                 | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index 90db2c6b1f5..5ab5935290d 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -11,36 +11,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure import graph_primtypes_wrapper
-from cugraph.structure.graph_primtypes_wrapper import Direction
-from cugraph.structure.number_map import NumberMap
-from cugraph.structure.symmetrize import symmetrize
-import cudf
+import gc
+from typing import Union
 import warnings
-import dask_cudf
+
+import cudf
 import cupy as cp
 import dask
-from typing import Union
+import dask_cudf
+from dask import delayed
+from dask.distributed import wait, default_client
 import numpy as np
-import gc
 from pylibcugraph import (
     MGGraph,
     ResourceHandle,
     GraphProperties,
+    get_two_hop_neighbors as pylibcugraph_get_two_hop_neighbors,
+    select_random_vertices as pylibcugraph_select_random_vertices,
 )
 
-from dask.distributed import wait, default_client
+from cugraph.structure import graph_primtypes_wrapper
+from cugraph.structure.graph_primtypes_wrapper import Direction
+from cugraph.structure.number_map import NumberMap
+from cugraph.structure.symmetrize import symmetrize
 from cugraph.dask.common.part_utils import (
     get_persisted_df_worker_map,
     persist_dask_df_equal_parts_per_worker,
 )
-from cugraph.dask.common.input_utils import get_distributed_data
-from pylibcugraph import (
-    get_two_hop_neighbors as pylibcugraph_get_two_hop_neighbors,
-    select_random_vertices as pylibcugraph_select_random_vertices,
-)
+from cugraph.dask import get_n_workers
 import cugraph.dask.comms.comms as Comms
-from dask import delayed
 
 
 class simpleDistributedGraphImpl:
@@ -784,6 +783,15 @@ def get_two_hop_neighbors(self, start_vertices=None):
                 the second vertex id of a pair, if an external vertex id
                 is defined by only one column
         """
+        _client = default_client()
+
+        def _call_plc_two_hop_neighbors(sID, mg_graph_x, start_vertices):
+            return pylibcugraph_get_two_hop_neighbors(
+                resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
+                graph=mg_graph_x,
+                start_vertices=start_vertices,
+                do_expensive_check=False,
+            )
 
         if isinstance(start_vertices, int):
             start_vertices = [start_vertices]
@@ -805,20 +813,13 @@ def get_two_hop_neighbors(self, start_vertices=None):
                 )
                 start_vertices = start_vertices.astype(start_vertices_type)
 
-            start_vertices = get_distributed_data(start_vertices)
-            wait(start_vertices)
-            start_vertices = start_vertices.worker_to_parts
-
-        def _call_plc_two_hop_neighbors(sID, mg_graph_x, start_vertices):
-            return pylibcugraph_get_two_hop_neighbors(
-                resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
-                graph=mg_graph_x,
-                start_vertices=start_vertices,
-                do_expensive_check=False,
+            n_workers = get_n_workers()
+            start_vertices = start_vertices.repartition(npartitions=n_workers)
+            start_vertices = persist_dask_df_equal_parts_per_worker(
+                start_vertices, _client
             )
+            start_vertices = get_persisted_df_worker_map(start_vertices, _client)
 
-        _client = default_client()
-        if start_vertices is not None:
             result = [
                 _client.submit(
                     _call_plc_two_hop_neighbors,
@@ -828,7 +829,7 @@ def _call_plc_two_hop_neighbors(sID, mg_graph_x, start_vertices):
                     workers=[w],
                     allow_other_workers=False,
                 )
-                for w in Comms.get_workers()
+                for w in start_vertices.keys()
             ]
         else:
             result = [
@@ -855,7 +856,6 @@ def convert_to_cudf(cp_arrays):
             df["second"] = second
             return df
 
-        _client = default_client()
         cudf_result = [
             _client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result
         ]

From 20dca85c65204ed6b63e60454f8f3bff4dc490be Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Mon, 14 Aug 2023 15:06:04 +0100
Subject: [PATCH 05/72] Makes copy of input ddf to work around dropped column
 names (#3776)

When creating multiple graphs with the same dask_cudf dataframe, there is a metadata mismatch occurring when one or more partitions are empty. In fact, during the second graph creation with the dask_cudf dataframe that was used/modified earlier, the metadata are not conserved for partitions with empty empty dataframes. This is due to the fact a _reference_ to the input dataframe partly destroyed (modfied) during the first graph creation is reused in the second graph creation.

This PR makes a copy of the input dataframe right after the repartition call to avoid that alteration.

Authors:
   - jnke2016 (jnke@gmail.com)

Approvers:
   - Vibhu Jawa (https://github.com/VibhuJawa)
   - Alex Barghi (https://github.com/alexbarghi-nv)
   - Rick Ratzel (https://github.com/rlratzel)
---
 .../structure/graph_implementation/simpleDistributedGraph.py   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index 90db2c6b1f5..bb546ab4e5d 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -182,6 +182,8 @@ def __from_edgelist(
         workers = _client.scheduler_info()["workers"]
         # Repartition to 2 partitions per GPU for memory efficient process
         input_ddf = input_ddf.repartition(npartitions=len(workers) * 2)
+        # FIXME: Make a copy of the input ddf before implicitly altering it.
+        input_ddf = input_ddf.map_partitions(lambda df: df.copy())
         # The dataframe will be symmetrized iff the graph is undirected
         # otherwise, the inital dataframe will be returned
         if edge_attr is not None:
@@ -318,7 +320,6 @@ def __from_edgelist(
             is_symmetric=not self.properties.directed,
         )
         ddf = ddf.repartition(npartitions=len(workers) * 2)
-        ddf = ddf.map_partitions(lambda df: df.copy())
         ddf = persist_dask_df_equal_parts_per_worker(ddf, _client)
         num_edges = len(ddf)
         ddf = get_persisted_df_worker_map(ddf, _client)

From 28fa98fb2f67754b5ff2eff9b7f7f4cd095d07e4 Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Wed, 16 Aug 2023 01:03:33 -0500
Subject: [PATCH 06/72] Add new cugraph-nx package (networkx backend using
 pylibcugraph) (#3614)

`cugraph-nx` is working with two algorithm (`betweenness_centrality`, `edge_betweenness_centrality`).

It can efficiently (i.e., faster than `cugraph` in my benchmarks) convert from (and to) networkx graphs, and runs (and passes) networkx tests using networkx dispatching machinery.

It renumbers vertex labels, which we handle with python dicts.

It only depends on `networkx`, `cupy`, and `pylibcugraph`. `cupy` is a transitive dependency from `pylibcugraph`, so we don't actually need to list it as a dependency.

I think it's nice the way we depend on `networkx`, which allows us to use its exceptions and utilities.

There is still plenty to do (and I have a few questions), but I think it's a good start and gets cugraph in on the NetworkX dispatching party.

I was planning on including `cugraph-pg` with this PR, but now I think it's probably better to have that in a new PR.

Authors:
  - Erik Welch (https://github.com/eriknw)
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3614
---
 build.sh                                      |  14 +-
 .../traversal/od_shortest_distances_test.cpp  |   8 +-
 .../source/installation/source_build.md       |   1 +
 python/cugraph-nx/.flake8                     |  13 +
 python/cugraph-nx/LICENSE                     |   1 +
 python/cugraph-nx/Makefile                    |   7 +
 python/cugraph-nx/README.md                   |  34 +
 python/cugraph-nx/conftest.py                 |  28 +
 python/cugraph-nx/cugraph_nx/__init__.py      |  20 +
 .../cugraph_nx/algorithms/__init__.py         |  14 +
 .../algorithms/centrality/__init__.py         |  13 +
 .../algorithms/centrality/betweenness.py      |  72 +++
 .../cugraph-nx/cugraph_nx/classes/__init__.py |  15 +
 .../cugraph-nx/cugraph_nx/classes/digraph.py  |  61 ++
 python/cugraph-nx/cugraph_nx/classes/graph.py | 589 ++++++++++++++++++
 python/cugraph-nx/cugraph_nx/convert.py       | 528 ++++++++++++++++
 python/cugraph-nx/cugraph_nx/interface.py     |  80 +++
 .../cugraph-nx/cugraph_nx/tests/__init__.py   |   0
 .../cugraph_nx/tests/bench_convert.py         | 176 ++++++
 .../cugraph-nx/cugraph_nx/tests/conftest.py   |  31 +
 .../cugraph_nx/tests/test_convert.py          | 203 ++++++
 .../cugraph_nx/tests/test_match_api.py        |  40 ++
 python/cugraph-nx/cugraph_nx/typing.py        |  25 +
 .../cugraph-nx/cugraph_nx/utils/__init__.py   |  13 +
 .../cugraph-nx/cugraph_nx/utils/decorators.py |  60 ++
 python/cugraph-nx/lint.yaml                   |  86 +++
 python/cugraph-nx/pyproject.toml              | 213 +++++++
 python/cugraph-nx/run_nx_tests.sh             |  16 +
 python/cugraph-nx/setup.py                    |  15 +
 29 files changed, 2371 insertions(+), 5 deletions(-)
 create mode 100644 python/cugraph-nx/.flake8
 create mode 120000 python/cugraph-nx/LICENSE
 create mode 100644 python/cugraph-nx/Makefile
 create mode 100644 python/cugraph-nx/README.md
 create mode 100644 python/cugraph-nx/conftest.py
 create mode 100644 python/cugraph-nx/cugraph_nx/__init__.py
 create mode 100644 python/cugraph-nx/cugraph_nx/algorithms/__init__.py
 create mode 100644 python/cugraph-nx/cugraph_nx/algorithms/centrality/__init__.py
 create mode 100644 python/cugraph-nx/cugraph_nx/algorithms/centrality/betweenness.py
 create mode 100644 python/cugraph-nx/cugraph_nx/classes/__init__.py
 create mode 100644 python/cugraph-nx/cugraph_nx/classes/digraph.py
 create mode 100644 python/cugraph-nx/cugraph_nx/classes/graph.py
 create mode 100644 python/cugraph-nx/cugraph_nx/convert.py
 create mode 100644 python/cugraph-nx/cugraph_nx/interface.py
 create mode 100644 python/cugraph-nx/cugraph_nx/tests/__init__.py
 create mode 100644 python/cugraph-nx/cugraph_nx/tests/bench_convert.py
 create mode 100644 python/cugraph-nx/cugraph_nx/tests/conftest.py
 create mode 100644 python/cugraph-nx/cugraph_nx/tests/test_convert.py
 create mode 100644 python/cugraph-nx/cugraph_nx/tests/test_match_api.py
 create mode 100644 python/cugraph-nx/cugraph_nx/typing.py
 create mode 100644 python/cugraph-nx/cugraph_nx/utils/__init__.py
 create mode 100644 python/cugraph-nx/cugraph_nx/utils/decorators.py
 create mode 100644 python/cugraph-nx/lint.yaml
 create mode 100644 python/cugraph-nx/pyproject.toml
 create mode 100755 python/cugraph-nx/run_nx_tests.sh
 create mode 100644 python/cugraph-nx/setup.py

diff --git a/build.sh b/build.sh
index a4232bdaed0..74bdb3c6a2f 100755
--- a/build.sh
+++ b/build.sh
@@ -30,6 +30,7 @@ VALIDARGS="
    cpp-mgtests
    cugraph-pyg
    cugraph-dgl
+   cugraph-nx
    docs
    -v
    -g
@@ -53,6 +54,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    pylibcugraph               - build the pylibcugraph Python package
    cugraph-pyg                - build the cugraph-pyg Python package
    cugraph                    - build the cugraph Python package
+   cugraph-nx                 - build the cugraph-nx Python package
    cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
    cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
    cugraph-dgl                - build the cugraph-dgl extensions for DGL
@@ -206,7 +208,8 @@ if hasArg uninstall; then
     # FIXME: if multiple versions of these packages are installed, this only
     # removes the latest one and leaves the others installed. build.sh uninstall
     # can be run multiple times to remove all of them, but that is not obvious.
-    pip uninstall -y pylibcugraph cugraph cugraph-service-client cugraph-service-server cugraph-dgl cugraph-pyg
+    pip uninstall -y pylibcugraph cugraph cugraph-service-client cugraph-service-server \
+        cugraph-dgl cugraph-pyg cugraph-nx
 fi
 
 if hasArg clean; then
@@ -379,6 +382,15 @@ if hasArg cugraph-dgl; then
     fi
 fi
 
+# Build and install the cugraph-nx Python package
+if hasArg cugraph-nx; then
+    if hasArg --clean; then
+        cleanPythonDir ${REPODIR}/python/cugraph-nx
+    else
+        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/cugraph-nx
+    fi
+fi
+
 # Build the docs
 if hasArg docs; then
     if [ ! -d ${LIBCUGRAPH_BUILD_DIR} ]; then
diff --git a/cpp/tests/traversal/od_shortest_distances_test.cpp b/cpp/tests/traversal/od_shortest_distances_test.cpp
index cc283f24dfd..e4fbbdf9275 100644
--- a/cpp/tests/traversal/od_shortest_distances_test.cpp
+++ b/cpp/tests/traversal/od_shortest_distances_test.cpp
@@ -225,27 +225,27 @@ class Tests_ODShortestDistances
 using Tests_ODShortestDistances_File = Tests_ODShortestDistances<cugraph::test::File_Usecase>;
 using Tests_ODShortestDistances_Rmat = Tests_ODShortestDistances<cugraph::test::Rmat_Usecase>;
 
-TEST_P(Tests_ODShortestDistances_File, CheckInt32Int32Float)
+TEST_P(Tests_ODShortestDistances_File, DISABLED_CheckInt32Int32Float)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
 }
 
-TEST_P(Tests_ODShortestDistances_Rmat, CheckInt32Int32Float)
+TEST_P(Tests_ODShortestDistances_Rmat, DISABLED_CheckInt32Int32Float)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float>(
     std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_ODShortestDistances_Rmat, CheckInt32Int64Float)
+TEST_P(Tests_ODShortestDistances_Rmat, DISABLED_CheckInt32Int64Float)
 {
   auto param = GetParam();
   run_current_test<int32_t, int64_t, float>(
     std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_ODShortestDistances_Rmat, CheckInt64Int64Float)
+TEST_P(Tests_ODShortestDistances_Rmat, DISABLED_CheckInt64Int64Float)
 {
   auto param = GetParam();
   run_current_test<int64_t, int64_t, float>(
diff --git a/docs/cugraph/source/installation/source_build.md b/docs/cugraph/source/installation/source_build.md
index 9a93ed6c575..453149d6cea 100644
--- a/docs/cugraph/source/installation/source_build.md
+++ b/docs/cugraph/source/installation/source_build.md
@@ -84,6 +84,7 @@ build.sh [<target> ...] [<flag> ...]
    libcugraph_etl             - build libcugraph_etl.so and SG test binaries
    pylibcugraph               - build the pylibcugraph Python package
    cugraph                    - build the cugraph Python package
+   cugraph-nx                 - build the cugraph-nx Python package
    cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
    cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
    cugraph-dgl                - build the cugraph-dgl extensions for DGL
diff --git a/python/cugraph-nx/.flake8 b/python/cugraph-nx/.flake8
new file mode 100644
index 00000000000..f66815e8507
--- /dev/null
+++ b/python/cugraph-nx/.flake8
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+[flake8]
+max-line-length = 88
+inline-quotes = "
+extend-ignore =
+    E203,
+    SIM105,
+    SIM401,
+# E203 whitespace before ':' (to be compatible with black)
+per-file-ignores =
+    cugraph_nx/tests/*.py:T201,
+    __init__.py:F401,F403,
diff --git a/python/cugraph-nx/LICENSE b/python/cugraph-nx/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cugraph-nx/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cugraph-nx/Makefile b/python/cugraph-nx/Makefile
new file mode 100644
index 00000000000..c9caf147d53
--- /dev/null
+++ b/python/cugraph-nx/Makefile
@@ -0,0 +1,7 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+SHELL= /bin/bash
+
+lint:
+	git ls-files | xargs pre-commit run --config lint.yaml --files
+lint-update:
+	pre-commit autoupdate --config lint.yaml
diff --git a/python/cugraph-nx/README.md b/python/cugraph-nx/README.md
new file mode 100644
index 00000000000..80dc473f20b
--- /dev/null
+++ b/python/cugraph-nx/README.md
@@ -0,0 +1,34 @@
+# cugraph-nx
+
+## Description
+[RAPIDS](https://rapids.ai) cugraph-nx is a [backend to NetworkX](https://networkx.org/documentation/stable/reference/classes/index.html#backends)
+with minimal dependencies (`networkx`, `cupy`, and `pylibcugraph`) to run graph algorithms on the GPU.
+
+_Nightly conda packages and pip wheels coming soon._
+
+### Contribute
+
+Follow instructions for [contributing to cugraph](https://github.com/rapidsai/cugraph/blob/branch-23.10/readme_pages/CONTRIBUTING.md)
+and [building from source](https://docs.rapids.ai/api/cugraph/stable/installation/source_build/), then build cugraph-nx in develop (i.e., editable) mode:
+```
+$ ./build.sh cugraph-nx --pydevelop
+```
+
+### Run tests
+
+Run cugraph-nx tests from `cugraph/python/cugraph-nx` directory:
+```
+$ pytest
+```
+Run cugraph-nx benchmarks:
+```
+$ pytest --bench
+```
+Run networkx tests (requires networkx version 3.2):
+```
+$ ./run_nx_tests.sh
+```
+Additional arguments may be passed to pytest such as:
+```
+$ ./run_nx_tests.sh -x --sw -k betweenness
+```
diff --git a/python/cugraph-nx/conftest.py b/python/cugraph-nx/conftest.py
new file mode 100644
index 00000000000..e329b28d81c
--- /dev/null
+++ b/python/cugraph-nx/conftest.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--bench",
+        action="store_true",
+        default=False,
+        help="Run benchmarks (sugar for --benchmark-enable) and skip other tests"
+        " (to run both benchmarks AND tests, use --all)",
+    )
+    parser.addoption(
+        "--all",
+        action="store_true",
+        default=False,
+        help="Run benchmarks AND tests (unlike --bench, which only runs benchmarks)",
+    )
diff --git a/python/cugraph-nx/cugraph_nx/__init__.py b/python/cugraph-nx/cugraph_nx/__init__.py
new file mode 100644
index 00000000000..28066fe2b02
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from networkx.exception import *
+
+from . import algorithms, classes, convert, utils
+from .algorithms import *
+from .classes import *
+from .convert import *
+
+__version__ = "23.10.00"
diff --git a/python/cugraph-nx/cugraph_nx/algorithms/__init__.py b/python/cugraph-nx/cugraph_nx/algorithms/__init__.py
new file mode 100644
index 00000000000..d014f7f401f
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/algorithms/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import centrality
+from .centrality import *
diff --git a/python/cugraph-nx/cugraph_nx/algorithms/centrality/__init__.py b/python/cugraph-nx/cugraph_nx/algorithms/centrality/__init__.py
new file mode 100644
index 00000000000..2ac6242e8a4
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/algorithms/centrality/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .betweenness import *
diff --git a/python/cugraph-nx/cugraph_nx/algorithms/centrality/betweenness.py b/python/cugraph-nx/cugraph_nx/algorithms/centrality/betweenness.py
new file mode 100644
index 00000000000..a5962a52865
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/algorithms/centrality/betweenness.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pylibcugraph as plc
+
+from cugraph_nx.convert import _to_graph
+from cugraph_nx.utils import networkx_algorithm
+
+__all__ = ["betweenness_centrality", "edge_betweenness_centrality"]
+
+
+@networkx_algorithm
+def betweenness_centrality(
+    G, k=None, normalized=True, weight=None, endpoints=False, seed=None
+):
+    if weight is not None:
+        raise NotImplementedError(
+            "Weighted implementation of betweenness centrality not currently supported"
+        )
+    G = _to_graph(G, weight)
+    node_ids, values = plc.betweenness_centrality(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(),
+        k=k,
+        random_state=seed,
+        normalized=normalized,
+        include_endpoints=endpoints,
+        do_expensive_check=False,
+    )
+    return G._nodearrays_to_dict(node_ids, values)
+
+
+@betweenness_centrality._can_run
+def _(G, k=None, normalized=True, weight=None, endpoints=False, seed=None):
+    return weight is None
+
+
+@networkx_algorithm
+def edge_betweenness_centrality(G, k=None, normalized=True, weight=None, seed=None):
+    if weight is not None:
+        raise NotImplementedError(
+            "Weighted implementation of betweenness centrality not currently supported"
+        )
+    G = _to_graph(G, weight)
+    src_ids, dst_ids, values, _edge_ids = plc.edge_betweenness_centrality(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(),
+        k=k,
+        random_state=seed,
+        normalized=normalized,
+        do_expensive_check=False,
+    )
+    if not G.is_directed():
+        mask = src_ids <= dst_ids
+        src_ids = src_ids[mask]
+        dst_ids = dst_ids[mask]
+        values = 2 * values[mask]
+    return G._edgearrays_to_dict(src_ids, dst_ids, values)
+
+
+@edge_betweenness_centrality._can_run
+def _(G, k=None, normalized=True, weight=None, seed=None):
+    return weight is None
diff --git a/python/cugraph-nx/cugraph_nx/classes/__init__.py b/python/cugraph-nx/cugraph_nx/classes/__init__.py
new file mode 100644
index 00000000000..e47641ae812
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/classes/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .graph import Graph
+
+from .digraph import DiGraph  # isort:skip
diff --git a/python/cugraph-nx/cugraph_nx/classes/digraph.py b/python/cugraph-nx/cugraph_nx/classes/digraph.py
new file mode 100644
index 00000000000..0cea08f3e77
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/classes/digraph.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import networkx as nx
+
+import cugraph_nx as cnx
+
+from .graph import Graph
+
+if TYPE_CHECKING:
+    from cugraph_nx.typing import NodeKey
+
+__all__ = ["DiGraph"]
+
+networkx_api = cnx.utils.decorators.networkx_class(nx.DiGraph)
+
+
+class DiGraph(Graph):
+    #################
+    # Class methods #
+    #################
+
+    @classmethod
+    @networkx_api
+    def is_directed(cls) -> bool:
+        return True
+
+    @classmethod
+    def to_networkx_class(cls) -> type[nx.DiGraph]:
+        return nx.DiGraph
+
+    @networkx_api
+    def number_of_edges(
+        self, u: NodeKey | None = None, v: NodeKey | None = None
+    ) -> int:
+        if u is not None or v is not None:
+            raise NotImplementedError
+        return self.row_indices.size
+
+    ##########################
+    # NetworkX graph methods #
+    ##########################
+
+    @networkx_api
+    def reverse(self, copy: bool = True) -> DiGraph:
+        return self._copy(not copy, self.__class__, reverse=True)
+
+    # Many more methods to implement...
diff --git a/python/cugraph-nx/cugraph_nx/classes/graph.py b/python/cugraph-nx/cugraph_nx/classes/graph.py
new file mode 100644
index 00000000000..3d561815de6
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/classes/graph.py
@@ -0,0 +1,589 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import operator as op
+from copy import deepcopy
+from typing import TYPE_CHECKING, ClassVar
+
+import cupy as cp
+import networkx as nx
+import numpy as np
+import pylibcugraph as plc
+
+import cugraph_nx as cnx
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from cugraph_nx.typing import (
+        AttrKey,
+        Dtype,
+        EdgeTuple,
+        EdgeValue,
+        IndexValue,
+        NodeKey,
+        NodeValue,
+    )
+
+__all__ = ["Graph"]
+
+networkx_api = cnx.utils.decorators.networkx_class(nx.Graph)
+
+
+class Graph:
+    # Tell networkx to dispatch calls with this object to cugraph-nx
+    __networkx_plugin__: ClassVar[str] = "cugraph"
+
+    # networkx properties
+    graph: dict
+    graph_attr_dict_factory: ClassVar[type] = dict
+
+    # Not networkx properties
+    # We store edge data in COO format with {row,col}_indices and edge_values.
+    row_indices: cp.ndarray[IndexValue]
+    col_indices: cp.ndarray[IndexValue]
+    edge_values: dict[AttrKey, cp.ndarray[EdgeValue]]
+    edge_masks: dict[AttrKey, cp.ndarray[bool]]
+    node_values: dict[AttrKey, cp.ndarray[NodeValue]]
+    node_masks: dict[AttrKey, cp.ndarray[bool]]
+    key_to_id: dict[NodeKey, IndexValue] | None
+    _id_to_key: dict[IndexValue, NodeKey] | None
+    _N: int
+
+    ####################
+    # Creation methods #
+    ####################
+
+    @classmethod
+    def from_coo(
+        cls,
+        N: int,
+        row_indices: cp.ndarray[IndexValue],
+        col_indices: cp.ndarray[IndexValue],
+        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
+        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
+        node_values: dict[AttrKey, cp.ndarray[NodeValue]] | None = None,
+        node_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
+        *,
+        key_to_id: dict[NodeKey, IndexValue] | None = None,
+        id_to_key: dict[IndexValue, NodeKey] | None = None,
+        **attr,
+    ) -> Graph:
+        new_graph = object.__new__(cls)
+        new_graph.row_indices = row_indices
+        new_graph.col_indices = col_indices
+        new_graph.edge_values = {} if edge_values is None else dict(edge_values)
+        new_graph.edge_masks = {} if edge_masks is None else dict(edge_masks)
+        new_graph.node_values = {} if node_values is None else dict(node_values)
+        new_graph.node_masks = {} if node_masks is None else dict(node_masks)
+        new_graph.key_to_id = None if key_to_id is None else dict(key_to_id)
+        new_graph._id_to_key = None if id_to_key is None else dict(id_to_key)
+        new_graph._N = op.index(N)  # Ensure N is integral
+        new_graph.graph = new_graph.graph_attr_dict_factory()
+        new_graph.graph.update(attr)
+        size = new_graph.row_indices.size
+        # Easy and fast sanity checks
+        if size != new_graph.col_indices.size:
+            raise ValueError
+        for attr in ["edge_values", "edge_masks"]:
+            if datadict := getattr(new_graph, attr):
+                for key, val in datadict.items():
+                    if val.shape[0] != size:
+                        raise ValueError(key)
+        for attr in ["node_values", "node_masks"]:
+            if datadict := getattr(new_graph, attr):
+                for key, val in datadict.items():
+                    if val.shape[0] != N:
+                        raise ValueError(key)
+        if new_graph.key_to_id is not None and len(new_graph.key_to_id) != N:
+            raise ValueError
+        if new_graph._id_to_key is not None and len(new_graph._id_to_key) != N:
+            raise ValueError
+        return new_graph
+
+    @classmethod
+    def from_csr(
+        cls,
+        indptr: cp.ndarray[IndexValue],
+        col_indices: cp.ndarray[IndexValue],
+        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
+        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
+        node_values: dict[AttrKey, cp.ndarray[NodeValue]] | None = None,
+        node_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
+        *,
+        key_to_id: dict[NodeKey, IndexValue] | None = None,
+        id_to_key: dict[IndexValue, NodeKey] | None = None,
+        **attr,
+    ) -> Graph:
+        N = indptr.size - 1
+        row_indices = cp.array(
+            # cp.repeat is slow to use here, so use numpy instead
+            np.repeat(np.arange(N, dtype=np.int32), cp.diff(indptr).get())
+        )
+        return cls.from_coo(
+            N,
+            row_indices,
+            col_indices,
+            edge_values,
+            edge_masks,
+            node_values,
+            node_masks,
+            key_to_id=key_to_id,
+            id_to_key=id_to_key,
+            **attr,
+        )
+
+    @classmethod
+    def from_csc(
+        cls,
+        indptr: cp.ndarray[IndexValue],
+        row_indices: cp.ndarray[IndexValue],
+        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
+        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
+        node_values: dict[AttrKey, cp.ndarray[NodeValue]] | None = None,
+        node_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
+        *,
+        key_to_id: dict[NodeKey, IndexValue] | None = None,
+        id_to_key: dict[IndexValue, NodeKey] | None = None,
+        **attr,
+    ) -> Graph:
+        N = indptr.size - 1
+        col_indices = cp.array(
+            # cp.repeat is slow to use here, so use numpy instead
+            np.repeat(np.arange(N, dtype=np.int32), cp.diff(indptr).get())
+        )
+        return cls.from_coo(
+            N,
+            row_indices,
+            col_indices,
+            edge_values,
+            edge_masks,
+            node_values,
+            node_masks,
+            key_to_id=key_to_id,
+            id_to_key=id_to_key,
+            **attr,
+        )
+
+    @classmethod
+    def from_dcsr(
+        cls,
+        N: int,
+        compressed_rows: cp.ndarray[IndexValue],
+        indptr: cp.ndarray[IndexValue],
+        col_indices: cp.ndarray[IndexValue],
+        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
+        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
+        node_values: dict[AttrKey, cp.ndarray[NodeValue]] | None = None,
+        node_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
+        *,
+        key_to_id: dict[NodeKey, IndexValue] | None = None,
+        id_to_key: dict[IndexValue, NodeKey] | None = None,
+        **attr,
+    ) -> Graph:
+        row_indices = cp.array(
+            # cp.repeat is slow to use here, so use numpy instead
+            np.repeat(compressed_rows.get(), cp.diff(indptr).get())
+        )
+        return cls.from_coo(
+            N,
+            row_indices,
+            col_indices,
+            edge_values,
+            edge_masks,
+            node_values,
+            node_masks,
+            key_to_id=key_to_id,
+            id_to_key=id_to_key,
+            **attr,
+        )
+
+    @classmethod
+    def from_dcsc(
+        cls,
+        N: int,
+        compressed_cols: cp.ndarray[IndexValue],
+        indptr: cp.ndarray[IndexValue],
+        row_indices: cp.ndarray[IndexValue],
+        edge_values: dict[AttrKey, cp.ndarray[EdgeValue]] | None = None,
+        edge_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
+        node_values: dict[AttrKey, cp.ndarray[NodeValue]] | None = None,
+        node_masks: dict[AttrKey, cp.ndarray[bool]] | None = None,
+        *,
+        key_to_id: dict[NodeKey, IndexValue] | None = None,
+        id_to_key: dict[IndexValue, NodeKey] | None = None,
+        **attr,
+    ) -> Graph:
+        col_indices = cp.array(
+            # cp.repeat is slow to use here, so use numpy instead
+            np.repeat(compressed_cols.get(), cp.diff(indptr).get())
+        )
+        return cls.from_coo(
+            N,
+            row_indices,
+            col_indices,
+            edge_values,
+            edge_masks,
+            node_values,
+            node_masks,
+            key_to_id=key_to_id,
+            id_to_key=id_to_key,
+            **attr,
+        )
+
+    def __new__(cls, incoming_graph_data=None, **attr) -> Graph:
+        if incoming_graph_data is None:
+            new_graph = cls.from_coo(0, cp.empty(0, np.int32), cp.empty(0, np.int32))
+        elif incoming_graph_data.__class__ is new_graph.__class__:
+            new_graph = incoming_graph_data.copy()
+        elif incoming_graph_data.__class__ is new_graph.to_networkx_class():
+            new_graph = cnx.from_networkx(incoming_graph_data, preserve_all_attrs=True)
+        else:
+            raise NotImplementedError
+        new_graph.graph.update(attr)
+        return new_graph
+
+    #################
+    # Class methods #
+    #################
+
+    @classmethod
+    @networkx_api
+    def is_directed(cls) -> bool:
+        return False
+
+    @classmethod
+    @networkx_api
+    def is_multigraph(cls) -> bool:
+        return False
+
+    @classmethod
+    @networkx_api
+    def to_directed_class(cls) -> type[cnx.DiGraph]:
+        return cnx.DiGraph
+
+    @classmethod
+    def to_networkx_class(cls) -> type[nx.Graph]:
+        return nx.Graph
+
+    @classmethod
+    @networkx_api
+    def to_undirected_class(cls) -> type[Graph]:
+        return Graph
+
+    ##############
+    # Properties #
+    ##############
+
+    @property
+    def edge_dtypes(self) -> dict[AttrKey, Dtype]:
+        return {key: val.dtype for key, val in self.edge_values.items()}
+
+    @property
+    def node_dtypes(self) -> dict[AttrKey, Dtype]:
+        return {key: val.dtype for key, val in self.node_values.items()}
+
+    @property
+    def id_to_key(self) -> dict[IndexValue, NodeKey] | None:
+        if self.key_to_id is None:
+            return None
+        if self._id_to_key is None:
+            self._id_to_key = {val: key for key, val in self.key_to_id.items()}
+        return self._id_to_key
+
+    name = nx.Graph.name
+
+    ##################
+    # Dunder methods #
+    ##################
+
+    @networkx_api
+    def __contains__(self, n: NodeKey) -> bool:
+        if self.key_to_id is not None:
+            container = self.key_to_id
+        else:
+            container = range(self._N)
+        try:
+            return n in container
+        except TypeError:
+            return False
+
+    @networkx_api
+    def __iter__(self) -> Iterator[NodeKey]:
+        if self.key_to_id is not None:
+            return iter(self.key_to_id)
+        return iter(range(self._N))
+
+    @networkx_api
+    def __len__(self) -> int:
+        return self._N
+
+    __str__ = nx.Graph.__str__
+
+    ##########################
+    # NetworkX graph methods #
+    ##########################
+
+    @networkx_api
+    def clear(self) -> None:
+        self.edge_values.clear()
+        self.edge_masks.clear()
+        self.node_values.clear()
+        self.node_masks.clear()
+        self.graph.clear()
+        self.row_indices = cp.empty(0, self.row_indices.dtype)
+        self.col_indices = cp.empty(0, self.col_indices.dtype)
+        self._N = 0
+        self.key_to_id = None
+        self._id_to_key = None
+
+    @networkx_api
+    def clear_edges(self) -> None:
+        self.edge_values.clear()
+        self.edge_masks.clear()
+        self.row_indices = cp.empty(0, self.row_indices.dtype)
+        self.col_indices = cp.empty(0, self.col_indices.dtype)
+
+    @networkx_api
+    def copy(self, as_view: bool = False) -> Graph:
+        # Does shallow copy in networkx
+        return self._copy(as_view, self.__class__)
+
+    @networkx_api
+    def get_edge_data(
+        self, u: NodeKey, v: NodeKey, default: EdgeValue | None = None
+    ) -> dict[AttrKey, EdgeValue]:
+        if self.key_to_id is not None:
+            try:
+                u = self.key_to_id[u]
+                v = self.key_to_id[v]
+            except KeyError:
+                return default
+        index = cp.nonzero((self.row_indices == u) & (self.col_indices == v))[0]
+        if index.size == 0:
+            return default
+        [index] = index.tolist()
+        if not self.edge_values:
+            return {}
+        return {
+            key: val[index].tolist()
+            for key, val in self.edge_values.items()
+            if key not in self.edge_masks or self.edge_masks[key][index]
+        }
+
+    @networkx_api
+    def has_edge(self, u: NodeKey, v: NodeKey) -> bool:
+        if self.key_to_id is not None:
+            try:
+                u = self.key_to_id[u]
+                v = self.key_to_id[v]
+            except KeyError:
+                return False
+        return bool(((self.row_indices == u) & (self.col_indices == v)).any())
+
+    @networkx_api
+    def has_node(self, n: NodeKey) -> bool:
+        return n in self
+
+    @networkx_api
+    def nbunch_iter(self, nbunch=None) -> Iterator[NodeKey]:
+        if nbunch is None:
+            return iter(self)
+        if nbunch in self:
+            return iter([nbunch])
+        return (node for node in nbunch if node in self)
+
+    @networkx_api
+    def number_of_edges(
+        self, u: NodeKey | None = None, v: NodeKey | None = None
+    ) -> int:
+        if u is not None or v is not None:
+            raise NotImplementedError
+        return self.size()
+
+    @networkx_api
+    def number_of_nodes(self) -> int:
+        return self._N
+
+    @networkx_api
+    def order(self) -> int:
+        return self._N
+
+    @networkx_api
+    def size(self, weight: AttrKey | None = None) -> int:
+        if weight is not None:
+            raise NotImplementedError
+        # If no self-edges, then `self.row_indices.size // 2`
+        return int((self.row_indices <= self.col_indices).sum())
+
+    @networkx_api
+    def to_directed(self, as_view: bool = False) -> cnx.DiGraph:
+        return self._copy(as_view, self.to_directed_class())
+
+    @networkx_api
+    def to_undirected(self, as_view: bool = False) -> Graph:
+        # Does deep copy in networkx
+        return self.copy(as_view)
+
+    # Not implemented...
+    # adj, adjacency, add_edge, add_edges_from, add_node,
+    # add_nodes_from, add_weighted_edges_from, degree,
+    # edge_subgraph, edges, neighbors, nodes, remove_edge,
+    # remove_edges_from, remove_node, remove_nodes_from, subgraph, update
+
+    ###################
+    # Private methods #
+    ###################
+
+    def _copy(self, as_view: bool, cls: type[Graph], reverse: bool = False):
+        indptr = self.indptr
+        row_indices = self.row_indices
+        col_indices = self.col_indices
+        edge_values = self.edge_values
+        edge_masks = self.edge_masks
+        node_values = self.node_values
+        node_masks = self.node_masks
+        key_to_id = self.key_to_id
+        id_to_key = None if key_to_id is None else self._id_to_key
+        if not as_view:
+            indptr = indptr.copy()
+            row_indices = row_indices.copy()
+            col_indices = col_indices.copy()
+            edge_values = {key: val.copy() for key, val in edge_values.items()}
+            edge_masks = {key: val.copy() for key, val in edge_masks.items()}
+            node_values = {key: val.copy() for key, val in node_values.items()}
+            node_masks = {key: val.copy() for key, val in node_masks.items()}
+            if key_to_id is not None:
+                key_to_id = key_to_id.copy()
+                if id_to_key is not None:
+                    id_to_key = id_to_key.copy()
+        if reverse:
+            row_indices, col_indices = col_indices, row_indices
+        rv = cls.from_coo(
+            indptr,
+            row_indices,
+            col_indices,
+            edge_values,
+            edge_masks,
+            node_values,
+            node_masks,
+            key_to_id=key_to_id,
+            id_to_key=id_to_key,
+        )
+        if as_view:
+            rv.graph = self.graph
+        else:
+            rv.graph.update(deepcopy(self.graph))
+        return rv
+
+    def _get_plc_graph(
+        self,
+        edge_attr: AttrKey | None = None,
+        edge_default: EdgeValue | None = None,
+        edge_dtype: Dtype | None = None,
+        *,
+        store_transposed: bool = False,
+    ):
+        if edge_attr is None:
+            edge_array = None
+        elif edge_attr not in self.edge_values:
+            raise KeyError("Graph has no edge attribute {edge_attr!r}")
+        elif edge_attr not in self.edge_masks:
+            edge_array = self.edge_values[edge_attr]
+        elif not self.edge_masks[edge_attr].all():
+            if edge_default is None:
+                raise NotImplementedError(
+                    "Missing edge attributes is not yet implemented"
+                )
+            edge_array = cp.where(
+                self.edge_masks[edge_attr], self.edge_values[edge_attr], edge_default
+            )
+        else:
+            # Mask is all True; don't need anymore
+            del self.edge_masks[edge_attr]
+            edge_array = self.edge_values[edge_attr]
+        # Should we cache PLC graph?
+        if edge_dtype is not None:
+            edge_dtype = np.dtype(edge_dtype)
+            if edge_array.dtype != edge_dtype:
+                edge_array = edge_array.astype(edge_dtype)
+        return plc.SGGraph(
+            resource_handle=plc.ResourceHandle(),
+            graph_properties=plc.GraphProperties(
+                is_multigraph=self.is_multigraph(),
+                is_symmetric=not self.is_directed(),
+            ),
+            src_or_offset_array=self.row_indices,
+            dst_or_index_array=self.col_indices,
+            weight_array=edge_array,
+            store_transposed=store_transposed,
+            renumber=False,
+            do_expensive_check=False,
+        )
+
+    def _nodearrays_to_dict(
+        self, node_ids: cp.ndarray[IndexValue], values: cp.ndarray[NodeValue]
+    ) -> dict[NodeKey, NodeValue]:
+        it = zip(node_ids.tolist(), values.tolist())
+        if (id_to_key := self.id_to_key) is not None:
+            return {id_to_key[key]: val for key, val in it}
+        return dict(it)
+
+    def _edgearrays_to_dict(
+        self,
+        src_ids: cp.ndarray[IndexValue],
+        dst_ids: cp.ndarray[IndexValue],
+        values: cp.ndarray[EdgeValue],
+    ) -> dict[EdgeTuple, EdgeValue]:
+        it = zip(zip(src_ids.tolist(), dst_ids.tolist()), values.tolist())
+        if (id_to_key := self.id_to_key) is not None:
+            return {
+                (id_to_key[src_id], id_to_key[dst_id]): val
+                for (src_id, dst_id), val in it
+            }
+        return dict(it)
+
+    def _dict_to_nodearrays(
+        self,
+        d: dict[NodeKey, NodeValue],
+        dtype: Dtype | None = None,
+    ) -> tuple[cp.ndarray[IndexValue], cp.ndarray[NodeValue]]:
+        if self.key_to_id is None:
+            indices_iter = d
+        else:
+            indices_iter = map(self.key_to_id.__getitem__, d)
+        node_ids = cp.fromiter(indices_iter, np.int32)
+        if dtype is None:
+            values = cp.array(list(d.values()))
+        else:
+            values = cp.fromiter(d.values(), dtype)
+        return node_ids, values
+
+    # def _dict_to_nodearray(
+    #     self,
+    #     d: dict[NodeKey, NodeValue] | cp.ndarray[NodeValue],
+    #     default: NodeValue | None = None,
+    #     dtype: Dtype | None = None,
+    # ) -> cp.ndarray[NodeValue]:
+    #     if isinstance(d, cp.ndarray):
+    #         if d.shape[0] != len(self):
+    #             raise ValueError
+    #         return d
+    #     if default is None:
+    #         val_iter = map(d.__getitem__, self)
+    #     else:
+    #         val_iter = (d.get(node, default) for node in self)
+    #     if dtype is None:
+    #         return cp.array(list(val_iter))
+    #     return cp.fromiter(val_iter, dtype)
diff --git a/python/cugraph-nx/cugraph_nx/convert.py b/python/cugraph-nx/cugraph_nx/convert.py
new file mode 100644
index 00000000000..530dd700f35
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/convert.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import itertools
+import operator as op
+from collections import Counter
+from collections.abc import Mapping
+from typing import TYPE_CHECKING
+
+import cupy as cp
+import networkx as nx
+import numpy as np
+
+import cugraph_nx as cnx
+
+if TYPE_CHECKING:
+    from cugraph_nx.typing import AttrKey, Dtype, EdgeValue, NodeValue
+
+__all__ = [
+    "from_networkx",
+    "to_networkx",
+]
+
+concat = itertools.chain.from_iterable
+# A "required" attribute is one that all edges or nodes must have or KeyError is raised
+REQUIRED = ...
+
+
+def from_networkx(
+    graph: nx.Graph,
+    edge_attrs: AttrKey | dict[AttrKey, EdgeValue | None] | None = None,
+    edge_dtypes: Dtype | dict[AttrKey, Dtype | None] | None = None,
+    *,
+    node_attrs: AttrKey | dict[AttrKey, NodeValue | None] | None = None,
+    node_dtypes: Dtype | dict[AttrKey, Dtype | None] | None = None,
+    preserve_all_attrs: bool = False,
+    preserve_edge_attrs: bool = False,
+    preserve_node_attrs: bool = False,
+    preserve_graph_attrs: bool = False,
+    as_directed: bool = False,
+    name: str | None = None,
+    graph_name: str | None = None,
+) -> cnx.Graph:
+    """Convert a networkx graph to cugraph_nx graph; can convert all attributes.
+
+    Parameters
+    ----------
+    G : networkx.Graph
+    edge_attrs : str or dict, optional
+        Dict that maps edge attributes to default values if missing in ``G``.
+        If None, then no edge attributes will be converted.
+        If default value is None, then missing values are handled with a mask.
+        A default value of ``cnx.convert.REQUIRED`` or ``...`` indicates that
+        all edges have data for this attribute, and raise `KeyError` if not.
+        For convenience, `edge_attrs` may be a single attribute with default 1;
+        for example ``edge_attrs="weight"``.
+    edge_dtypes : dtype or dict, optional
+    node_attrs : str or dict, optional
+        Dict that maps node attributes to default values if missing in ``G``.
+        If None, then no node attributes will be converted.
+        If default value is None, then missing values are handled with a mask.
+        A default value of ``cnx.convert.REQUIRED`` or ``...`` indicates that
+        all edges have data for this attribute, and raise `KeyError` if not.
+        For convenience, `node_attrs` may be a single attribute with no default;
+        for example ``node_attrs="weight"``.
+    node_dtypes : dtype or dict, optional
+    preserve_all_attrs : bool, default False
+        If True, then equivalent to setting preserve_edge_attrs, preserve_node_attrs,
+        and preserve_graph_attrs to True.
+    preserve_edge_attrs : bool, default False
+        Whether to preserve all edge attributes.
+    preserve_node_attrs : bool, default False
+        Whether to preserve all node attributes.
+    preserve_graph_attrs : bool, default False
+        Whether to preserve all graph attributes.
+    as_directed : bool, default False
+        If True, then the returned graph will be directed regardless of input.
+        If False, then the returned graph type is determined by input graph.
+    name : str, optional
+        The name of the algorithm when dispatched from networkx.
+    graph_name : str, optional
+        The name of the graph argument geing converted when dispatched from networkx.
+
+    Returns
+    -------
+    cugraph_nx.Graph
+
+    Notes
+    -----
+    For optimal performance, be as specific as possible about what is being converted:
+
+    1. Do you need edge values? Creating a graph with just the structure is the fastest.
+    2. Do you know the edge attribute(s) you need? Specify with `edge_attrs`.
+    3. Do you know the default values? Specify with ``edge_attrs={weight: default}``.
+    4. Do you know if all edges have values? Specify with ``edge_attrs={weight: ...}``.
+    5. Do you know the dtype of attributes? Specify with `edge_dtypes`.
+
+    Conversely, using ``preserve_edge_attrs=True`` or ``preserve_all_attrs=True`` are
+    the slowest, but are also the most flexible and generic.
+
+    See Also
+    --------
+    to_networkx : The opposite; convert cugraph_nx graph to networkx graph
+    """
+    # This uses `graph._adj` and `graph._node`, which are private attributes in NetworkX
+    if not isinstance(graph, nx.Graph):
+        if isinstance(graph, nx.classes.reportviews.NodeView):
+            # Convert to a Graph with only nodes (no edges)
+            G = nx.Graph()
+            G.add_nodes_from(graph.items())
+            graph = G
+        else:
+            raise TypeError(f"Expected networkx.Graph; got {type(graph)}")
+    elif graph.is_multigraph():
+        raise NotImplementedError("MultiGraph support is not yet implemented")
+
+    if preserve_all_attrs:
+        preserve_edge_attrs = True
+        preserve_node_attrs = True
+        preserve_graph_attrs = True
+
+    if edge_attrs is not None:
+        if isinstance(edge_attrs, Mapping):
+            # Copy so we don't mutate the original
+            edge_attrs = dict(edge_attrs)
+        else:
+            edge_attrs = {edge_attrs: 1}
+
+    if node_attrs is not None:
+        if isinstance(node_attrs, Mapping):
+            # Copy so we don't mutate the original
+            node_attrs = dict(node_attrs)
+        else:
+            node_attrs = {node_attrs: None}
+
+    if graph.__class__ in {nx.Graph, nx.DiGraph}:
+        # This is a NetworkX private attribute, but is much faster to use
+        adj = graph._adj
+    else:
+        adj = graph.adj
+    if isinstance(adj, nx.classes.coreviews.FilterAdjacency):
+        adj = {k: dict(v) for k, v in adj.items()}
+
+    N = len(adj)
+    if (
+        not preserve_edge_attrs
+        and not edge_attrs
+        # Faster than graph.number_of_edges() == 0
+        or next(concat(rowdata.values() for rowdata in adj.values()), None) is None
+    ):
+        # Either we weren't asked to preserve edge attributes, or there are no edges
+        edge_attrs = None
+    elif preserve_edge_attrs:
+        # Using comprehensions should be just as fast starting in Python 3.11
+        attr_sets = set(map(frozenset, concat(map(dict.values, adj.values()))))
+        attrs = frozenset.union(*attr_sets)
+        edge_attrs = dict.fromkeys(attrs, REQUIRED)
+        if len(attr_sets) > 1:
+            # Determine which edges have missing data
+            for attr, count in Counter(concat(attr_sets)).items():
+                if count != len(attr_sets):
+                    edge_attrs[attr] = None
+    elif None in edge_attrs.values():
+        # Required edge attributes have a default of None in `edge_attrs`
+        # Verify all edge attributes are present!
+        required = frozenset(
+            attr for attr, default in edge_attrs.items() if default is None
+        )
+        if len(required) == 1:
+            # Fast path for the common case of a single attribute with no default
+            [attr] = required
+            it = (
+                attr in edgedata
+                for rowdata in adj.values()
+                for edgedata in rowdata.values()
+            )
+            if next(it):
+                if all(it):
+                    # All edges have data
+                    edge_attrs[attr] = REQUIRED
+                # Else some edges have attribute (default already None)
+            elif not any(it):
+                # No edges have attribute
+                del edge_attrs[attr]
+            # Else some edges have attribute (default already None)
+        else:
+            attr_sets = set(
+                map(required.intersection, concat(map(dict.values, adj.values())))
+            )
+            for attr in required - frozenset.union(*attr_sets):
+                # No edges have these attributes
+                del edge_attrs[attr]
+            for attr in frozenset.intersection(*attr_sets):
+                # All edges have these attributes
+                edge_attrs[attr] = REQUIRED
+
+    if N == 0:
+        node_attrs = None
+    elif preserve_node_attrs:
+        attr_sets = set(map(frozenset, graph._node.values()))
+        attrs = frozenset.union(*attr_sets)
+        node_attrs = dict.fromkeys(attrs, REQUIRED)
+        if len(attr_sets) > 1:
+            # Determine which nodes have missing data
+            for attr, count in Counter(concat(attr_sets)).items():
+                if count != len(attr_sets):
+                    node_attrs[attr] = None
+    elif node_attrs and None in node_attrs.values():
+        # Required node attributes have a default of None in `node_attrs`
+        # Verify all node attributes are present!
+        required = frozenset(
+            attr for attr, default in node_attrs.items() if default is None
+        )
+        if len(required) == 1:
+            # Fast path for the common case of a single attribute with no default
+            [attr] = required
+            it = (attr in nodedata for nodedata in graph._node.values())
+            if next(it):
+                if all(it):
+                    # All nodes have data
+                    node_attrs[attr] = REQUIRED
+                # Else some nodes have attribute (default already None)
+            elif not any(it):
+                # No nodes have attribute
+                del node_attrs[attr]
+            # Else some nodes have attribute (default already None)
+        else:
+            attr_sets = set(map(required.intersection, graph._node.values()))
+            for attr in required - frozenset.union(*attr_sets):
+                # No nodes have these attributes
+                del node_attrs[attr]
+            for attr in frozenset.intersection(*attr_sets):
+                # All nodes have these attributes
+                node_attrs[attr] = REQUIRED
+
+    key_to_id = dict(zip(adj, range(N)))
+    col_iter = concat(adj.values())
+    try:
+        no_renumber = all(k == v for k, v in key_to_id.items())
+    except Exception:
+        no_renumber = False
+    if no_renumber:
+        key_to_id = None
+    else:
+        col_iter = map(key_to_id.__getitem__, col_iter)
+    col_indices = cp.fromiter(col_iter, np.int32)
+
+    edge_values = {}
+    edge_masks = {}
+    if edge_attrs:
+        if edge_dtypes is None:
+            edge_dtypes = {}
+        elif not isinstance(edge_dtypes, Mapping):
+            edge_dtypes = dict.fromkeys(edge_attrs, edge_dtypes)
+        for edge_attr, edge_default in edge_attrs.items():
+            dtype = edge_dtypes.get(edge_attr)
+            if edge_default is None:
+                vals = []
+                append = vals.append
+                iter_mask = (
+                    append(
+                        edgedata[edge_attr]
+                        if (present := edge_attr in edgedata)
+                        else False
+                    )
+                    or present
+                    for rowdata in adj.values()
+                    for edgedata in rowdata.values()
+                )
+                edge_masks[edge_attr] = cp.fromiter(iter_mask, bool)
+                edge_values[edge_attr] = cp.array(vals, dtype)
+                # if vals.ndim > 1: ...
+            else:
+                if edge_default is REQUIRED:
+                    # Using comprehensions should be fast starting in Python 3.11
+                    # iter_values = (
+                    #     edgedata[edge_attr]
+                    #     for rowdata in adj.values()
+                    #     for edgedata in rowdata.values()
+                    # )
+                    iter_values = map(
+                        op.itemgetter(edge_attr), concat(map(dict.values, adj.values()))
+                    )
+                else:
+                    iter_values = (
+                        edgedata.get(edge_attr, edge_default)
+                        for rowdata in adj.values()
+                        for edgedata in rowdata.values()
+                    )
+                if dtype is None:
+                    edge_values[edge_attr] = cp.array(list(iter_values))
+                else:
+                    edge_values[edge_attr] = cp.fromiter(iter_values, dtype)
+                # if vals.ndim > 1: ...
+
+    row_indices = cp.array(
+        # cp.repeat is slow to use here, so use numpy instead
+        np.repeat(
+            np.arange(N, dtype=np.int32),
+            np.fromiter(map(len, adj.values()), np.int32),
+        )
+    )
+
+    node_values = {}
+    node_masks = {}
+    if node_attrs:
+        nodes = graph._node
+        if node_dtypes is None:
+            node_dtypes = {}
+        elif not isinstance(node_dtypes, Mapping):
+            node_dtypes = dict.fromkeys(node_attrs, node_dtypes)
+        for node_attr, node_default in node_attrs.items():
+            # Iterate over `adj` to ensure consistent order
+            dtype = node_dtypes.get(node_attr)
+            if node_default is None:
+                vals = []
+                append = vals.append
+                iter_mask = (
+                    append(
+                        nodedata[node_attr]
+                        if (present := node_attr in (nodedata := nodes[node_id]))
+                        else False
+                    )
+                    or present
+                    for node_id in adj
+                )
+                node_masks[node_attr] = cp.fromiter(iter_mask, bool)
+                node_values[node_attr] = cp.array(vals, dtype)
+                # if vals.ndim > 1: ...
+            else:
+                if node_default is REQUIRED:
+                    iter_values = (nodes[node_id][node_attr] for node_id in adj)
+                else:
+                    iter_values = (
+                        nodes[node_id].get(node_attr, node_default) for node_id in adj
+                    )
+                if dtype is None:
+                    node_values[node_attr] = cp.array(list(iter_values))
+                else:
+                    node_values[node_attr] = cp.fromiter(iter_values, dtype)
+                # if vals.ndim > 1: ...
+
+    if graph.is_directed() or as_directed:
+        klass = cnx.DiGraph
+    else:
+        klass = cnx.Graph
+    rv = klass.from_coo(
+        N,
+        row_indices,
+        col_indices,
+        edge_values,
+        edge_masks,
+        node_values,
+        node_masks,
+        key_to_id=key_to_id,
+    )
+    if preserve_graph_attrs:
+        rv.graph.update(graph.graph)  # deepcopy?
+    return rv
+
+
+def _iter_attr_dicts(
+    values: dict[AttrKey, cp.ndarray[EdgeValue | NodeValue]],
+    masks: dict[AttrKey, cp.ndarray[bool]],
+):
+    full_attrs = list(values.keys() - masks.keys())
+    if full_attrs:
+        full_dicts = (
+            dict(zip(full_attrs, vals))
+            for vals in zip(*(values[attr].tolist() for attr in full_attrs))
+        )
+    partial_attrs = list(values.keys() & masks.keys())
+    if partial_attrs:
+        partial_dicts = (
+            {k: v for k, (v, m) in zip(partial_attrs, vals_masks) if m}
+            for vals_masks in zip(
+                *(
+                    zip(values[attr].tolist(), masks[attr].tolist())
+                    for attr in partial_attrs
+                )
+            )
+        )
+    if full_attrs and partial_attrs:
+        full_dicts = (d1.update(d2) or d1 for d1, d2 in zip(full_dicts, partial_dicts))
+    elif partial_attrs:
+        full_dicts = partial_dicts
+    return full_dicts
+
+
+def to_networkx(G: cnx.Graph) -> nx.Graph:
+    """Convert a cugraph_nx graph to networkx graph.
+
+    All edge and node attributes and ``G.graph`` properties are converted.
+
+    Parameters
+    ----------
+    G : cugraph_nx.Graph
+
+    Returns
+    -------
+    networkx.Graph
+
+    See Also
+    --------
+    from_networkx : The opposite; convert networkx graph to cugraph_nx graph
+    """
+    rv = G.to_networkx_class()()
+    id_to_key = G.id_to_key
+
+    node_values = G.node_values
+    node_masks = G.node_masks
+    if node_values:
+        node_iter = range(len(G))
+        if id_to_key is not None:
+            node_iter = map(id_to_key.__getitem__, node_iter)
+        full_node_dicts = _iter_attr_dicts(node_values, node_masks)
+        rv.add_nodes_from(zip(node_iter, full_node_dicts))
+    elif id_to_key is not None:
+        rv.add_nodes_from(id_to_key.values())
+    else:
+        rv.add_nodes_from(range(len(G)))
+
+    row_indices = G.row_indices
+    col_indices = G.col_indices
+    edge_values = G.edge_values
+    edge_masks = G.edge_masks
+    if edge_values and not G.is_directed():
+        # Only add upper triangle of the adjacency matrix so we don't double-add edges
+        mask = row_indices <= col_indices
+        row_indices = row_indices[mask]
+        col_indices = col_indices[mask]
+        edge_values = {k: v[mask] for k, v in edge_values.items()}
+        if edge_masks:
+            edge_masks = {k: v[mask] for k, v in edge_masks.items()}
+    row_indices = row_iter = row_indices.tolist()
+    col_indices = col_iter = col_indices.tolist()
+    if id_to_key is not None:
+        row_iter = map(id_to_key.__getitem__, row_indices)
+        col_iter = map(id_to_key.__getitem__, col_indices)
+    if edge_values:
+        full_edge_dicts = _iter_attr_dicts(edge_values, edge_masks)
+        rv.add_edges_from(zip(row_iter, col_iter, full_edge_dicts))
+    else:
+        rv.add_edges_from(zip(row_iter, col_iter))
+
+    rv.graph.update(G.graph)
+    return rv
+
+
+def _to_graph(
+    G,
+    edge_attr: AttrKey | None = None,
+    edge_default: EdgeValue | None = 1,
+    edge_dtype: Dtype | None = None,
+) -> cnx.Graph | cnx.DiGraph:
+    """Ensure that input type is a cugraph_nx graph, and convert if necessary.
+
+    Directed and undirected graphs are both allowed.
+    This is an internal utility function and may change or be removed.
+    """
+    if isinstance(G, cnx.Graph):
+        return G
+    if isinstance(G, nx.Graph):
+        return from_networkx(
+            G, {edge_attr: edge_default} if edge_attr is not None else None, edge_dtype
+        )
+    # TODO: handle cugraph.Graph
+    raise TypeError
+
+
+def _to_directed_graph(
+    G,
+    edge_attr: AttrKey | None = None,
+    edge_default: EdgeValue | None = 1,
+    edge_dtype: Dtype | None = None,
+) -> cnx.DiGraph:
+    """Ensure that input type is a cugraph_nx DiGraph, and convert if necessary.
+
+    Undirected graphs will be converted to directed.
+    This is an internal utility function and may change or be removed.
+    """
+    if isinstance(G, cnx.DiGraph):
+        return G
+    if isinstance(G, cnx.Graph):
+        return G.to_directed()
+    if isinstance(G, nx.Graph):
+        return from_networkx(
+            G,
+            {edge_attr: edge_default} if edge_attr is not None else None,
+            edge_dtype,
+            as_directed=True,
+        )
+    # TODO: handle cugraph.Graph
+    raise TypeError
+
+
+def _to_undirected_graph(
+    G,
+    edge_attr: AttrKey | None = None,
+    edge_default: EdgeValue | None = 1,
+    edge_dtype: Dtype | None = None,
+) -> cnx.Graph:
+    """Ensure that input type is a cugraph_nx Graph, and convert if necessary.
+
+    Only undirected graphs are allowed. Directed graphs will raise ValueError.
+    This is an internal utility function and may change or be removed.
+    """
+    if isinstance(G, cnx.Graph):
+        if G.is_directed():
+            raise ValueError("Only undirected graphs supported; got a directed graph")
+        return G
+    if isinstance(G, nx.Graph):
+        return from_networkx(
+            G, {edge_attr: edge_default} if edge_attr is not None else None, edge_dtype
+        )
+    # TODO: handle cugraph.Graph
+    raise TypeError
diff --git a/python/cugraph-nx/cugraph_nx/interface.py b/python/cugraph-nx/cugraph_nx/interface.py
new file mode 100644
index 00000000000..fe492c43ca2
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/interface.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import networkx as nx
+
+import cugraph_nx as cnx
+
+
+class BackendInterface:
+    # Required conversions
+    @staticmethod
+    def convert_from_nx(graph, *args, edge_attrs=None, weight=None, **kwargs):
+        if weight is not None:
+            # For networkx 3.0 and 3.1 compatibility
+            if edge_attrs is not None:
+                raise TypeError(
+                    "edge_attrs and weight arguments should not both be given"
+                )
+            edge_attrs = {weight: 1}
+        return cnx.from_networkx(graph, *args, edge_attrs=edge_attrs, **kwargs)
+
+    @staticmethod
+    def convert_to_nx(obj, *, name: str | None = None):
+        if isinstance(obj, cnx.Graph):
+            return cnx.to_networkx(obj)
+        return obj
+
+    @staticmethod
+    def on_start_tests(items):
+        try:
+            import pytest
+        except ModuleNotFoundError:
+            return
+
+        def key(testpath):
+            filename, path = testpath.split(":")
+            *names, testname = path.split(".")
+            if names:
+                [classname] = names
+                return (testname, frozenset({classname, filename}))
+            return (testname, frozenset({filename}))
+
+        string_attribute = "unable to handle string attributes"
+
+        skip = {
+            key("test_pajek.py:TestPajek.test_ignored_attribute"): string_attribute,
+            key(
+                "test_agraph.py:TestAGraph.test_no_warnings_raised"
+            ): "pytest.warn(None) deprecated",
+        }
+        for item in items:
+            kset = set(item.keywords)
+            for (test_name, keywords), reason in skip.items():
+                if item.name == test_name and keywords.issubset(kset):
+                    item.add_marker(pytest.mark.xfail(reason=reason))
+
+    @classmethod
+    def can_run(cls, name, args, kwargs):
+        """Can this backend run the specified algorithms with the given arguments?
+
+        This is a proposed API to add to networkx dispatching machinery and may change.
+        """
+        return (
+            hasattr(cls, name)
+            and getattr(cls, name).can_run(*args, **kwargs)
+            # We don't support MultiGraphs yet
+            and not any(isinstance(x, nx.MultiGraph) for x in args)
+            and not any(isinstance(x, nx.MultiGraph) for x in kwargs.values())
+        )
diff --git a/python/cugraph-nx/cugraph_nx/tests/__init__.py b/python/cugraph-nx/cugraph_nx/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cugraph-nx/cugraph_nx/tests/bench_convert.py b/python/cugraph-nx/cugraph_nx/tests/bench_convert.py
new file mode 100644
index 00000000000..85ef66ac918
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/tests/bench_convert.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+
+import networkx as nx
+import numpy as np
+import pytest
+
+import cugraph_nx as cnx
+
+try:
+    import cugraph
+except ModuleNotFoundError:
+    cugraph = None
+try:
+    import scipy
+except ModuleNotFoundError:
+    scipy = None
+
+# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark"
+# fixture will be available automatically. Check that this fixture is available
+# by trying to import rapids_pytest_benchmark, and if that fails, set
+# "gpubenchmark" to the standard "benchmark" fixture provided by
+# pytest-benchmark.
+try:
+    import rapids_pytest_benchmark  # noqa: F401
+except ModuleNotFoundError:
+    import pytest_benchmark
+
+    gpubenchmark = pytest_benchmark.plugin.benchmark
+
+
+def _bench_helper(gpubenchmark, N, attr_kind, create_using, method):
+    G = method(N, create_using=create_using)
+    if attr_kind:
+        skip = True
+        for *_ids, edgedict in G.edges(data=True):
+            skip = not skip
+            if skip and attr_kind not in {"full", "required", "required_dtype"}:
+                continue
+            edgedict["x"] = random.randint(0, 100000)
+        if attr_kind == "preserve":
+            gpubenchmark(cnx.from_networkx, G, preserve_edge_attrs=True)
+        elif attr_kind == "half_missing":
+            gpubenchmark(cnx.from_networkx, G, edge_attrs={"x": None})
+        elif attr_kind == "required":
+            gpubenchmark(cnx.from_networkx, G, edge_attrs={"x": ...})
+        elif attr_kind == "required_dtype":
+            gpubenchmark(
+                cnx.from_networkx, G, edge_attrs={"x": ...}, edge_dtypes={"x": np.int32}
+            )
+        else:  # full, half_default
+            gpubenchmark(cnx.from_networkx, G, edge_attrs={"x": 0})
+    else:
+        gpubenchmark(cnx.from_networkx, G)
+
+
+def _bench_helper_cugraph(
+    gpubenchmark, N, attr_kind, create_using, method, do_renumber
+):
+    G = method(N, create_using=create_using)
+    if attr_kind:
+        for *_ids, edgedict in G.edges(data=True):
+            edgedict["x"] = random.randint(0, 100000)
+        gpubenchmark(cugraph.utilities.convert_from_nx, G, "x", do_renumber=do_renumber)
+    else:
+        gpubenchmark(cugraph.utilities.convert_from_nx, G, do_renumber=do_renumber)
+
+
+def _bench_helper_scipy(gpubenchmark, N, attr_kind, create_using, method, fmt):
+    G = method(N, create_using=create_using)
+    if attr_kind:
+        for *_ids, edgedict in G.edges(data=True):
+            edgedict["x"] = random.randint(0, 100000)
+        gpubenchmark(nx.to_scipy_sparse_array, G, weight="x", format=fmt)
+    else:
+        gpubenchmark(nx.to_scipy_sparse_array, G, weight=None, format=fmt)
+
+
+@pytest.mark.parametrize("N", [1, 10**6])
+@pytest.mark.parametrize(
+    "attr_kind",
+    [
+        "required_dtype",
+        "required",
+        "full",
+        "half_missing",
+        "half_default",
+        "preserve",
+        None,
+    ],
+)
+@pytest.mark.parametrize("create_using", [nx.Graph, nx.DiGraph])
+def bench_cycle_graph(gpubenchmark, N, attr_kind, create_using):
+    _bench_helper(gpubenchmark, N, attr_kind, create_using, nx.cycle_graph)
+
+
+@pytest.mark.skipif("not cugraph")
+@pytest.mark.parametrize("N", [1, 10**6])
+@pytest.mark.parametrize("attr_kind", ["full", None])
+@pytest.mark.parametrize("create_using", [nx.Graph, nx.DiGraph])
+@pytest.mark.parametrize("do_renumber", [True, False])
+def bench_cycle_graph_cugraph(gpubenchmark, N, attr_kind, create_using, do_renumber):
+    if N == 1 and not do_renumber:
+        do_renumber = True
+    _bench_helper_cugraph(
+        gpubenchmark, N, attr_kind, create_using, nx.cycle_graph, do_renumber
+    )
+
+
+@pytest.mark.skipif("not scipy")
+@pytest.mark.parametrize("N", [1, 10**6])
+@pytest.mark.parametrize("attr_kind", ["full", None])
+@pytest.mark.parametrize("create_using", [nx.Graph, nx.DiGraph])
+@pytest.mark.parametrize("fmt", ["coo", "csr"])
+def bench_cycle_graph_scipy(gpubenchmark, N, attr_kind, create_using, fmt):
+    _bench_helper_scipy(gpubenchmark, N, attr_kind, create_using, nx.cycle_graph, fmt)
+
+
+@pytest.mark.parametrize("N", [1, 1500])
+@pytest.mark.parametrize(
+    "attr_kind",
+    [
+        "required_dtype",
+        "required",
+        "full",
+        "half_missing",
+        "half_default",
+        "preserve",
+        None,
+    ],
+)
+@pytest.mark.parametrize("create_using", [nx.Graph, nx.DiGraph])
+def bench_complete_graph_edgedata(gpubenchmark, N, attr_kind, create_using):
+    _bench_helper(gpubenchmark, N, attr_kind, create_using, nx.complete_graph)
+
+
+@pytest.mark.parametrize("N", [3000])
+@pytest.mark.parametrize("attr_kind", [None])
+@pytest.mark.parametrize("create_using", [nx.Graph, nx.DiGraph])
+def bench_complete_graph_noedgedata(gpubenchmark, N, attr_kind, create_using):
+    _bench_helper(gpubenchmark, N, attr_kind, create_using, nx.complete_graph)
+
+
+@pytest.mark.skipif("not cugraph")
+@pytest.mark.parametrize("N", [1, 1500])
+@pytest.mark.parametrize("attr_kind", ["full", None])
+@pytest.mark.parametrize("create_using", [nx.Graph, nx.DiGraph])
+@pytest.mark.parametrize("do_renumber", [True, False])
+def bench_complete_graph_cugraph(gpubenchmark, N, attr_kind, create_using, do_renumber):
+    if N == 1 and not do_renumber:
+        do_renumber = True
+    _bench_helper_cugraph(
+        gpubenchmark, N, attr_kind, create_using, nx.complete_graph, do_renumber
+    )
+
+
+@pytest.mark.skipif("not scipy")
+@pytest.mark.parametrize("N", [1, 1500])
+@pytest.mark.parametrize("attr_kind", ["full", None])
+@pytest.mark.parametrize("create_using", [nx.Graph, nx.DiGraph])
+@pytest.mark.parametrize("fmt", ["coo", "csr"])
+def bench_complete_graph_scipy(gpubenchmark, N, attr_kind, create_using, fmt):
+    _bench_helper_scipy(
+        gpubenchmark, N, attr_kind, create_using, nx.complete_graph, fmt
+    )
diff --git a/python/cugraph-nx/cugraph_nx/tests/conftest.py b/python/cugraph-nx/cugraph_nx/tests/conftest.py
new file mode 100644
index 00000000000..e5a250784b2
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/tests/conftest.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def pytest_configure(config):
+    if config.getoption("--all", False):
+        # Run benchmarks AND tests
+        config.option.benchmark_skip = False
+        config.option.benchmark_enable = True
+    elif config.getoption("--bench", False) or config.getoption(
+        "--benchmark-enable", False
+    ):
+        # Run benchmarks (and only benchmarks) with `--bench` argument
+        config.option.benchmark_skip = False
+        config.option.benchmark_enable = True
+        if not config.option.keyword:
+            config.option.keyword = "bench_"
+    else:
+        # Run only tests
+        config.option.benchmark_skip = True
+        config.option.benchmark_enable = False
+        if not config.option.keyword:
+            config.option.keyword = "test_"
diff --git a/python/cugraph-nx/cugraph_nx/tests/test_convert.py b/python/cugraph-nx/cugraph_nx/tests/test_convert.py
new file mode 100644
index 00000000000..7efba9ea555
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/tests/test_convert.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+import networkx as nx
+import pytest
+
+import cugraph_nx as cnx
+from cugraph_nx import interface
+
+
+@pytest.mark.parametrize("graph_class", [nx.Graph, nx.DiGraph])
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {},
+        {"preserve_edge_attrs": True},
+        {"preserve_node_attrs": True},
+        {"preserve_all_attrs": True},
+        {"edge_attrs": {"x": 0}},
+        {"edge_attrs": {"x": None}},
+        {"edge_attrs": {"x": cnx.convert.REQUIRED}},
+        {"edge_attrs": {"x": ...}},  # sugar for REQUIRED
+        {"edge_attrs": "x"},
+        {"node_attrs": {"x": 0}},
+        {"node_attrs": {"x": None}},
+        {"node_attrs": {"x": cnx.convert.REQUIRED}},
+        {"node_attrs": {"x": ...}},  # sugar for REQUIRED
+        {"node_attrs": "x"},
+    ],
+)
+def test_convert_empty(graph_class, kwargs):
+    G = graph_class()
+    cG = cnx.from_networkx(G, **kwargs)
+    H = cnx.to_networkx(cG)
+    assert G.number_of_nodes() == cG.number_of_nodes() == H.number_of_nodes() == 0
+    assert G.number_of_edges() == cG.number_of_edges() == H.number_of_edges() == 0
+    assert cG.edge_values == cG.edge_masks == cG.node_values == cG.node_masks == {}
+    assert G.graph == cG.graph == H.graph == {}
+
+
+def test_convert():
+    # FIXME: can we break this into smaller tests?
+    G = nx.Graph()
+    G.add_edge(0, 1, x=2)
+    G.add_node(0, foo=10)
+    G.add_node(1, foo=20, bar=100)
+    for kwargs in [
+        {"preserve_edge_attrs": True},
+        {"preserve_all_attrs": True},
+        {"edge_attrs": {"x": 0}},
+        {"edge_attrs": {"x": None}, "node_attrs": {"bar": None}},
+        {"edge_attrs": "x", "edge_dtypes": int},
+        {
+            "edge_attrs": {"x": cnx.convert.REQUIRED},
+            "node_attrs": {"foo": cnx.convert.REQUIRED},
+        },
+        {"edge_attrs": {"x": ...}, "node_attrs": {"foo": ...}},  # sugar for REQUIRED
+    ]:
+        # All edges have "x" attribute, so all kwargs are equivalent
+        cG = cnx.from_networkx(G, **kwargs)
+        cp.testing.assert_array_equal(cG.row_indices, [0, 1])
+        cp.testing.assert_array_equal(cG.col_indices, [1, 0])
+        cp.testing.assert_array_equal(cG.edge_values["x"], [2, 2])
+        assert len(cG.edge_values) == 1
+        assert cG.edge_masks == {}
+        H = cnx.to_networkx(cG)
+        assert G.number_of_nodes() == cG.number_of_nodes() == H.number_of_nodes() == 2
+        assert G.number_of_edges() == cG.number_of_edges() == H.number_of_edges() == 1
+        assert G.adj == H.adj
+
+    with pytest.raises(KeyError, match="bar"):
+        cnx.from_networkx(G, node_attrs={"bar": ...})
+
+    # Structure-only graph (no edge attributes)
+    cG = cnx.from_networkx(G, preserve_node_attrs=True)
+    cp.testing.assert_array_equal(cG.row_indices, [0, 1])
+    cp.testing.assert_array_equal(cG.col_indices, [1, 0])
+    cp.testing.assert_array_equal(cG.node_values["foo"], [10, 20])
+    assert cG.edge_values == cG.edge_masks == {}
+    H = cnx.to_networkx(cG)
+    assert set(G.edges) == set(H.edges) == {(0, 1)}
+    assert G.nodes == H.nodes
+
+    # Fill completely missing attribute with default value
+    cG = cnx.from_networkx(G, edge_attrs={"y": 0})
+    cp.testing.assert_array_equal(cG.row_indices, [0, 1])
+    cp.testing.assert_array_equal(cG.col_indices, [1, 0])
+    cp.testing.assert_array_equal(cG.edge_values["y"], [0, 0])
+    assert len(cG.edge_values) == 1
+    assert cG.edge_masks == cG.node_values == cG.node_masks == {}
+    H = cnx.to_networkx(cG)
+    assert list(H.edges(data=True)) == [(0, 1, {"y": 0})]
+
+    # If attribute is completely missing (and no default), then just ignore it
+    cG = cnx.from_networkx(G, edge_attrs={"y": None})
+    cp.testing.assert_array_equal(cG.row_indices, [0, 1])
+    cp.testing.assert_array_equal(cG.col_indices, [1, 0])
+    assert sorted(cG.edge_values) == sorted(cG.edge_masks) == []
+    H = cnx.to_networkx(cG)
+    assert list(H.edges(data=True)) == [(0, 1, {})]
+
+    G.add_edge(0, 2)
+    # Some edges are missing 'x' attribute; need to use a mask
+    for kwargs in [{"preserve_edge_attrs": True}, {"edge_attrs": {"x": None}}]:
+        cG = cnx.from_networkx(G, **kwargs)
+        cp.testing.assert_array_equal(cG.row_indices, [0, 0, 1, 2])
+        cp.testing.assert_array_equal(cG.col_indices, [1, 2, 0, 0])
+        assert sorted(cG.edge_values) == sorted(cG.edge_masks) == ["x"]
+        cp.testing.assert_array_equal(cG.edge_masks["x"], [True, False, True, False])
+        cp.testing.assert_array_equal(cG.edge_values["x"][cG.edge_masks["x"]], [2, 2])
+    H = cnx.to_networkx(cG)
+    assert list(H.edges(data=True)) == [(0, 1, {"x": 2}), (0, 2, {})]
+
+    with pytest.raises(KeyError, match="x"):
+        cnx.from_networkx(G, edge_attrs={"x": cnx.convert.REQUIRED})
+    with pytest.raises(KeyError, match="x"):
+        cnx.from_networkx(G, edge_attrs={"x": ...})
+    with pytest.raises(KeyError, match="bar"):
+        cnx.from_networkx(G, node_attrs={"bar": cnx.convert.REQUIRED})
+    with pytest.raises(KeyError, match="bar"):
+        cnx.from_networkx(G, node_attrs={"bar": ...})
+
+    # Now for something more complicated...
+    G = nx.Graph()
+    G.add_edge(10, 20, x=1)
+    G.add_edge(10, 30, x=2, y=1.5)
+    G.add_node(10, foo=100)
+    G.add_node(20, foo=200, bar=1000)
+    G.add_node(30, foo=300)
+    # Some edges have masks, some don't
+    for kwargs in [
+        {"preserve_edge_attrs": True},
+        {"preserve_all_attrs": True},
+        {"edge_attrs": {"x": None, "y": None}},
+        {"edge_attrs": {"x": 0, "y": None}},
+        {"edge_attrs": {"x": 0, "y": None}},
+        {"edge_attrs": {"x": 0, "y": None}, "edge_dtypes": {"x": int, "y": float}},
+    ]:
+        cG = cnx.from_networkx(G, **kwargs)
+        assert cG.id_to_key == {0: 10, 1: 20, 2: 30}  # Remap node IDs to 0, 1, ...
+        cp.testing.assert_array_equal(cG.row_indices, [0, 0, 1, 2])
+        cp.testing.assert_array_equal(cG.col_indices, [1, 2, 0, 0])
+        cp.testing.assert_array_equal(cG.edge_values["x"], [1, 2, 1, 2])
+        assert sorted(cG.edge_masks) == ["y"]
+        cp.testing.assert_array_equal(cG.edge_masks["y"], [False, True, False, True])
+        cp.testing.assert_array_equal(
+            cG.edge_values["y"][cG.edge_masks["y"]], [1.5, 1.5]
+        )
+        H = cnx.to_networkx(cG)
+        assert G.adj == H.adj
+
+    # Some nodes have masks, some don't
+    for kwargs in [
+        {"preserve_node_attrs": True},
+        {"preserve_all_attrs": True},
+        {"node_attrs": {"foo": None, "bar": None}},
+        {"node_attrs": {"foo": None, "bar": None}},
+        {"node_attrs": {"foo": 0, "bar": None, "missing": None}},
+    ]:
+        cG = cnx.from_networkx(G, **kwargs)
+        assert cG.id_to_key == {0: 10, 1: 20, 2: 30}  # Remap node IDs to 0, 1, ...
+        cp.testing.assert_array_equal(cG.row_indices, [0, 0, 1, 2])
+        cp.testing.assert_array_equal(cG.col_indices, [1, 2, 0, 0])
+        cp.testing.assert_array_equal(cG.node_values["foo"], [100, 200, 300])
+        assert sorted(cG.node_masks) == ["bar"]
+        cp.testing.assert_array_equal(cG.node_masks["bar"], [False, True, False])
+        cp.testing.assert_array_equal(
+            cG.node_values["bar"][cG.node_masks["bar"]], [1000]
+        )
+        H = cnx.to_networkx(cG)
+        assert G.nodes == H.nodes
+
+    # Check default values for nodes
+    for kwargs in [
+        {"node_attrs": {"foo": None, "bar": 0}},
+        {"node_attrs": {"foo": None, "bar": 0, "missing": None}},
+        {"node_attrs": {"bar": 0}},
+        {"node_attrs": {"bar": 0}, "node_dtypes": {"bar": int}},
+        {"node_attrs": {"bar": 0, "foo": None}, "node_dtypes": int},
+    ]:
+        cG = cnx.from_networkx(G, **kwargs)
+        assert cG.id_to_key == {0: 10, 1: 20, 2: 30}  # Remap node IDs to 0, 1, ...
+        cp.testing.assert_array_equal(cG.row_indices, [0, 0, 1, 2])
+        cp.testing.assert_array_equal(cG.col_indices, [1, 2, 0, 0])
+        cp.testing.assert_array_equal(cG.node_values["bar"], [0, 1000, 0])
+        assert cG.node_masks == {}
+
+    with pytest.raises(
+        TypeError, match="edge_attrs and weight arguments should not both be given"
+    ):
+        interface.BackendInterface.convert_from_nx(G, edge_attrs={"x": 1}, weight="x")
+    with pytest.raises(TypeError, match="Expected networkx.Graph"):
+        cnx.from_networkx({})
diff --git a/python/cugraph-nx/cugraph_nx/tests/test_match_api.py b/python/cugraph-nx/cugraph_nx/tests/test_match_api.py
new file mode 100644
index 00000000000..f2b88c7f137
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/tests/test_match_api.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+
+import networkx as nx
+
+import cugraph_nx as cnx
+from cugraph_nx.utils import networkx_algorithm
+
+
+def test_match_signature_and_names():
+    """Simple test to ensure our signatures and basic module layout match networkx."""
+    for name, func in vars(cnx.interface.BackendInterface).items():
+        if not isinstance(func, networkx_algorithm):
+            continue
+        dispatchable_func = nx.utils.backends._registered_algorithms[name]
+        orig_func = dispatchable_func.orig_func
+        # Matching signatures?
+        sig = inspect.signature(orig_func)
+        assert sig == inspect.signature(func)
+        # Matching function names?
+        assert func.__name__ == dispatchable_func.__name__ == orig_func.__name__
+        # Matching dispatch names?
+        assert func.name == dispatchable_func.name
+        # Matching modules (i.e., where function defined)?
+        assert (
+            "networkx." + func.__module__.split(".", 1)[1]
+            == dispatchable_func.__module__
+            == orig_func.__module__
+        )
diff --git a/python/cugraph-nx/cugraph_nx/typing.py b/python/cugraph-nx/cugraph_nx/typing.py
new file mode 100644
index 00000000000..d3045ab4656
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/typing.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from collections.abc import Hashable
+from typing import TypeVar
+
+AttrKey = TypeVar("AttrKey", bound=Hashable)
+EdgeKey = TypeVar("EdgeKey", bound=Hashable)
+NodeKey = TypeVar("NodeKey", bound=Hashable)
+EdgeTuple = tuple[NodeKey, NodeKey]
+EdgeValue = TypeVar("EdgeValue")
+NodeValue = TypeVar("NodeValue")
+IndexValue = TypeVar("IndexValue")
+Dtype = TypeVar("Dtype")
diff --git a/python/cugraph-nx/cugraph_nx/utils/__init__.py b/python/cugraph-nx/cugraph_nx/utils/__init__.py
new file mode 100644
index 00000000000..f7ef42c8677
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .decorators import *
diff --git a/python/cugraph-nx/cugraph_nx/utils/decorators.py b/python/cugraph-nx/cugraph_nx/utils/decorators.py
new file mode 100644
index 00000000000..7bda3e58b6b
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/utils/decorators.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial, update_wrapper
+
+from networkx.utils.decorators import not_implemented_for
+
+from cugraph_nx.interface import BackendInterface
+
+__all__ = ["not_implemented_for", "networkx_algorithm"]
+
+
+def networkx_class(api):
+    def inner(func):
+        func.__doc__ = getattr(api, func.__name__).__doc__
+        return func
+
+    return inner
+
+
+class networkx_algorithm:
+    def __new__(cls, func=None, *, name=None):
+        if func is None:
+            return partial(networkx_algorithm, name=name)
+        instance = object.__new__(cls)
+        # update_wrapper sets __wrapped__, which will be used for the signature
+        update_wrapper(instance, func)
+        instance.__defaults__ = func.__defaults__
+        instance.__kwdefaults__ = func.__kwdefaults__
+        instance.name = func.__name__ if name is None else name
+        instance.can_run = _default_can_run
+        setattr(BackendInterface, instance.name, instance)
+        return instance
+
+    def _can_run(self, func):
+        """Set the `can_run` attribute to the decorated function."""
+        self.can_run = func
+
+    def __call__(self, /, *args, **kwargs):
+        return self.__wrapped__(*args, **kwargs)
+
+    def __reduce__(self):
+        return _restore_networkx_dispatched, (self.name,)
+
+
+def _default_can_run(*args, **kwargs):
+    return True
+
+
+def _restore_networkx_dispatched(name):
+    return getattr(BackendInterface, name)
diff --git a/python/cugraph-nx/lint.yaml b/python/cugraph-nx/lint.yaml
new file mode 100644
index 00000000000..04747a2b49b
--- /dev/null
+++ b/python/cugraph-nx/lint.yaml
@@ -0,0 +1,86 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# https://pre-commit.com/
+#
+# Before first use: `pre-commit install`
+# To run: `make lint`
+# To update: `make lint-update`
+#  - &flake8_dependencies below needs updated manually
+fail_fast: false
+default_language_version:
+    python: python3
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: check-added-large-files
+      - id: check-case-conflict
+      - id: check-merge-conflict
+      - id: check-symlinks
+      - id: check-ast
+      - id: check-toml
+      - id: check-yaml
+      - id: debug-statements
+      - id: end-of-file-fixer
+        exclude_types: [svg]
+      - id: mixed-line-ending
+      - id: trailing-whitespace
+  - repo: https://github.com/abravalheri/validate-pyproject
+    rev: v0.13
+    hooks:
+      - id: validate-pyproject
+        name: Validate pyproject.toml
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.2.0
+    hooks:
+      - id: autoflake
+        args: [--in-place]
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.10.1
+    hooks:
+      - id: pyupgrade
+        args: [--py39-plus]
+  - repo: https://github.com/psf/black
+    rev: 23.7.0
+    hooks:
+      - id: black
+      # - id: black-jupyter
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.0.284
+    hooks:
+      - id: ruff
+        args: [--fix-only, --show-fixes]
+  - repo: https://github.com/PyCQA/flake8
+    rev: 6.1.0
+    hooks:
+      - id: flake8
+        additional_dependencies: &flake8_dependencies
+        # These versions need updated manually
+        - flake8==6.1.0
+        - flake8-bugbear==23.7.10
+        - flake8-simplify==0.20.0
+  - repo: https://github.com/asottile/yesqa
+    rev: v1.5.0
+    hooks:
+      - id: yesqa
+        additional_dependencies: *flake8_dependencies
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.5
+    hooks:
+      - id: codespell
+        types_or: [python, rst, markdown]
+        additional_dependencies: [tomli]
+        files: ^(cugraph_nx|docs)/
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.0.284
+    hooks:
+      - id: ruff
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: no-commit-to-branch
+        args: [-p, "^branch-2....$"]
diff --git a/python/cugraph-nx/pyproject.toml b/python/cugraph-nx/pyproject.toml
new file mode 100644
index 00000000000..8b0ae11fbe0
--- /dev/null
+++ b/python/cugraph-nx/pyproject.toml
@@ -0,0 +1,213 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+[build-system]
+
+requires = [
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "cugraph-nx"
+version = "23.10.00"
+description = "cugraph backend for NetworkX"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.9"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3 :: Only",
+    "Intended Audience :: Developers",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = [
+    "cupy-cuda11x>=12.0.0",
+    "pylibcugraph==23.10.*",
+    "networkx >=3.0",
+]
+
+[project.optional-dependencies]
+test = [
+    "pytest",
+    "pytest-benchmark",
+    "pytest-mpl",
+]
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cugraph"
+Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
+
+[project.entry-points."networkx.plugins"]
+cugraph = "cugraph_nx.interface:BackendInterface"
+
+[tool.setuptools]
+license-files = ["LICENSE"]
+
+[tool.setuptools.packages.find]
+include = [
+    "cugraph_nx*",
+    "cugraph_nx.*",
+]
+
+[tool.black]
+line-length = 88
+target-version = ["py39", "py310", "py311"]
+
+[tool.isort]
+sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
+profile = "black"
+skip_gitignore = true
+float_to_top = true
+default_section = "THIRDPARTY"
+known_first_party = "cugraph_nx"
+line_length = 88
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+testpaths = "cugraph_nx/tests"
+xfail_strict = true
+markers = [
+    "slow: Skipped unless --runslow passed",
+]
+log_cli_level = "info"
+filterwarnings = [
+    # See: https://docs.python.org/3/library/warnings.html#describing-warning-filters
+    # and: https://docs.pytest.org/en/7.2.x/how-to/capture-warnings.html#controlling-warnings
+    # "error",
+]
+python_files = [
+    "bench_*.py",
+    "test_*.py",
+]
+python_functions = [
+    "bench_*",
+    "test_*",
+]
+addopts = [
+    "--strict-config",  # Force error if config is mispelled
+    "--strict-markers",  # Force error if marker is mispelled (must be defined in config)
+    # "-ra",  # Print summary of all fails/errors
+    "--benchmark-warmup=off",
+    "--benchmark-max-time=0",
+    "--benchmark-min-rounds=3",
+    "--benchmark-columns=min,median,max",
+]
+
+[tool.coverage.run]
+branch = true
+source = ["cugraph_nx"]
+omit = []
+
+[tool.coverage.report]
+ignore_errors = false
+precision = 1
+fail_under = 0
+skip_covered = true
+skip_empty = true
+exclude_lines = [
+    "pragma: no cover",
+    "raise AssertionError",
+    "raise NotImplementedError",
+]
+
+[tool.ruff]
+# https://github.com/charliermarsh/ruff/
+line-length = 88
+target-version = "py39"
+select = [
+    "ALL",
+]
+external = [
+    # noqa codes that ruff doesn't know about: https://github.com/charliermarsh/ruff#external
+]
+ignore = [
+    # Would be nice to fix these
+    "D100",  # Missing docstring in public module
+    "D101",  # Missing docstring in public class
+    "D102",  # Missing docstring in public method
+    "D103",  # Missing docstring in public function
+    "D104",  # Missing docstring in public package
+    "D105",  # Missing docstring in magic method
+
+    # Maybe consider
+    # "SIM300",  # Yoda conditions are discouraged, use ... instead (Note: we're not this picky)
+    # "SIM401",  # Use dict.get ... instead of if-else-block (Note: if-else better for coverage and sometimes clearer)
+    # "TRY004",  # Prefer `TypeError` exception for invalid type (Note: good advice, but not worth the nuisance)
+    # "TRY200",  # Use `raise from` to specify exception cause (Note: sometimes okay to raise original exception)
+
+    # Intentionally ignored
+    "A003",  # Class attribute ... is shadowing a python builtin
+    "ANN101",  # Missing type annotation for `self` in method
+    "ARG004",  # Unused static method argument: `...`
+    "COM812",  # Trailing comma missing
+    "D203",  # 1 blank line required before class docstring (Note: conflicts with D211, which is preferred)
+    "D400",  # First line should end with a period (Note: prefer D415, which also allows "?" and "!")
+    "F403",  # `from .classes import *` used; unable to detect undefined names (Note: used to match networkx)
+    "N801",  # Class name ... should use CapWords convention (Note:we have a few exceptions to this)
+    "N802",  # Function name ... should be lowercase
+    "N803",  # Argument name ... should be lowercase (Maybe okay--except in tests)
+    "N806",  # Variable ... in function should be lowercase
+    "N807",  # Function name should not start and end with `__`
+    "N818",  # Exception name ... should be named with an Error suffix (Note: good advice)
+    "PLR0911",  # Too many return statements
+    "PLR0912",  # Too many branches
+    "PLR0913",  # Too many arguments to function call
+    "PLR0915",  # Too many statements
+    "PLR2004",  # Magic number used in comparison, consider replacing magic with a constant variable
+    "PLW2901",  # Outer for loop variable ... overwritten by inner assignment target (Note: good advice, but too strict)
+    "RET502",  # Do not implicitly `return None` in function able to return non-`None` value
+    "RET503",  # Missing explicit `return` at the end of function able to return non-`None` value
+    "RET504",  # Unnecessary variable assignment before `return` statement
+    "S110",  # `try`-`except`-`pass` detected, consider logging the exception (Note: good advice, but we don't log)
+    "S112",  # `try`-`except`-`continue` detected, consider logging the exception (Note: good advice, but we don't log)
+    "SIM102",  # Use a single `if` statement instead of nested `if` statements (Note: often necessary)
+    "SIM105",  # Use contextlib.suppress(...) instead of try-except-pass (Note: try-except-pass is much faster)
+    "SIM108",  # Use ternary operator ... instead of if-else-block (Note: if-else better for coverage and sometimes clearer)
+    "TRY003",  # Avoid specifying long messages outside the exception class (Note: why?)
+
+    # Ignored categories
+    "C90",  # mccabe (Too strict, but maybe we should make things less complex)
+    "I",  # isort (Should we replace `isort` with this?)
+    "ANN",  # flake8-annotations
+    "BLE",  # flake8-blind-except (Maybe consider)
+    "FBT",  # flake8-boolean-trap (Why?)
+    "DJ",  # flake8-django (We don't use django)
+    "EM",  # flake8-errmsg (Perhaps nicer, but too much work)
+    # "ICN",  # flake8-import-conventions (Doesn't allow "_" prefix such as `_np`)
+    "PYI",  # flake8-pyi (We don't have stub files yet)
+    "SLF",  # flake8-self (We can use our own private variables--sheesh!)
+    "TID",  # flake8-tidy-imports (Rely on isort and our own judgement)
+    # "TCH",  # flake8-type-checking
+    "ARG",  # flake8-unused-arguments (Sometimes helpful, but too strict)
+    "TD",  # flake8-todos (Maybe okay to add some of these)
+    "FIX",  # flake8-fixme (like flake8-todos)
+    "ERA",  # eradicate (We like code in comments!)
+    "PD",  # pandas-vet (Intended for scripts that use pandas, not libraries)
+]
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["F401"]  # Allow unused imports (w/o defining `__all__`)
+# Allow assert, print, RNG, and no docstring
+"cugraph_nx/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"]
+
+[tool.ruff.flake8-annotations]
+mypy-init-return = true
+
+[tool.ruff.flake8-builtins]
+builtins-ignorelist = ["copyright"]
+
+[tool.ruff.flake8-pytest-style]
+fixture-parentheses = false
+mark-parentheses = false
+
+[tool.ruff.pydocstyle]
+convention = "numpy"
diff --git a/python/cugraph-nx/run_nx_tests.sh b/python/cugraph-nx/run_nx_tests.sh
new file mode 100755
index 00000000000..5d3b616304d
--- /dev/null
+++ b/python/cugraph-nx/run_nx_tests.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+# NETWORKX_GRAPH_CONVERT=cugraph is necessary to test our backend.
+#
+# NETWORKX_TEST_FALLBACK_TO_NX=True is optional
+#   With this set, input graphs will not be converted to cugraph-nx and the networkx algorithm
+#   will be called for algorithms that we don't implement or if we raise NotImplementedError.
+#   This is sometimes helpful to get increased testing and coverage, but testing takes longer.
+#   Without it, tests will xfail when encountering a function that we don't implement.
+#
+# Coverage of `cugraph_nx.algorithms` is reported and is a good sanity check that algorithms run.
+
+# NETWORKX_GRAPH_CONVERT=cugraph NETWORKX_BACKEND_TEST_EXHAUSTIVE=True pytest --pyargs networkx "$@"
+NETWORKX_TEST_BACKEND=cugraph NETWORKX_TEST_FALLBACK_TO_NX=True pytest --pyargs networkx --cov=cugraph_nx/algorithms --cov-report term-missing --no-cov-on-fail "$@"
diff --git a/python/cugraph-nx/setup.py b/python/cugraph-nx/setup.py
new file mode 100644
index 00000000000..87c0e10646d
--- /dev/null
+++ b/python/cugraph-nx/setup.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from setuptools import setup
+
+setup()

From 5a57be973930fc32a3432192644198c4282c1ca9 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Wed, 16 Aug 2023 16:27:49 -0400
Subject: [PATCH 07/72] Fix ValueError Caused By Batches With No Samples
 (#3789)

There is currently a bug in the bulk sampler where empty samples (which result from batches where all input seeds have 0 out-degree) cause a `ValueError` to be raised since the batch ids do not match with what is expected based on the number of returned offset values.

This PR resolves this issue by checking whether the maximum batch id for a partition exceeds what is expected, and if so, renumbers the batch ids to be contiguous starting from 0.  This has the effect of throwing out the empty batches, which is acceptable behavior in both `cuGraph-DGL` and `cuGraph-PyG`.

Closes #3785

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3789
---
 .../cugraph/gnn/data_loading/bulk_sampler.py  | 14 +++++
 .../gnn/data_loading/bulk_sampler_io.py       | 43 ++++++++++---
 .../tests/sampling/test_bulk_sampler.py       | 57 +++++++++++++++++
 .../tests/sampling/test_bulk_sampler_mg.py    | 63 +++++++++++++++++++
 4 files changed, 169 insertions(+), 8 deletions(-)

diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
index 90d290cbf0f..92caba6dbaf 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
@@ -32,6 +32,20 @@
 
 
 class EXPERIMENTAL__BulkSampler:
+    """
+    Performs sampling based on input seeds grouped into batches by
+    a batch id.  Writes the output minibatches to parquet, with
+    partition sizes specified by the user.  Allows controlling the
+    number of input seeds per sampling function call.  Supports
+    basic logging.
+
+    Batches in each partition that are empty are discarded, and the remaining non-empty
+    batches are renumbered to be contiguous starting from the first
+    batch id in the partition.
+    This means that the output batch ids may not match the input batch ids.
+    See GitHub issue #3794 for more details.
+    """
+
     start_col_name = "_START_"
     batch_col_name = "_BATCH_"
 
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
index f6c5a7e970b..04917143030 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
@@ -28,6 +28,12 @@ def _write_samples_to_parquet(
 ) -> cudf.Series:
     """
     Writes the samples to parquet.
+
+    Batches that are empty are discarded, and the remaining non-empty
+    batches are renumbered to be contiguous starting from the first
+    batch id.  This means that the output batch ids may not match
+    the input batch ids.
+
     results: cudf.DataFrame
         The results dataframe containing the sampled minibatches.
     offsets: cudf.DataFrame
@@ -54,35 +60,46 @@ def _write_samples_to_parquet(
     if partition_info != "sg" and (not isinstance(partition_info, dict)):
         raise ValueError("Invalid value of partition_info")
 
-    max_batch_id = offsets.batch_id.max()
+    # Offsets is always in order, so the last batch id is always the highest
+    max_batch_id = offsets.batch_id.iloc[len(offsets) - 1]
     results.dropna(axis=1, how="all", inplace=True)
     results["hop_id"] = results["hop_id"].astype("uint8")
 
     for p in range(0, len(offsets), batches_per_partition):
         offsets_p = offsets.iloc[p : p + batches_per_partition]
         start_batch_id = offsets_p.batch_id.iloc[0]
-        end_batch_id = offsets_p.batch_id.iloc[-1]
+        end_batch_id = offsets_p.batch_id.iloc[len(offsets_p) - 1]
+
+        reached_end = end_batch_id == max_batch_id
 
         start_ix = offsets_p.offsets.iloc[0]
-        if end_batch_id == max_batch_id:
+        if reached_end:
             end_ix = len(results)
         else:
             offsets_z = offsets[offsets.batch_id == (end_batch_id + 1)]
             end_ix = offsets_z.offsets.iloc[0]
 
-        full_output_path = os.path.join(
-            output_path, f"batch={start_batch_id}-{end_batch_id}.parquet"
-        )
         results_p = results.iloc[start_ix:end_ix].reset_index(drop=True)
 
-        results_p["batch_id"] = offsets_p.batch_id.repeat(
+        if end_batch_id - start_batch_id + 1 > len(offsets_p):
+            # This occurs when some batches returned 0 samples.
+            # To properly account this, the remaining batches are
+            # renumbered to have contiguous batch ids and the empty
+            # samples are dropped.
+            offsets_p.drop("batch_id", axis=1, inplace=True)
+            batch_id_range = cudf.Series(cupy.arange(len(offsets_p)))
+            end_batch_id = start_batch_id + len(offsets_p) - 1
+        else:
+            batch_id_range = offsets_p.batch_id
+
+        results_p["batch_id"] = batch_id_range.repeat(
             cupy.diff(offsets_p.offsets.values, append=end_ix)
         ).values
 
         if renumber_map is not None:
             renumber_map_start_ix = offsets_p.renumber_map_offsets.iloc[0]
 
-            if end_batch_id == max_batch_id:
+            if reached_end:
                 renumber_map_end_ix = len(renumber_map)
             else:
                 renumber_map_end_ix = offsets_z.renumber_map_offsets.iloc[0]
@@ -124,6 +141,10 @@ def _write_samples_to_parquet(
             else:
                 results_p["map"] = final_map_series
 
+        full_output_path = os.path.join(
+            output_path, f"batch={start_batch_id}-{end_batch_id}.parquet"
+        )
+
         results_p.to_parquet(
             full_output_path, compression=None, index=False, force_nullable_schema=True
         )
@@ -140,6 +161,12 @@ def write_samples(
 ):
     """
     Writes the samples to parquet.
+
+    Batches in each partition that are empty are discarded, and the remaining non-empty
+    batches are renumbered to be contiguous starting from the first
+    batch id in the partition.
+    This means that the output batch ids may not match the input batch ids.
+
     results: cudf.DataFrame
         The results dataframe containing the sampled minibatches.
     offsets: cudf.DataFrame
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
index cbd8321a338..5ea79e0893a 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
@@ -240,3 +240,60 @@ def test_bulk_sampler_partitions(scratch_dir):
                 ]
             ).nunique()
             assert len(map_current_batch) == n_unique
+
+
+@pytest.mark.sg
+def test_bulk_sampler_empty_batches(scratch_dir):
+    edgelist = cudf.DataFrame(
+        {
+            "src": [0, 0, 1, 2, 3, 4, 5, 6],
+            "dst": [3, 2, 0, 7, 8, 9, 1, 2],
+        }
+    )
+
+    batches = cudf.DataFrame(
+        {
+            "start": [0, 1, 2, 7, 8, 9, 3, 2, 7],
+            "batch": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype="int32"),
+        }
+    )
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(edgelist, source="src", destination="dst")
+
+    samples_path = os.path.join(scratch_dir, "test_bulk_sampler_empty_batches")
+    create_directory_with_overwrite(samples_path)
+
+    bs = BulkSampler(
+        batch_size=3,
+        output_path=samples_path,
+        graph=G,
+        fanout_vals=[-1, -1],
+        with_replacement=False,
+        batches_per_partition=6,
+        renumber=False,
+    )
+    bs.add_batches(batches, start_col_name="start", batch_col_name="batch")
+    bs.flush()
+
+    assert len(os.listdir(samples_path)) == 1
+
+    df = cudf.read_parquet(os.path.join(samples_path, "batch=0-1.parquet"))
+
+    assert df[
+        (df.batch_id == 0) & (df.hop_id == 0)
+    ].destinations.sort_values().values_host.tolist() == [0, 2, 3, 7]
+
+    assert df[
+        (df.batch_id == 0) & (df.hop_id == 1)
+    ].destinations.sort_values().values_host.tolist() == [2, 3, 7, 8]
+
+    assert df[
+        (df.batch_id == 1) & (df.hop_id == 0)
+    ].destinations.sort_values().values_host.tolist() == [7, 8]
+
+    assert len(df[(df.batch_id == 1) & (df.hop_id == 1)]) == 0
+
+    assert df.batch_id.max() == 1
+
+    shutil.rmtree(samples_path)
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
index b7cd4b0822b..eded435f897 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
@@ -184,3 +184,66 @@ def test_bulk_sampler_partitions(dask_client, scratch_dir, mg_input):
                 ]
             ).nunique()
             assert len(map_current_batch) == n_unique
+
+
+@pytest.mark.mg
+def test_bulk_sampler_empty_batches(dask_client, scratch_dir):
+    edgelist = dask_cudf.from_cudf(
+        cudf.DataFrame(
+            {
+                "src": [0, 0, 1, 2, 3, 4, 5, 6, 4, 4],
+                "dst": [3, 2, 0, 7, 8, 9, 1, 2, 8, 1],
+            }
+        ),
+        npartitions=2,
+    )
+
+    batches = dask_cudf.from_cudf(
+        cudf.DataFrame(
+            {
+                "start": [0, 1, 2, 7, 8, 9, 3, 2, 7],
+                "batch": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2, 2], dtype="int32"),
+            }
+        ),
+        npartitions=2,
+    )
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(edgelist, source="src", destination="dst")
+
+    samples_path = os.path.join(scratch_dir, "mg_test_bulk_sampler_empty_batches")
+    create_directory_with_overwrite(samples_path)
+
+    bs = BulkSampler(
+        batch_size=3,
+        output_path=samples_path,
+        graph=G,
+        fanout_vals=[-1, -1],
+        with_replacement=False,
+        batches_per_partition=6,
+        renumber=False,
+    )
+    bs.add_batches(batches, start_col_name="start", batch_col_name="batch")
+    bs.flush()
+
+    assert len(os.listdir(samples_path)) == 1
+
+    df = cudf.read_parquet(os.path.join(samples_path, "batch=0-1.parquet"))
+
+    assert df[
+        (df.batch_id == 0) & (df.hop_id == 0)
+    ].destinations.sort_values().values_host.tolist() == [0, 2, 3, 7]
+
+    assert df[
+        (df.batch_id == 0) & (df.hop_id == 1)
+    ].destinations.sort_values().values_host.tolist() == [2, 3, 7, 8]
+
+    assert df[
+        (df.batch_id == 1) & (df.hop_id == 0)
+    ].destinations.sort_values().values_host.tolist() == [7, 8]
+
+    assert len(df[(df.batch_id == 1) & (df.hop_id == 1)]) == 0
+
+    assert df.batch_id.max() == 1
+
+    shutil.rmtree(samples_path)

From 4baa14d55b773803dde0fd285e9ad4c2e6bcafd4 Mon Sep 17 00:00:00 2001
From: ralph <137829296+nv-rliu@users.noreply.github.com>
Date: Wed, 16 Aug 2023 17:18:59 -0400
Subject: [PATCH 08/72] Clean-up old testing conventions in `test_ecg.py`
 (#3779)

Closes #3235

This PR cleans up the `test_ecg.py` unit test. It removes the old way of comparing networkx results to cugraph, which used `PurePath` to compare the location of the stored dataset when getting networkx results to compare with cugraph.

Since the cugraph tests use the `Datasets` API, the golden results can be fetched based on the name of the dataset being used.

Authors:
  - ralph (https://github.com/nv-rliu)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3779
---
 .../cugraph/tests/community/test_ecg.py       | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/python/cugraph/cugraph/tests/community/test_ecg.py b/python/cugraph/cugraph/tests/community/test_ecg.py
index 4440973df83..be59b5d5bb3 100644
--- a/python/cugraph/cugraph/tests/community/test_ecg.py
+++ b/python/cugraph/cugraph/tests/community/test_ecg.py
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 import gc
-from pathlib import PurePath
 
 import pytest
 import networkx as nx
@@ -32,12 +31,12 @@ def cugraph_call(G, min_weight, ensemble_size):
     return score, num_parts
 
 
-def golden_call(graph_file):
-    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "dolphins.csv":
+def golden_call(filename):
+    if filename == "dolphins":
         return 0.4962422251701355
-    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "karate.csv":
+    if filename == "karate":
         return 0.38428664207458496
-    if graph_file == PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "netscience.csv":
+    if filename == "netscience":
         return 0.9279554486274719
 
 
@@ -49,16 +48,14 @@ def golden_call(graph_file):
 
 
 @pytest.mark.sg
-@pytest.mark.parametrize("graph_file", DATASETS)
+@pytest.mark.parametrize("dataset", DATASETS)
 @pytest.mark.parametrize("min_weight", MIN_WEIGHTS)
 @pytest.mark.parametrize("ensemble_size", ENSEMBLE_SIZES)
-def test_ecg_clustering(graph_file, min_weight, ensemble_size):
+def test_ecg_clustering(dataset, min_weight, ensemble_size):
     gc.collect()
 
     # Read in the graph and get a cugraph object
-
-    G = graph_file.get_graph()
-    dataset_path = graph_file.get_path()
+    G = dataset.get_graph()
     # read_weights_in_sp=False => value column dtype is float64
     G.edgelist.edgelist_df["weights"] = G.edgelist.edgelist_df["weights"].astype(
         "float64"
@@ -66,7 +63,8 @@ def test_ecg_clustering(graph_file, min_weight, ensemble_size):
 
     # Get the modularity score for partitioning versus random assignment
     cu_score, num_parts = cugraph_call(G, min_weight, ensemble_size)
-    golden_score = golden_call(dataset_path)
+    filename = dataset.metadata["name"]
+    golden_score = golden_call(filename)
 
     # Assert that the partitioning has better modularity than the random
     # assignment
@@ -74,13 +72,13 @@ def test_ecg_clustering(graph_file, min_weight, ensemble_size):
 
 
 @pytest.mark.sg
-@pytest.mark.parametrize("graph_file", DATASETS)
+@pytest.mark.parametrize("dataset", DATASETS)
 @pytest.mark.parametrize("min_weight", MIN_WEIGHTS)
 @pytest.mark.parametrize("ensemble_size", ENSEMBLE_SIZES)
-def test_ecg_clustering_nx(graph_file, min_weight, ensemble_size):
+def test_ecg_clustering_nx(dataset, min_weight, ensemble_size):
 
     gc.collect()
-    dataset_path = graph_file.get_path()
+    dataset_path = dataset.get_path()
     # Read in the graph and get a NetworkX graph
     M = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True)
     G = nx.from_pandas_edgelist(

From f1c0054ba6951745d801e14158dacdf81f5f5703 Mon Sep 17 00:00:00 2001
From: ralph <137829296+nv-rliu@users.noreply.github.com>
Date: Wed, 16 Aug 2023 17:44:10 -0400
Subject: [PATCH 09/72] Calling `dataset.get_edgelist()` returns a copy of an
 edge list instead of global (#3777)

Closes #3421

This PR fixes a bug where `get_edgelist()` would return a global edge list, which causes any changes made to the first returned edge list to be reflected in subsequent calls to `get_edgelist()`.

Now, `get_edgelist()` returns a copy of the dataframe stored by the `Dataset` object.

Authors:
  - ralph (https://github.com/nv-rliu)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3777
---
 python/cugraph/cugraph/datasets/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py
index 229d0fda632..b276a87b88e 100644
--- a/python/cugraph/cugraph/datasets/dataset.py
+++ b/python/cugraph/cugraph/datasets/dataset.py
@@ -191,7 +191,7 @@ def get_edgelist(self, download=False):
                 header=header,
             )
 
-        return self._edgelist
+        return self._edgelist.copy()
 
     def get_graph(
         self,

From a3bb1fbfdfa5087a60d1cf64a803f152756e1935 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Thu, 17 Aug 2023 08:55:14 -0400
Subject: [PATCH 10/72] Allow models to use a lightweight sparse structure
 (#3782)

This PR introduces `SparseGraph` class to allow `SAGEConv` to use a more lightweight graph structure. The goal is to provide an option to bypass `to_block` which is the bottleneck in the sampling workflow.

I will submit another PR to extend the pattern to other models.

Authors:
  - Tingyu Wang (https://github.com/tingyu66)
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3782
---
 .../cugraph_dgl/nn/conv/__init__.py           |   3 +
 .../cugraph-dgl/cugraph_dgl/nn/conv/base.py   | 172 +++++++++++++++++-
 .../cugraph_dgl/nn/conv/sageconv.py           |  55 +++---
 python/cugraph-dgl/tests/conftest.py          |  24 +++
 python/cugraph-dgl/tests/nn/test_sageconv.py  |  27 ++-
 .../cugraph-dgl/tests/nn/test_sparsegraph.py  |  50 +++++
 6 files changed, 298 insertions(+), 33 deletions(-)
 create mode 100644 python/cugraph-dgl/tests/nn/test_sparsegraph.py

diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
index 7d3a660c052..e5acbf34478 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
@@ -10,12 +10,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .base import SparseGraph
 from .gatconv import GATConv
 from .relgraphconv import RelGraphConv
 from .sageconv import SAGEConv
 from .transformerconv import TransformerConv
 
 __all__ = [
+    "SparseGraph",
     "GATConv",
     "RelGraphConv",
     "SAGEConv",
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
index 9eb1e5222ca..0eeaed29d86 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
@@ -11,14 +11,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional, Tuple, Union
+
 from cugraph.utilities.utils import import_optional
 
 torch = import_optional("torch")
-nn = import_optional("torch.nn")
 ops_torch = import_optional("pylibcugraphops.pytorch")
 
 
-class BaseConv(nn.Module):
+class BaseConv(torch.nn.Module):
     r"""An abstract base class for cugraph-ops nn module."""
 
     def __init__(self):
@@ -48,3 +49,170 @@ def pad_offsets(self, offsets: torch.Tensor, size: int) -> torch.Tensor:
         self._cached_offsets_fg[offsets.numel() : size] = offsets[-1]
 
         return self._cached_offsets_fg[:size]
+
+
+def compress_ids(ids: torch.Tensor, size: int) -> torch.Tensor:
+    return torch._convert_indices_from_coo_to_csr(
+        ids, size, out_int32=ids.dtype == torch.int32
+    )
+
+
+def decompress_ids(c_ids: torch.Tensor) -> torch.Tensor:
+    ids = torch.arange(c_ids.numel() - 1, dtype=c_ids.dtype, device=c_ids.device)
+    return ids.repeat_interleave(c_ids[1:] - c_ids[:-1])
+
+
+class SparseGraph(object):
+    r"""A god-class to store different sparse formats needed by cugraph-ops
+    and facilitate sparse format conversions.
+
+    Parameters
+    ----------
+    size: tuple of int
+        Size of the adjacency matrix: (num_src_nodes, num_dst_nodes).
+
+    src_ids: torch.Tensor
+        Source indices of the edges.
+
+    dst_ids: torch.Tensor, optional
+        Destination indices of the edges.
+
+    csrc_ids: torch.Tensor, optional
+        Compressed source indices. It is a monotonically increasing array of
+        size (num_src_nodes + 1,). For the k-th source node, its neighborhood
+        consists of the destinations between `dst_indices[csrc_indices[k]]` and
+        `dst_indices[csrc_indices[k+1]]`.
+
+    cdst_ids: torch.Tensor, optional
+        Compressed destination indices. It is a monotonically increasing array of
+        size (num_dst_nodes + 1,). For the k-th destination node, its neighborhood
+        consists of the sources between `src_indices[cdst_indices[k]]` and
+        `src_indices[cdst_indices[k+1]]`.
+
+    dst_ids_is_sorted: bool
+        Whether `dst_ids` has been sorted in an ascending order. When sorted,
+        creating CSC layout is much faster.
+
+    formats: str or tuple of str, optional
+        The desired sparse formats to create for the graph.
+
+    reduce_memory: bool, optional
+        When set, the tensors are not required by the desired formats will be
+        set to `None`.
+
+    Notes
+    -----
+    For MFGs (sampled graphs), the node ids must have been renumbered.
+    """
+
+    supported_formats = {"coo": ("src_ids", "dst_ids"), "csc": ("cdst_ids", "src_ids")}
+
+    all_tensors = set(["src_ids", "dst_ids", "csrc_ids", "cdst_ids"])
+
+    def __init__(
+        self,
+        size: Tuple[int, int],
+        src_ids: torch.Tensor,
+        dst_ids: Optional[torch.Tensor] = None,
+        csrc_ids: Optional[torch.Tensor] = None,
+        cdst_ids: Optional[torch.Tensor] = None,
+        dst_ids_is_sorted: bool = False,
+        formats: Optional[Union[str, Tuple[str]]] = None,
+        reduce_memory: bool = True,
+    ):
+        self._num_src_nodes, self._num_dst_nodes = size
+        self._dst_ids_is_sorted = dst_ids_is_sorted
+
+        if dst_ids is None and cdst_ids is None:
+            raise ValueError("One of 'dst_ids' and 'cdst_ids' must be given.")
+
+        if src_ids is not None:
+            src_ids = src_ids.contiguous()
+
+        if dst_ids is not None:
+            dst_ids = dst_ids.contiguous()
+
+        if csrc_ids is not None:
+            if csrc_ids.numel() != self._num_src_nodes + 1:
+                raise RuntimeError(
+                    f"Size mismatch for 'csrc_ids': expected ({size[0]+1},), "
+                    f"but got {tuple(csrc_ids.size())}"
+                )
+            csrc_ids = csrc_ids.contiguous()
+
+        if cdst_ids is not None:
+            if cdst_ids.numel() != self._num_dst_nodes + 1:
+                raise RuntimeError(
+                    f"Size mismatch for 'cdst_ids': expected ({size[1]+1},), "
+                    f"but got {tuple(cdst_ids.size())}"
+                )
+            cdst_ids = cdst_ids.contiguous()
+
+        self._src_ids = src_ids
+        self._dst_ids = dst_ids
+        self._csrc_ids = csrc_ids
+        self._cdst_ids = cdst_ids
+        self._perm = None
+
+        if isinstance(formats, str):
+            formats = (formats,)
+
+        if formats is not None:
+            for format_ in formats:
+                assert format_ in SparseGraph.supported_formats
+                self.__getattribute__(f"_create_{format_}")()
+        self._formats = formats
+
+        self._reduce_memory = reduce_memory
+        if reduce_memory:
+            self.reduce_memory()
+
+    def reduce_memory(self):
+        """Remove the tensors that are not necessary to create the desired sparse
+        formats to reduce memory footprint."""
+
+        self._perm = None
+        if self._formats is None:
+            return
+
+        tensors_needed = []
+        for f in self._formats:
+            tensors_needed += SparseGraph.supported_formats[f]
+        for t in SparseGraph.all_tensors.difference(set(tensors_needed)):
+            self.__dict__[t] = None
+
+    def _create_coo(self):
+        if self._dst_ids is None:
+            self._dst_ids = decompress_ids(self._cdst_ids)
+
+    def _create_csc(self):
+        if self._cdst_ids is None:
+            if not self._dst_ids_is_sorted:
+                self._dst_ids, self._perm = torch.sort(self._dst_ids)
+                self._src_ids = self._src_ids[self._perm]
+            self._cdst_ids = compress_ids(self._dst_ids, self._num_dst_nodes)
+
+    def num_src_nodes(self):
+        return self._num_src_nodes
+
+    def num_dst_nodes(self):
+        return self._num_dst_nodes
+
+    def formats(self):
+        return self._formats
+
+    def coo(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        if "coo" not in self.formats():
+            raise RuntimeError(
+                "The SparseGraph did not create a COO layout. "
+                "Set 'formats' to include 'coo' when creating the graph."
+            )
+        return (self._src_ids, self._dst_ids)
+
+    def csc(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        if "csc" not in self.formats():
+            raise RuntimeError(
+                "The SparseGraph did not create a CSC layout. "
+                "Set 'formats' to include 'csc' when creating the graph."
+            )
+        return (self._cdst_ids, self._src_ids)
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
index 403678e24a2..60f4c505e19 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
@@ -14,9 +14,9 @@
 cugraph-ops"""
 # pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
 from __future__ import annotations
-from typing import Optional
+from typing import Optional, Union
 
-from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
@@ -98,7 +98,7 @@ def reset_parameters(self):
 
     def forward(
         self,
-        g: dgl.DGLHeteroGraph,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
         feat: torch.Tensor,
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
@@ -106,37 +106,46 @@ def forward(
 
         Parameters
         ----------
-        g : DGLGraph
+        g : DGLGraph or SparseGraph
             The graph.
         feat : torch.Tensor
             Node features. Shape: :math:`(|V|, D_{in})`.
         max_in_degree : int
-            Maximum in-degree of destination nodes. It is only effective when
-            :attr:`g` is a :class:`DGLBlock`, i.e., bipartite graph. When
-            :attr:`g` is generated from a neighbor sampler, the value should be
-            set to the corresponding :attr:`fanout`. If not given,
-            :attr:`max_in_degree` will be calculated on-the-fly.
+            Maximum in-degree of destination nodes. When :attr:`g` is generated
+            from a neighbor sampler, the value should be set to the corresponding
+            :attr:`fanout`. This option is used to invoke the MFG-variant of
+            cugraph-ops kernel.
 
         Returns
         -------
         torch.Tensor
             Output node features. Shape: :math:`(|V|, D_{out})`.
         """
-        offsets, indices, _ = g.adj_tensors("csc")
-
-        if g.is_block:
-            if max_in_degree is None:
-                max_in_degree = g.in_degrees().max().item()
-
-            if max_in_degree < self.MAX_IN_DEGREE_MFG:
-                _graph = ops_torch.SampledCSC(
-                    offsets, indices, max_in_degree, g.num_src_nodes()
-                )
-            else:
-                offsets_fg = self.pad_offsets(offsets, g.num_src_nodes() + 1)
-                _graph = ops_torch.StaticCSC(offsets_fg, indices)
+        if max_in_degree is None:
+            max_in_degree = -1
+
+        if isinstance(g, SparseGraph):
+            assert "csc" in g.formats()
+            offsets, indices = g.csc()
+            _graph = ops_torch.CSC(
+                offsets=offsets,
+                indices=indices,
+                num_src_nodes=g.num_src_nodes(),
+                dst_max_in_degree=max_in_degree,
+            )
+        elif isinstance(g, dgl.DGLHeteroGraph):
+            offsets, indices, _ = g.adj_tensors("csc")
+            _graph = ops_torch.CSC(
+                offsets=offsets,
+                indices=indices,
+                num_src_nodes=g.num_src_nodes(),
+                dst_max_in_degree=max_in_degree,
+            )
         else:
-            _graph = ops_torch.StaticCSC(offsets, indices)
+            raise TypeError(
+                f"The graph has to be either a 'SparseGraph' or "
+                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
+            )
 
         feat = self.feat_drop(feat)
         h = ops_torch.operators.agg_concat_n2n(feat, _graph, self.aggr)[
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/tests/conftest.py
index dc6b7db9b45..6f8690d1140 100644
--- a/python/cugraph-dgl/tests/conftest.py
+++ b/python/cugraph-dgl/tests/conftest.py
@@ -13,6 +13,8 @@
 
 import pytest
 
+import torch
+
 from cugraph.testing.mg_utils import (
     start_dask_client,
     stop_dask_client,
@@ -31,3 +33,25 @@ def dask_client():
     yield dask_client
 
     stop_dask_client(dask_client, dask_cluster)
+
+
+class SparseGraphData1:
+    size = (6, 5)
+    nnz = 6
+    src_ids = torch.IntTensor([0, 1, 2, 3, 2, 5]).cuda()
+    dst_ids = torch.IntTensor([1, 2, 3, 4, 0, 3]).cuda()
+
+    # CSR
+    src_ids_sorted_by_src = torch.IntTensor([0, 1, 2, 2, 3, 5]).cuda()
+    dst_ids_sorted_by_src = torch.IntTensor([1, 2, 0, 3, 4, 3]).cuda()
+    csrc_ids = torch.IntTensor([0, 1, 2, 4, 5, 5, 6]).cuda()
+
+    # CSC
+    src_ids_sorted_by_dst = torch.IntTensor([2, 0, 1, 5, 2, 3]).cuda()
+    dst_ids_sorted_by_dst = torch.IntTensor([0, 1, 2, 3, 3, 4]).cuda()
+    cdst_ids = torch.IntTensor([0, 1, 2, 3, 5, 6]).cuda()
+
+
+@pytest.fixture
+def sparse_graph_1():
+    return SparseGraphData1()
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index 38cb020b8bb..447bbe49460 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -14,12 +14,9 @@
 
 import pytest
 
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
 from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
 from .common import create_graph1
 
 torch = import_optional("torch")
@@ -30,20 +27,31 @@
 @pytest.mark.parametrize("idtype_int", [False, True])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("to_block", [False, True])
-def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block):
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
+def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block, sparse_format):
     SAGEConv = dgl.nn.SAGEConv
-    CuGraphSAGEConv = cugraph_dgl.nn.SAGEConv
     device = "cuda"
 
     in_feat, out_feat = 5, 2
     kwargs = {"aggregator_type": "mean", "bias": bias}
     g = create_graph1().to(device)
+
     if idtype_int:
         g = g.int()
     if to_block:
         g = dgl.to_block(g)
+
+    size = (g.num_src_nodes(), g.num_dst_nodes())
     feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
 
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
+        )
+    elif sparse_format == "csc":
+        offsets, indices, _ = g.adj_tensors("csc")
+        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
+
     torch.manual_seed(0)
     conv1 = SAGEConv(in_feat, out_feat, **kwargs).to(device)
 
@@ -57,7 +65,10 @@ def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block):
             conv2.linear.bias.data[:] = conv1.fc_self.bias.data
 
     out1 = conv1(g, feat)
-    out2 = conv2(g, feat, max_in_degree=max_in_degree)
+    if sparse_format is not None:
+        out2 = conv2(sg, feat, max_in_degree=max_in_degree)
+    else:
+        out2 = conv2(g, feat, max_in_degree=max_in_degree)
     assert torch.allclose(out1, out2, atol=1e-06)
 
     grad_out = torch.rand_like(out1)
diff --git a/python/cugraph-dgl/tests/nn/test_sparsegraph.py b/python/cugraph-dgl/tests/nn/test_sparsegraph.py
new file mode 100644
index 00000000000..3fb01575d66
--- /dev/null
+++ b/python/cugraph-dgl/tests/nn/test_sparsegraph.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn import SparseGraph
+
+torch = import_optional("torch")
+
+
+def test_coo2csc(sparse_graph_1):
+    data = sparse_graph_1
+    values = torch.ones(data.nnz).cuda()
+    g = SparseGraph(
+        size=data.size, src_ids=data.src_ids, dst_ids=data.dst_ids, formats="csc"
+    )
+    cdst_ids, src_ids = g.csc()
+
+    new = torch.sparse_csc_tensor(cdst_ids, src_ids, values).cuda()
+    old = torch.sparse_coo_tensor(
+        torch.vstack((data.src_ids, data.dst_ids)), values
+    ).cuda()
+    torch.allclose(new.to_dense(), old.to_dense())
+
+
+def test_csc2coo(sparse_graph_1):
+    data = sparse_graph_1
+    values = torch.ones(data.nnz).cuda()
+    g = SparseGraph(
+        size=data.size,
+        src_ids=data.src_ids_sorted_by_dst,
+        cdst_ids=data.cdst_ids,
+        formats="coo",
+    )
+    src_ids, dst_ids = g.coo()
+
+    new = torch.sparse_coo_tensor(torch.vstack((src_ids, dst_ids)), values).cuda()
+    old = torch.sparse_csc_tensor(
+        data.cdst_ids, data.src_ids_sorted_by_dst, values
+    ).cuda()
+    torch.allclose(new.to_dense(), old.to_dense())

From 5abbf7505e4ee75c63d8bad7e3f56c8e6c9d81ad Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Sat, 19 Aug 2023 18:34:02 +0200
Subject: [PATCH 11/72] Use rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10 for docs
 build (#3811)

cugraph-dgl is not supported for CUDA 12 yet and but the doc build use rapidsai/ci:latest which is on CUDA 12 now.

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3811
---
 .github/workflows/build.yaml | 2 +-
 .github/workflows/pr.yaml    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 413a24930e8..2d0d58315a0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c1801b468b3..005fe4a0267 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -63,7 +63,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -73,7 +73,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibcugraph:
     needs: checks

From 8b1f0149bdde7a107be8804419d6a339333d038e Mon Sep 17 00:00:00 2001
From: ralph <137829296+nv-rliu@users.noreply.github.com>
Date: Mon, 21 Aug 2023 15:17:30 -0400
Subject: [PATCH 12/72] Update `python_run_cugraph` in `dependencies.yaml`
 (#3781)

Closes #2811

This PR adds `aiohttp` and `requests` as hard dependencies to cugraph. Without these packages, `cugraph.datasets` is unable to download missing datasets. `cugraph.datasets` uses `cudf.read_csv` to download files from s3.

Authors:
  - ralph (https://github.com/nv-rliu)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Rick Ratzel (https://github.com/rlratzel)
  - Brad Rees (https://github.com/BradReesWork)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/3781
---
 conda/environments/all_cuda-118_arch-x86_64.yaml |  2 +-
 conda/environments/all_cuda-120_arch-x86_64.yaml |  2 +-
 conda/recipes/cugraph/meta.yaml                  |  3 +++
 dependencies.yaml                                | 11 ++++++-----
 python/cugraph-service/server/pyproject.toml     |  3 ---
 python/cugraph/pyproject.toml                    |  4 +---
 6 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index c92f1f47fc9..f3b15b6d13f 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - dask>=2023.5.1
 - distributed>=2023.5.1
 - doxygen
-- fsspec[http]>=0.6.0
+- fsspec>=0.6.0
 - gcc_linux-64=11.*
 - gmock>=1.13.0
 - graphviz
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 0b211458562..b87f1c635f8 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - dask>=2023.5.1
 - distributed>=2023.5.1
 - doxygen
-- fsspec[http]>=0.6.0
+- fsspec>=0.6.0
 - gcc_linux-64=11.*
 - gmock>=1.13.0
 - graphviz
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index 126fc9f7490..ec94ee05194 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -64,6 +64,7 @@ requirements:
     - scikit-build >=0.13.1
     - setuptools
   run:
+    - aiohttp
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
@@ -78,11 +79,13 @@ requirements:
     - dask ==2023.7.1
     - dask-core ==2023.7.1
     - distributed ==2023.7.1
+    - fsspec>=0.6.0
     - libcugraph ={{ version }}
     - pylibcugraph ={{ version }}
     - pylibraft ={{ minor_version }}
     - python
     - raft-dask ={{ minor_version }}
+    - requests
     - ucx-proc=*=gpu
     - ucx-py {{ ucx_py_version }}
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 1c2d30c0546..c8a3f48579c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -332,14 +332,20 @@ dependencies:
           - &ucx_py ucx-py==0.34.*
       - output_types: conda
         packages:
+          - aiohttp
           - &cupy cupy>=12.0.0
           - &dask-core dask-core>=2023.5.1
+          - fsspec>=0.6.0
           - libcudf==23.10.*
+          - requests
           - nccl>=2.9.9
           - ucx-proc=*=gpu
       - output_types: pyproject
         packages:
           - &cupy_pip cupy-cuda11x>=12.0.0
+            # cudf uses fsspec but is protocol independent. cugraph 
+            # dataset APIs require [http] extras for use with cudf.
+          - fsspec[http]>=0.6.0
           - pylibcugraph==23.10.*
   python_run_pylibcugraph:
     common:
@@ -434,12 +440,7 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - aiohttp
-            # cudf will use fsspec but is protocol independent. cugraph tests
-            # specifically require http for the test files it asks cudf to read.
-          - fsspec[http]>=0.6.0
           - python-louvain
-          - requests
           - scikit-learn>=0.23.1
   test_python_pylibcugraph:
     common:
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index cf14f04dfd1..52211b3ff89 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -43,8 +43,6 @@ dynamic = ["entry-points"]
 
 [project.optional-dependencies]
 test = [
-    "aiohttp",
-    "fsspec[http]>=0.6.0",
     "networkx>=2.5.1",
     "numpy>=1.21",
     "pandas",
@@ -53,7 +51,6 @@ test = [
     "pytest-cov",
     "pytest-xdist",
     "python-louvain",
-    "requests",
     "scikit-learn>=0.23.1",
     "scipy",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index 343ae1f748f..4c49ae259c1 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
     "dask-cudf==23.10.*",
     "dask>=2023.5.1",
     "distributed>=2023.5.1",
+    "fsspec[http]>=0.6.0",
     "numba>=0.57",
     "pylibcugraph==23.10.*",
     "raft-dask==23.10.*",
@@ -50,8 +51,6 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "aiohttp",
-    "fsspec[http]>=0.6.0",
     "networkx>=2.5.1",
     "numpy>=1.21",
     "pandas",
@@ -60,7 +59,6 @@ test = [
     "pytest-cov",
     "pytest-xdist",
     "python-louvain",
-    "requests",
     "scikit-learn>=0.23.1",
     "scipy",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 48fc29cd91b3115d691d34625b06984f385bdf2b Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Mon, 21 Aug 2023 21:17:28 -0500
Subject: [PATCH 13/72] Adds missing copyright and license text to __init__.py
 package files (#3799)

closes #3798

This PR adds missing copyright and license text to \_\_init\_\_.py package files.

The starting copyright year is based on the earliest entry in the "git log \_\_init\_\_.py" output.

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3799
---
 .../shared/python/cugraph_benchmarking/__init__.py  | 13 +++++++++++++
 python/cugraph-dgl/cugraph_dgl/utils/__init__.py    | 13 +++++++++++++
 python/cugraph-nx/cugraph_nx/tests/__init__.py      | 13 +++++++++++++
 .../cugraph_service_server/testing/__init__.py      | 13 +++++++++++++
 python/cugraph-service/tests/__init__.py            | 13 +++++++++++++
 python/cugraph/cugraph/dask/common/__init__.py      | 13 +++++++++++++
 python/cugraph/cugraph/dask/comms/__init__.py       | 13 +++++++++++++
 python/cugraph/cugraph/dask/components/__init__.py  | 13 +++++++++++++
 .../cugraph/cugraph/dask/link_analysis/__init__.py  | 13 +++++++++++++
 .../cugraph/dask/link_prediction/__init__.py        | 13 +++++++++++++
 python/cugraph/cugraph/dask/structure/__init__.py   | 13 +++++++++++++
 python/cugraph/cugraph/dask/traversal/__init__.py   | 13 +++++++++++++
 .../cugraph/cugraph/datasets/metadata/__init__.py   | 13 +++++++++++++
 .../cugraph/cugraph/experimental/compat/__init__.py | 13 +++++++++++++
 .../cugraph/experimental/components/__init__.py     | 13 +++++++++++++
 .../experimental/datasets/metadata/__init__.py      | 13 +++++++++++++
 .../experimental/link_prediction/__init__.py        | 13 +++++++++++++
 .../cugraph/experimental/structure/__init__.py      | 13 +++++++++++++
 .../pylibcugraph/_cugraph_c/__init__.py             | 13 +++++++++++++
 .../pylibcugraph/components/__init__.py             | 13 +++++++++++++
 .../pylibcugraph/internal_types/__init__.py         | 13 +++++++++++++
 .../pylibcugraph/pylibcugraph/structure/__init__.py | 13 +++++++++++++
 .../pylibcugraph/pylibcugraph/utilities/__init__.py | 13 +++++++++++++
 23 files changed, 299 insertions(+)

diff --git a/benchmarks/shared/python/cugraph_benchmarking/__init__.py b/benchmarks/shared/python/cugraph_benchmarking/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/benchmarks/shared/python/cugraph_benchmarking/__init__.py
+++ b/benchmarks/shared/python/cugraph_benchmarking/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph-dgl/cugraph_dgl/utils/__init__.py b/python/cugraph-dgl/cugraph_dgl/utils/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/cugraph-dgl/cugraph_dgl/utils/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph-nx/cugraph_nx/tests/__init__.py b/python/cugraph-nx/cugraph_nx/tests/__init__.py
index e69de29bb2d..ce94db52fa2 100644
--- a/python/cugraph-nx/cugraph_nx/tests/__init__.py
+++ b/python/cugraph-nx/cugraph_nx/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph-service/server/cugraph_service_server/testing/__init__.py b/python/cugraph-service/server/cugraph_service_server/testing/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/cugraph-service/server/cugraph_service_server/testing/__init__.py
+++ b/python/cugraph-service/server/cugraph_service_server/testing/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph-service/tests/__init__.py b/python/cugraph-service/tests/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/cugraph-service/tests/__init__.py
+++ b/python/cugraph-service/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/dask/common/__init__.py b/python/cugraph/cugraph/dask/common/__init__.py
index e69de29bb2d..521b825bbf5 100644
--- a/python/cugraph/cugraph/dask/common/__init__.py
+++ b/python/cugraph/cugraph/dask/common/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/dask/comms/__init__.py b/python/cugraph/cugraph/dask/comms/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/cugraph/cugraph/dask/comms/__init__.py
+++ b/python/cugraph/cugraph/dask/comms/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/dask/components/__init__.py b/python/cugraph/cugraph/dask/components/__init__.py
index e69de29bb2d..521b825bbf5 100644
--- a/python/cugraph/cugraph/dask/components/__init__.py
+++ b/python/cugraph/cugraph/dask/components/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/dask/link_analysis/__init__.py b/python/cugraph/cugraph/dask/link_analysis/__init__.py
index e69de29bb2d..521b825bbf5 100644
--- a/python/cugraph/cugraph/dask/link_analysis/__init__.py
+++ b/python/cugraph/cugraph/dask/link_analysis/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/dask/link_prediction/__init__.py b/python/cugraph/cugraph/dask/link_prediction/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/cugraph/cugraph/dask/link_prediction/__init__.py
+++ b/python/cugraph/cugraph/dask/link_prediction/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/dask/structure/__init__.py b/python/cugraph/cugraph/dask/structure/__init__.py
index e69de29bb2d..521b825bbf5 100644
--- a/python/cugraph/cugraph/dask/structure/__init__.py
+++ b/python/cugraph/cugraph/dask/structure/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/dask/traversal/__init__.py b/python/cugraph/cugraph/dask/traversal/__init__.py
index e69de29bb2d..521b825bbf5 100644
--- a/python/cugraph/cugraph/dask/traversal/__init__.py
+++ b/python/cugraph/cugraph/dask/traversal/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/datasets/metadata/__init__.py b/python/cugraph/cugraph/datasets/metadata/__init__.py
index e69de29bb2d..ce94db52fa2 100644
--- a/python/cugraph/cugraph/datasets/metadata/__init__.py
+++ b/python/cugraph/cugraph/datasets/metadata/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/experimental/compat/__init__.py b/python/cugraph/cugraph/experimental/compat/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/cugraph/cugraph/experimental/compat/__init__.py
+++ b/python/cugraph/cugraph/experimental/compat/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/experimental/components/__init__.py b/python/cugraph/cugraph/experimental/components/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/cugraph/cugraph/experimental/components/__init__.py
+++ b/python/cugraph/cugraph/experimental/components/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py b/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py
+++ b/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/experimental/link_prediction/__init__.py b/python/cugraph/cugraph/experimental/link_prediction/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/cugraph/cugraph/experimental/link_prediction/__init__.py
+++ b/python/cugraph/cugraph/experimental/link_prediction/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/cugraph/cugraph/experimental/structure/__init__.py b/python/cugraph/cugraph/experimental/structure/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/cugraph/cugraph/experimental/structure/__init__.py
+++ b/python/cugraph/cugraph/experimental/structure/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/__init__.py b/python/pylibcugraph/pylibcugraph/_cugraph_c/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/pylibcugraph/pylibcugraph/components/__init__.py b/python/pylibcugraph/pylibcugraph/components/__init__.py
index e69de29bb2d..521b825bbf5 100644
--- a/python/pylibcugraph/pylibcugraph/components/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/components/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/pylibcugraph/pylibcugraph/internal_types/__init__.py b/python/pylibcugraph/pylibcugraph/internal_types/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/pylibcugraph/pylibcugraph/internal_types/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/internal_types/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/pylibcugraph/pylibcugraph/structure/__init__.py b/python/pylibcugraph/pylibcugraph/structure/__init__.py
index e69de29bb2d..521b825bbf5 100644
--- a/python/pylibcugraph/pylibcugraph/structure/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/structure/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/pylibcugraph/pylibcugraph/utilities/__init__.py b/python/pylibcugraph/pylibcugraph/utilities/__init__.py
index e69de29bb2d..081b2ae8260 100644
--- a/python/pylibcugraph/pylibcugraph/utilities/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/utilities/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From 66d6c79cea8de0afb1b71c25a6d4b1b49f6a6731 Mon Sep 17 00:00:00 2001
From: Dylan Chima-Sanchez <97180625+betochimas@users.noreply.github.com>
Date: Mon, 21 Aug 2023 21:26:04 -0700
Subject: [PATCH 14/72] Testing util improvements and refactoring (#3705)

closes https://github.com/rapidsai/cugraph/issues/2416

Removes a dependency when running python tests, which will speed up test runs. Also introduces a new testing component (`Resultset`) to clean up unit tests by generating golden results and storing them locally/on the cloud. This PR is focused on tests from the `traversal` category.

NOTE: Devs will download specific golden results from the cloud to avoid local overhead.

Authors:
  - Dylan Chima-Sanchez (https://github.com/betochimas)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3705
---
 python/cugraph/cugraph/testing/__init__.py    |  11 +-
 .../cugraph/testing/generate_resultsets.py    | 276 ++++++++++++++++++
 python/cugraph/cugraph/testing/resultset.py   | 136 +++++++++
 .../cugraph/tests/traversal/test_bfs.py       | 210 +++++++------
 .../cugraph/tests/traversal/test_paths.py     | 117 +++++---
 .../cugraph/tests/traversal/test_sssp.py      | 219 ++++++++------
 6 files changed, 743 insertions(+), 226 deletions(-)
 create mode 100644 python/cugraph/cugraph/testing/generate_resultsets.py
 create mode 100644 python/cugraph/cugraph/testing/resultset.py

diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py
index db841a9a865..bde398aadbd 100644
--- a/python/cugraph/cugraph/testing/__init__.py
+++ b/python/cugraph/cugraph/testing/__init__.py
@@ -11,7 +11,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.testing.utils import RAPIDS_DATASET_ROOT_DIR_PATH, RAPIDS_DATASET_ROOT_DIR
+from cugraph.testing.utils import (
+    RAPIDS_DATASET_ROOT_DIR_PATH,
+    RAPIDS_DATASET_ROOT_DIR,
+)
+from cugraph.testing.resultset import (
+    Resultset,
+    load_resultset,
+    get_resultset,
+    results_dir_path,
+)
 from cugraph.datasets import (
     cyber,
     dolphins,
diff --git a/python/cugraph/cugraph/testing/generate_resultsets.py b/python/cugraph/cugraph/testing/generate_resultsets.py
new file mode 100644
index 00000000000..9724aca32dc
--- /dev/null
+++ b/python/cugraph/cugraph/testing/generate_resultsets.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tempfile import NamedTemporaryFile
+import random
+
+import numpy as np
+import networkx as nx
+
+import cudf
+import cugraph
+from cugraph.datasets import dolphins, netscience, karate_disjoint, karate
+from cugraph.testing import utils, Resultset, SMALL_DATASETS, results_dir_path
+
+
+_resultsets = {}
+
+
+def add_resultset(result_data_dictionary, **kwargs):
+    rs = Resultset(result_data_dictionary)
+    hashable_dict_repr = tuple((k, kwargs[k]) for k in sorted(kwargs.keys()))
+    _resultsets[hashable_dict_repr] = rs
+
+
+if __name__ == "__main__":
+    # =============================================================================
+    # Parameters
+    # =============================================================================
+    SEEDS = [42]
+
+    DIRECTED_GRAPH_OPTIONS = [True, False]
+
+    DEPTH_LIMITS = [None, 1, 5, 18]
+
+    DATASETS = [dolphins, netscience, karate_disjoint]
+
+    # =============================================================================
+    # tests/traversal/test_bfs.py
+    # =============================================================================
+    test_bfs_results = {}
+
+    for ds in DATASETS + [karate]:
+        for seed in SEEDS:
+            for depth_limit in DEPTH_LIMITS:
+                for dirctd in DIRECTED_GRAPH_OPTIONS:
+                    # this is used for get_cu_graph_golden_results_and_params
+                    Gnx = utils.generate_nx_graph_from_file(
+                        ds.get_path(), directed=dirctd
+                    )
+                    random.seed(seed)
+                    start_vertex = random.sample(list(Gnx.nodes()), 1)[0]
+                    golden_values = nx.single_source_shortest_path_length(
+                        Gnx, start_vertex, cutoff=depth_limit
+                    )
+                    vertices = cudf.Series(golden_values.keys())
+                    distances = cudf.Series(golden_values.values())
+                    add_resultset(
+                        {"vertex": vertices, "distance": distances},
+                        graph_dataset=ds.metadata["name"],
+                        graph_directed=str(dirctd),
+                        algo="single_source_shortest_path_length",
+                        start_vertex=str(start_vertex),
+                        cutoff=str(depth_limit),
+                    )
+
+    # these are pandas dataframes
+    for dirctd in DIRECTED_GRAPH_OPTIONS:
+        Gnx = utils.generate_nx_graph_from_file(karate.get_path(), directed=dirctd)
+        golden_result = cugraph.bfs_edges(Gnx, source=7)
+        cugraph_df = cudf.from_pandas(golden_result)
+        add_resultset(
+            cugraph_df,
+            graph_dataset="karate",
+            graph_directed=str(dirctd),
+            algo="bfs_edges",
+            source="7",
+        )
+
+    # =============================================================================
+    # tests/traversal/test_sssp.py
+    # =============================================================================
+    test_sssp_results = {}
+
+    SOURCES = [1]
+
+    for ds in SMALL_DATASETS:
+        for source in SOURCES:
+            Gnx = utils.generate_nx_graph_from_file(ds.get_path(), directed=True)
+            golden_paths = nx.single_source_dijkstra_path_length(Gnx, source)
+            vertices = cudf.Series(golden_paths.keys())
+            distances = cudf.Series(golden_paths.values())
+            add_resultset(
+                {"vertex": vertices, "distance": distances},
+                graph_dataset=ds.metadata["name"],
+                graph_directed="True",
+                algo="single_source_dijkstra_path_length",
+                source=str(source),
+            )
+
+            M = utils.read_csv_for_nx(ds.get_path(), read_weights_in_sp=True)
+            edge_attr = "weight"
+            Gnx = nx.from_pandas_edgelist(
+                M,
+                source="0",
+                target="1",
+                edge_attr=edge_attr,
+                create_using=nx.DiGraph(),
+            )
+
+            M["weight"] = M["weight"].astype(np.int32)
+            Gnx = nx.from_pandas_edgelist(
+                M,
+                source="0",
+                target="1",
+                edge_attr="weight",
+                create_using=nx.DiGraph(),
+            )
+            golden_paths_datatypeconv = nx.single_source_dijkstra_path_length(
+                Gnx, source
+            )
+            vertices_datatypeconv = cudf.Series(golden_paths_datatypeconv.keys())
+            distances_datatypeconv = cudf.Series(golden_paths_datatypeconv.values())
+            add_resultset(
+                {"vertex": vertices_datatypeconv, "distance": distances_datatypeconv},
+                graph_dataset=ds.metadata["name"],
+                graph_directed="True",
+                algo="single_source_dijkstra_path_length",
+                test="data_type_conversion",
+                source=str(source),
+            )
+
+    for dirctd in DIRECTED_GRAPH_OPTIONS:
+        for source in SOURCES:
+            Gnx = utils.generate_nx_graph_from_file(
+                karate.get_path(), directed=dirctd, edgevals=True
+            )
+            add_resultset(
+                cugraph.sssp(Gnx, source),
+                graph_dataset="karate",
+                graph_directed=str(dirctd),
+                algo="sssp_nonnative",
+                source=str(source),
+            )
+
+    Gnx = nx.Graph()
+    Gnx.add_edge(0, 1, other=10)
+    Gnx.add_edge(1, 2, other=20)
+    df = cugraph.sssp(Gnx, 0, edge_attr="other")
+    add_resultset(df, algo="sssp_nonnative", test="network_edge_attr")
+
+    # =============================================================================
+    # tests/traversal/test_paths.py
+    # =============================================================================
+    CONNECTED_GRAPH = """1,5,3
+    1,4,1
+    1,2,1
+    1,6,2
+    1,7,2
+    4,5,1
+    2,3,1
+    7,6,2
+    """
+
+    DISCONNECTED_GRAPH = CONNECTED_GRAPH + "8,9,4"
+
+    paths = [("1", "1"), ("1", "5"), ("1", "3"), ("1", "6")]
+    invalid_paths = {
+        "connected": [("-1", "1"), ("0", "42")],
+        "disconnected": [("1", "10"), ("1", "8")],
+    }
+
+    with NamedTemporaryFile(mode="w+", suffix=".csv") as graph_tf:
+        graph_tf.writelines(DISCONNECTED_GRAPH)
+        graph_tf.seek(0)
+        Gnx_DIS = nx.read_weighted_edgelist(graph_tf.name, delimiter=",")
+
+    res1 = nx.shortest_path_length(Gnx_DIS, source="1", weight="weight")
+    vertices = cudf.Series(res1.keys())
+    distances = cudf.Series(res1.values())
+    add_resultset(
+        {"vertex": vertices, "distance": distances},
+        algo="shortest_path_length",
+        graph_dataset="DISCONNECTED",
+        graph_directed="True",
+        source="1",
+        weight="weight",
+    )
+
+    # NOTE: Currently, only traversal result files are generated
+    random.seed(24)
+    traversal_mappings = cudf.DataFrame(
+        columns=[
+            "#UUID",
+            "arg0",
+            "arg0val",
+            "arg1",
+            "arg1val",
+            "arg2",
+            "arg2val",
+            "arg3",
+            "arg3val",
+            "arg4",
+            "arg4val",
+            "arg5",
+            "arg5val",
+            "arg6",
+            "arg6val",
+            "arg7",
+            "arg7val",
+            "arg8",
+            "arg8val",
+            "arg9",
+            "arg9val",
+        ]
+    )
+    # Generating ALL results files
+    if not results_dir_path.exists():
+        results_dir_path.mkdir(parents=True, exist_ok=True)
+
+    for temp in _resultsets:
+        res = _resultsets[temp].get_cudf_dataframe()
+        temp_filename = str(random.getrandbits(50))
+        temp_dict = dict(temp)
+        argnames, argvals = [t for t in temp_dict.keys()], [
+            t for t in temp_dict.values()
+        ]
+        single_mapping = np.empty(21, dtype=object)
+        dict_length = len(argnames)
+
+        single_mapping[0] = temp_filename
+        for i in np.arange(dict_length):
+            single_mapping[2 * i + 1] = argnames[i]
+            single_mapping[2 * i + 2] = argvals[i]
+        temp_mapping = cudf.DataFrame(
+            [single_mapping],
+            columns=[
+                "#UUID",
+                "arg0",
+                "arg0val",
+                "arg1",
+                "arg1val",
+                "arg2",
+                "arg2val",
+                "arg3",
+                "arg3val",
+                "arg4",
+                "arg4val",
+                "arg5",
+                "arg5val",
+                "arg6",
+                "arg6val",
+                "arg7",
+                "arg7val",
+                "arg8",
+                "arg8val",
+                "arg9",
+                "arg9val",
+            ],
+        )
+        traversal_mappings = cudf.concat(
+            [traversal_mappings, temp_mapping], axis=0, ignore_index=True
+        )
+        res.to_csv(results_dir_path / (temp_filename + ".csv"), index=False)
+    traversal_mappings.to_csv(
+        results_dir_path / "traversal_mappings.csv", index=False, sep=" "
+    )
diff --git a/python/cugraph/cugraph/testing/resultset.py b/python/cugraph/cugraph/testing/resultset.py
new file mode 100644
index 00000000000..490e3a7c4ff
--- /dev/null
+++ b/python/cugraph/cugraph/testing/resultset.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tarfile
+
+import urllib.request
+
+import cudf
+from cugraph.testing import utils
+
+
+results_dir_path = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" / "resultsets"
+
+
+class Resultset:
+    """
+    A Resultset Object, used to store golden results to easily run tests that
+    need to access said results without the overhead of running an algorithm
+    to get the results.
+
+    Parameters
+    ----------
+    data_dictionary : dict
+        The existing algorithm output, expected as a dictionary
+    """
+
+    def __init__(self, data_dictionary):
+        self._data_dictionary = data_dictionary
+
+    def get_cudf_dataframe(self):
+        """
+        Converts the existing algorithm output from a dictionary to
+        a cudf.DataFrame before writing the DataFrame to output into a csv
+        """
+        return cudf.DataFrame(self._data_dictionary)
+
+
+_resultsets = {}
+
+
+def load_resultset(resultset_name, resultset_download_url):
+    """
+    Read a mapping file (<resultset_name>.csv) in the _results_dir and save the
+    mappings between each unique set of args/identifiers to UUIDs to the
+    _resultsets dictionary. If <resultset_name>.csv does not exist in
+    _results_dir, use resultset_download_url to download a file to
+    install/unpack/etc. to _results_dir first.
+    """
+    mapping_file_path = results_dir_path / (resultset_name + "_mappings.csv")
+    if not mapping_file_path.exists():
+        # Downloads a tar gz from s3 bucket, then unpacks the results files
+        compressed_file_dir = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests"
+        compressed_file_path = compressed_file_dir / "resultsets.tar.gz"
+        if not results_dir_path.exists():
+            results_dir_path.mkdir(parents=True, exist_ok=True)
+        if not compressed_file_path.exists():
+            urllib.request.urlretrieve(resultset_download_url, compressed_file_path)
+        tar = tarfile.open(str(compressed_file_path), "r:gz")
+        tar.extractall(str(results_dir_path))
+        tar.close()
+
+    # FIXME: This assumes separator is " ", but should this be configurable?
+    sep = " "
+    with open(mapping_file_path) as mapping_file:
+        for line in mapping_file.readlines():
+            if line.startswith("#"):
+                continue
+
+            (uuid, *row_args) = line.split(sep)
+            if (len(row_args) % 2) != 0:
+                raise ValueError(
+                    f'bad row in {mapping_file_path}: "{line}", must '
+                    "contain UUID followed by an even number of items"
+                )
+            row_keys = row_args[::2]
+            row_vals = row_args[1::2]
+            row_keys = " ".join(row_keys).split()
+            row_vals = " ".join(row_vals).split()
+            arg_dict = dict(zip(row_keys, row_vals))
+            arg_dict["resultset_name"] = resultset_name
+            # Create a unique string key for the _resultsets dict based on
+            # sorted row_keys. Looking up results based on args will also have
+            # to sort, but this will ensure results can looked up without
+            # requiring maintaining a specific order. Example:
+            # {'a': 1, 'z': 9, 'c': 5, 'b': 2} becomes 'a-1-b-2-c-5-z-9'
+            resultset_key = "-".join(
+                [
+                    str(val)
+                    for arg_dict_pair in sorted(arg_dict.items())
+                    for val in arg_dict_pair
+                ]
+            )
+
+            _resultsets[resultset_key] = uuid
+
+
+def get_resultset(resultset_name, **kwargs):
+    """
+    Returns the golden results for a specific test.
+
+    Parameters
+    ----------
+    resultset_name : String
+        Name of the test's module (currently just 'traversal' is supported)
+
+    kwargs :
+        All distinct test details regarding the choice of algorithm, dataset,
+        and graph
+    """
+    arg_dict = dict(kwargs)
+    arg_dict["resultset_name"] = resultset_name
+    # Example:
+    # {'a': 1, 'z': 9, 'c': 5, 'b': 2} becomes 'a-1-b-2-c-5-z-9'
+    resultset_key = "-".join(
+        [
+            str(val)
+            for arg_dict_pair in sorted(arg_dict.items())
+            for val in arg_dict_pair
+        ]
+    )
+    uuid = _resultsets.get(resultset_key)
+    if uuid is None:
+        raise KeyError(f"results for {arg_dict} not found")
+
+    results_filename = results_dir_path / (uuid + ".csv")
+    return cudf.read_csv(results_filename)
diff --git a/python/cugraph/cugraph/tests/traversal/test_bfs.py b/python/cugraph/cugraph/tests/traversal/test_bfs.py
index 89b00e66baa..164963848ad 100644
--- a/python/cugraph/cugraph/tests/traversal/test_bfs.py
+++ b/python/cugraph/cugraph/tests/traversal/test_bfs.py
@@ -12,14 +12,10 @@
 # limitations under the License.
 
 import gc
-import random
 
 import pytest
 import cupy as cp
 import numpy as np
-import pandas as pd
-import networkx as nx
-import networkx.algorithms.centrality.betweenness as nxacb
 from scipy.sparse import coo_matrix as sp_coo_matrix
 from scipy.sparse import csr_matrix as sp_csr_matrix
 from scipy.sparse import csc_matrix as sp_csc_matrix
@@ -30,7 +26,13 @@
 from cupyx.scipy.sparse import csr_matrix as cp_csr_matrix
 from cupyx.scipy.sparse import csc_matrix as cp_csc_matrix
 from pylibcugraph.testing.utils import gen_fixture_params_product
-from cugraph.testing import utils, DEFAULT_DATASETS, SMALL_DATASETS
+from cugraph.testing import (
+    utils,
+    get_resultset,
+    load_resultset,
+    DEFAULT_DATASETS,
+    SMALL_DATASETS,
+)
 
 
 # =============================================================================
@@ -40,6 +42,13 @@
 
 SUBSET_SEED_OPTIONS = [42]
 
+DATASET_STARTS = {
+    "dolphins": 16,
+    "karate": 7,
+    "karate-disjoint": 19,
+    "netscience": 1237,
+}
+
 DEFAULT_EPSILON = 1e-6
 
 DEPTH_LIMITS = [None, 1, 5, 18]
@@ -48,8 +57,6 @@
 # connected_components calls.
 cuGraph_input_output_map = {
     cugraph.Graph: cudf.DataFrame,
-    nx.Graph: pd.DataFrame,
-    nx.DiGraph: pd.DataFrame,
     cp_coo_matrix: tuple,
     cp_csr_matrix: tuple,
     cp_csc_matrix: tuple,
@@ -83,9 +90,6 @@ def convert_output_to_cudf(input_G_or_matrix, cugraph_result):
     if expected_return_type is cudf.DataFrame:
         return cugraph_result
 
-    elif expected_return_type is pd.DataFrame:
-        return cudf.from_pandas(cugraph_result)
-
     # A CuPy/SciPy input means the return value will be a 2-tuple of:
     #   distance: cupy.ndarray
     #      ndarray of shortest distances between source and vertex.
@@ -142,10 +146,11 @@ def compare_single_sp_counter(result, expected, epsilon=DEFAULT_EPSILON):
     return np.isclose(result, expected, rtol=epsilon)
 
 
-def compare_bfs(benchmark_callable, G, nx_values, start_vertex, depth_limit):
+def compare_bfs(benchmark_callable, G, golden_values, start_vertex, depth_limit):
     """
-    Genereate both cugraph and reference bfs traversal.
+    Generate both cugraph and reference bfs traversal.
     """
+
     if isinstance(start_vertex, int):
         result = benchmark_callable(cugraph.bfs_edges, G, start_vertex)
         cugraph_df = convert_output_to_cudf(G, result)
@@ -156,10 +161,10 @@ def compare_bfs(benchmark_callable, G, nx_values, start_vertex, depth_limit):
         #       not contain all the vertices while the cugraph version return
         #       a cudf.DataFrame with all the vertices, also some verification
         #       become slow with the data transfer
-        compare_func(cugraph_df, nx_values, start_vertex)
+        compare_func(cugraph_df, golden_values, start_vertex)
 
     elif isinstance(start_vertex, list):  # For other Verifications
-        all_nx_values = nx_values
+        all_golden_values = golden_values
         all_cugraph_distances = []
 
         def func_to_benchmark():
@@ -173,13 +178,13 @@ def func_to_benchmark():
         for (i, sv) in enumerate(start_vertex):
             cugraph_df = convert_output_to_cudf(G, all_cugraph_distances[i])
 
-            compare_func(cugraph_df, all_nx_values[i], sv)
+            compare_func(cugraph_df, all_golden_values[i], sv)
 
     else:  # Unknown type given to seed
         raise NotImplementedError("Invalid type for start_vertex")
 
 
-def _compare_bfs(cugraph_df, nx_distances, source):
+def _compare_bfs(cugraph_df, golden_distances, source):
     # This call should only contain 3 columns:
     # 'vertex', 'distance', 'predecessor'
     # It also confirms wether or not 'sp_counter' has been created by the call
@@ -208,30 +213,31 @@ def _compare_bfs(cugraph_df, nx_distances, source):
     # We assume that the distances are given back as integers in BFS
     # max_val = np.iinfo(df['distance'].dtype).max
     # Unreached vertices have a distance of max_val
-
     missing_vertex_error = 0
     distance_mismatch_error = 0
     invalid_predecessor_error = 0
-    for vertex in nx_distances:
+    for vertex in golden_distances:
         if vertex in cu_distances:
             result = cu_distances[vertex]
-            expected = nx_distances[vertex]
+            expected = golden_distances[vertex]
             if result != expected:
                 print(
                     "[ERR] Mismatch on distances: "
-                    "vid = {}, cugraph = {}, nx = {}".format(vertex, result, expected)
+                    "vid = {}, cugraph = {}, golden = {}".format(
+                        vertex, result, expected
+                    )
                 )
                 distance_mismatch_error += 1
             if vertex not in cu_predecessors:
                 missing_vertex_error += 1
             else:
                 pred = cu_predecessors[vertex]
-                if vertex != source and pred not in nx_distances:
+                if vertex != source and pred not in golden_distances:
                     invalid_predecessor_error += 1
                 else:
                     # The graph is unweighted thus, predecessors are 1 away
                     if vertex != source and (
-                        (nx_distances[pred] + 1 != cu_distances[vertex])
+                        (golden_distances[pred] + 1 != cu_distances[vertex])
                     ):
                         print(
                             "[ERR] Invalid on predecessors: "
@@ -245,34 +251,38 @@ def _compare_bfs(cugraph_df, nx_distances, source):
     assert invalid_predecessor_error == 0, "There are invalid predecessors"
 
 
-def get_cu_graph_nx_graph_and_params(dataset, directed):
+def get_cu_graph_and_params(dataset, directed):
     """
-    Helper for fixtures returning a Nx graph obj and params.
+    Helper for fixtures returning a cuGraph obj and params.
     """
     # create graph
     G = dataset.get_graph(create_using=cugraph.Graph(directed=directed))
     dataset_path = dataset.get_path()
+    dataset_name = dataset.metadata["name"]
+    return (G, dataset_path, dataset_name, directed)
 
-    return (
-        G,
-        dataset_path,
-        directed,
-        utils.generate_nx_graph_from_file(dataset_path, directed),
-    )
 
-
-def get_cu_graph_nx_results_and_params(seed, depth_limit, G, dataset, directed, Gnx):
+def get_cu_graph_golden_results_and_params(
+    depth_limit, G, dataset_path, dataset_name, directed, _
+):
     """
-    Helper for fixtures returning Nx results and params.
+    Helper for fixtures returning golden results and params.
     """
-    random.seed(seed)
-    start_vertex = random.sample(list(Gnx.nodes()), 1)[0]
-
-    nx_values = nx.single_source_shortest_path_length(
-        Gnx, start_vertex, cutoff=depth_limit
+    start_vertex = DATASET_STARTS[dataset_name]
+    golden_values = get_resultset(
+        resultset_name="traversal",
+        algo="single_source_shortest_path_length",
+        cutoff=str(depth_limit),
+        graph_dataset=dataset_name,
+        graph_directed=str(directed),
+        start_vertex=str(start_vertex),
     )
 
-    return (G, dataset, directed, nx_values, start_vertex, depth_limit)
+    golden_values = cudf.Series(
+        golden_values.distance.values, index=golden_values.vertex
+    ).to_dict()
+
+    return (G, dataset_path, directed, golden_values, start_vertex, depth_limit)
 
 
 # =============================================================================
@@ -289,9 +299,7 @@ def get_cu_graph_nx_results_and_params(seed, depth_limit, G, dataset, directed,
 # not do this automatically (unlike multiply-parameterized tests). The 2nd
 # item in the tuple is a label for the param value used when displaying the
 # full test name.
-algo_test_fixture_params = gen_fixture_params_product(
-    (SEEDS, "seed"), (DEPTH_LIMIT, "depth_limit")
-)
+algo_test_fixture_params = gen_fixture_params_product((DEPTH_LIMIT, "depth_limit"))
 
 graph_fixture_params = gen_fixture_params_product(
     (DATASETS, "ds"), (DIRECTED, "dirctd")
@@ -301,11 +309,12 @@ def get_cu_graph_nx_results_and_params(seed, depth_limit, G, dataset, directed,
     (SMALL_DATASETS, "ds"), (DIRECTED, "dirctd")
 )
 
+
 # The single param list variants are used when only 1 param combination is
 # needed (eg. testing non-native input types where tests for other combinations
 # was covered elsewhere).
 single_algo_test_fixture_params = gen_fixture_params_product(
-    ([SEEDS[0]], "seed"), ([DEPTH_LIMIT[0]], "depth_limit")
+    ([DEPTH_LIMIT[0]], "depth_limit")
 )
 
 single_small_graph_fixture_params = gen_fixture_params_product(
@@ -313,22 +322,32 @@ def get_cu_graph_nx_results_and_params(seed, depth_limit, G, dataset, directed,
 )
 
 
+# Fixture that loads all golden results necessary to run cugraph tests if the
+# tests are not already present in the designated results directory. Most of the
+# time, this will only check if the module-specific mapping file exists.
+@pytest.fixture(scope="module")
+def load_traversal_results():
+    load_resultset(
+        "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz"
+    )
+
+
 # Fixtures that result in a test-per (dataset X directed/undirected)
 # combination. These return the path to the dataset, a bool indicating if a
 # directed graph is being used, and the Nx graph object.
 @pytest.fixture(scope="module", params=graph_fixture_params)
-def dataset_nx_graph(request):
-    return get_cu_graph_nx_graph_and_params(*request.param)
+def dataset_golden_results(request):
+    return get_cu_graph_and_params(*request.param)
 
 
 @pytest.fixture(scope="module", params=small_graph_fixture_params)
-def small_dataset_nx_graph(request):
-    return get_cu_graph_nx_graph_and_params(*request.param)
+def small_dataset_golden_results(request):
+    return get_cu_graph_and_params(*request.param)
 
 
 @pytest.fixture(scope="module", params=single_small_graph_fixture_params)
-def single_small_dataset_nx_graph(request):
-    return get_cu_graph_nx_graph_and_params(*request.param)
+def single_small_dataset_golden_results(request):
+    return get_cu_graph_and_params(*request.param)
 
 
 # Fixtures that result in a test-per (dataset_nx_graph combinations X algo_test
@@ -337,42 +356,29 @@ def single_small_dataset_nx_graph(request):
 # results, the starting vertex for BFS, and flag if shortes path counting was
 # used.
 @pytest.fixture(scope="module", params=algo_test_fixture_params)
-def dataset_nxresults_startvertex_spc(dataset_nx_graph, request):
-    return get_cu_graph_nx_results_and_params(*request.param, *dataset_nx_graph)
+def dataset_goldenresults_startvertex_spc(
+    dataset_golden_results, load_traversal_results, request
+):
+    return get_cu_graph_golden_results_and_params(
+        *request.param, *dataset_golden_results, load_traversal_results
+    )
 
 
 @pytest.fixture(scope="module", params=single_algo_test_fixture_params)
-def single_dataset_nxresults_startvertex_spc(single_small_dataset_nx_graph, request):
-    return get_cu_graph_nx_results_and_params(
-        *request.param, *single_small_dataset_nx_graph
+def single_dataset_goldenresults_startvertex_spc(
+    single_small_dataset_golden_results, load_traversal_results, request
+):
+    return get_cu_graph_golden_results_and_params(
+        *request.param, *single_small_dataset_golden_results, load_traversal_results
     )
 
 
-@pytest.fixture(scope="module")
-def dataset_nxresults_allstartvertices_spc(small_dataset_nx_graph):
-
-    dataset, directed, Gnx = small_dataset_nx_graph
-    use_spc = True
-
-    start_vertices = [start_vertex for start_vertex in Gnx]
-
-    all_nx_values = []
-    for start_vertex in start_vertices:
-        _, _, nx_sp_counter = nxacb._single_source_shortest_path_basic(
-            Gnx, start_vertex
-        )
-        nx_values = nx_sp_counter
-        all_nx_values.append(nx_values)
-
-    return (dataset, directed, all_nx_values, start_vertices, use_spc)
-
-
 # =============================================================================
 # Tests
 # =============================================================================
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_INPUT_TYPES)
-def test_bfs(gpubenchmark, dataset_nxresults_startvertex_spc, cugraph_input_type):
+def test_bfs(gpubenchmark, dataset_goldenresults_startvertex_spc, cugraph_input_type):
     """
     Test BFS traversal on random source with distance and predecessors
     """
@@ -380,52 +386,62 @@ def test_bfs(gpubenchmark, dataset_nxresults_startvertex_spc, cugraph_input_type
         G,
         dataset,
         directed,
-        nx_values,
+        golden_values,
         start_vertex,
         depth_limit,
-    ) = dataset_nxresults_startvertex_spc
+    ) = dataset_goldenresults_startvertex_spc
 
-    # special case: ensure cugraph and Nx Graph types are DiGraphs if
-    # "directed" is set, since the graph type parameterization is currently
-    # independent of the directed parameter. Unfortunately this does not
-    # change the "id" in the pytest output. Ignore for nonnative inputs
     if directed:
         if isinstance(cugraph_input_type, cugraph.Graph):
             cugraph_input_type = cugraph.Graph(directed=True)
-        elif cugraph_input_type is nx.Graph:
-            cugraph_input_type = nx.DiGraph
 
     if not isinstance(cugraph_input_type, cugraph.Graph):
         G_or_matrix = utils.create_obj_from_csv(dataset, cugraph_input_type)
     else:
         G_or_matrix = G
 
-    compare_bfs(gpubenchmark, G_or_matrix, nx_values, start_vertex, depth_limit)
+    compare_bfs(gpubenchmark, G_or_matrix, golden_values, start_vertex, depth_limit)
 
 
 @pytest.mark.sg
-@pytest.mark.parametrize(
-    "cugraph_input_type", utils.NX_INPUT_TYPES + utils.MATRIX_INPUT_TYPES
-)
-def test_bfs_nonnative_inputs(
-    gpubenchmark, single_dataset_nxresults_startvertex_spc, cugraph_input_type
+@pytest.mark.parametrize("cugraph_input_type", utils.MATRIX_INPUT_TYPES)
+def test_bfs_nonnative_inputs_matrix(
+    gpubenchmark, single_dataset_goldenresults_startvertex_spc, cugraph_input_type
 ):
-    test_bfs(gpubenchmark, single_dataset_nxresults_startvertex_spc, cugraph_input_type)
+    test_bfs(
+        gpubenchmark, single_dataset_goldenresults_startvertex_spc, cugraph_input_type
+    )
 
 
 @pytest.mark.sg
-@pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_INPUT_TYPES)
-def test_bfs_invalid_start(
-    gpubenchmark, dataset_nxresults_startvertex_spc, cugraph_input_type
+def test_bfs_nonnative_inputs_nx(
+    single_dataset_goldenresults_startvertex_spc,
 ):
     (
-        G,
-        dataset,
+        _,
+        _,
         directed,
-        nx_values,
+        golden_values,
         start_vertex,
-        depth_limit,
-    ) = dataset_nxresults_startvertex_spc
+        _,
+    ) = single_dataset_goldenresults_startvertex_spc
+
+    cugraph_df = get_resultset(
+        resultset_name="traversal",
+        algo="bfs_edges",
+        graph_dataset="karate",
+        graph_directed=str(directed),
+        source=str(start_vertex),
+    )
+
+    compare_func = _compare_bfs
+    compare_func(cugraph_df, golden_values, start_vertex)
+
+
+@pytest.mark.sg
+@pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_INPUT_TYPES)
+def test_bfs_invalid_start(dataset_goldenresults_startvertex_spc, cugraph_input_type):
+    (G, _, _, _, start_vertex, depth_limit) = dataset_goldenresults_startvertex_spc
 
     el = G.view_edge_list()
 
diff --git a/python/cugraph/cugraph/tests/traversal/test_paths.py b/python/cugraph/cugraph/tests/traversal/test_paths.py
index 8a751ba8840..5ee22874f4a 100644
--- a/python/cugraph/cugraph/tests/traversal/test_paths.py
+++ b/python/cugraph/cugraph/tests/traversal/test_paths.py
@@ -15,12 +15,13 @@
 from tempfile import NamedTemporaryFile
 import math
 
+import numpy as np
 import pytest
-import networkx as nx
 
 import cudf
 import cupy
 import cugraph
+from cugraph.testing import get_resultset, load_resultset
 from cupyx.scipy.sparse import coo_matrix as cupy_coo_matrix
 
 
@@ -37,13 +38,35 @@
 DISCONNECTED_GRAPH = CONNECTED_GRAPH + "8,9,4"
 
 
+# Single value or callable golden results are not added as a Resultset
+paths_golden_results = {
+    "shortest_path_length_1_1": 0,
+    "shortest_path_length_1_5": 2.0,
+    "shortest_path_length_1_3": 2.0,
+    "shortest_path_length_1_6": 2.0,
+    "shortest_path_length_-1_1": ValueError,
+    "shortest_path_length_1_10": ValueError,
+    "shortest_path_length_0_42": ValueError,
+    "shortest_path_length_1_8": 3.4028235e38,
+}
+
+
+# Fixture that loads all golden results necessary to run cugraph tests if the
+# tests are not already present in the designated results directory. Most of the
+# time, this will only check if the module-specific mapping file exists.
+@pytest.fixture(scope="module")
+def load_traversal_results():
+    load_resultset(
+        "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz"
+    )
+
+
 @pytest.fixture
 def graphs(request):
     with NamedTemporaryFile(mode="w+", suffix=".csv") as graph_tf:
         graph_tf.writelines(request.param)
         graph_tf.seek(0)
 
-        nx_G = nx.read_weighted_edgelist(graph_tf.name, delimiter=",")
         cudf_df = cudf.read_csv(
             graph_tf.name,
             names=["src", "dst", "data"],
@@ -74,57 +97,48 @@ def graphs(request):
             (weights, (i, j)), shape=(largest_vertex + 1, largest_vertex + 1)
         )
 
-        yield cugraph_G, nx_G, cupy_df
+        yield cugraph_G, cupy_df
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("graphs", [CONNECTED_GRAPH], indirect=True)
 def test_connected_graph_shortest_path_length(graphs):
-    cugraph_G, nx_G, cupy_df = graphs
+    cugraph_G, cupy_df = graphs
 
     path_1_to_1_length = cugraph.shortest_path_length(cugraph_G, 1, 1)
+    # FIXME: aren't the first two assertions in each batch redundant?
     assert path_1_to_1_length == 0.0
-    assert path_1_to_1_length == nx.shortest_path_length(
-        nx_G, "1", target="1", weight="weight"
-    )
-    assert path_1_to_1_length == cugraph.shortest_path_length(nx_G, "1", "1")
+    assert path_1_to_1_length == paths_golden_results["shortest_path_length_1_1"]
     assert path_1_to_1_length == cugraph.shortest_path_length(cupy_df, 1, 1)
 
     path_1_to_5_length = cugraph.shortest_path_length(cugraph_G, 1, 5)
     assert path_1_to_5_length == 2.0
-    assert path_1_to_5_length == nx.shortest_path_length(
-        nx_G, "1", target="5", weight="weight"
-    )
-    assert path_1_to_5_length == cugraph.shortest_path_length(nx_G, "1", "5")
+    assert path_1_to_5_length == paths_golden_results["shortest_path_length_1_5"]
     assert path_1_to_5_length == cugraph.shortest_path_length(cupy_df, 1, 5)
 
     path_1_to_3_length = cugraph.shortest_path_length(cugraph_G, 1, 3)
     assert path_1_to_3_length == 2.0
-    assert path_1_to_3_length == nx.shortest_path_length(
-        nx_G, "1", target="3", weight="weight"
-    )
-    assert path_1_to_3_length == cugraph.shortest_path_length(nx_G, "1", "3")
+    assert path_1_to_3_length == paths_golden_results["shortest_path_length_1_3"]
     assert path_1_to_3_length == cugraph.shortest_path_length(cupy_df, 1, 3)
 
     path_1_to_6_length = cugraph.shortest_path_length(cugraph_G, 1, 6)
     assert path_1_to_6_length == 2.0
-    assert path_1_to_6_length == nx.shortest_path_length(
-        nx_G, "1", target="6", weight="weight"
-    )
-    assert path_1_to_6_length == cugraph.shortest_path_length(nx_G, "1", "6")
+    assert path_1_to_6_length == paths_golden_results["shortest_path_length_1_6"]
     assert path_1_to_6_length == cugraph.shortest_path_length(cupy_df, 1, 6)
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("graphs", [CONNECTED_GRAPH], indirect=True)
 def test_shortest_path_length_invalid_source(graphs):
-    cugraph_G, nx_G, cupy_df = graphs
+    cugraph_G, cupy_df = graphs
 
     with pytest.raises(ValueError):
         cugraph.shortest_path_length(cugraph_G, -1, 1)
 
-    with pytest.raises(ValueError):
-        cugraph.shortest_path_length(nx_G, "-1", "1")
+    result = paths_golden_results["shortest_path_length_-1_1"]
+    if callable(result):
+        with pytest.raises(ValueError):
+            raise result()
 
     with pytest.raises(ValueError):
         cugraph.shortest_path_length(cupy_df, -1, 1)
@@ -133,13 +147,15 @@ def test_shortest_path_length_invalid_source(graphs):
 @pytest.mark.sg
 @pytest.mark.parametrize("graphs", [DISCONNECTED_GRAPH], indirect=True)
 def test_shortest_path_length_invalid_target(graphs):
-    cugraph_G, nx_G, cupy_df = graphs
+    cugraph_G, cupy_df = graphs
 
     with pytest.raises(ValueError):
         cugraph.shortest_path_length(cugraph_G, 1, 10)
 
-    with pytest.raises(ValueError):
-        cugraph.shortest_path_length(nx_G, "1", "10")
+    result = paths_golden_results["shortest_path_length_1_10"]
+    if callable(result):
+        with pytest.raises(ValueError):
+            raise result()
 
     with pytest.raises(ValueError):
         cugraph.shortest_path_length(cupy_df, 1, 10)
@@ -148,13 +164,15 @@ def test_shortest_path_length_invalid_target(graphs):
 @pytest.mark.sg
 @pytest.mark.parametrize("graphs", [CONNECTED_GRAPH], indirect=True)
 def test_shortest_path_length_invalid_vertexes(graphs):
-    cugraph_G, nx_G, cupy_df = graphs
+    cugraph_G, cupy_df = graphs
 
     with pytest.raises(ValueError):
         cugraph.shortest_path_length(cugraph_G, 0, 42)
 
-    with pytest.raises(ValueError):
-        cugraph.shortest_path_length(nx_G, "0", "42")
+    result = paths_golden_results["shortest_path_length_0_42"]
+    if callable(result):
+        with pytest.raises(ValueError):
+            raise result()
 
     with pytest.raises(ValueError):
         cugraph.shortest_path_length(cupy_df, 0, 42)
@@ -163,7 +181,7 @@ def test_shortest_path_length_invalid_vertexes(graphs):
 @pytest.mark.sg
 @pytest.mark.parametrize("graphs", [DISCONNECTED_GRAPH], indirect=True)
 def test_shortest_path_length_no_path(graphs):
-    cugraph_G, nx_G, cupy_df = graphs
+    cugraph_G, cupy_df = graphs
 
     # FIXME: In case there is no path between two vertices, the
     # result can be either the max of float32 or float64
@@ -171,38 +189,51 @@ def test_shortest_path_length_no_path(graphs):
 
     path_1_to_8 = cugraph.shortest_path_length(cugraph_G, 1, 8)
     assert path_1_to_8 == sys.float_info.max
-    assert cugraph.shortest_path_length(nx_G, "1", "8") in [max_float_32, path_1_to_8]
+
+    golden_path_1_to_8 = paths_golden_results["shortest_path_length_1_8"]
+    golden_path_1_to_8 = np.float32(golden_path_1_to_8)
+    assert golden_path_1_to_8 in [
+        max_float_32,
+        path_1_to_8,
+    ]
     assert path_1_to_8 == cugraph.shortest_path_length(cupy_df, 1, 8)
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("graphs", [DISCONNECTED_GRAPH], indirect=True)
-def test_shortest_path_length_no_target(graphs):
-    cugraph_G, nx_G, cupy_df = graphs
+def test_shortest_path_length_no_target(graphs, load_traversal_results):
+    cugraph_G, cupy_df = graphs
 
     cugraph_path_1_to_all = cugraph.shortest_path_length(cugraph_G, 1)
-    nx_path_1_to_all = nx.shortest_path_length(nx_G, source="1", weight="weight")
-    nx_gpu_path_1_to_all = cugraph.shortest_path_length(nx_G, "1")
+    golden_path_1_to_all = get_resultset(
+        resultset_name="traversal",
+        algo="shortest_path_length",
+        graph_dataset="DISCONNECTED",
+        graph_directed=str(True),
+        source="1",
+        weight="weight",
+    )
     cupy_path_1_to_all = cugraph.shortest_path_length(cupy_df, 1)
 
     # Cast networkx graph on cugraph vertex column type from str to int.
     # SSSP preserves vertex type, convert for comparison
-    nx_gpu_path_1_to_all["vertex"] = nx_gpu_path_1_to_all["vertex"].astype("int32")
-
-    assert cugraph_path_1_to_all == nx_gpu_path_1_to_all
     assert cugraph_path_1_to_all == cupy_path_1_to_all
 
     # results for vertex 8 and 9 are not returned
-    assert cugraph_path_1_to_all.shape[0] == len(nx_path_1_to_all) + 2
-
+    assert cugraph_path_1_to_all.shape[0] == len(golden_path_1_to_all) + 2
     for index in range(cugraph_path_1_to_all.shape[0]):
 
-        vertex = str(cugraph_path_1_to_all["vertex"][index].item())
+        vertex = cugraph_path_1_to_all["vertex"][index].item()
         distance = cugraph_path_1_to_all["distance"][index].item()
 
         # verify cugraph against networkx
-        if vertex in {"8", "9"}:
+        if vertex in {8, 9}:
             # Networkx does not return distances for these vertexes.
             assert distance == sys.float_info.max
         else:
-            assert distance == nx_path_1_to_all[vertex]
+            assert (
+                distance
+                == golden_path_1_to_all.loc[
+                    golden_path_1_to_all.vertex == vertex
+                ].distance.iloc[0]
+            )
diff --git a/python/cugraph/cugraph/tests/traversal/test_sssp.py b/python/cugraph/cugraph/tests/traversal/test_sssp.py
index 0d2646b29be..58288e022e8 100644
--- a/python/cugraph/cugraph/tests/traversal/test_sssp.py
+++ b/python/cugraph/cugraph/tests/traversal/test_sssp.py
@@ -12,12 +12,10 @@
 # limitations under the License.
 
 import gc
-import time
 
 import pytest
 import numpy as np
 import pandas as pd
-import networkx as nx
 
 import cudf
 import cupyx
@@ -30,18 +28,19 @@
 from scipy.sparse import csr_matrix as sp_csr_matrix
 from scipy.sparse import csc_matrix as sp_csc_matrix
 from pylibcugraph.testing.utils import gen_fixture_params_product
-from cugraph.testing import utils, UNDIRECTED_DATASETS, SMALL_DATASETS
-
-
-print("Networkx version : {} ".format(nx.__version__))
+from cugraph.testing import (
+    utils,
+    get_resultset,
+    load_resultset,
+    UNDIRECTED_DATASETS,
+    SMALL_DATASETS,
+)
 
 
 # Map of cuGraph input types to the expected output type for cuGraph
 # connected_components calls.
 cuGraph_input_output_map = {
     cugraph.Graph: cudf.DataFrame,
-    nx.Graph: pd.DataFrame,
-    nx.DiGraph: pd.DataFrame,
     cp_coo_matrix: tuple,
     cp_csr_matrix: tuple,
     cp_csc_matrix: tuple,
@@ -128,45 +127,47 @@ def cugraph_call(gpu_benchmark_callable, input_G_or_matrix, source, edgevals=Tru
     return result_dict, max_val
 
 
-def networkx_call(graph_file, source, edgevals=True):
+def resultset_call(graph_file, source, load_results, edgevals=True):
     dataset_path = graph_file.get_path()
-    M = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True)
-    # Directed NetworkX graph
-    edge_attr = "weight" if edgevals else None
-
-    Gnx = nx.from_pandas_edgelist(
-        M,
-        source="0",
-        target="1",
-        edge_attr=edge_attr,
-        create_using=nx.DiGraph(),
-    )
-    print("NX Solving... ")
-    t1 = time.time()
+    dataset_name = graph_file.metadata["name"]
 
     if edgevals is False:
-        nx_paths = nx.single_source_shortest_path_length(Gnx, source)
+        # FIXME: no test coverage if edgevals is False, this assertion is never reached
+        assert False
+        golden_paths = get_resultset(
+            resultset_name="traversal",
+            algo="single_source_shortest_path_length",
+            graph_dataset=dataset_name,
+            graph_directed=str(True),
+            source=str(source),
+        )
     else:
-        # FIXME: The nx call below doesn't return accurate results as it seems to
-        # not support 'weights'. It matches cuGraph result only if the weight column
-        # is 1s.
-        nx_paths = nx.single_source_dijkstra_path_length(Gnx, source)
+        # FIXME: The golden results (nx) below doesn't return accurate results as it
+        # seems to not support 'weights'. It matches cuGraph result only if the weight
+        # column is 1s.
+        golden_paths = get_resultset(
+            resultset_name="traversal",
+            algo="single_source_dijkstra_path_length",
+            graph_dataset=dataset_name,
+            graph_directed=str(True),
+            source=str(source),
+        )
+    golden_paths = cudf.Series(
+        golden_paths.distance.values, index=golden_paths.vertex
+    ).to_dict()
 
     G = graph_file.get_graph(
         create_using=cugraph.Graph(directed=True), ignore_weights=not edgevals
     )
 
-    t2 = time.time() - t1
-    print("NX Time : " + str(t2))
-
-    return (G, dataset_path, source, nx_paths, Gnx)
+    return (G, dataset_path, graph_file, source, golden_paths)
 
 
 # =============================================================================
 # Pytest fixtures
 # =============================================================================
 
-# Call gen_fixture_params_product() to caluculate the cartesian product of
+# Call gen_fixture_params_product() to calculate the cartesian product of
 # multiple lists of params. This is required since parameterized fixtures do
 # not do this automatically (unlike multiply-parameterized tests). The 2nd
 # item in the tuple is a label for the param value used when displaying the
@@ -182,28 +183,36 @@ def networkx_call(graph_file, source, edgevals=True):
 )
 
 
-# These fixtures will call networkx BFS algos and save the result. The networkx
-# call is only made only once per input param combination.
+# Fixture that loads all golden results necessary to run cugraph tests if the
+# tests are not already present in the designated results directory. Most of the
+# time, this will only check if the module-specific mapping file exists.
+@pytest.fixture(scope="module")
+def load_traversal_results():
+    load_resultset(
+        "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz"
+    )
+
+
 @pytest.fixture(scope="module", params=fixture_params)
-def dataset_source_nxresults(request):
+def dataset_source_goldenresults(request):
     # request.param is a tuple of params from fixture_params. When expanded
-    # with *, will be passed to networkx_call() as args (graph_file, source)
-    return networkx_call(*(request.param))
+    # with *, will be passed to resultset_call() as args (graph_file, source)
+    return resultset_call(*(request.param), load_traversal_results)
 
 
 @pytest.fixture(scope="module", params=fixture_params_single_dataset)
-def single_dataset_source_nxresults(request):
-    return networkx_call(*(request.param))
+def single_dataset_source_goldenresults(request):
+    return resultset_call(*(request.param), load_traversal_results)
 
 
 @pytest.fixture(scope="module", params=fixture_params)
-def dataset_source_nxresults_weighted(request):
-    return networkx_call(*(request.param), edgevals=True)
+def dataset_source_goldenresults_weighted(request):
+    return resultset_call(*(request.param), load_traversal_results, edgevals=True)
 
 
 @pytest.fixture(scope="module", params=fixture_params_single_dataset)
-def single_dataset_source_nxresults_weighted(request):
-    return networkx_call(*(request.param), edgevals=True)
+def single_dataset_source_goldenresults_weighted(request):
+    return resultset_call(*(request.param), load_traversal_results, edgevals=True)
 
 
 # =============================================================================
@@ -211,9 +220,9 @@ def single_dataset_source_nxresults_weighted(request):
 # =============================================================================
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES)
-def test_sssp(gpubenchmark, dataset_source_nxresults, cugraph_input_type):
+def test_sssp(gpubenchmark, dataset_source_goldenresults, cugraph_input_type):
     # Extract the params generated from the fixture
-    (G, dataset_path, source, nx_paths, Gnx) = dataset_source_nxresults
+    (G, dataset_path, _, source, golden_paths) = dataset_source_goldenresults
 
     if not isinstance(cugraph_input_type, cugraph.Graph):
         input_G_or_matrix = utils.create_obj_from_csv(
@@ -221,7 +230,6 @@ def test_sssp(gpubenchmark, dataset_source_nxresults, cugraph_input_type):
         )
     else:
         input_G_or_matrix = G
-
     cu_paths, max_val = cugraph_call(gpubenchmark, input_G_or_matrix, source)
 
     # Calculating mismatch
@@ -231,14 +239,14 @@ def test_sssp(gpubenchmark, dataset_source_nxresults, cugraph_input_type):
         # NOTE : If distance type is float64 then cu_paths[vid][0]
         # should be compared against np.finfo(np.float64).max)
         if cu_paths[vid][0] != max_val:
-            if cu_paths[vid][0] != nx_paths[vid]:
+            if cu_paths[vid][0] != golden_paths[vid]:
                 err = err + 1
             # check pred dist + 1 = current dist (since unweighted)
             pred = cu_paths[vid][1]
             if vid != source and cu_paths[pred][0] + 1 != cu_paths[vid][0]:
                 err = err + 1
         else:
-            if vid in nx_paths.keys():
+            if vid in golden_paths.keys():
                 err = err + 1
 
     assert err == 0
@@ -246,8 +254,10 @@ def test_sssp(gpubenchmark, dataset_source_nxresults, cugraph_input_type):
 
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES)
-def test_sssp_invalid_start(gpubenchmark, dataset_source_nxresults, cugraph_input_type):
-    (G, _, source, nx_paths, Gnx) = dataset_source_nxresults
+def test_sssp_invalid_start(
+    gpubenchmark, dataset_source_goldenresults, cugraph_input_type
+):
+    (G, _, _, source, _) = dataset_source_goldenresults
     el = G.view_edge_list()
 
     newval = max(el.src.max(), el.dst.max()) + 1
@@ -258,22 +268,61 @@ def test_sssp_invalid_start(gpubenchmark, dataset_source_nxresults, cugraph_inpu
 
 
 @pytest.mark.sg
-@pytest.mark.parametrize(
-    "cugraph_input_type", utils.NX_DIR_INPUT_TYPES + utils.MATRIX_INPUT_TYPES
-)
-def test_sssp_nonnative_inputs(
-    gpubenchmark, single_dataset_source_nxresults, cugraph_input_type
+@pytest.mark.parametrize("cugraph_input_type", utils.MATRIX_INPUT_TYPES)
+def test_sssp_nonnative_inputs_matrix(
+    gpubenchmark, single_dataset_source_goldenresults, cugraph_input_type
 ):
-    test_sssp(gpubenchmark, single_dataset_source_nxresults, cugraph_input_type)
+    test_sssp(gpubenchmark, single_dataset_source_goldenresults, cugraph_input_type)
+
+
+@pytest.mark.sg
+@pytest.mark.parametrize("directed", [True, False])
+def test_sssp_nonnative_inputs_graph(single_dataset_source_goldenresults, directed):
+    (_, _, graph_file, source, golden_paths) = single_dataset_source_goldenresults
+    dataset_name = graph_file.metadata["name"]
+    result = get_resultset(
+        resultset_name="traversal",
+        algo="sssp_nonnative",
+        graph_dataset=dataset_name,
+        graph_directed=str(directed),
+        source=str(source),
+    )
+    if np.issubdtype(result["distance"].dtype, np.integer):
+        max_val = np.iinfo(result["distance"].dtype).max
+    else:
+        max_val = np.finfo(result["distance"].dtype).max
+    verts = result["vertex"].to_numpy()
+    dists = result["distance"].to_numpy()
+    preds = result["predecessor"].to_numpy()
+    cu_paths = dict(zip(verts, zip(dists, preds)))
+
+    # Calculating mismatch
+    err = 0
+    for vid in cu_paths:
+        # Validate vertices that are reachable
+        # NOTE : If distance type is float64 then cu_paths[vid][0]
+        # should be compared against np.finfo(np.float64).max)
+        if cu_paths[vid][0] != max_val:
+            if cu_paths[vid][0] != golden_paths[vid]:
+                err = err + 1
+            # check pred dist + 1 = current dist (since unweighted)
+            pred = cu_paths[vid][1]
+            if vid != source and cu_paths[pred][0] + 1 != cu_paths[vid][0]:
+                err = err + 1
+        else:
+            if vid in golden_paths.keys():
+                err = err + 1
+
+    assert err == 0
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES)
 def test_sssp_edgevals(
-    gpubenchmark, dataset_source_nxresults_weighted, cugraph_input_type
+    gpubenchmark, dataset_source_goldenresults_weighted, cugraph_input_type
 ):
     # Extract the params generated from the fixture
-    (G, _, source, nx_paths, Gnx) = dataset_source_nxresults_weighted
+    (G, _, _, source, golden_paths) = dataset_source_goldenresults_weighted
     input_G_or_matrix = G
 
     cu_paths, max_val = cugraph_call(
@@ -286,19 +335,20 @@ def test_sssp_edgevals(
         # Validate vertices that are reachable
         # NOTE : If distance type is float64 then cu_paths[vid][0]
         # should be compared against np.finfo(np.float64).max)
+        distances = cugraph.sssp(G, source=vid)
         if cu_paths[vid][0] != max_val:
-            if cu_paths[vid][0] != nx_paths[vid]:
+            if cu_paths[vid][0] != golden_paths[vid]:
                 err = err + 1
             # check pred dist + edge_weight = current dist
             if vid != source:
                 pred = cu_paths[vid][1]
-                edge_weight = Gnx[pred][vid]["weight"]
+                if G.has_edge(pred, vid):
+                    edge_weight = distances[distances["vertex"] == pred].iloc[0, 0]
                 if cu_paths[pred][0] + edge_weight != cu_paths[vid][0]:
                     err = err + 1
         else:
-            if vid in nx_paths.keys():
+            if vid in golden_paths.keys():
                 err = err + 1
-
     assert err == 0
 
 
@@ -307,10 +357,10 @@ def test_sssp_edgevals(
     "cugraph_input_type", utils.NX_DIR_INPUT_TYPES + utils.MATRIX_INPUT_TYPES
 )
 def test_sssp_edgevals_nonnative_inputs(
-    gpubenchmark, single_dataset_source_nxresults_weighted, cugraph_input_type
+    gpubenchmark, single_dataset_source_goldenresults_weighted, cugraph_input_type
 ):
     test_sssp_edgevals(
-        gpubenchmark, single_dataset_source_nxresults_weighted, cugraph_input_type
+        gpubenchmark, single_dataset_source_goldenresults_weighted, cugraph_input_type
     )
 
 
@@ -319,7 +369,7 @@ def test_sssp_edgevals_nonnative_inputs(
 @pytest.mark.parametrize("source", SOURCES)
 def test_sssp_data_type_conversion(graph_file, source):
     dataset_path = graph_file.get_path()
-    M = utils.read_csv_for_nx(dataset_path)
+    dataset_name = graph_file.metadata["name"]
     cu_M = utils.read_csv_file(dataset_path)
 
     # cugraph call with int32 weights
@@ -334,19 +384,17 @@ def test_sssp_data_type_conversion(graph_file, source):
     dist_np = df["distance"].to_numpy()
     pred_np = df["predecessor"].to_numpy()
     cu_paths = dict(zip(verts_np, zip(dist_np, pred_np)))
-
-    # networkx call with int32 weights
-    M["weight"] = M["weight"].astype(np.int32)
-    Gnx = nx.from_pandas_edgelist(
-        M,
-        source="0",
-        target="1",
-        edge_attr="weight",
-        create_using=nx.DiGraph(),
+    golden_paths = get_resultset(
+        resultset_name="traversal",
+        algo="single_source_dijkstra_path_length",
+        graph_dataset=dataset_name,
+        graph_directed=str(True),
+        source=str(source),
+        test="data_type_conversion",
     )
-    # assert nx weights is int
-    assert type(list(Gnx.edges(data=True))[0][2]["weight"]) is int
-    nx_paths = nx.single_source_dijkstra_path_length(Gnx, source)
+    golden_paths = cudf.Series(
+        golden_paths.distance.values, index=golden_paths.vertex
+    ).to_dict()
 
     # Calculating mismatch
     err = 0
@@ -354,28 +402,29 @@ def test_sssp_data_type_conversion(graph_file, source):
         # Validate vertices that are reachable
         # NOTE : If distance type is float64 then cu_paths[vid][0]
         # should be compared against np.finfo(np.float64).max)
+        distances = cugraph.sssp(G, source=vid)
         if cu_paths[vid][0] != max_val:
-            if cu_paths[vid][0] != nx_paths[vid]:
+            if cu_paths[vid][0] != golden_paths[vid]:
                 err = err + 1
             # check pred dist + edge_weight = current dist
             if vid != source:
                 pred = cu_paths[vid][1]
-                edge_weight = Gnx[pred][vid]["weight"]
+                if G.has_edge(pred, vid):
+                    edge_weight = distances[distances["vertex"] == pred].iloc[0, 0]
                 if cu_paths[pred][0] + edge_weight != cu_paths[vid][0]:
                     err = err + 1
         else:
-            if vid in nx_paths.keys():
+            if vid in golden_paths.keys():
                 err = err + 1
 
     assert err == 0
 
 
 @pytest.mark.sg
-def test_sssp_networkx_edge_attr():
-    G = nx.Graph()
-    G.add_edge(0, 1, other=10)
-    G.add_edge(1, 2, other=20)
-    df = cugraph.sssp(G, 0, edge_attr="other")
+def test_sssp_golden_edge_attr(load_traversal_results):
+    df = get_resultset(
+        resultset_name="traversal", algo="sssp_nonnative", test="network_edge_attr"
+    )
     df = df.set_index("vertex")
     assert df.loc[0, "distance"] == 0
     assert df.loc[1, "distance"] == 10

From fa99e3471a2bbd59abc1f4f117b6e1ccbaeebd17 Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Mon, 21 Aug 2023 23:26:58 -0500
Subject: [PATCH 15/72] Allow cugraph-nx to run networkx tests for nx versions
 3.0, 3.1, and 3.2 (#3808)

This is one step to support #3770

Note that NetworkX version 3.2 is currently under development.

CC @rlratzel

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3808
---
 python/cugraph-nx/cugraph_nx/interface.py | 117 ++++++++++++++++++++--
 python/cugraph-nx/pyproject.toml          |   1 +
 python/cugraph-nx/run_nx_tests.sh         |   8 +-
 3 files changed, 116 insertions(+), 10 deletions(-)

diff --git a/python/cugraph-nx/cugraph_nx/interface.py b/python/cugraph-nx/cugraph_nx/interface.py
index fe492c43ca2..ccd8d418d30 100644
--- a/python/cugraph-nx/cugraph_nx/interface.py
+++ b/python/cugraph-nx/cugraph_nx/interface.py
@@ -22,6 +22,7 @@ class BackendInterface:
     @staticmethod
     def convert_from_nx(graph, *args, edge_attrs=None, weight=None, **kwargs):
         if weight is not None:
+            # MAINT: networkx 3.0, 3.1
             # For networkx 3.0 and 3.1 compatibility
             if edge_attrs is not None:
                 raise TypeError(
@@ -38,6 +39,13 @@ def convert_to_nx(obj, *, name: str | None = None):
 
     @staticmethod
     def on_start_tests(items):
+        """Modify pytest items after tests have been collected.
+
+        This is called during ``pytest_collection_modifyitems`` phase of pytest.
+        We use this to set `xfail` on tests we expect to fail. See:
+
+        https://docs.pytest.org/en/stable/reference/reference.html#std-hook-pytest_collection_modifyitems
+        """
         try:
             import pytest
         except ModuleNotFoundError:
@@ -51,17 +59,110 @@ def key(testpath):
                 return (testname, frozenset({classname, filename}))
             return (testname, frozenset({filename}))
 
-        string_attribute = "unable to handle string attributes"
+        no_weights = "weighted implementation not currently supported"
+        no_multigraph = "multigraphs not currently supported"
+
+        xfail = {}
+
+        from packaging.version import parse
 
-        skip = {
-            key("test_pajek.py:TestPajek.test_ignored_attribute"): string_attribute,
-            key(
-                "test_agraph.py:TestAGraph.test_no_warnings_raised"
-            ): "pytest.warn(None) deprecated",
-        }
+        nxver = parse(nx.__version__)
+        if nxver.major == 3 and nxver.minor in {0, 1}:
+            # MAINT: networkx 3.0, 3.1
+            xfail.update(
+                {
+                    key(
+                        "test_agraph.py:TestAGraph.test_no_warnings_raised"
+                    ): "pytest.warn(None) deprecated",
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality.test_K5"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality.test_P3_normalized"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality.test_P3"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality.test_krackhardt_kite_graph"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality."
+                        "test_krackhardt_kite_graph_normalized"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality."
+                        "test_florentine_families_graph"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality.test_les_miserables_graph"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality.test_ladder_graph"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality.test_G"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality.test_G2"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality.test_G3"
+                    ): no_multigraph,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedBetweennessCentrality.test_G4"
+                    ): no_multigraph,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedEdgeBetweennessCentrality.test_K5"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedEdgeBetweennessCentrality.test_C4"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedEdgeBetweennessCentrality.test_P4"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedEdgeBetweennessCentrality.test_balanced_tree"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedEdgeBetweennessCentrality.test_weighted_graph"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedEdgeBetweennessCentrality."
+                        "test_normalized_weighted_graph"
+                    ): no_weights,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedEdgeBetweennessCentrality.test_weighted_multigraph"
+                    ): no_multigraph,
+                    key(
+                        "test_betweenness_centrality.py:"
+                        "TestWeightedEdgeBetweennessCentrality."
+                        "test_normalized_weighted_multigraph"
+                    ): no_multigraph,
+                }
+            )
         for item in items:
             kset = set(item.keywords)
-            for (test_name, keywords), reason in skip.items():
+            for (test_name, keywords), reason in xfail.items():
                 if item.name == test_name and keywords.issubset(kset):
                     item.add_marker(pytest.mark.xfail(reason=reason))
 
diff --git a/python/cugraph-nx/pyproject.toml b/python/cugraph-nx/pyproject.toml
index 8b0ae11fbe0..df1e7a7a9ab 100644
--- a/python/cugraph-nx/pyproject.toml
+++ b/python/cugraph-nx/pyproject.toml
@@ -40,6 +40,7 @@ test = [
     "pytest",
     "pytest-benchmark",
     "pytest-mpl",
+    "packaging >=21",
 ]
 
 [project.urls]
diff --git a/python/cugraph-nx/run_nx_tests.sh b/python/cugraph-nx/run_nx_tests.sh
index 5d3b616304d..8736d77010f 100755
--- a/python/cugraph-nx/run_nx_tests.sh
+++ b/python/cugraph-nx/run_nx_tests.sh
@@ -12,5 +12,9 @@
 #
 # Coverage of `cugraph_nx.algorithms` is reported and is a good sanity check that algorithms run.
 
-# NETWORKX_GRAPH_CONVERT=cugraph NETWORKX_BACKEND_TEST_EXHAUSTIVE=True pytest --pyargs networkx "$@"
-NETWORKX_TEST_BACKEND=cugraph NETWORKX_TEST_FALLBACK_TO_NX=True pytest --pyargs networkx --cov=cugraph_nx/algorithms --cov-report term-missing --no-cov-on-fail "$@"
+NETWORKX_GRAPH_CONVERT=cugraph NETWORKX_BACKEND_TEST_EXHAUSTIVE=True \
+NETWORKX_TEST_BACKEND=cugraph NETWORKX_TEST_FALLBACK_TO_NX=True \
+    pytest --pyargs networkx \
+    --cov=cugraph_nx/algorithms \
+    --cov-report term-missing --no-cov-on-fail \
+    "$@"

From f0d16c1bbd66629a5ec9d3a93dec6426b15b1bae Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Tue, 22 Aug 2023 17:11:50 +0200
Subject: [PATCH 16/72] [REVIEW] Cugraph dgl block improvements (#3810)

This PR fixes: https://github.com/rapidsai/cugraph/issues/3784 and speeds up MFG creation by `3.5x` .

Todo:
- [x] Add tests


Benchmarked on 6_462_743_488 edges with a batch size of `128` on a 1 V100:

Before PR Times:
```
1min 17s
```

After PR Times:
```
22 s
```

See link: https://gist.github.com/VibhuJawa/4852203f2e96de09d84d698af945682d


**Profiling:**

After PR: https://github.com/rapidsai/cugraph/pull/3810

<img width="1252" alt="image" src="https://github.com/rapidsai/cugraph/assets/4837571/4cbe5153-4251-4195-9471-c60d11cdf7e9">


<img width="1252" alt="image" src="https://github.com/rapidsai/cugraph/assets/4837571/ad019f47-6ccf-45b2-b866-9a2f4f16bc9b">


Profile of splitting df into tensors :

<img width="781" alt="image" src="https://github.com/rapidsai/cugraph/assets/4837571/82c401d6-1fca-44da-871f-ebb163a464ba">

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Tingyu Wang (https://github.com/tingyu66)

URL: https://github.com/rapidsai/cugraph/pull/3810
---
 .../cugraph_dgl/dataloading/dataset.py        |   9 +-
 .../dataloading/utils/sampling_helpers.py     | 175 ++++++++++++------
 python/cugraph-dgl/tests/test_dataset.py      |  46 +++--
 python/cugraph-dgl/tests/test_utils.py        |  36 ++++
 4 files changed, 192 insertions(+), 74 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
index 57f7db3be01..e0d51bcf4cf 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
@@ -32,7 +32,13 @@ def __init__(
         self,
         total_number_of_nodes: int,
         edge_dir: str,
+        return_type: str = "dgl.Block",
     ):
+        if return_type not in ["dgl.Block", "cugraph_dgl.nn.SparseGraph"]:
+            raise ValueError(
+                "return_type must be either 'dgl.Block' or \
+                    'cugraph_dgl.nn.SparseGraph' "
+            )
         # TODO: Deprecate `total_number_of_nodes`
         # as it is no longer needed
         # in the next release
@@ -40,6 +46,7 @@ def __init__(
         self.edge_dir = edge_dir
         self._current_batch_fn = None
         self._input_files = None
+        self._return_type = return_type
 
     def __len__(self):
         return self.num_batches
@@ -55,7 +62,7 @@ def __getitem__(self, idx: int):
         if fn != self._current_batch_fn:
             df = _load_sampled_file(dataset_obj=self, fn=fn)
             self._current_batches = create_homogeneous_sampled_graphs_from_dataframe(
-                df, self.edge_dir
+                sampled_df=df, edge_dir=self.edge_dir, return_type=self._return_type
             )
         current_offset = idx - batch_offset
         return self._current_batches[current_offset]
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
index 9fc0f6a559b..bdac3b1a323 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
@@ -18,6 +18,7 @@
 
 dgl = import_optional("dgl")
 torch = import_optional("torch")
+cugraph_dgl = import_optional("cugraph_dgl")
 
 
 def cast_to_tensor(ser: cudf.Series):
@@ -40,6 +41,30 @@ def _split_tensor(t, split_indices):
     return torch.tensor_split(t, split_indices)
 
 
+def _get_source_destination_range(sampled_df):
+    o = sampled_df.groupby(["batch_id", "hop_id"], as_index=True).agg(
+        {"sources": "max", "destinations": "max"}
+    )
+    o.rename(
+        columns={"sources": "sources_range", "destinations": "destinations_range"},
+        inplace=True,
+    )
+    d = o.to_dict(orient="index")
+    return d
+
+
+def _create_split_dict(tensor):
+    min_value = tensor.min()
+    max_value = tensor.max()
+    indices = torch.arange(
+        start=min_value + 1,
+        end=max_value + 1,
+        device=tensor.device,
+    )
+    split_dict = {i: {} for i in range(min_value, max_value + 1)}
+    return split_dict, indices
+
+
 def _get_renumber_map(df):
     map = df["map"]
     df.drop(columns=["map"], inplace=True)
@@ -49,9 +74,12 @@ def _get_renumber_map(df):
     renumber_map_batch_indices = map[1 : map_starting_offset - 1].reset_index(drop=True)
     renumber_map_batch_indices = renumber_map_batch_indices - map_starting_offset
 
-    # Drop all rows with NaN values
-    df.dropna(axis=0, how="all", inplace=True)
-    df.reset_index(drop=True, inplace=True)
+    map_end_offset = map_starting_offset + len(renumber_map)
+    # We only need to drop rows if the length of dataframe is determined by the map
+    # that is if map_length > sampled edges length
+    if map_end_offset == len(df):
+        df.dropna(axis=0, how="all", inplace=True)
+        df.reset_index(drop=True, inplace=True)
 
     return df, cast_to_tensor(renumber_map), cast_to_tensor(renumber_map_batch_indices)
 
@@ -65,24 +93,16 @@ def _get_tensor_d_from_sampled_df(df):
     Returns:
         dict: A dictionary of tensors, keyed by batch_id and hop_id.
     """
+    range_d = _get_source_destination_range(df)
     df, renumber_map, renumber_map_batch_indices = _get_renumber_map(df)
     batch_id_tensor = cast_to_tensor(df["batch_id"])
-    batch_id_min = batch_id_tensor.min()
-    batch_id_max = batch_id_tensor.max()
-    batch_indices = torch.arange(
-        start=batch_id_min + 1,
-        end=batch_id_max + 1,
-        device=batch_id_tensor.device,
-    )
-    # TODO: Fix below
-    # batch_indices = _get_id_tensor_boundaries(batch_id_tensor)
-    batch_indices = torch.searchsorted(batch_id_tensor, batch_indices).to("cpu")
-    split_d = {i: {} for i in range(batch_id_min, batch_id_max + 1)}
+    split_d, batch_indices = _create_split_dict(batch_id_tensor)
+    batch_split_indices = torch.searchsorted(batch_id_tensor, batch_indices).to("cpu")
 
     for column in df.columns:
         if column != "batch_id":
             t = cast_to_tensor(df[column])
-            split_t = _split_tensor(t, batch_indices)
+            split_t = _split_tensor(t, batch_split_indices)
             for bid, batch_t in zip(split_d.keys(), split_t):
                 split_d[bid][column] = batch_t
 
@@ -91,35 +111,37 @@ def _get_tensor_d_from_sampled_df(df):
         split_d[bid]["map"] = batch_t
     del df
     result_tensor_d = {}
+    # Cache hop_split_d, hop_indices
+    hop_split_empty_d, hop_indices = None, None
     for batch_id, batch_d in split_d.items():
         hop_id_tensor = batch_d["hop_id"]
-        hop_id_min = hop_id_tensor.min()
-        hop_id_max = hop_id_tensor.max()
+        if hop_split_empty_d is None:
+            hop_split_empty_d, hop_indices = _create_split_dict(hop_id_tensor)
 
-        hop_indices = torch.arange(
-            start=hop_id_min + 1,
-            end=hop_id_max + 1,
-            device=hop_id_tensor.device,
-        )
-        # TODO: Fix below
-        # hop_indices = _get_id_tensor_boundaries(hop_id_tensor)
-        hop_indices = torch.searchsorted(hop_id_tensor, hop_indices).to("cpu")
-        hop_split_d = {i: {} for i in range(hop_id_min, hop_id_max + 1)}
+        hop_split_d = {k: {} for k in hop_split_empty_d.keys()}
+        hop_split_indices = torch.searchsorted(hop_id_tensor, hop_indices).to("cpu")
         for column, t in batch_d.items():
             if column not in ["hop_id", "map"]:
-                split_t = _split_tensor(t, hop_indices)
+                split_t = _split_tensor(t, hop_split_indices)
                 for hid, ht in zip(hop_split_d.keys(), split_t):
                     hop_split_d[hid][column] = ht
+        for hid in hop_split_d.keys():
+            hop_split_d[hid]["sources_range"] = range_d[(batch_id, hid)][
+                "sources_range"
+            ]
+            hop_split_d[hid]["destinations_range"] = range_d[(batch_id, hid)][
+                "destinations_range"
+            ]
 
         result_tensor_d[batch_id] = hop_split_d
-        if "map" in batch_d:
-            result_tensor_d[batch_id]["map"] = batch_d["map"]
+        result_tensor_d[batch_id]["map"] = batch_d["map"]
     return result_tensor_d
 
 
 def create_homogeneous_sampled_graphs_from_dataframe(
     sampled_df: cudf.DataFrame,
     edge_dir: str = "in",
+    return_type: str = "dgl.Block",
 ):
     """
     This helper function creates DGL MFGS  for
@@ -136,11 +158,16 @@ def create_homogeneous_sampled_graphs_from_dataframe(
             - output_nodes: The output nodes for the batch.
             - graph_per_hop_ls: A list of DGL MFGS for each hop.
     """
+    if return_type not in ["dgl.Block", "cugraph_dgl.nn.SparseGraph"]:
+        raise ValueError(
+            "return_type must be either dgl.Block or cugraph_dgl.nn.SparseGraph"
+        )
+
     result_tensor_d = _get_tensor_d_from_sampled_df(sampled_df)
     del sampled_df
     result_mfgs = [
         _create_homogeneous_sampled_graphs_from_tensors_perhop(
-            tensors_batch_d, edge_dir
+            tensors_batch_d, edge_dir, return_type
         )
         for tensors_batch_d in result_tensor_d.values()
     ]
@@ -148,73 +175,111 @@ def create_homogeneous_sampled_graphs_from_dataframe(
     return result_mfgs
 
 
-def _create_homogeneous_sampled_graphs_from_tensors_perhop(tensors_batch_d, edge_dir):
+def _create_homogeneous_sampled_graphs_from_tensors_perhop(
+    tensors_batch_d, edge_dir, return_type
+):
     """
     This helper function creates sampled DGL MFGS for
     homogeneous graphs from tensors per hop for a single
     batch
-
     Args:
         tensors_batch_d (dict): A dictionary of tensors, keyed by hop_id.
         edge_dir (str): Direction of edges from samples
+        metagraph (dgl.metagraph): The metagraph for the sampled graph
+        return_type (str): The type of graph to return
     Returns:
         tuple: A tuple of three elements:
             - input_nodes: The input nodes for the batch.
             - output_nodes: The output nodes for the batch.
-            - graph_per_hop_ls: A list of DGL MFGS for each hop.
+            - graph_per_hop_ls: A list of MFGS for each hop.
     """
     if edge_dir not in ["in", "out"]:
         raise ValueError(f"Invalid edge_dir {edge_dir} provided")
     if edge_dir == "out":
         raise ValueError("Outwards edges not supported yet")
     graph_per_hop_ls = []
-    seednodes = None
+    seednodes_range = None
     for hop_id, tensor_per_hop_d in tensors_batch_d.items():
         if hop_id != "map":
-            block = _create_homogeneous_dgl_block_from_tensor_d(
-                tensor_per_hop_d, tensors_batch_d["map"], seednodes
+            if return_type == "dgl.Block":
+                mfg = _create_homogeneous_dgl_block_from_tensor_d(
+                    tensor_d=tensor_per_hop_d,
+                    renumber_map=tensors_batch_d["map"],
+                    seednodes_range=seednodes_range,
+                )
+            elif return_type == "cugraph_dgl.nn.SparseGraph":
+                mfg = _create_homogeneous_cugraph_dgl_nn_sparse_graph(
+                    tensor_d=tensor_per_hop_d, seednodes_range=seednodes_range
+                )
+            else:
+                raise ValueError(f"Invalid return_type {return_type} provided")
+            seednodes_range = max(
+                tensor_per_hop_d["sources_range"],
+                tensor_per_hop_d["destinations_range"],
             )
-            seednodes = torch.concat(
-                [tensor_per_hop_d["sources"], tensor_per_hop_d["destinations"]]
-            )
-            graph_per_hop_ls.append(block)
+            graph_per_hop_ls.append(mfg)
 
     # default DGL behavior
     if edge_dir == "in":
         graph_per_hop_ls.reverse()
-
-    input_nodes = graph_per_hop_ls[0].srcdata[dgl.NID]
-    output_nodes = graph_per_hop_ls[-1].dstdata[dgl.NID]
+    if return_type == "dgl.Block":
+        input_nodes = graph_per_hop_ls[0].srcdata[dgl.NID]
+        output_nodes = graph_per_hop_ls[-1].dstdata[dgl.NID]
+    else:
+        map = tensors_batch_d["map"]
+        input_nodes = map[0 : graph_per_hop_ls[0].num_src_nodes()]
+        output_nodes = map[0 : graph_per_hop_ls[-1].num_dst_nodes()]
     return input_nodes, output_nodes, graph_per_hop_ls
 
 
-def _create_homogeneous_dgl_block_from_tensor_d(tensor_d, renumber_map, seednodes=None):
+def _create_homogeneous_dgl_block_from_tensor_d(
+    tensor_d,
+    renumber_map,
+    seednodes_range=None,
+):
     rs = tensor_d["sources"]
     rd = tensor_d["destinations"]
-
-    max_src_nodes = rs.max()
-    max_dst_nodes = rd.max()
-    if seednodes is not None:
-        # If we have isolated vertices
+    max_src_nodes = tensor_d["sources_range"]
+    max_dst_nodes = tensor_d["destinations_range"]
+    if seednodes_range is not None:
+        # If we have  vertices without outgoing edges, then
         # sources can be missing from seednodes
         # so we add them
         # to ensure all the blocks are
-        # linedup correctly
-        max_dst_nodes = max(max_dst_nodes, seednodes.max())
+        # lined up correctly
+        max_dst_nodes = max(max_dst_nodes, seednodes_range)
 
     data_dict = {("_N", "_E", "_N"): (rs, rd)}
-    num_src_nodes = {"_N": max_src_nodes.item() + 1}
-    num_dst_nodes = {"_N": max_dst_nodes.item() + 1}
+    num_src_nodes = {"_N": max_src_nodes + 1}
+    num_dst_nodes = {"_N": max_dst_nodes + 1}
+
     block = dgl.create_block(
         data_dict=data_dict, num_src_nodes=num_src_nodes, num_dst_nodes=num_dst_nodes
     )
     if "edge_id" in tensor_d:
         block.edata[dgl.EID] = tensor_d["edge_id"]
-    block.srcdata[dgl.NID] = renumber_map[block.srcnodes()]
-    block.dstdata[dgl.NID] = renumber_map[block.dstnodes()]
+    # Below adds run time overhead
+    block.srcdata[dgl.NID] = renumber_map[0 : max_src_nodes + 1]
+    block.dstdata[dgl.NID] = renumber_map[0 : max_dst_nodes + 1]
     return block
 
 
+def _create_homogeneous_cugraph_dgl_nn_sparse_graph(tensor_d, seednodes_range):
+    max_src_nodes = tensor_d["sources_range"]
+    max_dst_nodes = tensor_d["destinations_range"]
+    if seednodes_range is not None:
+        max_dst_nodes = max(max_dst_nodes, seednodes_range)
+    size = (max_src_nodes + 1, max_dst_nodes + 1)
+    sparse_graph = cugraph_dgl.nn.SparseGraph(
+        size=size,
+        src_ids=tensor_d["sources"],
+        dst_ids=tensor_d["destinations"],
+        formats=["csc"],
+        reduce_memory=True,
+    )
+    return sparse_graph
+
+
 def create_heterogeneous_sampled_graphs_from_dataframe(
     sampled_df: cudf.DataFrame,
     num_nodes_dict: Dict[str, int],
diff --git a/python/cugraph-dgl/tests/test_dataset.py b/python/cugraph-dgl/tests/test_dataset.py
index a1da77721a3..69d50261e55 100644
--- a/python/cugraph-dgl/tests/test_dataset.py
+++ b/python/cugraph-dgl/tests/test_dataset.py
@@ -47,20 +47,18 @@ def create_dgl_mfgs(g, seed_nodes, fanout):
     return sampler.sample_blocks(g, seed_nodes)
 
 
-def create_cugraph_dgl_homogenous_mfgs(g, seed_nodes, fanout):
+def create_cugraph_dgl_homogenous_mfgs(dgl_blocks, return_type):
     df_ls = []
     unique_vertices_ls = []
-    for hop_id, fanout in enumerate(reversed(fanout)):
-        frontier = g.sample_neighbors(seed_nodes, fanout)
-        # Set include_dst_in_src to match cugraph behavior
-        block = dgl.to_block(frontier, seed_nodes, include_dst_in_src=False)
-        block.edata[dgl.EID] = frontier.edata[dgl.EID]
-        seed_nodes = block.srcdata[dgl.NID]
+    for hop_id, block in enumerate(reversed(dgl_blocks)):
         block = block.to("cpu")
         src, dst, eid = block.edges("all")
         eid = block.edata[dgl.EID][eid]
+
+        og_src = block.srcdata[dgl.NID][src]
+        og_dst = block.dstdata[dgl.NID][dst]
         unique_vertices = pd.concat(
-            [pd.Series(dst.numpy()), pd.Series(src.numpy())]
+            [pd.Series(og_dst.numpy()), pd.Series(og_src.numpy())]
         ).drop_duplicates(keep="first")
         unique_vertices_ls.append(unique_vertices)
         df = cudf.DataFrame(
@@ -84,23 +82,24 @@ def create_cugraph_dgl_homogenous_mfgs(g, seed_nodes, fanout):
     # Have to reindex cause map_ser can be of larger length than df
     df = df.reindex(df.index.union(map_ser.index))
     df["map"] = map_ser
-    return create_homogeneous_sampled_graphs_from_dataframe(df)[0]
+    return create_homogeneous_sampled_graphs_from_dataframe(
+        df, return_type=return_type
+    )[0]
 
 
+@pytest.mark.parametrize("return_type", ["dgl.Block", "cugraph_dgl.nn.SparseGraph"])
 @pytest.mark.parametrize("seed_node", [3, 4, 5])
-def test_homogeneous_sampled_graphs_from_dataframe(seed_node):
+def test_homogeneous_sampled_graphs_from_dataframe(return_type, seed_node):
     g = dgl.graph(([0, 1, 2, 3, 4], [1, 2, 3, 4, 5]))
     fanout = [1, 1, 1]
     seed_node = torch.as_tensor([seed_node])
 
-    dgl_seed_nodes, dgl_output_nodes, dgl_mfgs = create_cugraph_dgl_homogenous_mfgs(
-        g, seed_node, fanout
-    )
+    dgl_seed_nodes, dgl_output_nodes, dgl_mfgs = create_dgl_mfgs(g, seed_node, fanout)
     (
         cugraph_seed_nodes,
         cugraph_output_nodes,
         cugraph_mfgs,
-    ) = create_cugraph_dgl_homogenous_mfgs(g, seed_node, fanout)
+    ) = create_cugraph_dgl_homogenous_mfgs(dgl_mfgs, return_type=return_type)
 
     np.testing.assert_equal(
         cugraph_seed_nodes.cpu().numpy().copy().sort(),
@@ -112,7 +111,18 @@ def test_homogeneous_sampled_graphs_from_dataframe(seed_node):
         cugraph_output_nodes.cpu().numpy().copy().sort(),
     )
 
-    for dgl_block, cugraph_dgl_block in zip(dgl_mfgs, cugraph_mfgs):
-        dgl_df = get_edge_df_from_homogenous_block(dgl_block)
-        cugraph_dgl_df = get_edge_df_from_homogenous_block(cugraph_dgl_block)
-        pd.testing.assert_frame_equal(dgl_df, cugraph_dgl_df)
+    if return_type == "dgl.Block":
+        for dgl_block, cugraph_dgl_block in zip(dgl_mfgs, cugraph_mfgs):
+            dgl_df = get_edge_df_from_homogenous_block(dgl_block)
+            cugraph_dgl_df = get_edge_df_from_homogenous_block(cugraph_dgl_block)
+            pd.testing.assert_frame_equal(dgl_df, cugraph_dgl_df)
+    else:
+        for dgl_block, cugraph_dgl_graph in zip(dgl_mfgs, cugraph_mfgs):
+            # Can not verify edge ids as they are not
+            # preserved in cugraph_dgl.nn.SparseGraph
+            assert dgl_block.num_src_nodes() == cugraph_dgl_graph.num_src_nodes()
+            assert dgl_block.num_dst_nodes() == cugraph_dgl_graph.num_dst_nodes()
+            dgl_offsets, dgl_indices, _ = dgl_block.adj_tensors("csc")
+            cugraph_offsets, cugraph_indices = cugraph_dgl_graph.csc()
+            assert torch.equal(dgl_offsets.to("cpu"), cugraph_offsets.to("cpu"))
+            assert torch.equal(dgl_indices.to("cpu"), cugraph_indices.to("cpu"))
diff --git a/python/cugraph-dgl/tests/test_utils.py b/python/cugraph-dgl/tests/test_utils.py
index fd75b1537b5..740db59ce7f 100644
--- a/python/cugraph-dgl/tests/test_utils.py
+++ b/python/cugraph-dgl/tests/test_utils.py
@@ -20,11 +20,14 @@
     _split_tensor,
     _get_tensor_d_from_sampled_df,
     create_homogeneous_sampled_graphs_from_dataframe,
+    _get_source_destination_range,
+    _create_homogeneous_cugraph_dgl_nn_sparse_graph,
 )
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
 torch = import_optional("torch")
+cugraph_dgl = import_optional("cugraph_dgl")
 
 
 def test_casting_empty_array():
@@ -140,3 +143,36 @@ def test_create_homogeneous_sampled_graphs_from_dataframe():
         de, dd = d_block.edges()
         assert torch.equal(ce, de)
         assert torch.equal(cd, dd)
+
+
+def test_get_source_destination_range():
+    df = get_dummy_sampled_df()
+    output_d = _get_source_destination_range(df)
+
+    expected_output = {
+        (0, 0): {"sources_range": 0, "destinations_range": 1},
+        (0, 1): {"sources_range": 1, "destinations_range": 2},
+        (1, 0): {"sources_range": 0, "destinations_range": 1},
+        (1, 1): {"sources_range": 1, "destinations_range": 2},
+        (2, 0): {"sources_range": 0, "destinations_range": 2},
+        (2, 1): {"sources_range": 2, "destinations_range": 1},
+    }
+
+    assert output_d == expected_output
+
+
+def test__create_homogeneous_cugraph_dgl_nn_sparse_graph():
+    tensor_d = {
+        "sources_range": 1,
+        "destinations_range": 2,
+        "sources": torch.as_tensor([0, 0, 1, 1], dtype=torch.int64, device="cuda"),
+        "destinations": torch.as_tensor([0, 0, 1, 2], dtype=torch.int64, device="cuda"),
+    }
+
+    seednodes_range = 10
+    sparse_graph = _create_homogeneous_cugraph_dgl_nn_sparse_graph(
+        tensor_d, seednodes_range
+    )
+    assert sparse_graph.num_src_nodes() == 2
+    assert sparse_graph.num_dst_nodes() == seednodes_range + 1
+    assert isinstance(sparse_graph, cugraph_dgl.nn.SparseGraph)

From 578e7a3405e2369743c0727084e786e10647a762 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 22 Aug 2023 21:46:13 -0500
Subject: [PATCH 17/72] Unpin `dask` and `distributed` for `23.10` development
 (#3818)

This PR unpins `dask` and `distributed` to use nightly builds for `23.10` development.


xref: https://github.com/rapidsai/cudf/pull/13935

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - https://github.com/jakirkham
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3818
---
 ci/test_wheel_cugraph.sh                         | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 6 +++---
 conda/environments/all_cuda-120_arch-x86_64.yaml | 6 +++---
 conda/recipes/cugraph-pyg/meta.yaml              | 2 +-
 conda/recipes/cugraph-service/meta.yaml          | 2 +-
 conda/recipes/cugraph/meta.yaml                  | 6 +++---
 dependencies.yaml                                | 6 +++---
 python/cugraph-service/server/pyproject.toml     | 4 ++--
 python/cugraph/pyproject.toml                    | 4 ++--
 9 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/ci/test_wheel_cugraph.sh b/ci/test_wheel_cugraph.sh
index 1c356ba3073..a117e00b8a2 100755
--- a/ci/test_wheel_cugraph.sh
+++ b/ci/test_wheel_cugraph.sh
@@ -9,7 +9,7 @@ RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-whe
 python -m pip install --no-deps ./local-pylibcugraph-dep/pylibcugraph*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@2023.7.1 git+https://github.com/dask/distributed.git@2023.7.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
 
 # Only download test data for x86
 arch=$(uname -m)
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index f3b15b6d13f..1fb267b2986 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -18,11 +18,11 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.5.1
+- dask-core>=2023.7.1
 - dask-cuda==23.10.*
 - dask-cudf==23.10.*
-- dask>=2023.5.1
-- distributed>=2023.5.1
+- dask>=2023.7.1
+- distributed>=2023.7.1
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index b87f1c635f8..6a06e453b84 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -18,11 +18,11 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.5.1
+- dask-core>=2023.7.1
 - dask-cuda==23.10.*
 - dask-cudf==23.10.*
-- dask>=2023.5.1
-- distributed>=2023.5.1
+- dask>=2023.7.1
+- distributed>=2023.7.1
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 026e8318b4e..847a5e9bf85 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -26,7 +26,7 @@ requirements:
     - python
     - scikit-build >=0.13.1
   run:
-    - distributed ==2023.7.1
+    - distributed >=2023.7.1
     - numba >=0.57
     - numpy >=1.21
     - python
diff --git a/conda/recipes/cugraph-service/meta.yaml b/conda/recipes/cugraph-service/meta.yaml
index d928bae1de4..64d967a5d1f 100644
--- a/conda/recipes/cugraph-service/meta.yaml
+++ b/conda/recipes/cugraph-service/meta.yaml
@@ -57,7 +57,7 @@ outputs:
         - cupy >=12.0.0
         - dask-cuda ={{ minor_version }}
         - dask-cudf ={{ minor_version }}
-        - distributed ==2023.7.1
+        - distributed >=2023.7.1
         - numba >=0.57
         - numpy >=1.21
         - python
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index ec94ee05194..bf574c44dd6 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -76,9 +76,9 @@ requirements:
     - cupy >=12.0.0
     - dask-cuda ={{ minor_version }}
     - dask-cudf ={{ minor_version }}
-    - dask ==2023.7.1
-    - dask-core ==2023.7.1
-    - distributed ==2023.7.1
+    - dask >=2023.7.1
+    - dask-core >=2023.7.1
+    - distributed >=2023.7.1
     - fsspec>=0.6.0
     - libcugraph ={{ version }}
     - pylibcugraph ={{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index c8a3f48579c..04c6c21df19 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -322,8 +322,8 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - &cudf cudf==23.10.*
-          - &dask dask>=2023.5.1
-          - &distributed distributed>=2023.5.1
+          - &dask dask>=2023.7.1
+          - &distributed distributed>=2023.7.1
           - &dask_cuda dask-cuda==23.10.*
           - &dask_cudf dask-cudf==23.10.*
           - &numba numba>=0.57
@@ -334,7 +334,7 @@ dependencies:
         packages:
           - aiohttp
           - &cupy cupy>=12.0.0
-          - &dask-core dask-core>=2023.5.1
+          - &dask-core dask-core>=2023.7.1
           - fsspec>=0.6.0
           - libcudf==23.10.*
           - requests
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index 52211b3ff89..f8f90b864cd 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -25,8 +25,8 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "dask-cuda==23.10.*",
     "dask-cudf==23.10.*",
-    "dask>=2023.5.1",
-    "distributed>=2023.5.1",
+    "dask>=2023.7.1",
+    "distributed>=2023.7.1",
     "numba>=0.57",
     "numpy>=1.21",
     "rmm==23.10.*",
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index 4c49ae259c1..591161774e3 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -33,8 +33,8 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "dask-cuda==23.10.*",
     "dask-cudf==23.10.*",
-    "dask>=2023.5.1",
-    "distributed>=2023.5.1",
+    "dask>=2023.7.1",
+    "distributed>=2023.7.1",
     "fsspec[http]>=0.6.0",
     "numba>=0.57",
     "pylibcugraph==23.10.*",

From ec1e329b16bc8fef42f23fda865a50034e786d63 Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Wed, 23 Aug 2023 19:44:24 -0500
Subject: [PATCH 18/72] Adds updates to build wheel and conda packages for
 `cugraph-nx` (#3793)

closes #3771
closes #3772
closes #3770

This PR adds updates to build new `cugraph-nx` wheels and conda packages, updates the CI scripts to test them, and also updates the auto-versioning script used during releases to also update `cugraph-nx`.

This PR also cleans up some tech debt in `dependencies.yaml` and various conda recipes (added missing meta-data).

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Erik Welch (https://github.com/eriknw)
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3793
---
 ci/build_python.sh                            | 28 ++++++---
 ci/release/update-version.sh                  |  2 +
 ci/test_python.sh                             | 26 +++++++++
 .../all_cuda-118_arch-x86_64.yaml             |  3 +
 .../all_cuda-120_arch-x86_64.yaml             |  3 +
 conda/recipes/cugraph-dgl/meta.yaml           |  1 +
 conda/recipes/cugraph-nx/build.sh             |  7 +++
 conda/recipes/cugraph-nx/meta.yaml            | 43 ++++++++++++++
 conda/recipes/cugraph-pyg/meta.yaml           |  1 +
 conda/recipes/cugraph-service/meta.yaml       |  2 +
 conda/recipes/cugraph/meta.yaml               |  1 +
 conda/recipes/libcugraph/meta.yaml            |  3 +
 conda/recipes/pylibcugraph/meta.yaml          |  1 +
 dependencies.yaml                             | 58 ++++++++++++++++---
 python/cugraph-nx/README.md                   |  2 -
 .../cugraph_nx/tests/test_match_api.py        | 27 ++++++++-
 python/cugraph-nx/pyproject.toml              | 12 ++--
 python/cugraph-nx/run_nx_tests.sh             | 42 +++++++++-----
 python/pylibcugraph/pyproject.toml            |  1 -
 19 files changed, 224 insertions(+), 39 deletions(-)
 create mode 100644 conda/recipes/cugraph-nx/build.sh
 create mode 100644 conda/recipes/cugraph-nx/meta.yaml

diff --git a/ci/build_python.sh b/ci/build_python.sh
index 5125e86d53a..595eedf9e46 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -26,17 +26,29 @@ rapids-mamba-retry mambabuild \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cugraph
 
+# NOTE: nothing in cugraph-nx is CUDA-specific, but it is built on each CUDA
+# platform to ensure it is included in each set of artifacts, since test
+# scripts only install from one set of artifacts based on the CUDA version used
+# for the test run.
+rapids-mamba-retry mambabuild \
+  --no-test \
+  --channel "${CPP_CHANNEL}" \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
+  conda/recipes/cugraph-nx
+
+# NOTE: nothing in the cugraph-service packages are CUDA-specific, but they are
+# built on each CUDA platform to ensure they are included in each set of
+# artifacts, since test scripts only install from one set of artifacts based on
+# the CUDA version used for the test run.
+rapids-mamba-retry mambabuild \
+  --no-test \
+  --channel "${CPP_CHANNEL}" \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
+  conda/recipes/cugraph-service
+
 RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 
 if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then
-  # Only one CUDA configuration is needed, so we choose CUDA 11 arbitrarily.
-  # Nothing in the cugraph-service packages is CUDA-specific.
-  rapids-mamba-retry mambabuild \
-    --no-test \
-    --channel "${CPP_CHANNEL}" \
-    --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-    conda/recipes/cugraph-service
-
   # Only CUDA 11 is supported right now due to PyTorch requirement.
   rapids-mamba-retry mambabuild \
     --no-test \
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 71fc7b1447e..f9a78b275ae 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -61,6 +61,7 @@ sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cugr
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cugraph-service/client/cugraph_service_client/__init__.py
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cugraph-service/server/cugraph_service_server/__init__.py
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/pylibcugraph/pylibcugraph/__init__.py
+sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cugraph-nx/cugraph_nx/__init__.py
 
 # Python pyproject.toml updates
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph/pyproject.toml
@@ -69,6 +70,7 @@ sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph-pyg
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph-service/client/pyproject.toml
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph-service/server/pyproject.toml
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/pylibcugraph/pyproject.toml
+sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph-nx/pyproject.toml
 
 # Wheel testing script
 sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cugraph.sh
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 3a23f521734..dd205b16049 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -34,6 +34,7 @@ rapids-mamba-retry install \
   libcugraph \
   pylibcugraph \
   cugraph \
+  cugraph-nx \
   cugraph-service-server \
   cugraph-service-client
 
@@ -88,6 +89,31 @@ pytest \
   cugraph/pytest-based/bench_algos.py
 popd
 
+rapids-logger "pytest cugraph-nx"
+pushd python/cugraph-nx/cugraph_nx
+pytest \
+  --capture=no \
+  --verbose \
+  --cache-clear \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph-nx.xml" \
+  --cov-config=../../.coveragerc \
+  --cov=cugraph_nx \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-nx-coverage.xml" \
+  --cov-report=term \
+  --benchmark-disable \
+  tests
+popd
+
+rapids-logger "pytest networkx using cugraph-nx backend"
+pushd python/cugraph-nx
+./run_nx_tests.sh
+# run_nx_tests.sh outputs coverage data, so check that total coverage is >0.0%
+# in case cugraph-nx failed to load but fallback mode allowed the run to pass.
+_coverage=$(coverage report|grep "^TOTAL")
+echo "cugraph-nx coverage from networkx tests: $_coverage"
+echo $_coverage | awk '{ if ($NF == "0.0%") exit 1 }'
+popd
+
 rapids-logger "pytest cugraph-service (single GPU)"
 pushd python/cugraph-service
 pytest \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 1fb267b2986..075cf231725 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -38,6 +38,7 @@ dependencies:
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
+- networkx>=3.0
 - ninja
 - notebook>=0.5.0
 - numba>=0.57
@@ -45,6 +46,7 @@ dependencies:
 - numpydoc
 - nvcc_linux-64=11.8
 - openmpi
+- packaging>=21
 - pandas
 - pre-commit
 - pydata-sphinx-theme
@@ -53,6 +55,7 @@ dependencies:
 - pytest
 - pytest-benchmark
 - pytest-cov
+- pytest-mpl
 - pytest-xdist
 - python-louvain
 - raft-dask==23.10.*
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 6a06e453b84..eacafbfd6c4 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -38,12 +38,14 @@ dependencies:
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
+- networkx>=3.0
 - ninja
 - notebook>=0.5.0
 - numba>=0.57
 - numpy>=1.21
 - numpydoc
 - openmpi
+- packaging>=21
 - pandas
 - pre-commit
 - pydata-sphinx-theme
@@ -52,6 +54,7 @@ dependencies:
 - pytest
 - pytest-benchmark
 - pytest-cov
+- pytest-mpl
 - pytest-xdist
 - python-louvain
 - raft-dask==23.10.*
diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index 96d25da45fb..2fbc6360c04 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -35,6 +35,7 @@ tests:
 
 about:
   home: https://rapids.ai/
+  dev_url: https://github.com/rapidsai/cugraph
   license: Apache-2.0
   license_file: ../../../LICENSE
   summary: cuGraph library
diff --git a/conda/recipes/cugraph-nx/build.sh b/conda/recipes/cugraph-nx/build.sh
new file mode 100644
index 00000000000..31ad477a73e
--- /dev/null
+++ b/conda/recipes/cugraph-nx/build.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+# This assumes the script is executed from the root of the repo directory
+
+./build.sh cugraph-nx
diff --git a/conda/recipes/cugraph-nx/meta.yaml b/conda/recipes/cugraph-nx/meta.yaml
new file mode 100644
index 00000000000..d6b12974981
--- /dev/null
+++ b/conda/recipes/cugraph-nx/meta.yaml
@@ -0,0 +1,43 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set date_string = environ['RAPIDS_DATE_STRING'] %}
+
+package:
+  name: cugraph-nx
+  version: {{ version }}
+
+source:
+  git_url: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  build:
+      number: {{ GIT_DESCRIBE_NUMBER }}
+      string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+
+requirements:
+  host:
+    - python
+  run:
+    - pylibcugraph ={{ version }}
+    - networkx >=3.0
+    - cupy >=12.0.0
+    - python
+
+tests:
+  imports:
+    - cugraph_nx
+  commands:
+    - pip check
+  requires:
+    - pip
+
+about:
+  home: https://rapids.ai/
+  dev_url: https://github.com/rapidsai/cugraph
+  license: Apache-2.0
+  license_file: ../../../LICENSE
+  summary: cuGraph backend for GPU-accelerated NetworkX
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 847a5e9bf85..4d3d7c44093 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -41,6 +41,7 @@ tests:
 
 about:
   home: https://rapids.ai/
+  dev_url: https://github.com/rapidsai/cugraph
   license: Apache-2.0
   license_file: ../../../LICENSE
   summary: cuGraph-pyg library
diff --git a/conda/recipes/cugraph-service/meta.yaml b/conda/recipes/cugraph-service/meta.yaml
index 64d967a5d1f..f3229c27364 100644
--- a/conda/recipes/cugraph-service/meta.yaml
+++ b/conda/recipes/cugraph-service/meta.yaml
@@ -19,6 +19,7 @@ outputs:
       string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
     about:
       home: https://rapids.ai/
+      dev_url: https://github.com/rapidsai/cugraph
       license: Apache-2.0
       license_family: APACHE
       license_file: ../../../LICENSE
@@ -39,6 +40,7 @@ outputs:
       string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
     about:
       home: https://rapids.ai/
+      dev_url: https://github.com/rapidsai/cugraph
       license: Apache-2.0
       license_family: APACHE
       license_file: ../../../LICENSE
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index bf574c44dd6..ad5965ad20c 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -97,6 +97,7 @@ tests:
 
 about:
   home: https://rapids.ai/
+  dev_url: https://github.com/rapidsai/cugraph
   license: Apache-2.0
   license_file: ../../../LICENSE
   summary: cuGraph library
diff --git a/conda/recipes/libcugraph/meta.yaml b/conda/recipes/libcugraph/meta.yaml
index d52d81366d7..83c82adf703 100644
--- a/conda/recipes/libcugraph/meta.yaml
+++ b/conda/recipes/libcugraph/meta.yaml
@@ -118,6 +118,7 @@ outputs:
         - ucx-proc=*=gpu
     about:
       home: https://rapids.ai/
+      dev_url: https://github.com/rapidsai/cugraph
       license: Apache-2.0
       license_file: ../../../LICENSE
       summary: libcugraph library
@@ -146,6 +147,7 @@ outputs:
         - librmm ={{ minor_version }}
     about:
       home: https://rapids.ai/
+      dev_url: https://github.com/rapidsai/cugraph
       license: Apache-2.0
       license_file: ../../../LICENSE
       summary: libcugraph_etl library
@@ -175,6 +177,7 @@ outputs:
         - gtest {{ gtest_version }}
     about:
       home: https://rapids.ai/
+      dev_url: https://github.com/rapidsai/cugraph
       license: Apache-2.0
       license_file: ../../../LICENSE
       summary: libcugraph test & benchmark executables
diff --git a/conda/recipes/pylibcugraph/meta.yaml b/conda/recipes/pylibcugraph/meta.yaml
index 4ac3bb2dde1..083998be053 100644
--- a/conda/recipes/pylibcugraph/meta.yaml
+++ b/conda/recipes/pylibcugraph/meta.yaml
@@ -76,6 +76,7 @@ tests:
 
 about:
   home: https://rapids.ai/
+  dev_url: https://github.com/rapidsai/cugraph
   license: Apache-2.0
   license_file: ../../../LICENSE
   summary: pylibcugraph library
diff --git a/dependencies.yaml b/dependencies.yaml
index 04c6c21df19..22579425898 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -14,12 +14,14 @@ files:
       - python_build_cythonize
       - python_run_cugraph
       - python_run_pylibcugraph
+      - python_run_cugraph_nx
       - python_run_cugraph_dgl
       - python_run_cugraph_pyg
       - test_notebook
       - test_python_common
       - test_python_cugraph
       - test_python_pylibcugraph
+      - test_python_cugraph_nx
   checks:
     output: none
     includes:
@@ -103,6 +105,29 @@ files:
     includes:
       - test_python_common
       - test_python_pylibcugraph
+  py_build_cugraph_nx:
+    output: pyproject
+    pyproject_dir: python/cugraph-nx
+    extras:
+      table: build-system
+    includes:
+      - python_build_wheel
+  py_run_cugraph_nx:
+    output: pyproject
+    pyproject_dir: python/cugraph-nx
+    extras:
+      table: project
+    includes:
+      - python_run_cugraph_nx
+  py_test_cugraph_nx:
+    output: pyproject
+    pyproject_dir: python/cugraph-nx
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+      - test_python_cugraph_nx
   py_build_cugraph_dgl:
     output: pyproject
     pyproject_dir: python/cugraph-dgl
@@ -166,9 +191,6 @@ files:
       table: project.optional-dependencies
       key: test
     includes:
-      # TODO: I think that the contents of the server's pyproject.toml
-      # dependencies were just copied from cugraph, so I'm not sure if this
-      # list is really minimal or if it is a superset.
       - test_python_common
       - test_python_cugraph
   cugraph_dgl_dev:
@@ -316,7 +338,7 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - pylibcugraph==23.10.*
+          - &pylibcugraph pylibcugraph==23.10.*
   python_run_cugraph:
     common:
       - output_types: [conda, pyproject]
@@ -343,16 +365,28 @@ dependencies:
       - output_types: pyproject
         packages:
           - &cupy_pip cupy-cuda11x>=12.0.0
-            # cudf uses fsspec but is protocol independent. cugraph 
+            # cudf uses fsspec but is protocol independent. cugraph
             # dataset APIs require [http] extras for use with cudf.
           - fsspec[http]>=0.6.0
-          - pylibcugraph==23.10.*
+          - *pylibcugraph
   python_run_pylibcugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
           - *pylibraft
           - *rmm
+  python_run_cugraph_nx:
+    common:
+      - output_types: [conda, pyproject]
+        packages:
+          - networkx>=3.0
+      - output_types: conda
+        packages:
+          - *cupy
+      - output_types: pyproject
+        packages:
+          - *cupy_pip
+          - *pylibcugraph
   python_run_cugraph_dgl:
     common:
       - output_types: [conda, pyproject]
@@ -428,8 +462,6 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - networkx>=2.5.1
-          - *numpy
           - pandas
           - pytest
           - pytest-benchmark
@@ -440,6 +472,8 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
+          - networkx>=2.5.1
+          - *numpy
           - python-louvain
           - scikit-learn>=0.23.1
   test_python_pylibcugraph:
@@ -447,6 +481,14 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - *cudf
+          - *numpy
+  test_python_cugraph_nx:
+    common:
+      - output_types: [conda, pyproject]
+        packages:
+          - packaging>=21
+            # not needed by cugraph-nx tests, but is required for running networkx tests
+          - pytest-mpl
   cugraph_dgl_dev:
     common:
       - output_types: [conda]
diff --git a/python/cugraph-nx/README.md b/python/cugraph-nx/README.md
index 80dc473f20b..2137fdb6472 100644
--- a/python/cugraph-nx/README.md
+++ b/python/cugraph-nx/README.md
@@ -4,8 +4,6 @@
 [RAPIDS](https://rapids.ai) cugraph-nx is a [backend to NetworkX](https://networkx.org/documentation/stable/reference/classes/index.html#backends)
 with minimal dependencies (`networkx`, `cupy`, and `pylibcugraph`) to run graph algorithms on the GPU.
 
-_Nightly conda packages and pip wheels coming soon._
-
 ### Contribute
 
 Follow instructions for [contributing to cugraph](https://github.com/rapidsai/cugraph/blob/branch-23.10/readme_pages/CONTRIBUTING.md)
diff --git a/python/cugraph-nx/cugraph_nx/tests/test_match_api.py b/python/cugraph-nx/cugraph_nx/tests/test_match_api.py
index f2b88c7f137..2a2e33ec2f4 100644
--- a/python/cugraph-nx/cugraph_nx/tests/test_match_api.py
+++ b/python/cugraph-nx/cugraph_nx/tests/test_match_api.py
@@ -23,15 +23,36 @@ def test_match_signature_and_names():
     for name, func in vars(cnx.interface.BackendInterface).items():
         if not isinstance(func, networkx_algorithm):
             continue
-        dispatchable_func = nx.utils.backends._registered_algorithms[name]
-        orig_func = dispatchable_func.orig_func
+
+        # nx version >=3.2 uses utils.backends, version >=3.0,<3.2 uses classes.backends
+        nx_backends = getattr(
+            nx.utils, "backends", getattr(nx.classes, "backends", None)
+        )
+        if nx_backends is None:
+            raise AttributeError(
+                f"imported networkx version {nx.__version__} is not "
+                "supported, must be >= 3.0"
+            )
+
+        dispatchable_func = nx_backends._registered_algorithms[name]
+        # nx version >=3.2 uses orig_func, version >=3.0,<3.2 uses _orig_func
+        orig_func = getattr(
+            dispatchable_func, "orig_func", getattr(dispatchable_func, "_orig_func")
+        )
+
         # Matching signatures?
         sig = inspect.signature(orig_func)
         assert sig == inspect.signature(func)
+
         # Matching function names?
         assert func.__name__ == dispatchable_func.__name__ == orig_func.__name__
+
         # Matching dispatch names?
-        assert func.name == dispatchable_func.name
+        # nx version >=3.2 uses name, version >=3.0,<3.2 uses dispatchname
+        assert func.name == getattr(
+            dispatchable_func, "name", getattr(dispatchable_func, "dispatchname")
+        )
+
         # Matching modules (i.e., where function defined)?
         assert (
             "networkx." + func.__module__.split(".", 1)[1]
diff --git a/python/cugraph-nx/pyproject.toml b/python/cugraph-nx/pyproject.toml
index df1e7a7a9ab..e8c4f670444 100644
--- a/python/cugraph-nx/pyproject.toml
+++ b/python/cugraph-nx/pyproject.toml
@@ -31,17 +31,21 @@ classifiers = [
 ]
 dependencies = [
     "cupy-cuda11x>=12.0.0",
+    "networkx>=3.0",
     "pylibcugraph==23.10.*",
-    "networkx >=3.0",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
 test = [
+    "packaging>=21",
+    "pandas",
     "pytest",
     "pytest-benchmark",
+    "pytest-cov",
     "pytest-mpl",
-    "packaging >=21",
-]
+    "pytest-xdist",
+    "scipy",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cugraph"
diff --git a/python/cugraph-nx/run_nx_tests.sh b/python/cugraph-nx/run_nx_tests.sh
index 8736d77010f..7ea2348eaff 100755
--- a/python/cugraph-nx/run_nx_tests.sh
+++ b/python/cugraph-nx/run_nx_tests.sh
@@ -1,20 +1,36 @@
 #!/usr/bin/env bash
 #
 # Copyright (c) 2023, NVIDIA CORPORATION.
-
-# NETWORKX_GRAPH_CONVERT=cugraph is necessary to test our backend.
 #
-# NETWORKX_TEST_FALLBACK_TO_NX=True is optional
-#   With this set, input graphs will not be converted to cugraph-nx and the networkx algorithm
-#   will be called for algorithms that we don't implement or if we raise NotImplementedError.
-#   This is sometimes helpful to get increased testing and coverage, but testing takes longer.
-#   Without it, tests will xfail when encountering a function that we don't implement.
+# NETWORKX_GRAPH_CONVERT=cugraph
+#   Used by networkx versions 3.0 and 3.1
+#   Must be set to "cugraph" to test the cugraph-nx backend.
+#
+# NETWORKX_TEST_BACKEND=cugraph
+#   Replaces NETWORKX_GRAPH_CONVERT for networkx versions >=3.2
+#   Must be set to "cugraph" to test the cugraph-nx backend.
 #
-# Coverage of `cugraph_nx.algorithms` is reported and is a good sanity check that algorithms run.
+# NETWORKX_FALLBACK_TO_NX=True (optional)
+#   Used by networkx versions >=3.2.  With this set, input graphs will not be
+#   converted to cugraph-nx and the networkx algorithm will be called for
+#   algorithms that we don't implement or if we raise NotImplementedError.
+#   This is sometimes helpful to get increased testing and coverage, but
+#   testing takes longer.  Without it, tests will xfail when encountering a
+#   function that we don't implement.
+#
+# Coverage of `cugraph_nx.algorithms` is reported and is a good sanity check
+# that algorithms run.
+
+# Warning: cugraph has a .coveragerc file in the <repo root>/python directory,
+# so be mindful of its contents and the CWD when running.
+# FIXME: should something be added to detect/prevent the above?
 
-NETWORKX_GRAPH_CONVERT=cugraph NETWORKX_BACKEND_TEST_EXHAUSTIVE=True \
-NETWORKX_TEST_BACKEND=cugraph NETWORKX_TEST_FALLBACK_TO_NX=True \
-    pytest --pyargs networkx \
-    --cov=cugraph_nx/algorithms \
-    --cov-report term-missing --no-cov-on-fail \
+NETWORKX_GRAPH_CONVERT=cugraph \
+NETWORKX_TEST_BACKEND=cugraph \
+NETWORKX_FALLBACK_TO_NX=True \
+    pytest \
+    --pyargs networkx \
+    --cov=cugraph_nx.algorithms \
+    --cov-report term-missing \
+    --no-cov-on-fail \
     "$@"
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 8301c25a11b..191bdf41920 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -41,7 +41,6 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "cudf==23.10.*",
-    "networkx>=2.5.1",
     "numpy>=1.21",
     "pandas",
     "pytest",

From cafded113c9545e5e7211cc965f53c00939307c0 Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Fri, 25 Aug 2023 14:49:23 +0100
Subject: [PATCH 19/72] Update the docstrings of the similarity algorithms
 (#3817)

A [PR](https://github.com/rapidsai/cugraph/pull/3002) updating the vertex pair column names was merged few releases ago however few docstrings weren't.
This PR updates the docstrings for Jaccard and Sorensen.

Authors:
  - Joseph Nke (https://github.com/jnke2016)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3817
---
 python/cugraph/cugraph/link_prediction/jaccard.py  | 14 +++++++-------
 python/cugraph/cugraph/link_prediction/sorensen.py | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py
index f1b488c8cca..334d57f9d80 100644
--- a/python/cugraph/cugraph/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/link_prediction/jaccard.py
@@ -96,13 +96,13 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
         relative to the adjacency list, or that given by the specified vertex
         pairs.
 
-        df['source'] : cudf.Series
-            The source vertex ID (will be identical to first if specified).
-        df['destination'] : cudf.Series
-            The destination vertex ID (will be identical to second if
+        df['first'] : cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        df['second'] : cudf.Series
+            the second vertex ID of each pair (will be identical to second if
             specified).
         df['jaccard_coeff'] : cudf.Series
-            The computed jaccard coefficient between the first and the second
+            The computed Jaccard coefficient between the first and the second
             vertex ID.
 
     Examples
@@ -182,8 +182,8 @@ def jaccard_coefficient(G, ebunch=None, do_expensive_check=True):
             the second vertex ID of each pair (will be identical to second if
             specified).
         df['jaccard_coeff'] : cudf.Series
-            The computed Jaccard coefficient between the source and destination
-            vertices.
+            The computed Jaccard coefficient between the first and the second
+            vertex ID.
 
     Examples
     --------
diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py
index 1d43adb51cd..ef2bd8d674d 100644
--- a/python/cugraph/cugraph/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/link_prediction/sorensen.py
@@ -74,8 +74,8 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
             specified)
 
         df['sorensen_coeff'] : cudf.Series
-            The computed Sorensen coefficient between the source and
-            destination vertices
+            The computed Sorensen coefficient between the first and the second
+            vertex ID.
 
     Examples
     --------
@@ -149,13 +149,13 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
         relative to the adjacency list, or that given by the specified vertex
         pairs.
 
-        df['source'] : cudf.Series
-            The source vertex ID (will be identical to first if specified).
-        df['destination'] : cudf.Series
-            The destination vertex ID (will be identical to second if
+        df['first'] : cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        df['second'] : cudf.Series
+            the second vertex ID of each pair (will be identical to second if
             specified).
         df['sorensen_coeff'] : cudf.Series
-            The computed sorensen coefficient between the first and the second
+            The computed Sorensen coefficient between the first and the second
             vertex ID.
 
     Examples

From a7d988cb20e644e813267c7f0d82c9fa19d48999 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Tue, 29 Aug 2023 10:48:27 -0400
Subject: [PATCH 20/72] Use `copy-pr-bot` (#3827)

This PR replaces the `copy_prs` functionality from the `ops-bot` with the new dedicated `copy-pr-bot` GitHub application.

Thorough documentation for the new `copy-pr-bot` application can be viewed below.

- https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/

**Important**: `copy-pr-bot` enforces signed commits. If an organization member opens a PR that contains unsigned commits, it will be deemed untrusted and therefore require an `/ok to test` comment. See the GitHub docs [here](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification) for information on how to set up commit signing.

Any time a PR is deemed untrusted, it will receive a comment that looks like this: https://github.com/rapidsai/ci-imgs/pull/63#issuecomment-1688973208.

Every subsequent commit on an untrusted PR will require an additional `/ok to test` comment.

Any existing PRs that have unsigned commits after this change is merged will require an `/ok to test` comment for each subsequent commit _or_ the PR can be rebased to include signed commits as mentioned in the docs below:
https://docs.gha-runners.nvidia.com/cpr/contributors.

This information is all included on the documentation page linked above.

_I've skipped CI on this PR since it's not a change that is tested._

[skip ci]
---
 .github/copy-pr-bot.yaml | 4 ++++
 .github/ops-bot.yaml     | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 .github/copy-pr-bot.yaml

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 00000000000..895ba83ee54
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 2d1444c595d..9a0b4155035 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,5 +5,4 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-copy_prs: true
 recently_updated: true

From 4656d3ebdf525eed45e821cf29d88e924299cb03 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Wed, 30 Aug 2023 02:08:19 +0200
Subject: [PATCH 21/72] Disable mg tests (#3833)

This PR is on top off the changes from #3831.

Temporarily disables single-GPU "MG" tests in CI until https://github.com/rapidsai/cugraph/issues/3790 is closed.
This will unblock CI for PRs unrelated to the issue in https://github.com/rapidsai/cugraph/issues/3790 at the risk of removed coverage for MG code paths. Hopefully nightly MG testing will minimize the risk.
A followup PR will be submitted that re-enables the tests and must be merged prior to 23.10 burndown.

Authors:
  - Naim (https://github.com/naimnv)
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/3833
---
 ci/test_python.sh                                           | 6 +++++-
 ci/test_wheel.sh                                            | 4 +++-
 .../tests/centrality/test_betweenness_centrality_mg.py      | 2 +-
 .../tests/centrality/test_edge_betweenness_centrality_mg.py | 2 +-
 .../tests/centrality/test_eigenvector_centrality_mg.py      | 4 ++--
 .../cugraph/tests/centrality/test_katz_centrality_mg.py     | 6 +++---
 python/cugraph/cugraph/tests/comms/test_comms_mg.py         | 2 +-
 .../cugraph/tests/components/test_connectivity_mg.py        | 2 +-
 python/cugraph/cugraph/tests/core/test_k_core_mg.py         | 4 ++--
 python/cugraph/cugraph/tests/internals/test_renumber_mg.py  | 2 +-
 python/cugraph/cugraph/tests/link_analysis/test_hits_mg.py  | 4 ++--
 .../cugraph/cugraph/tests/link_analysis/test_pagerank_mg.py | 4 ++--
 .../cugraph/tests/link_prediction/test_jaccard_mg.py        | 4 ++--
 .../cugraph/tests/link_prediction/test_overlap_mg.py        | 4 ++--
 .../cugraph/tests/link_prediction/test_sorensen_mg.py       | 4 ++--
 python/cugraph/cugraph/tests/sampling/test_egonet_mg.py     | 2 +-
 .../cugraph/cugraph/tests/sampling/test_random_walks_mg.py  | 2 +-
 python/cugraph/cugraph/tests/traversal/test_bfs_mg.py       | 6 +++---
 python/cugraph/cugraph/tests/traversal/test_sssp_mg.py      | 2 +-
 19 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index dd205b16049..e650630fa47 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -63,6 +63,10 @@ pytest \
   tests
 popd
 
+# FIXME: TEMPORARILY disable single-GPU "MG" testing until
+# https://github.com/rapidsai/cugraph/issues/3790 is closed
+# When closed, replace -k "not _mg" with
+#  -k "not test_property_graph_mg" \
 rapids-logger "pytest cugraph"
 pushd python/cugraph/cugraph
 export DASK_WORKER_DEVICES="0"
@@ -75,7 +79,7 @@ pytest \
   --cov=cugraph \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-coverage.xml" \
   --cov-report=term \
-  -k "not test_property_graph_mg" \
+  -k "not _mg" \
   tests
 popd
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 146186ae2e7..b62635d08b4 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -18,5 +18,7 @@ arch=$(uname -m)
 if [[ "${arch}" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
     python ./ci/wheel_smoke_test_${package_name}.py
 else
-    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest ./python/${package_name}/${package_name}/tests
+    # FIXME: TEMPORARILY disable single-GPU "MG" testing until
+    # https://github.com/rapidsai/cugraph/issues/3790 is closed
+    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest -k "not _mg" ./python/${package_name}/${package_name}/tests
 fi
diff --git a/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality_mg.py
index 930f80c1bfa..1e20287d1e5 100644
--- a/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_betweenness_centrality_mg.py
@@ -159,7 +159,7 @@ def input_expected_output(input_combo):
 
 
 @pytest.mark.mg
-def test_dask_betweenness_centrality(dask_client, benchmark, input_expected_output):
+def test_dask_mg_betweenness_centrality(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
     k = input_expected_output["k"]
diff --git a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py
index 97e503e5428..4277f94a396 100644
--- a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py
@@ -179,7 +179,7 @@ def input_expected_output(input_combo):
 #    is_single_gpu(), reason="skipping MG testing on Single GPU system"
 # )
 @pytest.mark.mg
-def test_dask_edge_betweenness_centrality(
+def test_dask_mg_edge_betweenness_centrality(
     dask_client, benchmark, input_expected_output
 ):
     if input_expected_output is not None:
diff --git a/python/cugraph/cugraph/tests/centrality/test_eigenvector_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_eigenvector_centrality_mg.py
index 6828dd3cbd2..e2ce7d2c341 100644
--- a/python/cugraph/cugraph/tests/centrality/test_eigenvector_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_eigenvector_centrality_mg.py
@@ -39,7 +39,7 @@ def setup_function():
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
 @pytest.mark.parametrize("directed", IS_DIRECTED)
 @pytest.mark.parametrize("input_data_path", DATASETS)
-def test_dask_eigenvector_centrality(dask_client, directed, input_data_path):
+def test_dask_mg_eigenvector_centrality(dask_client, directed, input_data_path):
     input_data_path = input_data_path.as_posix()
     print(f"dataset={input_data_path}")
     chunksize = dcg.get_chunksize(input_data_path)
@@ -86,7 +86,7 @@ def test_dask_eigenvector_centrality(dask_client, directed, input_data_path):
 
 
 @pytest.mark.mg
-def test_dask_eigenvector_centrality_transposed_false(dask_client):
+def test_dask_mg_eigenvector_centrality_transposed_false(dask_client):
     input_data_path = DATASETS[0]
 
     chunksize = dcg.get_chunksize(input_data_path)
diff --git a/python/cugraph/cugraph/tests/centrality/test_katz_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_katz_centrality_mg.py
index 1dcbcbae3cd..72b81ce50bb 100644
--- a/python/cugraph/cugraph/tests/centrality/test_katz_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_katz_centrality_mg.py
@@ -38,7 +38,7 @@ def setup_function():
 @pytest.mark.mg
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_katz_centrality(dask_client, directed):
+def test_dask_mg_katz_centrality(dask_client, directed):
 
     input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
     print(f"dataset={input_data_path}")
@@ -96,7 +96,7 @@ def test_dask_katz_centrality(dask_client, directed):
 @pytest.mark.mg
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_katz_centrality_nstart(dask_client, directed):
+def test_dask_mg_katz_centrality_nstart(dask_client, directed):
     input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
     print(f"dataset={input_data_path}")
     chunksize = dcg.get_chunksize(input_data_path)
@@ -141,7 +141,7 @@ def test_dask_katz_centrality_nstart(dask_client, directed):
 
 
 @pytest.mark.mg
-def test_dask_katz_centrality_transposed_false(dask_client):
+def test_dask_mg_katz_centrality_transposed_false(dask_client):
     input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
 
     chunksize = dcg.get_chunksize(input_data_path)
diff --git a/python/cugraph/cugraph/tests/comms/test_comms_mg.py b/python/cugraph/cugraph/tests/comms/test_comms_mg.py
index d4b33641c1a..747ef935e01 100644
--- a/python/cugraph/cugraph/tests/comms/test_comms_mg.py
+++ b/python/cugraph/cugraph/tests/comms/test_comms_mg.py
@@ -38,7 +38,7 @@ def setup_function():
 # )
 @pytest.mark.mg
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_pagerank(dask_client, directed):
+def test_dask_mg_pagerank(dask_client, directed):
 
     # Initialize and run pagerank on two distributed graphs
     # with same communicator
diff --git a/python/cugraph/cugraph/tests/components/test_connectivity_mg.py b/python/cugraph/cugraph/tests/components/test_connectivity_mg.py
index e809ab66438..217c9f0f09f 100644
--- a/python/cugraph/cugraph/tests/components/test_connectivity_mg.py
+++ b/python/cugraph/cugraph/tests/components/test_connectivity_mg.py
@@ -40,7 +40,7 @@ def setup_function():
 # )
 @pytest.mark.mg
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_wcc(dask_client, directed):
+def test_dask_mg_wcc(dask_client, directed):
 
     input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix()
     print(f"dataset={input_data_path}")
diff --git a/python/cugraph/cugraph/tests/core/test_k_core_mg.py b/python/cugraph/cugraph/tests/core/test_k_core_mg.py
index 7f4eeeb69d5..32c4f4553a2 100644
--- a/python/cugraph/cugraph/tests/core/test_k_core_mg.py
+++ b/python/cugraph/cugraph/tests/core/test_k_core_mg.py
@@ -137,7 +137,7 @@ def test_sg_k_core(dask_client, benchmark, input_expected_output):
 
 
 @pytest.mark.mg
-def test_dask_k_core(dask_client, benchmark, input_expected_output):
+def test_dask_mg_k_core(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
     core_number = input_expected_output["core_number"]
@@ -159,7 +159,7 @@ def test_dask_k_core(dask_client, benchmark, input_expected_output):
 
 
 @pytest.mark.mg
-def test_dask_k_core_invalid_input(dask_client):
+def test_dask_mg_k_core_invalid_input(dask_client):
     input_data_path = datasets[0]
     chunksize = dcg.get_chunksize(input_data_path)
     ddf = dask_cudf.read_csv(
diff --git a/python/cugraph/cugraph/tests/internals/test_renumber_mg.py b/python/cugraph/cugraph/tests/internals/test_renumber_mg.py
index c0abc61b050..e9521f16594 100644
--- a/python/cugraph/cugraph/tests/internals/test_renumber_mg.py
+++ b/python/cugraph/cugraph/tests/internals/test_renumber_mg.py
@@ -132,7 +132,7 @@ def test_mg_renumber_add_internal_vertex_id(graph_file, dask_client):
 @pytest.mark.mg
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_pagerank(dask_client, directed):
+def test_dask_mg_pagerank(dask_client, directed):
     pandas.set_option("display.max_rows", 10000)
 
     input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
diff --git a/python/cugraph/cugraph/tests/link_analysis/test_hits_mg.py b/python/cugraph/cugraph/tests/link_analysis/test_hits_mg.py
index bfb33ccd619..5590eb17401 100644
--- a/python/cugraph/cugraph/tests/link_analysis/test_hits_mg.py
+++ b/python/cugraph/cugraph/tests/link_analysis/test_hits_mg.py
@@ -111,7 +111,7 @@ def input_expected_output(input_combo):
 #    is_single_gpu(), reason="skipping MG testing on Single GPU system"
 # )
 @pytest.mark.mg
-def test_dask_hits(dask_client, benchmark, input_expected_output):
+def test_dask_mg_hits(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
 
@@ -155,7 +155,7 @@ def test_dask_hits(dask_client, benchmark, input_expected_output):
 
 
 @pytest.mark.mg
-def test_dask_hits_transposed_false(dask_client):
+def test_dask_mg_hits_transposed_false(dask_client):
     input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix()
 
     chunksize = dcg.get_chunksize(input_data_path)
diff --git a/python/cugraph/cugraph/tests/link_analysis/test_pagerank_mg.py b/python/cugraph/cugraph/tests/link_analysis/test_pagerank_mg.py
index d68aeda4a2f..63dbf31ca5e 100644
--- a/python/cugraph/cugraph/tests/link_analysis/test_pagerank_mg.py
+++ b/python/cugraph/cugraph/tests/link_analysis/test_pagerank_mg.py
@@ -89,7 +89,7 @@ def setup_function():
 @pytest.mark.parametrize("directed", IS_DIRECTED)
 @pytest.mark.parametrize("has_precomputed_vertex_out_weight", HAS_PRECOMPUTED)
 @pytest.mark.parametrize("has_guess", HAS_GUESS)
-def test_dask_pagerank(
+def test_dask_mg_pagerank(
     dask_client,
     personalization_perc,
     directed,
@@ -215,7 +215,7 @@ def test_pagerank_invalid_personalization_dtype(dask_client):
 
 
 @pytest.mark.mg
-def test_dask_pagerank_transposed_false(dask_client):
+def test_dask_mg_pagerank_transposed_false(dask_client):
     dg = create_distributed_karate_graph(store_transposed=False)
 
     warning_msg = (
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
index 1f7c0a9cadb..b56a6baae2b 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
@@ -119,7 +119,7 @@ def input_expected_output(input_combo):
 
 
 @pytest.mark.mg
-def test_dask_jaccard(dask_client, benchmark, input_expected_output):
+def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
 
@@ -154,7 +154,7 @@ def test_dask_jaccard(dask_client, benchmark, input_expected_output):
 
 
 @pytest.mark.mg
-def test_dask_weighted_jaccard(dask_client):
+def test_dask_mg_weighted_jaccard(dask_client):
     input_data_path = datasets[0]
     chunksize = dcg.get_chunksize(input_data_path)
     ddf = dask_cudf.read_csv(
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
index 220b90cbb47..ce4bf619f47 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
@@ -122,7 +122,7 @@ def input_expected_output(input_combo):
 #    is_single_gpu(), reason="skipping MG testing on Single GPU system"
 # )
 @pytest.mark.mg
-def test_dask_overlap(dask_client, benchmark, input_expected_output):
+def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
 
@@ -157,7 +157,7 @@ def test_dask_overlap(dask_client, benchmark, input_expected_output):
 
 
 @pytest.mark.mg
-def test_dask_weighted_overlap():
+def test_dask_mg_weighted_overlap():
     input_data_path = datasets[0]
     chunksize = dcg.get_chunksize(input_data_path)
     ddf = dask_cudf.read_csv(
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
index d9d013c7e35..af6b60771a0 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
@@ -121,7 +121,7 @@ def input_expected_output(input_combo):
 
 @pytest.mark.mg
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
-def test_dask_sorensen(dask_client, benchmark, input_expected_output):
+def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
 
@@ -156,7 +156,7 @@ def test_dask_sorensen(dask_client, benchmark, input_expected_output):
 
 
 @pytest.mark.mg
-def test_dask_weighted_sorensen(dask_client):
+def test_dask_mg_weighted_sorensen(dask_client):
     input_data_path = datasets[0]
     chunksize = dcg.get_chunksize(input_data_path)
     ddf = dask_cudf.read_csv(
diff --git a/python/cugraph/cugraph/tests/sampling/test_egonet_mg.py b/python/cugraph/cugraph/tests/sampling/test_egonet_mg.py
index 7f5891abdd3..e2f77700958 100644
--- a/python/cugraph/cugraph/tests/sampling/test_egonet_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_egonet_mg.py
@@ -118,7 +118,7 @@ def input_expected_output(input_combo):
 
 @pytest.mark.mg
 @pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
-def test_dask_ego_graphs(dask_client, benchmark, input_expected_output):
+def test_dask_mg_ego_graphs(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
 
diff --git a/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py b/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py
index a8aa34710ec..03658c7a06e 100644
--- a/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py
@@ -202,7 +202,7 @@ def input_graph(request):
 
 @pytest.mark.mg
 @pytest.mark.cugraph_ops
-def test_dask_random_walks(dask_client, benchmark, input_graph):
+def test_dask_mg_random_walks(dask_client, benchmark, input_graph):
     path_data, seeds, max_depth = calc_random_walks(input_graph)
     df_G = input_graph.input_df.compute().reset_index(drop=True)
     check_random_walks(input_graph, path_data, seeds, max_depth, df_G)
diff --git a/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py b/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py
index 38b5a2734d6..8ffbecea4fc 100644
--- a/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py
+++ b/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py
@@ -38,7 +38,7 @@ def setup_function():
 # )
 @pytest.mark.mg
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_bfs(dask_client, directed):
+def test_dask_mg_bfs(dask_client, directed):
 
     input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix()
 
@@ -102,7 +102,7 @@ def modify_dataset(df):
 # )
 @pytest.mark.mg
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_bfs_invalid_start(dask_client, directed):
+def test_dask_mg_bfs_invalid_start(dask_client, directed):
     source_vertex = 10
     input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix()
 
@@ -138,7 +138,7 @@ def test_dask_bfs_invalid_start(dask_client, directed):
 # )
 @pytest.mark.mg
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_bfs_multi_column_depthlimit(dask_client, directed):
+def test_dask_mg_bfs_multi_column_depthlimit(dask_client, directed):
     gc.collect()
 
     input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix()
diff --git a/python/cugraph/cugraph/tests/traversal/test_sssp_mg.py b/python/cugraph/cugraph/tests/traversal/test_sssp_mg.py
index 1720a051ee7..55bd320c2f1 100644
--- a/python/cugraph/cugraph/tests/traversal/test_sssp_mg.py
+++ b/python/cugraph/cugraph/tests/traversal/test_sssp_mg.py
@@ -39,7 +39,7 @@ def setup_function():
 # )
 @pytest.mark.mg
 @pytest.mark.parametrize("directed", IS_DIRECTED)
-def test_dask_sssp(dask_client, directed):
+def test_dask_mg_sssp(dask_client, directed):
 
     input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix()
     print(f"dataset={input_data_path}")

From b5d8cbefd28d5560af23a3baa8f0b3bd93627c31 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Tue, 29 Aug 2023 23:42:52 -0700
Subject: [PATCH 22/72] Fix OD shortest distance matrix computation test
 failures. (#3813)

Closing https://github.com/rapidsai/cugraph/issues/3801

I also submitted a minimum reproducer to the slack thrust channel.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Naim (https://github.com/naimnv)
  - Joseph Nke (https://github.com/jnke2016)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3813
---
 .../traversal/od_shortest_distances_impl.cuh  | 30 ++++++++++---------
 .../traversal/od_shortest_distances_test.cpp  |  8 ++---
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index 09e41466393..6a0c5a4a675 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -210,12 +210,17 @@ size_t compute_kv_store_capacity(size_t new_min_size,
 
 int32_t constexpr multi_partition_copy_block_size = 512;  // tuning parameter
 
-template <int32_t max_num_partitions, typename InputIterator, typename key_t, typename PartitionOp>
+template <int32_t max_num_partitions,
+          typename InputIterator,
+          typename key_t,
+          typename PartitionOp,
+          typename KeyOp>
 __global__ void multi_partition_copy(
   InputIterator input_first,
   InputIterator input_last,
   raft::device_span<key_t*> output_buffer_ptrs,
   PartitionOp partition_op,  // returns max_num_partitions to discard
+  KeyOp key_op,
   raft::device_span<size_t> partition_counters)
 {
   static_assert(max_num_partitions <= static_cast<int32_t>(std::numeric_limits<uint8_t>::max()));
@@ -283,7 +288,7 @@ __global__ void multi_partition_copy(
         if (partition != static_cast<uint8_t>(max_num_partitions)) {
           auto offset = block_start_offsets[partition] +
                         static_cast<size_t>(tmp_intra_block_offsets[partition] + tmp_offsets[i]);
-          *(output_buffer_ptrs[partition] + offset) = thrust::get<0>(*(input_first + tmp_idx));
+          *(output_buffer_ptrs[partition] + offset) = key_op(*(input_first + tmp_idx));
         }
       }
       tmp_idx += gridDim.x * blockDim.x;
@@ -794,6 +799,7 @@ rmm::device_uvector<weight_t> od_shortest_distances(
                                                      split_thresholds.end(),
                                                      thrust::get<1>(pair))));
             },
+            [] __device__(auto pair) { return thrust::get<0>(pair); },
             raft::device_span<size_t>(d_counters.data(), d_counters.size()));
 
         std::vector<size_t> h_counters(d_counters.size());
@@ -912,13 +918,6 @@ rmm::device_uvector<weight_t> od_shortest_distances(
               thrust::fill(
                 handle.get_thrust_policy(), d_counters.begin(), d_counters.end(), size_t{0});
               if (tmp_buffer.size() > 0) {
-                auto distance_first = thrust::make_transform_iterator(
-                  tmp_buffer.begin(),
-                  [key_to_dist_map = detail::kv_cuco_store_find_device_view_t(
-                     key_to_dist_map.view())] __device__(auto key) {
-                    return key_to_dist_map.find(key);
-                  });
-                auto input_first = thrust::make_zip_iterator(tmp_buffer.begin(), distance_first);
                 raft::grid_1d_thread_t update_grid(tmp_buffer.size(),
                                                    multi_partition_copy_block_size,
                                                    handle.get_device_properties().maxGridSize[0]);
@@ -926,13 +925,15 @@ rmm::device_uvector<weight_t> od_shortest_distances(
                   static_cast<int32_t>(1 /* near queue */ + num_far_buffers);
                 multi_partition_copy<max_num_partitions>
                   <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-                    input_first,
-                    input_first + tmp_buffer.size(),
+                    tmp_buffer.begin(),
+                    tmp_buffer.end(),
                     raft::device_span<key_t*>(d_buffer_ptrs.data(), d_buffer_ptrs.size()),
-                    [split_thresholds = raft::device_span<weight_t const>(
+                    [key_to_dist_map =
+                       detail::kv_cuco_store_find_device_view_t(key_to_dist_map.view()),
+                     split_thresholds = raft::device_span<weight_t const>(
                        d_split_thresholds.data(), d_split_thresholds.size()),
-                     invalid_threshold] __device__(auto pair) {
-                      auto dist = thrust::get<1>(pair);
+                     invalid_threshold] __device__(auto key) {
+                      auto dist = key_to_dist_map.find(key);
                       return static_cast<uint8_t>(
                         (dist < invalid_threshold)
                           ? max_num_partitions /* discard */
@@ -942,6 +943,7 @@ rmm::device_uvector<weight_t> od_shortest_distances(
                                                                  split_thresholds.end(),
                                                                  dist)));
                     },
+                    thrust::identity<key_t>{},
                     raft::device_span<size_t>(d_counters.data(), d_counters.size()));
               }
               std::vector<size_t> h_counters(d_counters.size());
diff --git a/cpp/tests/traversal/od_shortest_distances_test.cpp b/cpp/tests/traversal/od_shortest_distances_test.cpp
index e4fbbdf9275..cc283f24dfd 100644
--- a/cpp/tests/traversal/od_shortest_distances_test.cpp
+++ b/cpp/tests/traversal/od_shortest_distances_test.cpp
@@ -225,27 +225,27 @@ class Tests_ODShortestDistances
 using Tests_ODShortestDistances_File = Tests_ODShortestDistances<cugraph::test::File_Usecase>;
 using Tests_ODShortestDistances_Rmat = Tests_ODShortestDistances<cugraph::test::Rmat_Usecase>;
 
-TEST_P(Tests_ODShortestDistances_File, DISABLED_CheckInt32Int32Float)
+TEST_P(Tests_ODShortestDistances_File, CheckInt32Int32Float)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float>(std::get<0>(param), std::get<1>(param));
 }
 
-TEST_P(Tests_ODShortestDistances_Rmat, DISABLED_CheckInt32Int32Float)
+TEST_P(Tests_ODShortestDistances_Rmat, CheckInt32Int32Float)
 {
   auto param = GetParam();
   run_current_test<int32_t, int32_t, float>(
     std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_ODShortestDistances_Rmat, DISABLED_CheckInt32Int64Float)
+TEST_P(Tests_ODShortestDistances_Rmat, CheckInt32Int64Float)
 {
   auto param = GetParam();
   run_current_test<int32_t, int64_t, float>(
     std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
 }
 
-TEST_P(Tests_ODShortestDistances_Rmat, DISABLED_CheckInt64Int64Float)
+TEST_P(Tests_ODShortestDistances_Rmat, CheckInt64Int64Float)
 {
   auto param = GetParam();
   run_current_test<int64_t, int64_t, float>(

From 5120c630f33af108c71de1d7bb44f6089e6aee77 Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Wed, 30 Aug 2023 20:16:23 +0100
Subject: [PATCH 23/72] Remove legacy betweenness centrality (#3829)

The python API is now leveraging the CAPI for both [betweenness](https://github.com/rapidsai/cugraph/pull/2971) and [edge betweenness centrality](https://github.com/rapidsai/cugraph/pull/3672) therefore, the legacy code is no longer used anywhere. This PR cleanup the C++ API.

closes #2651
closes #3272

Authors:
  - Joseph Nke (https://github.com/jnke2016)
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3829
---
 cpp/CMakeLists.txt                            |    2 -
 .../legacy/betweenness_centrality.cu          |  564 --------
 .../legacy/betweenness_centrality.cuh         |  148 ---
 .../legacy/betweenness_centrality_kernels.cuh |  120 --
 cpp/src/traversal/legacy/bfs.cu               |  575 --------
 cpp/src/traversal/legacy/bfs.cuh              |  109 --
 cpp/src/traversal/legacy/bfs_kernels.cuh      | 1163 -----------------
 cpp/src/traversal/legacy/traversal_common.cuh |  480 -------
 cpp/tests/CMakeLists.txt                      |    9 -
 .../legacy/betweenness_centrality_test.cu     |  450 -------
 .../edge_betweenness_centrality_test.cu       |  349 -----
 cpp/tests/traversal/legacy/bfs_ref.h          |   73 --
 cpp/tests/traversal/legacy/bfs_test.cu        |  238 ----
 13 files changed, 4280 deletions(-)
 delete mode 100644 cpp/src/centrality/legacy/betweenness_centrality.cu
 delete mode 100644 cpp/src/centrality/legacy/betweenness_centrality.cuh
 delete mode 100644 cpp/src/centrality/legacy/betweenness_centrality_kernels.cuh
 delete mode 100644 cpp/src/traversal/legacy/bfs.cu
 delete mode 100644 cpp/src/traversal/legacy/bfs.cuh
 delete mode 100644 cpp/src/traversal/legacy/bfs_kernels.cuh
 delete mode 100644 cpp/src/traversal/legacy/traversal_common.cuh
 delete mode 100644 cpp/tests/centrality/legacy/betweenness_centrality_test.cu
 delete mode 100644 cpp/tests/centrality/legacy/edge_betweenness_centrality_test.cu
 delete mode 100644 cpp/tests/traversal/legacy/bfs_ref.h
 delete mode 100644 cpp/tests/traversal/legacy/bfs_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 87d26bfd848..63a91d4971f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -195,7 +195,6 @@ set(CUGRAPH_SOURCES
     src/utilities/path_retrieval.cu
     src/structure/legacy/graph.cu
     src/linear_assignment/legacy/hungarian.cu
-    src/traversal/legacy/bfs.cu
     src/link_prediction/legacy/jaccard.cu
     src/link_prediction/legacy/overlap.cu
     src/link_prediction/jaccard_sg.cu
@@ -234,7 +233,6 @@ set(CUGRAPH_SOURCES
     src/cores/k_core_sg.cu
     src/cores/k_core_mg.cu
     src/components/legacy/connectivity.cu
-    src/centrality/legacy/betweenness_centrality.cu
     src/generators/generate_rmat_edgelist.cu
     src/generators/generate_bipartite_rmat_edgelist.cu
     src/generators/generator_tools.cu
diff --git a/cpp/src/centrality/legacy/betweenness_centrality.cu b/cpp/src/centrality/legacy/betweenness_centrality.cu
deleted file mode 100644
index cd274a408e1..00000000000
--- a/cpp/src/centrality/legacy/betweenness_centrality.cu
+++ /dev/null
@@ -1,564 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <vector>
-
-#include <thrust/count.h>
-#include <thrust/extrema.h>
-#include <thrust/fill.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/replace.h>
-#include <thrust/transform.h>
-
-#include <raft/util/cudart_utils.hpp>
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/legacy/graph.hpp>
-#include <cugraph/utilities/error.hpp>
-
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_vector.hpp>
-
-#include "betweenness_centrality.cuh"
-#include "betweenness_centrality_kernels.cuh"
-#include <raft/core/handle.hpp>
-
-namespace cugraph {
-namespace detail {
-namespace {
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void betweenness_centrality_impl(raft::handle_t const& handle,
-                                 legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                                 result_t* result,
-                                 bool normalize,
-                                 bool endpoints,
-                                 weight_t const* weight,
-                                 vertex_t number_of_sources,
-                                 vertex_t const* sources,
-                                 vertex_t total_number_of_sources)
-{
-  // Current Implementation relies on BFS
-  // FIXME: For SSSP version
-  // Brandes Algorithm expects non negative weights for the accumulation
-  bool is_edge_betweenness = false;
-  verify_betweenness_centrality_input<vertex_t, edge_t, weight_t, result_t>(
-    result, is_edge_betweenness, normalize, endpoints, weight, number_of_sources, sources);
-  cugraph::detail::BC<vertex_t, edge_t, weight_t, result_t> bc(handle, graph);
-  bc.configure(
-    result, is_edge_betweenness, normalize, endpoints, weight, sources, number_of_sources);
-  bc.compute();
-  bc.rescale_by_total_sources_used(total_number_of_sources);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void edge_betweenness_centrality_impl(raft::handle_t const& handle,
-                                      legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                                      result_t* result,
-                                      bool normalize,
-                                      weight_t const* weight,
-                                      vertex_t number_of_sources,
-                                      vertex_t const* sources,
-                                      vertex_t /* total_number_of_sources */)
-{
-  // Current Implementation relies on BFS
-  // FIXME: For SSSP version
-  // Brandes Algorithm expects non negative weights for the accumulation
-  bool is_edge_betweenness = true;
-  bool endpoints           = false;
-  verify_betweenness_centrality_input<vertex_t, edge_t, weight_t, result_t>(
-    result, is_edge_betweenness, normalize, endpoints, weight, number_of_sources, sources);
-  cugraph::detail::BC<vertex_t, edge_t, weight_t, result_t> bc(handle, graph);
-  bc.configure(
-    result, is_edge_betweenness, normalize, endpoints, weight, sources, number_of_sources);
-  bc.compute();
-  // NOTE: As of 07/2020 NetworkX does not apply rescaling based on number
-  // of sources
-  // bc.rescale_by_total_sources_used(total_number_of_sources);
-}
-template <typename vertex_t>
-vertex_t get_total_number_of_sources(raft::handle_t const& handle, vertex_t local_number_of_sources)
-{
-  vertex_t total_number_of_sources_used = local_number_of_sources;
-  if (handle.comms_initialized()) {
-    rmm::device_scalar<vertex_t> d_number_of_sources(local_number_of_sources, handle.get_stream());
-    handle.get_comms().allreduce(d_number_of_sources.data(),
-                                 d_number_of_sources.data(),
-                                 1,
-                                 raft::comms::op_t::SUM,
-                                 handle.get_stream());
-    total_number_of_sources_used = d_number_of_sources.value(handle.get_stream());
-    // RAFT_CUDA_TRY(
-    // cudaMemcpy(&total_number_of_sources_used, data, sizeof(vertex_t), cudaMemcpyDeviceToHost));
-  }
-  return total_number_of_sources_used;
-}
-}  // namespace
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void verify_betweenness_centrality_input(result_t* result,
-                                         bool is_edge_betweenness,
-                                         bool normalize,
-                                         bool endpoints,
-                                         weight_t const* weights,
-                                         vertex_t const number_of_sources,
-                                         vertex_t const* sources)
-{
-  static_assert(std::is_same<vertex_t, int>::value, "vertex_t should be int");
-  static_assert(std::is_same<edge_t, int>::value, "edge_t should be int");
-  static_assert(std::is_same<weight_t, float>::value || std::is_same<weight_t, double>::value,
-                "weight_t should be float or double");
-  static_assert(std::is_same<result_t, float>::value || std::is_same<result_t, double>::value,
-                "result_t should be float or double");
-
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: betwenness pointer is NULL");
-  CUGRAPH_EXPECTS(number_of_sources >= 0, "Number of sources must be positive or equal to 0.");
-  if (number_of_sources != 0) {
-    CUGRAPH_EXPECTS(sources != nullptr,
-                    "Sources cannot be NULL if number_of_source is different from 0.");
-  }
-  if (is_edge_betweenness) {
-    CUGRAPH_EXPECTS(!endpoints, "Endpoints is not supported for edge betweenness centrality.");
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::setup()
-{
-  number_of_vertices_ = graph_.number_of_vertices;
-  number_of_edges_    = graph_.number_of_edges;
-  offsets_ptr_        = graph_.offsets;
-  indices_ptr_        = graph_.indices;
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::configure(result_t* betweenness,
-                                                         bool is_edge_betweenness,
-                                                         bool normalized,
-                                                         bool endpoints,
-                                                         weight_t const* weights,
-                                                         vertex_t const* sources,
-                                                         vertex_t number_of_sources)
-{
-  // --- Bind betweenness output vector to internal ---
-  betweenness_         = betweenness;
-  normalized_          = normalized;
-  endpoints_           = endpoints;
-  sources_             = sources;
-  number_of_sources_   = number_of_sources;
-  edge_weights_ptr_    = weights;
-  is_edge_betweenness_ = is_edge_betweenness;
-
-  // --- Working data allocation ---
-  initialize_work_vectors();
-  initialize_pointers_to_vectors();
-
-  // --- Get Device Information ---
-  initialize_device_information();
-
-  // --- Confirm that configuration went through ---
-  configured_ = true;
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::initialize_work_vectors()
-{
-  distances_vec_.resize(number_of_vertices_);
-  predecessors_vec_.resize(number_of_vertices_);
-  sp_counters_vec_.resize(number_of_vertices_);
-  deltas_vec_.resize(number_of_vertices_);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::initialize_pointers_to_vectors()
-{
-  distances_    = distances_vec_.data().get();
-  predecessors_ = predecessors_vec_.data().get();
-  sp_counters_  = sp_counters_vec_.data().get();
-  deltas_       = deltas_vec_.data().get();
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::initialize_device_information()
-{
-  max_grid_dim_1D_  = handle_.get_device_properties().maxGridSize[0];
-  max_block_dim_1D_ = handle_.get_device_properties().maxThreadsDim[0];
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::compute()
-{
-  CUGRAPH_EXPECTS(configured_, "BC must be configured before computation");
-  if (sources_) {
-    for (vertex_t source_idx = 0; source_idx < number_of_sources_; ++source_idx) {
-      vertex_t source_vertex = sources_[source_idx];
-      compute_single_source(source_vertex);
-    }
-  } else {
-    for (vertex_t source_vertex = 0; source_vertex < number_of_vertices_; ++source_vertex) {
-      compute_single_source(source_vertex);
-    }
-  }
-  rescale();
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::compute_single_source(vertex_t source_vertex)
-{
-  // Step 1) Singe-source shortest-path problem
-  cugraph::bfs(handle_,
-               graph_,
-               distances_,
-               predecessors_,
-               sp_counters_,
-               source_vertex,
-               graph_.prop.directed,
-               true);
-
-  // FIXME: Remove that with a BC specific class to gather
-  //        information during traversal
-
-  // Numeric max value is replaced by -1 as we look for the maximal depth of
-  // the traversal, this value is avalaible within the bfs implementation and
-  // there could be a way to access it directly and avoid both replace and the
-  // max
-  thrust::replace(handle_.get_thrust_policy(),
-                  distances_,
-                  distances_ + number_of_vertices_,
-                  std::numeric_limits<vertex_t>::max(),
-                  static_cast<vertex_t>(-1));
-  auto current_max_depth =
-    thrust::max_element(handle_.get_thrust_policy(), distances_, distances_ + number_of_vertices_);
-  vertex_t max_depth = 0;
-  RAFT_CUDA_TRY(
-    cudaMemcpy(&max_depth, current_max_depth, sizeof(vertex_t), cudaMemcpyDeviceToHost));
-  // Step 2) Dependency accumulation
-  accumulate(source_vertex, max_depth);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::accumulate(vertex_t source_vertex,
-                                                          vertex_t max_depth)
-{
-  dim3 grid_configuration, block_configuration;
-  block_configuration.x = max_block_dim_1D_;
-  grid_configuration.x  = min(max_grid_dim_1D_, (number_of_edges_ / block_configuration.x + 1));
-
-  initialize_dependencies();
-
-  if (is_edge_betweenness_) {
-    accumulate_edges(max_depth, grid_configuration, block_configuration);
-  } else if (endpoints_) {
-    accumulate_vertices_with_endpoints(
-      source_vertex, max_depth, grid_configuration, block_configuration);
-  } else {
-    accumulate_vertices(max_depth, grid_configuration, block_configuration);
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::initialize_dependencies()
-{
-  thrust::fill(
-    handle_.get_thrust_policy(), deltas_, deltas_ + number_of_vertices_, static_cast<result_t>(0));
-}
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::accumulate_edges(vertex_t max_depth,
-                                                                dim3 grid_configuration,
-                                                                dim3 block_configuration)
-{
-  for (vertex_t depth = max_depth; depth >= 0; --depth) {
-    edges_accumulation_kernel<vertex_t, edge_t, weight_t, result_t>
-      <<<grid_configuration, block_configuration, 0, handle_.get_stream()>>>(betweenness_,
-                                                                             number_of_vertices_,
-                                                                             graph_.indices,
-                                                                             graph_.offsets,
-                                                                             distances_,
-                                                                             sp_counters_,
-                                                                             deltas_,
-                                                                             depth);
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::accumulate_vertices_with_endpoints(
-  vertex_t source_vertex, vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration)
-{
-  for (vertex_t depth = max_depth; depth > 0; --depth) {
-    endpoints_accumulation_kernel<vertex_t, edge_t, weight_t, result_t>
-      <<<grid_configuration, block_configuration, 0, handle_.get_stream()>>>(betweenness_,
-                                                                             number_of_vertices_,
-                                                                             graph_.indices,
-                                                                             graph_.offsets,
-                                                                             distances_,
-                                                                             sp_counters_,
-                                                                             deltas_,
-                                                                             depth);
-  }
-  add_reached_endpoints_to_source_betweenness(source_vertex);
-  add_vertices_dependencies_to_betweenness();
-}
-
-// Distances should contain -1 for unreached nodes,
-
-// FIXME: There might be a cleaner way to add a value to a single
-//        score in the betweenness vector
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::add_reached_endpoints_to_source_betweenness(
-  vertex_t source_vertex)
-{
-  vertex_t number_of_unvisited_vertices =
-    thrust::count(handle_.get_thrust_policy(), distances_, distances_ + number_of_vertices_, -1);
-  vertex_t number_of_visited_vertices_except_source =
-    number_of_vertices_ - number_of_unvisited_vertices - 1;
-  rmm::device_vector<vertex_t> buffer(1);
-  buffer[0] = number_of_visited_vertices_except_source;
-  thrust::transform(handle_.get_thrust_policy(),
-                    buffer.begin(),
-                    buffer.end(),
-                    betweenness_ + source_vertex,
-                    betweenness_ + source_vertex,
-                    thrust::plus<result_t>());
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::add_vertices_dependencies_to_betweenness()
-{
-  thrust::transform(handle_.get_thrust_policy(),
-                    deltas_,
-                    deltas_ + number_of_vertices_,
-                    betweenness_,
-                    betweenness_,
-                    thrust::plus<result_t>());
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::accumulate_vertices(vertex_t max_depth,
-                                                                   dim3 grid_configuration,
-                                                                   dim3 block_configuration)
-{
-  for (vertex_t depth = max_depth; depth > 0; --depth) {
-    accumulation_kernel<vertex_t, edge_t, weight_t, result_t>
-      <<<grid_configuration, block_configuration, 0, handle_.get_stream()>>>(betweenness_,
-                                                                             number_of_vertices_,
-                                                                             graph_.indices,
-                                                                             graph_.offsets,
-                                                                             distances_,
-                                                                             sp_counters_,
-                                                                             deltas_,
-                                                                             depth);
-  }
-  add_vertices_dependencies_to_betweenness();
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::rescale()
-{
-  bool modified           = false;
-  result_t rescale_factor = static_cast<result_t>(1);
-  if (normalized_) {
-    if (is_edge_betweenness_) {
-      std::tie(rescale_factor, modified) =
-        rescale_edges_betweenness_centrality(rescale_factor, modified);
-    } else {
-      std::tie(rescale_factor, modified) =
-        rescale_vertices_betweenness_centrality(rescale_factor, modified);
-    }
-  } else {
-    if (!graph_.prop.directed) {
-      rescale_factor /= static_cast<result_t>(2);
-      modified = true;
-    }
-  }
-  apply_rescale_factor_to_betweenness(rescale_factor);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-std::tuple<result_t, bool>
-BC<vertex_t, edge_t, weight_t, result_t>::rescale_edges_betweenness_centrality(
-  result_t rescale_factor, bool modified)
-{
-  result_t casted_number_of_vertices_ = static_cast<result_t>(number_of_vertices_);
-  if (number_of_vertices_ > 1) {
-    rescale_factor /= ((casted_number_of_vertices_) * (casted_number_of_vertices_ - 1));
-    modified = true;
-  }
-  return std::make_tuple(rescale_factor, modified);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-std::tuple<result_t, bool>
-BC<vertex_t, edge_t, weight_t, result_t>::rescale_vertices_betweenness_centrality(
-  result_t rescale_factor, bool modified)
-{
-  result_t casted_number_of_vertices = static_cast<result_t>(number_of_vertices_);
-  if (number_of_vertices_ > 2) {
-    if (endpoints_) {
-      rescale_factor /= (casted_number_of_vertices * (casted_number_of_vertices - 1));
-    } else {
-      rescale_factor /= ((casted_number_of_vertices - 1) * (casted_number_of_vertices - 2));
-    }
-    modified = true;
-  }
-  return std::make_tuple(rescale_factor, modified);
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::apply_rescale_factor_to_betweenness(
-  result_t rescale_factor)
-{
-  size_t result_size = number_of_vertices_;
-  if (is_edge_betweenness_) result_size = number_of_edges_;
-  thrust::transform(handle_.get_thrust_policy(),
-                    betweenness_,
-                    betweenness_ + result_size,
-                    thrust::make_constant_iterator(rescale_factor),
-                    betweenness_,
-                    thrust::multiplies<result_t>());
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void BC<vertex_t, edge_t, weight_t, result_t>::rescale_by_total_sources_used(
-  vertex_t total_number_of_sources_used)
-{
-  result_t rescale_factor = static_cast<result_t>(1);
-  result_t casted_total_number_of_sources_used =
-    static_cast<result_t>(total_number_of_sources_used);
-  result_t casted_number_of_vertices = static_cast<result_t>(number_of_vertices_);
-
-  if (normalized_) {
-    if (number_of_vertices_ > 2 && total_number_of_sources_used > 0) {
-      rescale_factor *= (casted_number_of_vertices / casted_total_number_of_sources_used);
-    }
-  } else if (!graph_.prop.directed) {
-    if (number_of_vertices_ > 2 && total_number_of_sources_used > 0) {
-      rescale_factor *= (casted_number_of_vertices / casted_total_number_of_sources_used);
-    }
-  }
-  apply_rescale_factor_to_betweenness(rescale_factor);
-}
-}  // namespace detail
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void betweenness_centrality(raft::handle_t const& handle,
-                            legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                            result_t* result,
-                            bool normalize,
-                            bool endpoints,
-                            weight_t const* weight,
-                            vertex_t k,
-                            vertex_t const* vertices)
-{
-  vertex_t total_number_of_sources_used = detail::get_total_number_of_sources<vertex_t>(handle, k);
-  if (handle.comms_initialized()) {
-    rmm::device_vector<result_t> betweenness(graph.number_of_vertices, 0);
-    detail::betweenness_centrality_impl(handle,
-                                        graph,
-                                        betweenness.data().get(),
-                                        normalize,
-                                        endpoints,
-                                        weight,
-                                        k,
-                                        vertices,
-                                        total_number_of_sources_used);
-    handle.get_comms().reduce(betweenness.data().get(),
-                              result,
-                              betweenness.size(),
-                              raft::comms::op_t::SUM,
-                              0,
-                              handle.get_stream());
-  } else {
-    detail::betweenness_centrality_impl(handle,
-                                        graph,
-                                        result,
-                                        normalize,
-                                        endpoints,
-                                        weight,
-                                        k,
-                                        vertices,
-                                        total_number_of_sources_used);
-  }
-}
-
-template void betweenness_centrality<int, int, float, float>(
-  const raft::handle_t&,
-  legacy::GraphCSRView<int, int, float> const&,
-  float*,
-  bool,
-  bool,
-  float const*,
-  int,
-  int const*);
-template void betweenness_centrality<int, int, double, double>(
-  const raft::handle_t&,
-  legacy::GraphCSRView<int, int, double> const&,
-  double*,
-  bool,
-  bool,
-  double const*,
-  int,
-  int const*);
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void edge_betweenness_centrality(raft::handle_t const& handle,
-                                 legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                                 result_t* result,
-                                 bool normalize,
-                                 weight_t const* weight,
-                                 vertex_t k,
-                                 vertex_t const* vertices)
-{
-  vertex_t total_number_of_sources_used = detail::get_total_number_of_sources<vertex_t>(handle, k);
-  if (handle.comms_initialized()) {
-    rmm::device_vector<result_t> betweenness(graph.number_of_edges, 0);
-    detail::edge_betweenness_centrality_impl(handle,
-                                             graph,
-                                             betweenness.data().get(),
-                                             normalize,
-                                             weight,
-                                             k,
-                                             vertices,
-                                             total_number_of_sources_used);
-    handle.get_comms().reduce(betweenness.data().get(),
-                              result,
-                              betweenness.size(),
-                              raft::comms::op_t::SUM,
-                              0,
-                              handle.get_stream());
-  } else {
-    detail::edge_betweenness_centrality_impl(
-      handle, graph, result, normalize, weight, k, vertices, total_number_of_sources_used);
-  }
-}
-
-template void edge_betweenness_centrality<int, int, float, float>(
-  const raft::handle_t&,
-  legacy::GraphCSRView<int, int, float> const&,
-  float*,
-  bool,
-  float const*,
-  int,
-  int const*);
-
-template void edge_betweenness_centrality<int, int, double, double>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<int, int, double> const&,
-  double*,
-  bool,
-  double const*,
-  int,
-  int const*);
-}  // namespace cugraph
diff --git a/cpp/src/centrality/legacy/betweenness_centrality.cuh b/cpp/src/centrality/legacy/betweenness_centrality.cuh
deleted file mode 100644
index 43f095d634f..00000000000
--- a/cpp/src/centrality/legacy/betweenness_centrality.cuh
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Author: Xavier Cadet xcadet@nvidia.com
-
-#pragma once
-#include <rmm/device_vector.hpp>
-
-namespace cugraph {
-namespace detail {
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void betweenness_centrality(raft::handle_t const& handle,
-                            legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                            result_t* result,
-                            bool normalize,
-                            bool endpoints,
-                            weight_t const* weight,
-                            vertex_t const number_of_sources,
-                            vertex_t const* sources);
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void edge_betweenness_centrality(legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-                                 result_t* result,
-                                 bool normalize,
-                                 weight_t const* weight,
-                                 vertex_t const number_of_sources,
-                                 vertex_t const* sources);
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void verify_betweenness_centrality_input(result_t* result,
-                                         bool is_edge_betweenness,
-                                         bool normalize,
-                                         bool endpoints,
-                                         weight_t const* weights,
-                                         vertex_t const number_of_sources,
-                                         vertex_t const* sources);
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-class BC {
- public:
-  virtual ~BC(void) {}
-  BC(raft::handle_t const& handle,
-     legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-     cudaStream_t stream = 0)
-    : handle_(handle), graph_(graph)
-  {
-    setup();
-  }
-  void configure(result_t* betweenness,
-                 bool is_edge_betweenness,
-                 bool normalize,
-                 bool endpoints,
-                 weight_t const* weight,
-                 vertex_t const* sources,
-                 vertex_t const number_of_sources);
-
-  void configure_edge(result_t* betweenness,
-                      bool normalize,
-                      weight_t const* weight,
-                      vertex_t const* sources,
-                      vertex_t const number_of_sources);
-  void compute();
-  void rescale_by_total_sources_used(vertex_t total_number_of_sources_used);
-
- private:
-  // --- RAFT handle ---
-  raft::handle_t const& handle_;
-  // --- Information concerning the graph ---
-  const legacy::GraphCSRView<vertex_t, edge_t, weight_t>& graph_;
-  // --- These information are extracted on setup ---
-  vertex_t number_of_vertices_;  // Number of vertices in the graph
-  vertex_t number_of_edges_;     // Number of edges in the graph
-  edge_t const* offsets_ptr_;    // Pointer to the offsets
-  vertex_t const* indices_ptr_;  // Pointers to the indices
-
-  // --- Information from configuration ---
-  bool configured_          = false;  // Flag to ensure configuration was called
-  bool normalized_          = false;  // If True normalize the betweenness
-  bool is_edge_betweenness_ = false;  // If True compute edge_betweeness
-
-  // FIXME: For weighted version
-  weight_t const* edge_weights_ptr_ = nullptr;  // Pointer to the weights
-  bool endpoints_                   = false;    // If True normalize the betweenness
-  vertex_t const* sources_          = nullptr;  // Subset of vertices to gather information from
-  vertex_t number_of_sources_;                  // Number of vertices in sources
-
-  // --- Output ----
-  // betweenness is set/read by users - using Vectors
-  result_t* betweenness_ = nullptr;
-
-  // --- Data required to perform computation ----
-  rmm::device_vector<vertex_t> distances_vec_;
-  rmm::device_vector<vertex_t> predecessors_vec_;
-  rmm::device_vector<double> sp_counters_vec_;
-  rmm::device_vector<double> deltas_vec_;
-
-  vertex_t* distances_ =
-    nullptr;  // array<vertex_t>(|V|) stores the distances gathered by the latest SSSP
-  vertex_t* predecessors_ =
-    nullptr;  // array<weight_t>(|V|) stores the predecessors of the latest SSSP
-  double* sp_counters_ =
-    nullptr;  // array<vertex_t>(|V|) stores the shortest path counter for the latest SSSP
-  double* deltas_ = nullptr;  // array<result_t>(|V|) stores the dependencies for the latest SSSP
-
-  int max_grid_dim_1D_  = 0;
-  int max_block_dim_1D_ = 0;
-
-  void setup();
-
-  void initialize_work_vectors();
-  void initialize_pointers_to_vectors();
-  void initialize_device_information();
-
-  void compute_single_source(vertex_t source_vertex);
-
-  void accumulate(vertex_t source_vertex, vertex_t max_depth);
-  void initialize_dependencies();
-  void accumulate_edges(vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration);
-  void accumulate_vertices_with_endpoints(vertex_t source_vertex,
-                                          vertex_t max_depth,
-                                          dim3 grid_configuration,
-                                          dim3 block_configuration);
-  void accumulate_vertices(vertex_t max_depth, dim3 grid_configuration, dim3 block_configuration);
-  void add_reached_endpoints_to_source_betweenness(vertex_t source_vertex);
-  void add_vertices_dependencies_to_betweenness();
-
-  void rescale();
-  std::tuple<result_t, bool> rescale_vertices_betweenness_centrality(result_t rescale_factor,
-                                                                     bool modified);
-  std::tuple<result_t, bool> rescale_edges_betweenness_centrality(result_t rescale_factor,
-                                                                  bool modified);
-  void apply_rescale_factor_to_betweenness(result_t scaling_factor);
-};
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/centrality/legacy/betweenness_centrality_kernels.cuh b/cpp/src/centrality/legacy/betweenness_centrality_kernels.cuh
deleted file mode 100644
index b0ccb669376..00000000000
--- a/cpp/src/centrality/legacy/betweenness_centrality_kernels.cuh
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace cugraph {
-namespace detail {
-// Dependecy Accumulation: based on McLaughlin and Bader, 2018
-// FIXME: Accumulation kernel mights not scale well, as each thread is handling
-//        all the edges for each node, an approach similar to the traversal
-//        bucket (i.e. BFS / SSSP) system might enable speed up.
-//        Should look into forAllEdge type primitive for different
-//        load balancing
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-__global__ void edges_accumulation_kernel(result_t* betweenness,
-                                          vertex_t number_vertices,
-                                          vertex_t const* indices,
-                                          edge_t const* offsets,
-                                          vertex_t* distances,
-                                          double* sp_counters,
-                                          double* deltas,
-                                          vertex_t depth)
-{
-  for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices;
-       thread_idx += gridDim.x * blockDim.x) {
-    vertex_t vertex     = thread_idx;
-    double vertex_delta = 0;
-    double vertex_sigma = sp_counters[vertex];
-    if (distances[vertex] == depth) {
-      edge_t first_edge_idx = offsets[vertex];
-      edge_t last_edge_idx  = offsets[vertex + 1];
-      for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) {
-        vertex_t successor = indices[edge_idx];
-        if (distances[successor] == distances[vertex] + 1) {
-          double factor = (static_cast<double>(1) + deltas[successor]) / sp_counters[successor];
-          double coefficient = vertex_sigma * factor;
-
-          vertex_delta += coefficient;
-          betweenness[edge_idx] += coefficient;
-        }
-      }
-      deltas[vertex] = vertex_delta;
-    }
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-__global__ void endpoints_accumulation_kernel(result_t* betweenness,
-                                              vertex_t number_vertices,
-                                              vertex_t const* indices,
-                                              edge_t const* offsets,
-                                              vertex_t* distances,
-                                              double* sp_counters,
-                                              double* deltas,
-                                              vertex_t depth)
-{
-  for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices;
-       thread_idx += gridDim.x * blockDim.x) {
-    vertex_t vertex     = thread_idx;
-    double vertex_delta = 0;
-    double vertex_sigma = sp_counters[vertex];
-    if (distances[vertex] == depth) {
-      edge_t first_edge_idx = offsets[vertex];
-      edge_t last_edge_idx  = offsets[vertex + 1];
-      for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) {
-        vertex_t successor = indices[edge_idx];
-        if (distances[successor] == distances[vertex] + 1) {
-          double factor = (static_cast<double>(1) + deltas[successor]) / sp_counters[successor];
-          vertex_delta += vertex_sigma * factor;
-        }
-      }
-      betweenness[vertex] += 1;
-      deltas[vertex] = vertex_delta;
-    }
-  }
-}
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-__global__ void accumulation_kernel(result_t* betweenness,
-                                    vertex_t number_vertices,
-                                    vertex_t const* indices,
-                                    edge_t const* offsets,
-                                    vertex_t* distances,
-                                    double* sp_counters,
-                                    double* deltas,
-                                    vertex_t depth)
-{
-  for (int thread_idx = blockIdx.x * blockDim.x + threadIdx.x; thread_idx < number_vertices;
-       thread_idx += gridDim.x * blockDim.x) {
-    vertex_t vertex     = thread_idx;
-    double vertex_delta = 0;
-    double vertex_sigma = sp_counters[vertex];
-    if (distances[vertex] == depth) {
-      edge_t first_edge_idx = offsets[vertex];
-      edge_t last_edge_idx  = offsets[vertex + 1];
-      for (edge_t edge_idx = first_edge_idx; edge_idx < last_edge_idx; ++edge_idx) {
-        vertex_t successor = indices[edge_idx];
-        if (distances[successor] == distances[vertex] + 1) {
-          double factor = (static_cast<double>(1) + deltas[successor]) / sp_counters[successor];
-          vertex_delta += vertex_sigma * factor;
-        }
-      }
-      deltas[vertex] = vertex_delta;
-    }
-  }
-}
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/traversal/legacy/bfs.cu b/cpp/src/traversal/legacy/bfs.cu
deleted file mode 100644
index a0fb11a98d9..00000000000
--- a/cpp/src/traversal/legacy/bfs.cu
+++ /dev/null
@@ -1,575 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-#include "bfs.cuh"
-#include <algorithm>
-#include <iomanip>
-#include <limits>
-
-#include <cugraph/legacy/graph.hpp>
-
-#include "bfs_kernels.cuh"
-#include "traversal_common.cuh"
-#include <cugraph/utilities/error.hpp>
-#include <utilities/graph_utils.cuh>
-
-namespace cugraph {
-namespace detail {
-enum BFS_ALGO_STATE { TOPDOWN, BOTTOMUP };
-
-template <typename IndexType>
-void BFS<IndexType>::setup()
-{
-  // Determinism flag, false by default
-  deterministic = false;
-
-  // Working data
-  // Each vertex can be in the frontier at most once
-  // We will update frontier during the execution
-  // We need the orig to reset frontier, or ALLOC_FREE_TRY
-  original_frontier.resize(number_of_vertices);
-  frontier = original_frontier.data().get();
-
-  // size of bitmaps for vertices
-  vertices_bmap_size = (number_of_vertices / (8 * sizeof(int)) + 1);
-  // ith bit of visited_bmap is set <=> ith vertex is visited
-
-  visited_bmap.resize(vertices_bmap_size);
-
-  // ith bit of isolated_bmap is set <=> degree of ith vertex = 0
-  isolated_bmap.resize(vertices_bmap_size);
-
-  // vertices_degree[i] = degree of vertex i
-  vertex_degree.resize(number_of_vertices);
-
-  // We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it
-  // since those uses are mutually exclusive
-  buffer_np1_1.resize(number_of_vertices + 1);
-  buffer_np1_2.resize(number_of_vertices + 1);
-
-  // Using buffers : top down
-
-  // frontier_vertex_degree[i] is the degree of vertex frontier[i]
-  frontier_vertex_degree = buffer_np1_1.data().get();
-  // exclusive sum of frontier_vertex_degree
-  exclusive_sum_frontier_vertex_degree = buffer_np1_2.data().get();
-
-  // Using buffers : bottom up
-  // contains list of unvisited vertices
-  unvisited_queue = buffer_np1_1.data().get();
-  // size of the "last" unvisited queue : size_last_unvisited_queue
-  // refers to the size of unvisited_queue
-  // which may not be up to date (the queue may contains vertices that are now
-  // visited)
-
-  // We may leave vertices unvisited after bottom up main kernels - storing them
-  // here
-  left_unvisited_queue = buffer_np1_2.data().get();
-
-  // We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels).
-  // frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of
-  // the first edge of the bucket See top down kernels for more details
-  exclusive_sum_frontier_vertex_buckets_offsets.resize(
-    ((number_of_edges / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2));
-
-  // Init device-side counters
-  // Those counters must be/can be reset at each bfs iteration
-  // Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the
-  // current bottleneck
-  d_counters_pad.resize(4);
-
-  d_new_frontier_cnt   = d_counters_pad.data().get();
-  d_mu                 = d_counters_pad.data().get() + 1;
-  d_unvisited_cnt      = d_counters_pad.data().get() + 2;
-  d_left_unvisited_cnt = d_counters_pad.data().get() + 3;
-
-  // Lets use this int* for the next 3 lines
-  // Its dereferenced value is not initialized - so we dont care about what we
-  // put in it
-  IndexType* d_nisolated = d_new_frontier_cnt;
-  cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream);
-
-  // Computing isolated_bmap
-  // Only dependent on graph - not source vertex - done once
-  traversal::flag_isolated_vertices(number_of_vertices,
-                                    isolated_bmap.data().get(),
-                                    row_offsets,
-                                    vertex_degree.data().get(),
-                                    d_nisolated,
-                                    stream);
-  cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-
-  // We need nisolated to be ready to use
-  cudaStreamSynchronize(stream);
-}
-
-template <typename IndexType>
-void BFS<IndexType>::configure(IndexType* _distances,
-                               IndexType* _predecessors,
-                               double* _sp_counters,
-                               int* _edge_mask)
-{
-  distances    = _distances;
-  predecessors = _predecessors;
-  edge_mask    = _edge_mask;
-  sp_counters  = _sp_counters;
-
-  useEdgeMask         = (edge_mask != NULL);
-  computeDistances    = (distances != NULL);
-  computePredecessors = (predecessors != NULL);
-
-  // We need distances to use bottom up
-  if (directed && !computeDistances) {
-    distances_vals.resize(number_of_vertices);
-    distances = distances_vals.data().get();
-  }
-
-  // In case the shortest path counters is required, previous_bmap has to be allocated
-  if (sp_counters) { previous_visited_bmap.resize(vertices_bmap_size); }
-}
-
-template <typename IndexType>
-void BFS<IndexType>::traverse(IndexType source_vertex)
-{
-  // Init visited_bmap
-  // If the graph is undirected, we not that
-  // we will never discover isolated vertices (in degree = out degree = 0)
-  // we avoid a lot of work by flagging them now
-  // in g500 graphs they represent ~25% of total vertices
-  // more than that for wiki and twitter graphs
-
-  if (directed) {
-    cudaMemsetAsync(visited_bmap.data().get(), 0, vertices_bmap_size * sizeof(int), stream);
-  } else {
-    cudaMemcpyAsync(visited_bmap.data().get(),
-                    isolated_bmap.data().get(),
-                    vertices_bmap_size * sizeof(int),
-                    cudaMemcpyDeviceToDevice,
-                    stream);
-  }
-
-  // If needed, setting all vertices as undiscovered (inf distance)
-  // We dont use computeDistances here
-  // if the graph is undirected, we may need distances even if
-  // computeDistances is false
-  if (distances)
-    traversal::fill_vec(distances, number_of_vertices, traversal::vec_t<IndexType>::max, stream);
-
-  // If needed, setting all predecessors to non-existent (-1)
-  if (computePredecessors) {
-    cudaMemsetAsync(predecessors, -1, number_of_vertices * sizeof(IndexType), stream);
-  }
-
-  if (sp_counters) {
-    cudaMemsetAsync(sp_counters, 0, number_of_vertices * sizeof(double), stream);
-    double value = 1;
-    cudaMemcpyAsync(sp_counters + source_vertex, &value, sizeof(double), cudaMemcpyHostToDevice);
-  }
-
-  //
-  // Initial frontier
-  //
-
-  frontier = original_frontier.data().get();
-
-  if (distances) { cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream); }
-
-  // Setting source_vertex as visited
-  // There may be bit already set on that bmap (isolated vertices) - if the
-  // graph is undirected
-  int current_visited_bmap_source_vert = 0;
-
-  if (!directed) {
-    cudaMemcpyAsync(&current_visited_bmap_source_vert,
-                    visited_bmap.data().get() + (source_vertex / INT_SIZE),
-                    sizeof(int),
-                    cudaMemcpyDeviceToHost);
-    // We need current_visited_bmap_source_vert
-    cudaStreamSynchronize(stream);
-  }
-
-  int m = (1 << (source_vertex % INT_SIZE));
-
-  // In that case, source is isolated, done now
-  if (!directed && (m & current_visited_bmap_source_vert)) {
-    // Init distances and predecessors are done, (cf Streamsync in previous if)
-    return;
-  }
-
-  m |= current_visited_bmap_source_vert;
-
-  cudaMemcpyAsync(visited_bmap.data().get() + (source_vertex / INT_SIZE),
-                  &m,
-                  sizeof(int),
-                  cudaMemcpyHostToDevice,
-                  stream);
-
-  // Adding source_vertex to init frontier
-  cudaMemcpyAsync(&frontier[0], &source_vertex, sizeof(IndexType), cudaMemcpyHostToDevice, stream);
-
-  // mf : edges in frontier
-  // nf : vertices in frontier
-  // mu : edges undiscovered
-  // nu : nodes undiscovered
-  // lvl : current frontier's depth
-  IndexType mf, nf, mu, nu;
-  bool growing;
-  IndexType lvl = 1;
-
-  // Frontier has one vertex
-  nf = 1;
-
-  // all edges are undiscovered (by def isolated vertices have 0 edges)
-  mu = number_of_edges;
-
-  // all non isolated vertices are undiscovered (excepted source vertex, which is in frontier)
-  // That number is wrong if source_vertex is also isolated - but it's not important
-  nu = number_of_vertices - nisolated - nf;
-
-  // Last frontier was 0, now it is 1
-  growing = true;
-
-  IndexType size_last_left_unvisited_queue = number_of_vertices;  // we just need value > 0
-  IndexType size_last_unvisited_queue      = 0;                   // queue empty
-
-  // Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-  traversal::set_frontier_degree(
-    frontier_vertex_degree, frontier, vertex_degree.data().get(), nf, stream);
-  traversal::exclusive_sum(
-    frontier_vertex_degree, exclusive_sum_frontier_vertex_degree, nf + 1, stream);
-
-  cudaMemcpyAsync(&mf,
-                  &exclusive_sum_frontier_vertex_degree[nf],
-                  sizeof(IndexType),
-                  cudaMemcpyDeviceToHost,
-                  stream);
-
-  // We need mf
-  cudaStreamSynchronize(stream);
-
-  // At first we know we have to use top down
-  BFS_ALGO_STATE algo_state = TOPDOWN;
-
-  // useDistances : we check if a vertex is a parent using distances in bottom up - distances become
-  // working data undirected g : need parents to be in children's neighbors
-
-  // In case the shortest path counters need to be computeed, the bottom_up approach cannot be used
-  // bool can_use_bottom_up = (!sp_counters && !directed && distances);
-  bool can_use_bottom_up = false;
-
-  while (nf > 0) {
-    new_frontier     = frontier + nf;
-    IndexType old_nf = nf;
-    resetDevicePointers();
-
-    if (can_use_bottom_up) {
-      // Choosing algo
-      // Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf
-
-      switch (algo_state) {
-        case TOPDOWN:
-          if (mf > mu / alpha) algo_state = BOTTOMUP;
-          break;
-        case BOTTOMUP:
-          if (!growing && nf < number_of_vertices / beta) {
-            // We need to prepare the switch back to top down
-            // We couldnt keep track of mu during bottom up - because we dont know what mf is.
-            // Computing mu here
-            bfs_kernels::count_unvisited_edges(unvisited_queue,
-                                               size_last_unvisited_queue,
-                                               visited_bmap.data().get(),
-                                               vertex_degree.data().get(),
-                                               d_mu,
-                                               stream);
-
-            // Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-            traversal::set_frontier_degree(
-              frontier_vertex_degree, frontier, vertex_degree.data().get(), nf, stream);
-            traversal::exclusive_sum(
-              frontier_vertex_degree, exclusive_sum_frontier_vertex_degree, nf + 1, stream);
-
-            cudaMemcpyAsync(&mf,
-                            &exclusive_sum_frontier_vertex_degree[nf],
-                            sizeof(IndexType),
-                            cudaMemcpyDeviceToHost,
-                            stream);
-
-            cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-
-            // We will need mf and mu
-            cudaStreamSynchronize(stream);
-            algo_state = TOPDOWN;
-          }
-          break;
-      }
-    }
-
-    // Executing algo
-
-    switch (algo_state) {
-      case TOPDOWN:
-        // This step is only required if sp_counters is not nullptr
-        if (sp_counters) {
-          cudaMemcpyAsync(previous_visited_bmap.data().get(),
-                          visited_bmap.data().get(),
-                          vertices_bmap_size * sizeof(int),
-                          cudaMemcpyDeviceToDevice,
-                          stream);
-          // We need to copy the visited_bmap before doing the traversal
-          cudaStreamSynchronize(stream);
-        }
-        traversal::compute_bucket_offsets(
-          exclusive_sum_frontier_vertex_degree,
-          exclusive_sum_frontier_vertex_buckets_offsets.data().get(),
-          nf,
-          mf,
-          stream);
-        bfs_kernels::frontier_expand(row_offsets,
-                                     col_indices,
-                                     frontier,
-                                     nf,
-                                     mf,
-                                     lvl,
-                                     new_frontier,
-                                     d_new_frontier_cnt,
-                                     exclusive_sum_frontier_vertex_degree,
-                                     exclusive_sum_frontier_vertex_buckets_offsets.data().get(),
-                                     previous_visited_bmap.data().get(),
-                                     visited_bmap.data().get(),
-                                     distances,
-                                     predecessors,
-                                     sp_counters,
-                                     edge_mask,
-                                     isolated_bmap.data().get(),
-                                     directed,
-                                     stream,
-                                     deterministic);
-
-        mu -= mf;
-
-        cudaMemcpyAsync(&nf, d_new_frontier_cnt, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-        RAFT_CHECK_CUDA(stream);
-
-        // We need nf
-        cudaStreamSynchronize(stream);
-
-        if (nf) {
-          // Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-          traversal::set_frontier_degree(
-            frontier_vertex_degree, new_frontier, vertex_degree.data().get(), nf, stream);
-          traversal::exclusive_sum(
-            frontier_vertex_degree, exclusive_sum_frontier_vertex_degree, nf + 1, stream);
-          cudaMemcpyAsync(&mf,
-                          &exclusive_sum_frontier_vertex_degree[nf],
-                          sizeof(IndexType),
-                          cudaMemcpyDeviceToHost,
-                          stream);
-
-          // We need mf
-          cudaStreamSynchronize(stream);
-        }
-        break;
-
-      case BOTTOMUP:
-        bfs_kernels::fill_unvisited_queue(visited_bmap.data().get(),
-                                          vertices_bmap_size,
-                                          number_of_vertices,
-                                          unvisited_queue,
-                                          d_unvisited_cnt,
-                                          stream,
-                                          deterministic);
-
-        size_last_unvisited_queue = nu;
-
-        bfs_kernels::bottom_up_main(unvisited_queue,
-                                    size_last_unvisited_queue,
-                                    left_unvisited_queue,
-                                    d_left_unvisited_cnt,
-                                    visited_bmap.data().get(),
-                                    row_offsets,
-                                    col_indices,
-                                    lvl,
-                                    new_frontier,
-                                    d_new_frontier_cnt,
-                                    distances,
-                                    predecessors,
-                                    edge_mask,
-                                    stream,
-                                    deterministic);
-
-        // The number of vertices left unvisited decreases
-        // If it wasnt necessary last time, it wont be this time
-        if (size_last_left_unvisited_queue) {
-          cudaMemcpyAsync(&size_last_left_unvisited_queue,
-                          d_left_unvisited_cnt,
-                          sizeof(IndexType),
-                          cudaMemcpyDeviceToHost,
-                          stream);
-          RAFT_CHECK_CUDA(stream);
-          // We need last_left_unvisited_size
-          cudaStreamSynchronize(stream);
-          bfs_kernels::bottom_up_large(left_unvisited_queue,
-                                       size_last_left_unvisited_queue,
-                                       visited_bmap.data().get(),
-                                       row_offsets,
-                                       col_indices,
-                                       lvl,
-                                       new_frontier,
-                                       d_new_frontier_cnt,
-                                       distances,
-                                       predecessors,
-                                       edge_mask,
-                                       stream,
-                                       deterministic);
-        }
-        cudaMemcpyAsync(&nf, d_new_frontier_cnt, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-        RAFT_CHECK_CUDA(stream);
-
-        // We will need nf
-        cudaStreamSynchronize(stream);
-        break;
-    }
-
-    // Updating undiscovered edges count
-    nu -= nf;
-
-    // Using new frontier
-    frontier = new_frontier;
-    growing  = (nf > old_nf);
-
-    ++lvl;
-  }
-}
-
-template <typename IndexType>
-void BFS<IndexType>::resetDevicePointers()
-{
-  cudaMemsetAsync(d_counters_pad.data().get(), 0, 4 * sizeof(IndexType), stream);
-}
-
-template <typename IndexType>
-void BFS<IndexType>::clean()
-{
-  // the vectors have a destructor that takes care of cleaning
-}
-
-// Explicit Instantiation
-template class BFS<uint32_t>;
-template class BFS<int>;
-template class BFS<int64_t>;
-
-}  // namespace detail
-
-// NOTE: SP counter increase extremely fast on large graph
-//       It can easily reach 1e40~1e70 on GAP-road.mtx
-template <typename VT, typename ET, typename WT>
-void bfs(raft::handle_t const& handle,
-         legacy::GraphCSRView<VT, ET, WT> const& graph,
-         VT* distances,
-         VT* predecessors,
-         double* sp_counters,
-         const VT start_vertex,
-         bool directed,
-         bool mg_batch)
-{
-  static_assert(std::is_integral<VT>::value && sizeof(VT) >= sizeof(int32_t),
-                "Unsupported vertex id data type. Use integral types of size >= sizeof(int32_t)");
-  static_assert(std::is_same<VT, ET>::value,
-                "VT and ET should be the same time for the current BFS implementation");
-  static_assert(std::is_floating_point<WT>::value,
-                "Unsupported edge weight type. Use floating point types");  // actually, this is
-                                                                            // unnecessary for BFS
-  if (handle.comms_initialized() && !mg_batch) {
-    CUGRAPH_FAIL("NO LONGER SUPPORTED");
-  } else {
-    VT number_of_vertices = graph.number_of_vertices;
-    ET number_of_edges    = graph.number_of_edges;
-
-    const VT* indices_ptr = graph.indices;
-    const ET* offsets_ptr = graph.offsets;
-
-    int alpha = 15;
-    int beta  = 18;
-    // FIXME: Use VT and ET in the BFS detail
-    cugraph::detail::BFS<VT> bfs(
-      number_of_vertices, number_of_edges, offsets_ptr, indices_ptr, directed, alpha, beta);
-    bfs.configure(distances, predecessors, sp_counters, nullptr);
-    bfs.traverse(start_vertex);
-  }
-}
-
-// Explicit Instantiation
-template void bfs<uint32_t, uint32_t, float>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<uint32_t, uint32_t, float> const& graph,
-  uint32_t* distances,
-  uint32_t* predecessors,
-  double* sp_counters,
-  const uint32_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-// Explicit Instantiation
-template void bfs<uint32_t, uint32_t, double>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<uint32_t, uint32_t, double> const& graph,
-  uint32_t* distances,
-  uint32_t* predecessors,
-  double* sp_counters,
-  const uint32_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-// Explicit Instantiation
-template void bfs<int32_t, int32_t, float>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<int32_t, int32_t, float> const& graph,
-  int32_t* distances,
-  int32_t* predecessors,
-  double* sp_counters,
-  const int32_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-// Explicit Instantiation
-template void bfs<int32_t, int32_t, double>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<int32_t, int32_t, double> const& graph,
-  int32_t* distances,
-  int32_t* predecessors,
-  double* sp_counters,
-  const int32_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-// Explicit Instantiation
-template void bfs<int64_t, int64_t, float>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<int64_t, int64_t, float> const& graph,
-  int64_t* distances,
-  int64_t* predecessors,
-  double* sp_counters,
-  const int64_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-// Explicit Instantiation
-template void bfs<int64_t, int64_t, double>(
-  raft::handle_t const& handle,
-  legacy::GraphCSRView<int64_t, int64_t, double> const& graph,
-  int64_t* distances,
-  int64_t* predecessors,
-  double* sp_counters,
-  const int64_t source_vertex,
-  bool directed,
-  bool mg_batch);
-
-}  // namespace cugraph
diff --git a/cpp/src/traversal/legacy/bfs.cuh b/cpp/src/traversal/legacy/bfs.cuh
deleted file mode 100644
index dd636a2c97c..00000000000
--- a/cpp/src/traversal/legacy/bfs.cuh
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-#pragma once
-
-#include <climits>
-#include <rmm/device_vector.hpp>
-
-#define TRAVERSAL_DEFAULT_ALPHA 15
-
-#define TRAVERSAL_DEFAULT_BETA 18
-
-namespace cugraph {
-namespace detail {
-// FIXME: Differentiate IndexType for vertices and edges
-template <typename IndexType>
-class BFS {
- private:
-  IndexType number_of_vertices, number_of_edges;
-  const IndexType* row_offsets = nullptr;
-  const IndexType* col_indices = nullptr;
-
-  bool directed;
-  bool deterministic;
-
-  // edgemask, distances, predecessors are set/read by users - using Vectors
-  bool useEdgeMask;
-  bool computeDistances;
-  bool computePredecessors;
-  rmm::device_vector<IndexType> distances_vals;
-  IndexType* distances    = nullptr;
-  IndexType* predecessors = nullptr;
-  double* sp_counters     = nullptr;
-  int* edge_mask          = nullptr;
-
-  rmm::device_vector<IndexType> original_frontier;
-  rmm::device_vector<int> visited_bmap;
-  rmm::device_vector<int> isolated_bmap;
-  rmm::device_vector<int> previous_visited_bmap;
-  rmm::device_vector<IndexType> vertex_degree;
-  rmm::device_vector<IndexType> buffer_np1_1;
-  rmm::device_vector<IndexType> buffer_np1_2;
-  rmm::device_vector<IndexType> exclusive_sum_frontier_vertex_buckets_offsets;
-  rmm::device_vector<IndexType> d_counters_pad;
-  // Working data
-  // For complete description of each, go to bfs.cu
-  IndexType nisolated;
-  IndexType* frontier                             = nullptr;
-  IndexType* new_frontier                         = nullptr;
-  IndexType* frontier_vertex_degree               = nullptr;
-  IndexType* exclusive_sum_frontier_vertex_degree = nullptr;
-  IndexType* unvisited_queue                      = nullptr;
-  IndexType* left_unvisited_queue                 = nullptr;
-  IndexType* d_new_frontier_cnt                   = nullptr;
-  IndexType* d_mu                                 = nullptr;
-  IndexType* d_unvisited_cnt                      = nullptr;
-  IndexType* d_left_unvisited_cnt                 = nullptr;
-
-  IndexType vertices_bmap_size;
-
-  // Parameters for direction optimizing
-  IndexType alpha, beta;
-  cudaStream_t stream;
-
-  // resets pointers defined by d_counters_pad (see implem)
-  void resetDevicePointers();
-  void setup();
-  void clean();
-
- public:
-  virtual ~BFS(void) { clean(); }
-
-  BFS(IndexType _number_of_vertices,
-      IndexType _number_of_edges,
-      const IndexType* _row_offsets,
-      const IndexType* _col_indices,
-      bool _directed,
-      IndexType _alpha,
-      IndexType _beta,
-      cudaStream_t _stream = 0)
-    : number_of_vertices(_number_of_vertices),
-      number_of_edges(_number_of_edges),
-      row_offsets(_row_offsets),
-      col_indices(_col_indices),
-      directed(_directed),
-      alpha(_alpha),
-      beta(_beta),
-      stream(_stream)
-  {
-    setup();
-  }
-
-  void configure(IndexType* distances,
-                 IndexType* predecessors,
-                 double* sp_counters,
-                 int* edge_mask);
-
-  void traverse(IndexType source_vertex);
-};
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/traversal/legacy/bfs_kernels.cuh b/cpp/src/traversal/legacy/bfs_kernels.cuh
deleted file mode 100644
index a0c49e9601a..00000000000
--- a/cpp/src/traversal/legacy/bfs_kernels.cuh
+++ /dev/null
@@ -1,1163 +0,0 @@
-/*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <iostream>
-
-#include <cub/cub.cuh>
-#include <raft/util/cudart_utils.hpp>
-
-#include "traversal_common.cuh"
-#include <cugraph/legacy/graph.hpp>
-
-namespace cugraph {
-namespace detail {
-namespace bfs_kernels {
-//
-//  -------------------------  Bottom up -------------------------
-//
-
-//
-// fill_unvisited_queue_kernel
-//
-// Finding unvisited vertices in the visited_bmap, and putting them in the queue
-// Vertices represented by the same int in the bitmap are adjacent in the queue,
-// and sorted For instance, the queue can look like this : 34 38 45 58 61 4 18
-// 24 29 71 84 85 90 Because they are represented by those ints in the bitmap :
-// [34 38 45 58 61] [4 18 24 29] [71 84 85 90]
-
-// visited_bmap_nints = the visited_bmap is made of that number of ints
-
-template <typename IndexType>
-__global__ void fill_unvisited_queue_kernel(int* visited_bmap,
-                                            IndexType visited_bmap_nints,
-                                            IndexType n,
-                                            IndexType* unvisited,
-                                            IndexType* unvisited_cnt)
-{
-  typedef cub::BlockScan<int, FILL_UNVISITED_QUEUE_DIMX> BlockScan;
-  __shared__ typename BlockScan::TempStorage scan_temp_storage;
-
-  // When filling the "unvisited" queue, we use "unvisited_cnt" to know where to
-  // write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) )
-  // We will actually do only one atomicAdd per block - we first do a scan, then
-  // call one atomicAdd, and store the common offset for the block in
-  // unvisited_common_block_offset
-  __shared__ IndexType unvisited_common_block_offset;
-
-  // We don't want threads divergence in the loop (we're going to call
-  // __syncthreads) Using a block-only dependent in the condition of the loop
-  for (IndexType block_v_idx = blockIdx.x * blockDim.x; block_v_idx < visited_bmap_nints;
-       block_v_idx += blockDim.x * gridDim.x) {
-    // Index of visited_bmap that this thread will compute
-    IndexType v_idx = block_v_idx + threadIdx.x;
-
-    int thread_visited_int = (v_idx < visited_bmap_nints)
-                               ? visited_bmap[v_idx]
-                               : (~0);  // will be neutral in the next lines
-                                        // (virtual vertices all visited)
-
-    // The last int can only be partially valid
-    // If we are indeed taking care of the last visited int in this thread,
-    // We need to first disable (ie set as "visited") the inactive bits
-    // (vertices >= n)
-    if (v_idx == (visited_bmap_nints - 1)) {
-      int active_bits   = n - (INT_SIZE * v_idx);
-      int inactive_bits = INT_SIZE - active_bits;
-      int mask          = traversal::getMaskNLeftmostBitSet(inactive_bits);
-      thread_visited_int |= mask;  // Setting inactive bits as visited
-    }
-
-    // Counting number of unvisited vertices represented by this int
-    int n_unvisited_in_int = __popc(~thread_visited_int);
-    int unvisited_thread_offset;
-
-    // We will need to write n_unvisited_in_int unvisited vertices to the
-    // unvisited queue We ask for that space when computing the block scan, that
-    // will tell where to write those vertices in the queue, using the common
-    // offset of the block (see below)
-    BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset);
-
-    // Last thread knows how many vertices will be written to the queue by this
-    // block Asking for that space in the queue using the global count, and
-    // saving the common offset
-    if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) {
-      IndexType total               = unvisited_thread_offset + n_unvisited_in_int;
-      unvisited_common_block_offset = traversal::atomicAdd(unvisited_cnt, total);
-    }
-
-    // syncthreads for two reasons :
-    // - we need to broadcast unvisited_common_block_offset
-    // - we will reuse scan_temp_storage (cf CUB doc)
-    __syncthreads();
-
-    IndexType current_unvisited_index = unvisited_common_block_offset + unvisited_thread_offset;
-    int nvertices_to_write            = n_unvisited_in_int;
-
-    // getNextZeroBit uses __ffs, which gives least significant bit set
-    // which means that as long as n_unvisited_in_int is valid,
-    // we will use valid bits
-
-    while (nvertices_to_write > 0) {
-      if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) {
-        typename traversal::vec_t<IndexType>::vec4 vec_v;
-
-        vec_v.x = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-        vec_v.y = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-        vec_v.z = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-        vec_v.w = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-
-        typename traversal::vec_t<IndexType>::vec4* unvisited_i4 =
-          reinterpret_cast<typename traversal::vec_t<IndexType>::vec4*>(
-            &unvisited[current_unvisited_index]);
-        *unvisited_i4 = vec_v;
-
-        current_unvisited_index += 4;
-        nvertices_to_write -= 4;
-      } else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) {
-        typename traversal::vec_t<IndexType>::vec2 vec_v;
-
-        vec_v.x = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-        vec_v.y = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-
-        typename traversal::vec_t<IndexType>::vec2* unvisited_i2 =
-          reinterpret_cast<typename traversal::vec_t<IndexType>::vec2*>(
-            &unvisited[current_unvisited_index]);
-        *unvisited_i2 = vec_v;
-
-        current_unvisited_index += 2;
-        nvertices_to_write -= 2;
-      } else {
-        IndexType v = v_idx * INT_SIZE + traversal::getNextZeroBit(thread_visited_int);
-
-        unvisited[current_unvisited_index] = v;
-
-        current_unvisited_index += 1;
-        nvertices_to_write -= 1;
-      }
-    }
-  }
-}
-
-// Wrapper
-template <typename IndexType>
-void fill_unvisited_queue(int* visited_bmap,
-                          IndexType visited_bmap_nints,
-                          IndexType n,
-                          IndexType* unvisited,
-                          IndexType* unvisited_cnt,
-                          cudaStream_t m_stream,
-                          bool deterministic)
-{
-  dim3 grid, block;
-  block.x = FILL_UNVISITED_QUEUE_DIMX;
-
-  grid.x = std::min(static_cast<size_t>(MAXBLOCKS),
-                    (static_cast<size_t>(visited_bmap_nints) + block.x - 1) / block.x);
-
-  fill_unvisited_queue_kernel<<<grid, block, 0, m_stream>>>(
-    visited_bmap, visited_bmap_nints, n, unvisited, unvisited_cnt);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-//
-// count_unvisited_edges_kernel
-// Couting the total number of unvisited edges in the graph - using an
-// potentially unvisited queue We need the current unvisited vertices to be in
-// the unvisited queue But visited vertices can be in the potentially_unvisited
-// queue We first check if the vertex is still unvisited before using it Useful
-// when switching from "Bottom up" to "Top down"
-//
-
-template <typename IndexType>
-__global__ void count_unvisited_edges_kernel(const IndexType* potentially_unvisited,
-                                             const IndexType potentially_unvisited_size,
-                                             const int* visited_bmap,
-                                             IndexType* degree_vertices,
-                                             IndexType* mu)
-{
-  typedef cub::BlockReduce<IndexType, COUNT_UNVISITED_EDGES_DIMX> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage reduce_temp_storage;
-
-  // number of undiscovered edges counted by this thread
-  IndexType thread_unvisited_edges_count = 0;
-
-  for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x; idx < potentially_unvisited_size;
-       idx += blockDim.x * gridDim.x) {
-    IndexType u        = potentially_unvisited[idx];
-    int u_visited_bmap = visited_bmap[u / INT_SIZE];
-    int is_visited     = u_visited_bmap & (1 << (u % INT_SIZE));
-
-    if (!is_visited) thread_unvisited_edges_count += degree_vertices[u];
-  }
-
-  // We need all thread_unvisited_edges_count to be ready before reducing
-  __syncthreads();
-
-  IndexType block_unvisited_edges_count =
-    BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count);
-
-  // block_unvisited_edges_count is only defined is th.x == 0
-  if (threadIdx.x == 0) traversal::atomicAdd(mu, block_unvisited_edges_count);
-}
-
-// Wrapper
-template <typename IndexType>
-void count_unvisited_edges(const IndexType* potentially_unvisited,
-                           const IndexType potentially_unvisited_size,
-                           const int* visited_bmap,
-                           IndexType* node_degree,
-                           IndexType* mu,
-                           cudaStream_t m_stream)
-{
-  dim3 grid, block;
-  block.x = COUNT_UNVISITED_EDGES_DIMX;
-  grid.x  = std::min(static_cast<size_t>(MAXBLOCKS),
-                    (static_cast<size_t>(potentially_unvisited_size) + block.x - 1) / block.x);
-
-  count_unvisited_edges_kernel<<<grid, block, 0, m_stream>>>(
-    potentially_unvisited, potentially_unvisited_size, visited_bmap, node_degree, mu);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-//
-// Main Bottom Up kernel
-// Here we will start to process unvisited vertices in the unvisited queue
-// We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges
-// If it's not possible to define a valid parent using only those edges,
-// add it to the "left_unvisited_queue"
-//
-
-//
-// We will use the "vertices represented by the same int in the visited bmap are
-// adjacents and sorted in the unvisited queue" property It is used to do a
-// reduction locally and fully build the new visited_bmap
-//
-
-template <typename IndexType>
-__global__ void main_bottomup_kernel(const IndexType* unvisited,
-                                     const IndexType unvisited_size,
-                                     IndexType* left_unvisited,
-                                     IndexType* left_unvisited_cnt,
-                                     int* visited_bmap,
-                                     const IndexType* row_ptr,
-                                     const IndexType* col_ind,
-                                     IndexType lvl,
-                                     IndexType* new_frontier,
-                                     IndexType* new_frontier_cnt,
-                                     IndexType* distances,
-                                     IndexType* predecessors,
-                                     int* edge_mask)
-{
-  typedef cub::BlockDiscontinuity<IndexType, MAIN_BOTTOMUP_DIMX> BlockDiscontinuity;
-  typedef cub::WarpReduce<int> WarpReduce;
-  typedef cub::BlockScan<int, MAIN_BOTTOMUP_DIMX> BlockScan;
-
-  __shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage;
-  __shared__ typename WarpReduce::TempStorage reduce_temp_storage;
-  __shared__ typename BlockScan::TempStorage scan_temp_storage;
-
-  // To write vertices in the frontier,
-  // We will use a block scan to locally compute the offsets
-  // frontier_common_block_offset contains the common offset for the block
-  __shared__ IndexType frontier_common_block_offset;
-
-  // When building the new visited_bmap, we reduce (using a bitwise and) the
-  // visited_bmap ints from the vertices represented by the same int (for
-  // instance vertices 1, 5, 9, 13, 23) vertices represented by the same int
-  // will be designed as part of the same "group" To detect the deliminations
-  // between those groups, we use BlockDiscontinuity Then we need to create the
-  // new "visited_bmap" within those group. We use a warp reduction that takes
-  // into account limits between groups to do it But a group can be cut in two
-  // different warps : in that case, the second warp put the result of its local
-  // reduction in local_visited_bmap_warp_head the first warp will then read it
-  // and finish the reduction
-
-  __shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS];
-
-  const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
-
-  // When this kernel is converted to support different VT and ET, this
-  // will likely split into invalid_vid and invalid_eid
-  // This is equivalent to ~IndexType(0) (i.e., all bits set to 1)
-  constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx<IndexType>::value;
-
-  // we will call __syncthreads inside the loop
-  // we need to keep complete block active
-  for (IndexType block_off = blockIdx.x * blockDim.x; block_off < unvisited_size;
-       block_off += blockDim.x * gridDim.x) {
-    IndexType idx = block_off + threadIdx.x;
-
-    // This thread will take care of unvisited_vertex
-    // in the visited_bmap, it is represented by the int at index
-    // visited_bmap_index = unvisited_vertex/INT_SIZE
-    // it will be used by BlockDiscontinuity
-    // to flag the separation between groups of vertices (vertices represented
-    // by different in in visited_bmap)
-    IndexType visited_bmap_index[1];  // this is an array of size 1 because CUB
-                                      // needs one
-
-    visited_bmap_index[0]      = invalid_idx;
-    IndexType unvisited_vertex = invalid_idx;
-
-    // local_visited_bmap gives info on the visited bit of unvisited_vertex
-    //
-    // By default, everything is visited
-    // This is because we only take care of unvisited vertices here,
-    // The other are by default unvisited
-    // If a vertex remain unvisited, we will notice it here
-    // That's why by default we consider everything visited ( ie ~0 )
-    // If we fail to assign one parent to an unvisited vertex, we will
-    // explicitly unset the bit
-    int local_visited_bmap = (~0);
-    int found              = 0;
-    int more_to_visit      = 0;
-    IndexType valid_parent;
-    IndexType left_unvisited_off;
-
-    if (idx < unvisited_size) {
-      // Processing first STPV edges of unvisited v
-      // If bigger than that, push to left_unvisited queue
-      unvisited_vertex = unvisited[idx];
-
-      IndexType edge_begin = row_ptr[unvisited_vertex];
-      IndexType edge_end   = row_ptr[unvisited_vertex + 1];
-
-      visited_bmap_index[0] = unvisited_vertex / INT_SIZE;
-
-      IndexType degree = edge_end - edge_begin;
-
-      for (IndexType edge = edge_begin;
-           edge < min(static_cast<size_t>(edge_end),
-                      static_cast<size_t>(edge_begin) + MAIN_BOTTOMUP_MAX_EDGES);
-           ++edge) {
-        if (edge_mask && !edge_mask[edge]) continue;
-
-        IndexType parent_candidate = col_ind[edge];
-
-        if (distances[parent_candidate] == (lvl - 1)) {
-          found        = 1;
-          valid_parent = parent_candidate;
-          break;
-        }
-      }
-
-      // This vertex will remain unvisited at the end of this kernel
-      // Explicitly say it
-      if (!found)
-        local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE));  // let this one unvisited
-      else {
-        if (distances) distances[unvisited_vertex] = lvl;
-        if (predecessors) predecessors[unvisited_vertex] = valid_parent;
-      }
-
-      // If we haven't found a parent and there's more edge to check
-      if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES) {
-        left_unvisited_off = traversal::atomicAdd(left_unvisited_cnt, static_cast<IndexType>(1));
-        more_to_visit      = 1;
-      }
-    }
-
-    //
-    // We will separate vertices in group
-    // Two vertices are in the same group if represented by same int in
-    // visited_bmap ie u and v in same group <=> u/32 == v/32
-    //
-    // We will now flag the head of those group (first element of each group)
-    //
-    // 1) All vertices within the same group are adjacent in the queue (cf
-    // fill_unvisited_queue) 2) A group is of size <= 32, so a warp will contain
-    // at least one head, and a group will be contained at most by two warps
-
-    int is_head_a[1];  // CUB need an array
-    BlockDiscontinuity(discontinuity_temp_storage)
-      .FlagHeads(is_head_a, visited_bmap_index, cub::Inequality());
-    int is_head = is_head_a[0];
-
-    // Computing the warp reduce within group
-    // This primitive uses the is_head flags to know where the limits of the
-    // groups are We use bitwise and as operator, because of the fact that 1 is
-    // the default value If a vertex is unvisited, we have to explicitly ask for
-    // it
-    int local_bmap_agg =
-      WarpReduce(reduce_temp_storage)
-        .HeadSegmentedReduce(local_visited_bmap, is_head, traversal::BitwiseAnd());
-
-    // We need to take care of the groups cut in two in two different warps
-    // Saving second part of the reduce here, then applying it on the first part
-    // bellow Corner case : if the first thread of the warp is a head, then this
-    // group is not cut in two and then we have to be neutral (for an bitwise
-    // and, it's an ~0)
-    if (laneid == 0) { local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg; }
-
-    // broadcasting local_visited_bmap_warp_head
-    __syncthreads();
-
-    int head_ballot = __ballot_sync(raft::warp_full_mask(), is_head);
-
-    // As long as idx < unvisited_size, we know there's at least one head per
-    // warp
-    int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot);
-
-    int is_last_head_in_warp = (laneid == laneid_last_head_in_warp);
-
-    // if laneid == 0 && is_last_head_in_warp, it's a special case where
-    // a group of size 32 starts exactly at lane 0
-    // in that case, nothing to do (this group is not cut by a warp
-    // delimitation) we also have to make sure that a warp actually exists after
-    // this one (this corner case is handled after)
-    if (laneid != 0 && (is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS)) {
-      local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1];
-    }
-
-    // Three cases :
-    // -> This is the first group of the block - it may be cut in two (with
-    // previous block)
-    // -> This is the last group of the block - same thing
-    // -> This group is completely contained in this block
-
-    if (warpid == 0 && laneid == 0) {
-      // The first elt of this group considered in this block is
-      // unvisited_vertex We know that's the case because elts are sorted in a
-      // group, and we are at laneid == 0 We will do an atomicOr - we have to be
-      // neutral about elts < unvisited_vertex
-      int iv   = unvisited_vertex % INT_SIZE;  // we know that this unvisited_vertex is valid
-      int mask = traversal::getMaskNLeftmostBitSet(INT_SIZE - iv);
-      local_bmap_agg &= mask;                  // we have to be neutral for elts < unvisited_vertex
-      atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
-    } else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) &&
-               laneid >= laneid_last_head_in_warp &&  // We need the other ones
-                                                      // to go in else case
-               idx < unvisited_size                   // we could be out
-    ) {
-      // Last head of the block
-      // We don't know if this group is complete
-
-      // last_v is the last unvisited_vertex of the group IN THIS block
-      // we dont know about the rest - we have to be neutral about elts > last_v
-
-      // the destination thread of the __shfl is active
-      int laneid_max =
-        min(static_cast<IndexType>(WARP_SIZE - 1), (unvisited_size - (block_off + 32 * warpid)));
-      IndexType last_v = __shfl_sync(__activemask(), unvisited_vertex, laneid_max, WARP_SIZE);
-
-      if (is_last_head_in_warp) {
-        int ilast_v = last_v % INT_SIZE + 1;
-        int mask    = traversal::getMaskNRightmostBitSet(ilast_v);
-        local_bmap_agg &= mask;  // we have to be neutral for elts > last_unvisited_vertex
-        atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
-      }
-    } else {
-      // group completely in block
-      if (is_head && idx < unvisited_size) {
-        visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg;  // no atomics needed, we know
-                                                                     // everything about this int
-      }
-    }
-
-    // Saving in frontier
-
-    int thread_frontier_offset;
-    BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset);
-    IndexType inclusive_sum = thread_frontier_offset + found;
-    if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum) {
-      frontier_common_block_offset = traversal::atomicAdd(new_frontier_cnt, inclusive_sum);
-    }
-
-    // 1) Broadcasting frontier_common_block_offset
-    // 2) we want to reuse the *_temp_storage
-    __syncthreads();
-
-    if (found)
-      new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex;
-    if (more_to_visit) left_unvisited[left_unvisited_off] = unvisited_vertex;
-  }
-}
-
-template <typename IndexType>
-void bottom_up_main(IndexType* unvisited,
-                    IndexType unvisited_size,
-                    IndexType* left_unvisited,
-                    IndexType* d_left_unvisited_idx,
-                    int* visited,
-                    const IndexType* row_ptr,
-                    const IndexType* col_ind,
-                    IndexType lvl,
-                    IndexType* new_frontier,
-                    IndexType* new_frontier_idx,
-                    IndexType* distances,
-                    IndexType* predecessors,
-                    int* edge_mask,
-                    cudaStream_t m_stream,
-                    bool deterministic)
-{
-  dim3 grid, block;
-  block.x = MAIN_BOTTOMUP_DIMX;
-
-  grid.x = std::min(static_cast<size_t>(MAXBLOCKS),
-                    (static_cast<size_t>(unvisited_size) + block.x) / block.x);
-
-  main_bottomup_kernel<<<grid, block, 0, m_stream>>>(unvisited,
-                                                     unvisited_size,
-                                                     left_unvisited,
-                                                     d_left_unvisited_idx,
-                                                     visited,
-                                                     row_ptr,
-                                                     col_ind,
-                                                     lvl,
-                                                     new_frontier,
-                                                     new_frontier_idx,
-                                                     distances,
-                                                     predecessors,
-                                                     edge_mask);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-//
-// bottom_up_large_degree_kernel
-// finishing the work started in main_bottomup_kernel for vertex with degree >
-// MAIN_BOTTOMUP_MAX_EDGES && no parent found
-//
-template <typename IndexType>
-__global__ void bottom_up_large_degree_kernel(IndexType* left_unvisited,
-                                              IndexType left_unvisited_size,
-                                              int* visited,
-                                              const IndexType* row_ptr,
-                                              const IndexType* col_ind,
-                                              IndexType lvl,
-                                              IndexType* new_frontier,
-                                              IndexType* new_frontier_cnt,
-                                              IndexType* distances,
-                                              IndexType* predecessors,
-                                              int* edge_mask)
-{
-  int logical_lane_id         = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE;
-  int logical_warp_id         = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
-  int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
-
-  // When this kernel is converted to support different VT and ET, this
-  // will likely split into invalid_vid and invalid_eid
-  // This is equivalent to ~IndexType(0) (i.e., all bits set to 1)
-  constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx<IndexType>::value;
-
-  // Inactive threads are not a pb for __ballot (known behaviour)
-  for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id;
-       idx < left_unvisited_size;
-       idx += gridDim.x * logical_warps_per_block) {
-    // Unvisited vertices - potentially in the next frontier
-    IndexType v = left_unvisited[idx];
-
-    // Used only with symmetric graphs
-    // Parents are included in v's neighbors
-    IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES;  // we already have checked the
-                                                                    // first MAIN_BOTTOMUP_MAX_EDGES
-                                                                    // edges in find_unvisited
-
-    IndexType end_i_edge = row_ptr[v + 1];
-
-    // We can have warp divergence in the next loop
-    // It's not a pb because the behaviour of __ballot
-    // is know with inactive threads
-    for (IndexType i_edge = first_i_edge + logical_lane_id; i_edge < end_i_edge;
-         i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) {
-      IndexType valid_parent = invalid_idx;
-
-      if (!edge_mask || edge_mask[i_edge]) {
-        IndexType u     = col_ind[i_edge];
-        IndexType lvl_u = distances[u];
-
-        if (lvl_u == (lvl - 1)) { valid_parent = u; }
-      }
-
-      unsigned int warp_valid_p_ballot =
-        __ballot_sync(raft::warp_full_mask(), valid_parent != invalid_idx);
-
-      int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE;
-      unsigned int mask           = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1;
-      unsigned int logical_warp_valid_p_ballot =
-        warp_valid_p_ballot >> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp);
-      logical_warp_valid_p_ballot &= mask;
-
-      int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1;
-
-      if (chosen_thread == logical_lane_id) {
-        // Using only one valid parent (reduce bw)
-        IndexType off = traversal::atomicAdd(new_frontier_cnt, static_cast<IndexType>(1));
-        int m         = 1 << (v % INT_SIZE);
-        atomicOr(&visited[v / INT_SIZE], m);
-        distances[v] = lvl;
-
-        if (predecessors) predecessors[v] = valid_parent;
-
-        new_frontier[off] = v;
-      }
-
-      if (logical_warp_valid_p_ballot) { break; }
-    }
-  }
-}
-
-template <typename IndexType>
-void bottom_up_large(IndexType* left_unvisited,
-                     IndexType left_unvisited_size,
-                     int* visited,
-                     const IndexType* row_ptr,
-                     const IndexType* col_ind,
-                     IndexType lvl,
-                     IndexType* new_frontier,
-                     IndexType* new_frontier_idx,
-                     IndexType* distances,
-                     IndexType* predecessors,
-                     int* edge_mask,
-                     cudaStream_t m_stream,
-                     bool deterministic)
-{
-  dim3 grid, block;
-  block.x = LARGE_BOTTOMUP_DIMX;
-  grid.x  = std::min(
-    static_cast<size_t>(MAXBLOCKS),
-    ((static_cast<size_t>(left_unvisited_size) + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) /
-      block.x);
-
-  bottom_up_large_degree_kernel<<<grid, block, 0, m_stream>>>(left_unvisited,
-                                                              left_unvisited_size,
-                                                              visited,
-                                                              row_ptr,
-                                                              col_ind,
-                                                              lvl,
-                                                              new_frontier,
-                                                              new_frontier_idx,
-                                                              distances,
-                                                              predecessors,
-                                                              edge_mask);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-//
-// topdown_expand_kernel
-// Read current frontier and compute new one with top down paradigm
-// One thread = One edge
-// To know origin of edge, we have to find where is index_edge in the values of
-// frontier_degrees_exclusive_sum (using a binary search, max less or equal
-// than) This index k will give us the origin of this edge, which is frontier[k]
-// This thread will then process the (linear_idx_thread -
-// frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k]
-//
-// To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load
-// NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches
-// We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we
-// prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD
-// * blockDim.x edges
-//
-// Once we have those offsets, we may still need a few values from
-// frontier_degrees_exclusive_sum to compute exact index k To be able to do it,
-// we will load the values that we need from frontier_degrees_exclusive_sum in
-// shared memory We know that it will fit because we never add node with degree
-// == 0 in the frontier, so we have an upper bound on the number of value to
-// load (see below)
-//
-// We will then look which vertices are not visited yet :
-// 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as
-// visited, update distances and predecessors, and move on 2) if the unvisited
-// vertex has degree > 0, we add it to the "frontier_candidates" queue
-//
-// We then treat the candidates queue using the threadIdx.x < ncandidates
-// If we are indeed the first thread to discover that vertex (result of
-// atomicOr(visited)) We add it to the new frontier
-//
-
-template <typename IndexType>
-__global__ void topdown_expand_kernel(
-  const IndexType* row_ptr,
-  const IndexType* col_ind,
-  const IndexType* frontier,
-  const IndexType frontier_size,
-  const IndexType totaldegree,
-  const IndexType max_items_per_thread,
-  const IndexType lvl,
-  IndexType* new_frontier,
-  IndexType* new_frontier_cnt,
-  const IndexType* frontier_degrees_exclusive_sum,
-  const IndexType* frontier_degrees_exclusive_sum_buckets_offsets,
-  int* previous_bmap,
-  int* bmap,
-  IndexType* distances,
-  IndexType* predecessors,
-  double* sp_counters,
-  const int* edge_mask,
-  const int* isolated_bmap,
-  bool directed)
-{
-  // BlockScan
-  typedef cub::BlockScan<IndexType, TOP_DOWN_EXPAND_DIMX> BlockScan;
-  __shared__ typename BlockScan::TempStorage scan_storage;
-
-  // We will do a scan to know where to write in frontier
-  // This will contain the common offset of the block
-  __shared__ IndexType frontier_common_block_offset;
-
-  __shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1];
-  __shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1];
-
-  //
-  // Frontier candidates local queue
-  // We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able
-  // to store everything We also save the predecessors here, because we will not
-  // be able to retrieve it after
-  //
-  __shared__ IndexType
-    shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE * TOP_DOWN_EXPAND_DIMX];
-  __shared__ IndexType
-    shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE * TOP_DOWN_EXPAND_DIMX];
-  __shared__ IndexType block_n_frontier_candidates;
-
-  IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread;
-
-  // When this kernel is converted to support different VT and ET, this
-  // will likely split into invalid_vid and invalid_eid
-  // This is equivalent to ~IndexType(0) (i.e., all bits set to 1)
-  constexpr IndexType invalid_idx = cugraph::legacy::invalid_idx<IndexType>::value;
-
-  IndexType n_items_per_thread_left =
-    (totaldegree > block_offset)
-      ? (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1) / TOP_DOWN_EXPAND_DIMX
-      : 0;
-
-  n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left);
-
-  for (; (n_items_per_thread_left > 0) && (block_offset < totaldegree);
-
-       block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x,
-       n_items_per_thread_left -= min(
-         n_items_per_thread_left, static_cast<IndexType>(MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD))) {
-    // In this loop, we will process batch_set_size batches
-    IndexType nitems_per_thread =
-      min(n_items_per_thread_left, static_cast<IndexType>(MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD));
-
-    // Loading buckets offset (see compute_bucket_offsets_kernel)
-
-    if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1))
-      shared_buckets_offsets[threadIdx.x] =
-        frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE +
-                                                       threadIdx.x];
-
-    // We will use shared_buckets_offsets
-    __syncthreads();
-
-    //
-    // shared_buckets_offsets gives us a range of the possible indexes
-    // for edge of linear_threadx, we are looking for the value k such as
-    // k is the max value such as frontier_degrees_exclusive_sum[k] <=
-    // linear_threadx
-    //
-    // we have 0 <= k < frontier_size
-    // but we also have :
-    //
-    // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE]
-    // <= k
-    // <=
-    // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE
-    // + 1]
-    //
-    // To find the exact value in that range, we need a few values from
-    // frontier_degrees_exclusive_sum (see below) We will load them here We will
-    // load as much as we can - if it doesn't fit we will make multiple
-    // iteration of the next loop Because all vertices in frontier have degree >
-    // 0, we know it will fits if left + 1 = right (see below)
-
-    // We're going to load values in frontier_degrees_exclusive_sum for batch
-    // [left; right[ If it doesn't fit, --right until it does, then loop It is
-    // excepted to fit on the first try, that's why we start right =
-    // nitems_per_thread
-
-    IndexType left  = 0;
-    IndexType right = nitems_per_thread;
-
-    while (left < nitems_per_thread) {
-      //
-      // Values that are necessary to compute the local binary searches
-      // We only need those with indexes between extremes indexes of
-      // buckets_offsets We need the next val for the binary search, hence the
-      // +1
-      //
-
-      IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] -
-                                  shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
-
-      // If left = right + 1 we are sure to have nvalues_to_load <
-      // TOP_DOWN_EXPAND_DIMX+1
-      while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) {
-        --right;
-
-        nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] -
-                          shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
-      }
-
-      IndexType nitems_per_thread_for_this_load = right - left;
-
-      IndexType frontier_degrees_exclusive_sum_block_offset =
-        shared_buckets_offsets[left * NBUCKETS_PER_BLOCK];
-
-      if (threadIdx.x < nvalues_to_load) {
-        shared_frontier_degrees_exclusive_sum[threadIdx.x] =
-          frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset + threadIdx.x];
-      }
-
-      if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) {
-        shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] =
-          frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset +
-                                         TOP_DOWN_EXPAND_DIMX];
-      }
-
-      // shared_frontier_degrees_exclusive_sum is in shared mem, we will use it,
-      // sync
-      __syncthreads();
-
-      // Now we will process the edges
-      // Here each thread will process nitems_per_thread_for_this_load
-      for (IndexType item_index = 0; item_index < nitems_per_thread_for_this_load;
-           item_index += TOP_DOWN_BATCH_SIZE) {
-        // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction
-        // parallism) Reduces latency
-
-        IndexType current_max_edge_index = min(
-          static_cast<size_t>(block_offset) + (left + nitems_per_thread_for_this_load) * blockDim.x,
-          static_cast<size_t>(totaldegree));
-
-        // We will need vec_u (source of the edge) until the end if we need to
-        // save the predecessors For others informations, we will reuse pointers
-        // on the go (nvcc does not color well the registers in that case)
-
-        IndexType vec_u[TOP_DOWN_BATCH_SIZE];
-        IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
-        IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
-
-        IndexType* vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
-
-#pragma unroll
-        for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType ibatch = left + item_index + iv;
-          IndexType gid    = block_offset + ibatch * blockDim.x + threadIdx.x;
-
-          if (gid < current_max_edge_index) {
-            IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x) / TOP_DOWN_BUCKET_SIZE;
-            IndexType bucket_start =
-              shared_buckets_offsets[start_off_idx] - frontier_degrees_exclusive_sum_block_offset;
-            IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1] -
-                                   frontier_degrees_exclusive_sum_block_offset;
-
-            IndexType k = traversal::binsearch_maxle(
-                            shared_frontier_degrees_exclusive_sum, gid, bucket_start, bucket_end) +
-                          frontier_degrees_exclusive_sum_block_offset;
-            vec_u[iv]                                    = frontier[k];  // origin of this edge
-            vec_frontier_degrees_exclusive_sum_index[iv] = frontier_degrees_exclusive_sum[k];
-          } else {
-            vec_u[iv]                                    = invalid_idx;
-            vec_frontier_degrees_exclusive_sum_index[iv] = invalid_idx;
-          }
-        }
-
-        IndexType* vec_row_ptr_u = &local_buf1[0];
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType u = vec_u[iv];
-          // row_ptr for this vertex origin u
-          vec_row_ptr_u[iv] = (u != invalid_idx) ? row_ptr[u] : invalid_idx;
-        }
-
-        // We won't need row_ptr after that, reusing pointer
-        IndexType* vec_dest_v = vec_row_ptr_u;
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType thread_item_index = left + item_index + iv;
-          IndexType gid               = block_offset + thread_item_index * blockDim.x + threadIdx.x;
-
-          IndexType row_ptr_u = vec_row_ptr_u[iv];
-          // Need this check so that we don't use invalid values of edge to index
-          if (row_ptr_u != invalid_idx) {
-            IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv];
-
-            if (edge_mask && !edge_mask[edge]) {
-              // Disabling edge
-              row_ptr_u = invalid_idx;
-            } else {
-              // Destination of this edge
-              vec_dest_v[iv] = col_ind[edge];
-            }
-          }
-        }
-
-        // We don't need vec_frontier_degrees_exclusive_sum_index anymore
-        IndexType* vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
-
-        // Visited bmap need to contain information about the previous
-        // frontier if we actually process every edge (shortest path counting)
-        // otherwise we can read and update from the same bmap
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType v = vec_dest_v[iv];
-          vec_v_visited_bmap[iv] =
-            (v != invalid_idx) ? previous_bmap[v / INT_SIZE] : (~int(0));  // will look visited
-        }
-
-        // From now on we will consider v as a frontier candidate
-        // If for some reason vec_candidate[iv] should be put in the
-        // new_frontier Then set vec_candidate[iv] = -1
-        IndexType* vec_frontier_candidate = vec_dest_v;
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType v = vec_frontier_candidate[iv];
-          int m       = 1 << (v % INT_SIZE);
-
-          int is_visited = vec_v_visited_bmap[iv] & m;
-
-          if (is_visited) vec_frontier_candidate[iv] = invalid_idx;
-        }
-
-        // Each source should update the destination shortest path counter
-        // if the destination has not been visited in the *previous* frontier
-        if (sp_counters) {
-#pragma unroll
-          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-            IndexType dst = vec_frontier_candidate[iv];
-            if (dst != invalid_idx) {
-              IndexType src = vec_u[iv];
-              atomicAdd(&sp_counters[dst], sp_counters[src]);
-            }
-          }
-        }
-
-        if (directed) {
-          // vec_v_visited_bmap is available
-          IndexType* vec_is_isolated_bmap = vec_v_visited_bmap;
-
-#pragma unroll
-          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-            IndexType v              = vec_frontier_candidate[iv];
-            vec_is_isolated_bmap[iv] = (v != invalid_idx) ? isolated_bmap[v / INT_SIZE] : ~int(0);
-          }
-
-#pragma unroll
-          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-            IndexType v     = vec_frontier_candidate[iv];
-            int m           = 1 << (v % INT_SIZE);
-            int is_isolated = vec_is_isolated_bmap[iv] & m;
-
-            // If v is isolated, we will not add it to the frontier (it's not a
-            // frontier candidate) 1st reason : it's useless 2nd reason : it
-            // will make top down algo fail we need each node in frontier to
-            // have a degree > 0 If it is isolated, we just need to mark it as
-            // visited, and save distance and predecessor here. Not need to
-            // check return value of atomicOr
-
-            if (is_isolated && v != invalid_idx) {
-              int m = 1 << (v % INT_SIZE);
-              atomicOr(&bmap[v / INT_SIZE], m);
-              if (distances) distances[v] = lvl;
-
-              if (predecessors) predecessors[v] = vec_u[iv];
-
-              // This is no longer a candidate, neutralize it
-              vec_frontier_candidate[iv] = invalid_idx;
-            }
-          }
-        }
-
-        // Number of successor candidate hold by this thread
-        IndexType thread_n_frontier_candidates = 0;
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          IndexType v = vec_frontier_candidate[iv];
-          if (v != invalid_idx) ++thread_n_frontier_candidates;
-        }
-
-        // We need to have all nfrontier_candidates to be ready before doing the
-        // scan
-        __syncthreads();
-
-        // We will put the frontier candidates in a local queue
-        // Computing offsets
-        IndexType thread_frontier_candidate_offset = 0;  // offset inside block
-        BlockScan(scan_storage)
-          .ExclusiveSum(thread_n_frontier_candidates, thread_frontier_candidate_offset);
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          // May have bank conflicts
-          IndexType frontier_candidate = vec_frontier_candidate[iv];
-
-          if (frontier_candidate != invalid_idx) {
-            shared_local_new_frontier_candidates[thread_frontier_candidate_offset] =
-              frontier_candidate;
-            shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] = vec_u[iv];
-            ++thread_frontier_candidate_offset;
-          }
-        }
-
-        if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
-          // No need to add nsuccessor_candidate, even if its an
-          // exclusive sum
-          // We incremented the thread_frontier_candidate_offset
-          block_n_frontier_candidates = thread_frontier_candidate_offset;
-        }
-
-        // broadcast block_n_frontier_candidates
-        __syncthreads();
-
-        IndexType naccepted_vertices = 0;
-        // We won't need vec_frontier_candidate after that
-        IndexType* vec_frontier_accepted_vertex = vec_frontier_candidate;
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          const int idx_shared             = iv * blockDim.x + threadIdx.x;
-          vec_frontier_accepted_vertex[iv] = invalid_idx;
-
-          if (idx_shared < block_n_frontier_candidates) {
-            IndexType v = shared_local_new_frontier_candidates[idx_shared];  // popping
-                                                                             // queue
-            int m = 1 << (v % INT_SIZE);
-            int q = atomicOr(&bmap[v / INT_SIZE], m);                        // atomicOr returns old
-
-            if (!(m & q)) {  // if this thread was the first to discover this node
-              if (distances) distances[v] = lvl;
-
-              if (predecessors) {
-                IndexType pred  = shared_local_new_frontier_predecessors[idx_shared];
-                predecessors[v] = pred;
-              }
-
-              vec_frontier_accepted_vertex[iv] = v;
-              ++naccepted_vertices;
-            }
-          }
-        }
-
-        // We need naccepted_vertices to be ready
-        __syncthreads();
-
-        IndexType thread_new_frontier_offset;
-
-        BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset);
-
-        if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
-          IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices;
-          // for this thread, thread_new_frontier_offset + has_successor
-          // (exclusive sum)
-          if (inclusive_sum)
-            frontier_common_block_offset = traversal::atomicAdd(new_frontier_cnt, inclusive_sum);
-        }
-
-        // Broadcasting frontier_common_block_offset
-        __syncthreads();
-
-#pragma unroll
-        for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-          const int idx_shared = iv * blockDim.x + threadIdx.x;
-          if (idx_shared < block_n_frontier_candidates) {
-            IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv];
-
-            if (new_frontier_vertex != invalid_idx) {
-              IndexType off     = frontier_common_block_offset + thread_new_frontier_offset++;
-              new_frontier[off] = new_frontier_vertex;
-            }
-          }
-        }
-      }
-
-      // We need to keep shared_frontier_degrees_exclusive_sum coherent
-      __syncthreads();
-
-      // Preparing for next load
-      left  = right;
-      right = nitems_per_thread;
-    }
-
-    // we need to keep shared_buckets_offsets coherent
-    __syncthreads();
-  }
-}
-
-template <typename IndexType>
-void frontier_expand(const IndexType* row_ptr,
-                     const IndexType* col_ind,
-                     const IndexType* frontier,
-                     const IndexType frontier_size,
-                     const IndexType totaldegree,
-                     const IndexType lvl,
-                     IndexType* new_frontier,
-                     IndexType* new_frontier_cnt,
-                     const IndexType* frontier_degrees_exclusive_sum,
-                     const IndexType* frontier_degrees_exclusive_sum_buckets_offsets,
-                     int* previous_visited_bmap,
-                     int* visited_bmap,
-                     IndexType* distances,
-                     IndexType* predecessors,
-                     double* sp_counters,
-                     const int* edge_mask,
-                     const int* isolated_bmap,
-                     bool directed,
-                     cudaStream_t m_stream,
-                     bool deterministic)
-{
-  if (!totaldegree) return;
-
-  dim3 block;
-  block.x = TOP_DOWN_EXPAND_DIMX;
-
-  IndexType max_items_per_thread =
-    (static_cast<size_t>(totaldegree) + MAXBLOCKS * block.x - 1) / (MAXBLOCKS * block.x);
-
-  dim3 grid;
-  grid.x = std::min((static_cast<size_t>(totaldegree) + max_items_per_thread * block.x - 1) /
-                      (max_items_per_thread * block.x),
-                    static_cast<size_t>(MAXBLOCKS));
-
-  // Shortest Path counting (Betweenness Centrality)
-  // We need to keep track of the previously visited bmap
-
-  // If the coutner of shortest path is nullptr
-  // The previous_visited_bmap is no longer needed (and should be nullptr on
-  // the first access), so it can be the same as the current visitedbmap
-  if (!sp_counters) { previous_visited_bmap = visited_bmap; }
-  topdown_expand_kernel<<<grid, block, 0, m_stream>>>(
-    row_ptr,
-    col_ind,
-    frontier,
-    frontier_size,
-    totaldegree,
-    max_items_per_thread,
-    lvl,
-    new_frontier,
-    new_frontier_cnt,
-    frontier_degrees_exclusive_sum,
-    frontier_degrees_exclusive_sum_buckets_offsets,
-    previous_visited_bmap,
-    visited_bmap,
-    distances,
-    predecessors,
-    sp_counters,
-    edge_mask,
-    isolated_bmap,
-    directed);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-}  // namespace bfs_kernels
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/src/traversal/legacy/traversal_common.cuh b/cpp/src/traversal/legacy/traversal_common.cuh
deleted file mode 100644
index fac80e90eb6..00000000000
--- a/cpp/src/traversal/legacy/traversal_common.cuh
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cub/cub.cuh>
-#include <cugraph/utilities/error.hpp>
-
-#include <thrust/scan.h>
-
-#define MAXBLOCKS 65535
-#define WARP_SIZE 32
-#define INT_SIZE  32
-
-//
-// Bottom up macros
-//
-
-#define FILL_UNVISITED_QUEUE_DIMX 256
-
-#define COUNT_UNVISITED_EDGES_DIMX 256
-
-#define MAIN_BOTTOMUP_DIMX   256
-#define MAIN_BOTTOMUP_NWARPS (MAIN_BOTTOMUP_DIMX / WARP_SIZE)
-
-#define LARGE_BOTTOMUP_DIMX 256
-
-// Number of edges processed in the main bottom up kernel
-#define MAIN_BOTTOMUP_MAX_EDGES 6
-
-// Power of 2 < 32 (strict <)
-#define BOTTOM_UP_LOGICAL_WARP_SIZE 4
-
-//
-// Top down macros
-//
-
-// We will precompute the results the binsearch_maxle every
-// TOP_DOWN_BUCKET_SIZE edges
-#define TOP_DOWN_BUCKET_SIZE 32
-
-// DimX of the kernel
-#define TOP_DOWN_EXPAND_DIMX 256
-
-// TOP_DOWN_EXPAND_DIMX edges -> NBUCKETS_PER_BLOCK buckets
-#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX / TOP_DOWN_BUCKET_SIZE)
-
-// How many items_per_thread we can process with one bucket_offset loading
-// the -1 is here because we need the +1 offset
-#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1)
-
-// instruction parallelism
-// for how many edges will we create instruction parallelism
-#define TOP_DOWN_BATCH_SIZE 2
-
-#define COMPUTE_BUCKET_OFFSETS_DIMX 512
-
-// Other macros
-
-#define FLAG_ISOLATED_VERTICES_DIMX 128
-
-// Number of vertices handled by one thread
-// Must be power of 2, lower than 32
-#define FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD 4
-
-// Number of threads involved in the "construction" of one int in the bitset
-#define FLAG_ISOLATED_VERTICES_THREADS_PER_INT \
-  (INT_SIZE / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD)
-
-//
-// Parameters of the heuristic to switch between bottomup/topdown
-// Finite machine described in
-// http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf
-//
-
-namespace cugraph {
-namespace detail {
-namespace traversal {
-
-//
-// gives the equivalent vectors from a type
-// for the max val, would be better to use numeric_limits<>::max() once
-// cpp11 is allowed in nvgraph
-//
-
-template <typename>
-struct vec_t {
-  typedef int4 vec4;
-  typedef int2 vec2;
-};
-
-template <>
-struct vec_t<int> {
-  typedef int4 vec4;
-  typedef int2 vec2;
-  static const int max = std::numeric_limits<int>::max();
-};
-
-template <>
-struct vec_t<long> {
-  typedef long4 vec4;
-  typedef long2 vec2;
-  static const long max = std::numeric_limits<long>::max();
-};
-
-template <>
-struct vec_t<unsigned> {
-  typedef uint4 vec4;
-  typedef uint2 vec2;
-  static const unsigned max = std::numeric_limits<unsigned>::max();
-};
-
-template <>
-struct vec_t<long long int> {
-  typedef longlong4 vec4;
-  typedef longlong2 vec2;
-  static const long long int max = std::numeric_limits<long long int>::max();
-};
-
-template <>
-struct vec_t<float> {
-  typedef float4 vec4;
-  typedef float2 vec2;
-  static constexpr float max = std::numeric_limits<float>::max();
-};
-
-template <>
-struct vec_t<double> {
-  typedef double4 vec4;
-  typedef double2 vec2;
-  static constexpr double max = std::numeric_limits<double>::max();
-};
-
-//
-// ------------------------- Helper device functions -------------------
-//
-
-__forceinline__ __device__ int getMaskNRightmostBitSet(int n)
-{
-  if (n == INT_SIZE) return (~0);
-  int mask = (1 << n) - 1;
-  return mask;
-}
-
-__forceinline__ __device__ int getMaskNLeftmostBitSet(int n)
-{
-  if (n == 0) return 0;
-  int mask = ~((1 << (INT_SIZE - n)) - 1);
-  return mask;
-}
-
-__forceinline__ __device__ int getNextZeroBit(int& val)
-{
-  int ibit = __ffs(~val) - 1;
-  val |= (1 << ibit);
-
-  return ibit;
-}
-
-struct BitwiseAnd {
-  template <typename T>
-  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const
-  {
-    return (a & b);
-  }
-};
-struct BitwiseOr {
-  template <typename T>
-  __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const
-  {
-    return (a | b);
-  }
-};
-
-template <typename ValueType, typename SizeType>
-__global__ void fill_vec_kernel(ValueType* vec, SizeType n, ValueType val)
-{
-  for (SizeType idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
-    vec[idx] = val;
-}
-
-template <typename ValueType, typename SizeType>
-void fill_vec(ValueType* vec, SizeType n, ValueType val, cudaStream_t stream)
-{
-  dim3 grid, block;
-  block.x = 256;
-  grid.x  = (n + block.x - 1) / block.x;
-
-  fill_vec_kernel<<<grid, block, 0, stream>>>(vec, n, val);
-  RAFT_CHECK_CUDA(stream);
-}
-
-template <typename IndexType>
-__device__ IndexType
-binsearch_maxle(const IndexType* vec, const IndexType val, IndexType low, IndexType high)
-{
-  while (true) {
-    if (low == high) return low;  // we know it exists
-    if ((low + 1) == high) return (vec[high] <= val) ? high : low;
-
-    IndexType mid = low + (high - low) / 2;
-
-    if (vec[mid] > val)
-      high = mid - 1;
-    else
-      low = mid;
-  }
-}
-
-// FIXME: The atomicAdd wrappers should be moved to RAFT
-
-template <typename T>
-__device__ static __forceinline__ T atomicAdd(T* addr, T val)
-{
-  return ::atomicAdd(addr, val);
-}
-
-template <>
-__device__ __forceinline__ int64_t atomicAdd<int64_t>(int64_t* addr, int64_t val)
-{
-  static_assert(sizeof(int64_t) == sizeof(unsigned long long),
-                "sizeof(int64_t) != sizeof(unsigned long long). Can't use atomicAdd");
-
-  return ::atomicAdd(reinterpret_cast<unsigned long long*>(addr),
-                     static_cast<unsigned long long>(val));
-}
-
-__device__ static __forceinline__ float atomicMin(float* addr, float val)
-{
-  int* addr_as_int = (int*)addr;
-  int old          = *addr_as_int;
-  int expected;
-  do {
-    expected = old;
-    old =
-      ::atomicCAS(addr_as_int, expected, __float_as_int(::fminf(val, __int_as_float(expected))));
-  } while (expected != old);
-  return __int_as_float(old);
-}
-
-__device__ static __forceinline__ double atomicMin(double* address, double val)
-{
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old             = *address_as_ull, assumed;
-
-  do {
-    assumed = old;
-    old     = ::atomicCAS(
-      address_as_ull, assumed, __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
-
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
-    // NaN)
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-}
-
-template <typename IndexType>
-__global__ void flag_isolated_vertices_kernel(IndexType n,
-                                              int* isolated_bmap,
-                                              const IndexType* row_ptr,
-                                              IndexType* degrees,
-                                              IndexType* nisolated)
-{
-  typedef cub::BlockLoad<IndexType,
-                         FLAG_ISOLATED_VERTICES_DIMX,
-                         FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD,
-                         cub::BLOCK_LOAD_WARP_TRANSPOSE>
-    BlockLoad;
-  typedef cub::BlockStore<IndexType,
-                          FLAG_ISOLATED_VERTICES_DIMX,
-                          FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD,
-                          cub::BLOCK_STORE_WARP_TRANSPOSE>
-    BlockStore;
-  typedef cub::BlockReduce<IndexType, FLAG_ISOLATED_VERTICES_DIMX> BlockReduce;
-  typedef cub::WarpReduce<int, FLAG_ISOLATED_VERTICES_THREADS_PER_INT> WarpReduce;
-
-  __shared__ typename BlockLoad::TempStorage load_temp_storage;
-  __shared__ typename BlockStore::TempStorage store_temp_storage;
-  __shared__ typename BlockReduce::TempStorage block_reduce_temp_storage;
-
-  __shared__ typename WarpReduce::TempStorage
-    warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX / FLAG_ISOLATED_VERTICES_THREADS_PER_INT];
-
-  __shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX];
-
-  for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * blockIdx.x);
-       block_off < n;
-       block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) {
-    IndexType thread_off = block_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x;
-    IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1;
-
-    IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
-    IndexType block_valid_items = n - block_off + 1;  //+1, we need row_ptr[last_node+1]
-
-    BlockLoad(load_temp_storage).Load(row_ptr + block_off, thread_row_ptr, block_valid_items, -1);
-
-    // To compute 4 degrees, we need 5 values of row_ptr
-    // Saving the "5th" value in shared memory for previous thread to use
-    if (threadIdx.x > 0) { row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0]; }
-
-    // If this is the last thread, it needs to load its row ptr tail value
-    if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) {
-      row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1];
-    }
-    __syncthreads();  // we may reuse temp_storage
-
-    int local_isolated_bmap = 0;
-
-    IndexType imax = (n > thread_off) ? (n - thread_off) : 0;
-
-    IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
-
-#pragma unroll
-    for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) {
-      IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i];
-
-      if (i < imax) local_isolated_bmap |= ((degree == 0) << i);
-    }
-
-    if (last_node_thread < n) {
-      IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] =
-        row_ptr_tail[threadIdx.x] - thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1];
-
-      local_isolated_bmap |= ((degree == 0) << (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1));
-    }
-
-    local_isolated_bmap <<= (thread_off % INT_SIZE);
-
-    IndexType local_nisolated = __popc(local_isolated_bmap);
-
-    // We need local_nisolated and local_isolated_bmap to be ready for next
-    // steps
-    __syncthreads();
-
-    IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated);
-
-    if (threadIdx.x == 0 && total_nisolated) { traversal::atomicAdd(nisolated, total_nisolated); }
-
-    int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT;
-
-    // Building int for bmap
-    int int_aggregate_isolated_bmap =
-      WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce(local_isolated_bmap, BitwiseOr());
-
-    int is_head_of_visited_int = ((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0);
-    if (is_head_of_visited_int && (thread_off / INT_SIZE) < (n + INT_SIZE - 1) / INT_SIZE) {
-      isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap;
-    }
-
-    BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items - 1);
-  }
-}
-
-template <typename IndexType>
-void flag_isolated_vertices(IndexType n,
-                            int* isolated_bmap,
-                            const IndexType* row_ptr,
-                            IndexType* degrees,
-                            IndexType* nisolated,
-                            cudaStream_t m_stream)
-{
-  dim3 grid, block;
-  block.x = FLAG_ISOLATED_VERTICES_DIMX;
-
-  grid.x = min((IndexType)MAXBLOCKS,
-               (n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x);
-
-  flag_isolated_vertices_kernel<<<grid, block, 0, m_stream>>>(
-    n, isolated_bmap, row_ptr, degrees, nisolated);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-template <typename IndexType>
-__global__ void set_frontier_degree_kernel(IndexType* frontier_degree,
-                                           IndexType* frontier,
-                                           const IndexType* degree,
-                                           IndexType n)
-{
-  for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n;
-       idx += gridDim.x * blockDim.x) {
-    IndexType u          = frontier[idx];
-    frontier_degree[idx] = degree[u];
-  }
-}
-
-template <typename IndexType>
-void set_frontier_degree(IndexType* frontier_degree,
-                         IndexType* frontier,
-                         const IndexType* degree,
-                         IndexType n,
-                         cudaStream_t m_stream)
-{
-  dim3 grid, block;
-  block.x = 256;
-  grid.x  = min((n + block.x - 1) / block.x, (IndexType)MAXBLOCKS);
-  set_frontier_degree_kernel<<<grid, block, 0, m_stream>>>(frontier_degree, frontier, degree, n);
-  RAFT_CHECK_CUDA(m_stream);
-}
-
-template <typename IndexType>
-void exclusive_sum(void* d_temp_storage,
-                   size_t temp_storage_bytes,
-                   IndexType* d_in,
-                   IndexType* d_out,
-                   IndexType num_items,
-                   cudaStream_t m_stream)
-{
-  if (num_items <= 1) return;  // DeviceScan fails if n==1
-  cub::DeviceScan::ExclusiveSum(
-    d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, m_stream);
-}
-
-template <typename IndexType>
-void exclusive_sum(IndexType* d_in, IndexType* d_out, IndexType num_items, cudaStream_t m_stream)
-{
-  if (num_items <= 1) return;  // DeviceScan fails if n==1
-  thrust::exclusive_scan(rmm::exec_policy(m_stream), d_in, d_in + num_items, d_out);
-}
-
-//
-// compute_bucket_offsets_kernel
-// simply compute the position in the frontier corresponding all valid edges
-// with index=TOP_DOWN_BUCKET_SIZE * k, k integer
-//
-
-template <typename IndexType>
-__global__ void compute_bucket_offsets_kernel(const IndexType* frontier_degrees_exclusive_sum,
-                                              IndexType* bucket_offsets,
-                                              const IndexType frontier_size,
-                                              IndexType total_degree)
-{
-  IndexType end =
-    ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX * NBUCKETS_PER_BLOCK + 1);
-
-  for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x; bid <= end;
-       bid += gridDim.x * blockDim.x) {
-    IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1);
-
-    bucket_offsets[bid] =
-      binsearch_maxle(frontier_degrees_exclusive_sum, eid, (IndexType)0, frontier_size - 1);
-  }
-}
-
-template <typename IndexType>
-void compute_bucket_offsets(IndexType* cumul,
-                            IndexType* bucket_offsets,
-                            IndexType frontier_size,
-                            IndexType total_degree,
-                            cudaStream_t m_stream)
-{
-  dim3 grid, block;
-  block.x = COMPUTE_BUCKET_OFFSETS_DIMX;
-
-  grid.x =
-    min((IndexType)MAXBLOCKS,
-        ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX * NBUCKETS_PER_BLOCK + 1 +
-         block.x - 1) /
-          block.x);
-
-  compute_bucket_offsets_kernel<<<grid, block, 0, m_stream>>>(
-    cumul, bucket_offsets, frontier_size, total_degree);
-  RAFT_CHECK_CUDA(m_stream);
-}
-}  // namespace traversal
-}  // namespace detail
-}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index da1e0e50919..eebd31a0030 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -239,15 +239,6 @@ ConfigureTest(GRAPH_GENERATORS_TEST generators/generators_test.cpp)
 # - erdos renyi graph generator tests -------------------------------------------------------------
 ConfigureTest(ERDOS_RENYI_GENERATOR_TEST generators/erdos_renyi_test.cpp)
 
-###################################################################################################
-# - betweenness centrality tests ------------------------------------------------------------------
-ConfigureTest(LEGACY_BETWEENNESS_TEST centrality/legacy/betweenness_centrality_test.cu)
-ConfigureTest(LEGACY_EDGE_BETWEENNESS_TEST centrality/legacy/edge_betweenness_centrality_test.cu)
-
-###################################################################################################
-# - BFS tests -------------------------------------------------------------------------------------
-ConfigureTest(LEGACY_BFS_TEST traversal/legacy/bfs_test.cu)
-
 ###################################################################################################
 # - LOUVAIN tests ---------------------------------------------------------------------------------
 ConfigureTest(LOUVAIN_TEST community/louvain_test.cpp)
diff --git a/cpp/tests/centrality/legacy/betweenness_centrality_test.cu b/cpp/tests/centrality/legacy/betweenness_centrality_test.cu
deleted file mode 100644
index 2cdbe1c98e4..00000000000
--- a/cpp/tests/centrality/legacy/betweenness_centrality_test.cu
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <traversal/legacy/bfs_ref.h>
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_utilities.hpp>
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/legacy/graph.hpp>
-
-#include <raft/core/error.hpp>
-#include <raft/core/handle.hpp>
-#include <rmm/device_vector.hpp>
-
-#include <rmm/device_vector.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/host_vector.h>
-#include <thrust/sequence.h>
-
-#include <fstream>
-#include <queue>
-#include <stack>
-#include <utility>
-
-#ifndef TEST_EPSILON
-#define TEST_EPSILON 0.0001
-#endif
-
-// NOTE: Defines under which values the difference should  be discarded when
-// considering values are close to zero
-//  i.e: Do we consider that the difference between 1.3e-9 and 8.e-12 is
-// significant
-#ifndef TEST_ZERO_THRESHOLD
-#define TEST_ZERO_THRESHOLD 1e-10
-#endif
-
-// ============================================================================
-// C++ Reference Implementation
-// ============================================================================
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_accumulation(result_t* result,
-                      vertex_t const number_of_vertices,
-                      std::stack<vertex_t>& S,
-                      std::vector<std::vector<vertex_t>>& pred,
-                      std::vector<double>& sigmas,
-                      std::vector<double>& deltas,
-                      vertex_t source)
-{
-  for (vertex_t v = 0; v < number_of_vertices; ++v) {
-    deltas[v] = 0;
-  }
-  while (!S.empty()) {
-    vertex_t w = S.top();
-    S.pop();
-    for (vertex_t v : pred[w]) {
-      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
-    }
-    if (w != source) { result[w] += deltas[w]; }
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_endpoints_accumulation(result_t* result,
-                                vertex_t const number_of_vertices,
-                                std::stack<vertex_t>& S,
-                                std::vector<std::vector<vertex_t>>& pred,
-                                std::vector<double>& sigmas,
-                                std::vector<double>& deltas,
-                                vertex_t source)
-{
-  result[source] += S.size() - 1;
-  for (vertex_t v = 0; v < number_of_vertices; ++v) {
-    deltas[v] = 0;
-  }
-  while (!S.empty()) {
-    vertex_t w = S.top();
-    S.pop();
-    for (vertex_t v : pred[w]) {
-      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
-    }
-    if (w != source) { result[w] += deltas[w] + 1; }
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_edge_accumulation(result_t* result,
-                           vertex_t const number_of_vertices,
-                           std::stack<vertex_t>& S,
-                           std::vector<std::vector<vertex_t>>& pred,
-                           std::vector<double>& sigmas,
-                           std::vector<double>& deltas,
-                           vertex_t source)
-{
-  for (vertex_t v = 0; v < number_of_vertices; ++v) {
-    deltas[v] = 0;
-  }
-  while (!S.empty()) {
-    vertex_t w = S.top();
-    S.pop();
-    for (vertex_t v : pred[w]) {
-      deltas[v] += (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
-    }
-    if (w != source) { result[w] += deltas[w]; }
-  }
-}
-
-// Algorithm 1: Shortest-path vertex betweenness, (Brandes, 2001)
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_betweenness_centrality_impl(vertex_t* indices,
-                                           edge_t* offsets,
-                                           vertex_t const number_of_vertices,
-                                           result_t* result,
-                                           bool endpoints,
-                                           vertex_t const* sources,
-                                           vertex_t const number_of_sources)
-{
-  std::queue<vertex_t> Q;
-  std::stack<vertex_t> S;
-  // NOTE: dist is of type vertex_t not weight_t
-  std::vector<vertex_t> dist(number_of_vertices);
-  std::vector<std::vector<vertex_t>> pred(number_of_vertices);
-  std::vector<double> sigmas(number_of_vertices);
-  std::vector<double> deltas(number_of_vertices);
-
-  std::vector<vertex_t> neighbors;
-
-  if (sources) {
-    for (vertex_t source_idx = 0; source_idx < number_of_sources; ++source_idx) {
-      vertex_t s = sources[source_idx];
-      // Step 1: Single-source shortest-paths problem
-      //   a. Initialization
-      ref_bfs<vertex_t, edge_t>(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s);
-      //  Step 2: Accumulation
-      //          Back propagation of dependencies
-      if (endpoints) {
-        ref_endpoints_accumulation<vertex_t, edge_t, weight_t, result_t>(
-          result, number_of_vertices, S, pred, sigmas, deltas, s);
-      } else {
-        ref_accumulation<vertex_t, edge_t, weight_t, result_t>(
-          result, number_of_vertices, S, pred, sigmas, deltas, s);
-      }
-    }
-  } else {
-    for (vertex_t s = 0; s < number_of_vertices; ++s) {
-      // Step 1: Single-source shortest-paths problem
-      //   a. Initialization
-      ref_bfs<vertex_t, edge_t>(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s);
-      //  Step 2: Accumulation
-      //          Back propagation of dependencies
-      if (endpoints) {
-        ref_endpoints_accumulation<vertex_t, edge_t, weight_t, result_t>(
-          result, number_of_vertices, S, pred, sigmas, deltas, s);
-      } else {
-        ref_accumulation<vertex_t, edge_t, weight_t, result_t>(
-          result, number_of_vertices, S, pred, sigmas, deltas, s);
-      }
-    }
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_rescale(result_t* result,
-                       bool directed,
-                       bool normalize,
-                       bool endpoints,
-                       vertex_t const number_of_vertices,
-                       vertex_t const number_of_sources)
-{
-  bool modified                      = false;
-  result_t rescale_factor            = static_cast<result_t>(1);
-  result_t casted_number_of_sources  = static_cast<result_t>(number_of_sources);
-  result_t casted_number_of_vertices = static_cast<result_t>(number_of_vertices);
-  if (normalize) {
-    if (number_of_vertices > 2) {
-      if (endpoints) {
-        rescale_factor /= (casted_number_of_vertices * (casted_number_of_vertices - 1));
-      } else {
-        rescale_factor /= ((casted_number_of_vertices - 1) * (casted_number_of_vertices - 2));
-      }
-      modified = true;
-    }
-  } else {
-    if (!directed) {
-      rescale_factor /= static_cast<result_t>(2);
-      modified = true;
-    }
-  }
-  if (modified) {
-    if (number_of_sources > 0) {
-      rescale_factor *= (casted_number_of_vertices / casted_number_of_sources);
-    }
-  }
-  for (auto idx = 0; idx < number_of_vertices; ++idx) {
-    result[idx] *= rescale_factor;
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_betweenness_centrality(
-  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-  result_t* result,
-  bool normalize,
-  bool endpoints,  // This is not yet implemented
-  vertex_t const number_of_sources,
-  vertex_t const* sources)
-{
-  vertex_t number_of_vertices = graph.number_of_vertices;
-  edge_t number_of_edges      = graph.number_of_edges;
-  thrust::host_vector<vertex_t> h_indices(number_of_edges);
-  thrust::host_vector<edge_t> h_offsets(number_of_vertices + 1);
-
-  thrust::device_ptr<vertex_t> d_indices((vertex_t*)&graph.indices[0]);
-  thrust::device_ptr<edge_t> d_offsets((edge_t*)&graph.offsets[0]);
-
-  thrust::copy(d_indices, d_indices + number_of_edges, h_indices.begin());
-  thrust::copy(d_offsets, d_offsets + (number_of_vertices + 1), h_offsets.begin());
-
-  cudaDeviceSynchronize();
-
-  reference_betweenness_centrality_impl<vertex_t, edge_t, weight_t, result_t>(&h_indices[0],
-                                                                              &h_offsets[0],
-                                                                              number_of_vertices,
-                                                                              result,
-                                                                              endpoints,
-                                                                              sources,
-                                                                              number_of_sources);
-  reference_rescale<vertex_t, edge_t, weight_t, result_t>(
-    result, graph.prop.directed, normalize, endpoints, number_of_vertices, number_of_sources);
-}
-// Explicit instantiation
-/*    FIXME!!!
-template void reference_betweenness_centrality<int, int, float, float>(
-  cugraph::legacy::GraphCSRView<int, int, float> const &,
-  float *,
-  bool,
-  bool,
-  const int,
-  int const *);
-template void reference_betweenness_centrality<int, int, double, double>(
-  cugraph::legacy::GraphCSRView<int, int, double> const &,
-  double *,
-  bool,
-  bool,
-  const int,
-  int const *);
-*/
-
-// =============================================================================
-// Utility functions
-// =============================================================================
-// Compare while allowing relatie error of epsilon
-// zero_threshold indicates when  we should drop comparison for small numbers
-template <typename T, typename precision_t>
-bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold)
-{
-  return ((zero_threshold > a && zero_threshold > b)) ||
-         (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon));
-}
-
-// =============================================================================
-// Test Suite
-// =============================================================================
-// Defines Betweenness Centrality UseCase
-// SSSP's test suite code uses type of Graph parameter that could be used
-// (MTX / RMAT)
-typedef struct BC_Usecase_t {
-  std::string config_;     // Path to graph file
-  std::string file_path_;  // Complete path to graph using dataset_root_dir
-  int number_of_sources_;  // Starting point from the traversal
-  BC_Usecase_t(const std::string& config, int number_of_sources)
-    : config_(config), number_of_sources_(number_of_sources)
-  {
-    // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
-    // FIXME: Use platform independent stuff from c++14/17 on compiler update
-    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
-    if ((config_ != "") && (config_[0] != '/')) {
-      file_path_ = rapidsDatasetRootDir + "/" + config_;
-    } else {
-      file_path_ = config_;
-    }
-  };
-} BC_Usecase;
-
-class Tests_BC : public ::testing::TestWithParam<BC_Usecase> {
-  raft::handle_t handle;
-
- public:
-  Tests_BC() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  // vertex_t         vertex identifier data type
-  // edge_t         edge identifier data type
-  // weight_t         edge weight data type
-  // result_t   result data type
-  // normalize  should the result be normalized
-  // endpoints  should the endpoints be included
-  template <typename vertex_t,
-            typename edge_t,
-            typename weight_t,
-            typename result_t,
-            bool normalize,
-            bool endpoints>
-  void run_current_test(const BC_Usecase& configuration)
-  {
-    // Step 1: Construction of the graph based on configuration
-    bool is_directed = false;
-    auto csr         = cugraph::test::generate_graph_csr_from_mm<vertex_t, edge_t, weight_t>(
-      is_directed, configuration.file_path_);
-    cudaDeviceSynchronize();
-    cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> G = csr->view();
-    G.prop.directed                                             = is_directed;
-    RAFT_CUDA_TRY(cudaGetLastError());
-    std::vector<result_t> result(G.number_of_vertices, 0);
-    std::vector<result_t> expected(G.number_of_vertices, 0);
-
-    // Step 2: Generation of sources based on configuration
-    //         if number_of_sources_ is 0 then sources must be nullptr
-    //         Otherwise we only  use the first k values
-    ASSERT_TRUE(configuration.number_of_sources_ >= 0 &&
-                configuration.number_of_sources_ <= G.number_of_vertices)
-      << "Number number of sources should be >= 0 and"
-      << " less than the number of vertices in the graph";
-    std::vector<vertex_t> sources(configuration.number_of_sources_);
-    thrust::sequence(thrust::host, sources.begin(), sources.end(), 0);
-
-    vertex_t* sources_ptr = nullptr;
-    if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); }
-
-    reference_betweenness_centrality(
-      G, expected.data(), normalize, endpoints, configuration.number_of_sources_, sources_ptr);
-
-    sources_ptr = nullptr;
-    if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); }
-
-    rmm::device_vector<result_t> d_result(G.number_of_vertices);
-    cugraph::betweenness_centrality(handle,
-                                    G,
-                                    d_result.data().get(),
-                                    normalize,
-                                    endpoints,
-                                    static_cast<weight_t*>(nullptr),
-                                    configuration.number_of_sources_,
-                                    sources_ptr);
-    cudaDeviceSynchronize();
-    RAFT_CUDA_TRY(cudaMemcpy(result.data(),
-                             d_result.data().get(),
-                             sizeof(result_t) * G.number_of_vertices,
-                             cudaMemcpyDeviceToHost));
-    cudaDeviceSynchronize();
-    for (int i = 0; i < G.number_of_vertices; ++i)
-      EXPECT_TRUE(compare_close(result[i], expected[i], TEST_EPSILON, TEST_ZERO_THRESHOLD))
-        << "[MISMATCH] vaid = " << i << ", cugraph = " << result[i]
-        << " expected = " << expected[i];
-  }
-};
-
-// ============================================================================
-// Tests
-// ============================================================================
-// Verifiy Un-Normalized results
-TEST_P(Tests_BC, CheckFP32_NO_NORMALIZE_NO_ENDPOINTS)
-{
-  run_current_test<int, int, float, float, false, false>(GetParam());
-}
-
-#if 0
-// Temporarily disable some of the test combinations
-//  Full solution will be explored for issue #1555
-TEST_P(Tests_BC, CheckFP64_NO_NORMALIZE_NO_ENDPOINTS)
-{
-  run_current_test<int, int, double, double, false, false>(GetParam());
-}
-
-TEST_P(Tests_BC, CheckFP32_NO_NORMALIZE_ENDPOINTS)
-{
-  run_current_test<int, int, float, float, false, true>(GetParam());
-}
-#endif
-
-TEST_P(Tests_BC, CheckFP64_NO_NORMALIZE_ENDPOINTS)
-{
-  run_current_test<int, int, double, double, false, true>(GetParam());
-}
-
-// Verifiy Normalized results
-TEST_P(Tests_BC, CheckFP32_NORMALIZE_NO_ENDPOINTS)
-{
-  run_current_test<int, int, float, float, true, false>(GetParam());
-}
-
-#if 0
-// Temporarily disable some of the test combinations
-//  Full solution will be explored for issue #1555
-TEST_P(Tests_BC, CheckFP64_NORMALIZE_NO_ENDPOINTS)
-{
-  run_current_test<int, int, double, double, true, false>(GetParam());
-}
-
-TEST_P(Tests_BC, CheckFP32_NORMALIZE_ENDPOINTS)
-{
-  run_current_test<int, int, float, float, true, true>(GetParam());
-}
-#endif
-
-TEST_P(Tests_BC, CheckFP64_NORMALIZE_ENDPOINTS)
-{
-  run_current_test<int, int, double, double, true, true>(GetParam());
-}
-
-#if 0
-// Temporarily disable some of the test combinations
-//  Full solution will be explored for issue #1555
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_BC,
-                         ::testing::Values(BC_Usecase("test/datasets/karate.mtx", 0),
-                                           BC_Usecase("test/datasets/netscience.mtx", 0),
-                                           BC_Usecase("test/datasets/netscience.mtx", 4),
-                                           BC_Usecase("test/datasets/wiki2003.mtx", 4),
-                                           BC_Usecase("test/datasets/wiki-Talk.mtx", 4)));
-#else
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_BC,
-                         ::testing::Values(BC_Usecase("test/datasets/karate.mtx", 0),
-                                           BC_Usecase("test/datasets/netscience.mtx", 0),
-                                           BC_Usecase("test/datasets/netscience.mtx", 4)));
-#endif
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/centrality/legacy/edge_betweenness_centrality_test.cu b/cpp/tests/centrality/legacy/edge_betweenness_centrality_test.cu
deleted file mode 100644
index 153993deda7..00000000000
--- a/cpp/tests/centrality/legacy/edge_betweenness_centrality_test.cu
+++ /dev/null
@@ -1,349 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <traversal/legacy/bfs_ref.h>
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_utilities.hpp>
-
-#include <raft/core/error.hpp>
-#include <raft/core/handle.hpp>
-#include <rmm/device_vector.hpp>
-
-#include <rmm/device_vector.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/host_vector.h>
-#include <thrust/sequence.h>
-
-#include <cugraph/algorithms.hpp>
-#include <cugraph/legacy/graph.hpp>
-
-#include <fstream>
-#include <queue>
-#include <stack>
-#include <utility>
-
-#ifndef TEST_EPSILON
-#define TEST_EPSILON 0.0001
-#endif
-
-// NOTE: Defines under which values the difference should  be discarded when
-// considering values are close to zero
-//  i.e: Do we consider that the difference between 1.3e-9 and 8.e-12 is
-// significant
-#ifndef TEST_ZERO_THRESHOLD
-#define TEST_ZERO_THRESHOLD 1e-10
-#endif
-
-// ============================================================================
-// C++ Reference Implementation
-// ============================================================================
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-edge_t get_edge_index_from_source_and_destination(vertex_t source_vertex,
-                                                  vertex_t destination_vertex,
-                                                  vertex_t const* indices,
-                                                  edge_t const* offsets)
-{
-  edge_t index          = -1;
-  edge_t first_edge_idx = offsets[source_vertex];
-  edge_t last_edge_idx  = offsets[source_vertex + 1];
-  auto index_it = std::find(indices + first_edge_idx, indices + last_edge_idx, destination_vertex);
-  if (index_it != (indices + last_edge_idx)) { index = std::distance(indices, index_it); }
-  return index;
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void ref_accumulation(result_t* result,
-                      vertex_t const* indices,
-                      edge_t const* offsets,
-                      vertex_t const number_of_vertices,
-                      std::stack<vertex_t>& S,
-                      std::vector<std::vector<vertex_t>>& pred,
-                      std::vector<double>& sigmas,
-                      std::vector<double>& deltas,
-                      vertex_t source)
-{
-  for (vertex_t v = 0; v < number_of_vertices; ++v) {
-    deltas[v] = 0;
-  }
-  while (!S.empty()) {
-    vertex_t w = S.top();
-    S.pop();
-    for (vertex_t v : pred[w]) {
-      edge_t edge_idx =
-        get_edge_index_from_source_and_destination<vertex_t, edge_t, weight_t, result_t>(
-          v, w, indices, offsets);
-      double coefficient = (sigmas[v] / sigmas[w]) * (1.0 + deltas[w]);
-
-      deltas[v] += coefficient;
-      result[edge_idx] += coefficient;
-    }
-  }
-}
-
-// Algorithm 1: Shortest-path vertex betweenness, (Brandes, 2001)
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_edge_betweenness_centrality_impl(vertex_t* indices,
-                                                edge_t* offsets,
-                                                vertex_t const number_of_vertices,
-                                                result_t* result,
-                                                vertex_t const* sources,
-                                                vertex_t const number_of_sources)
-{
-  std::queue<vertex_t> Q;
-  std::stack<vertex_t> S;
-  // NOTE: dist is of type vertex_t not weight_t
-  std::vector<vertex_t> dist(number_of_vertices);
-  std::vector<std::vector<vertex_t>> pred(number_of_vertices);
-  std::vector<double> sigmas(number_of_vertices);
-  std::vector<double> deltas(number_of_vertices);
-
-  std::vector<vertex_t> neighbors;
-
-  if (sources) {
-    for (vertex_t source_idx = 0; source_idx < number_of_sources; ++source_idx) {
-      vertex_t s = sources[source_idx];
-      // Step 1: Single-source shortest-paths problem
-      //   a. Initialization
-      ref_bfs<vertex_t, edge_t>(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s);
-      //  Step 2: Accumulation
-      //          Back propagation of dependencies
-      ref_accumulation<vertex_t, edge_t, weight_t, result_t>(
-        result, indices, offsets, number_of_vertices, S, pred, sigmas, deltas, s);
-    }
-  } else {
-    for (vertex_t s = 0; s < number_of_vertices; ++s) {
-      // Step 1: Single-source shortest-paths problem
-      //   a. Initialization
-      ref_bfs<vertex_t, edge_t>(indices, offsets, number_of_vertices, Q, S, dist, pred, sigmas, s);
-      //  Step 2: Accumulation
-      //          Back propagation of dependencies
-      ref_accumulation<vertex_t, edge_t, weight_t, result_t>(
-        result, indices, offsets, number_of_vertices, S, pred, sigmas, deltas, s);
-    }
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_rescale(result_t* result,
-                       bool directed,
-                       bool normalize,
-                       vertex_t const number_of_vertices,
-                       edge_t const number_of_edges)
-{
-  result_t rescale_factor            = static_cast<result_t>(1);
-  result_t casted_number_of_vertices = static_cast<result_t>(number_of_vertices);
-  if (normalize) {
-    if (number_of_vertices > 1) {
-      rescale_factor /= ((casted_number_of_vertices) * (casted_number_of_vertices - 1));
-    }
-  } else {
-    if (!directed) { rescale_factor /= static_cast<result_t>(2); }
-  }
-  for (auto idx = 0; idx < number_of_edges; ++idx) {
-    result[idx] *= rescale_factor;
-  }
-}
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-void reference_edge_betweenness_centrality(
-  cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> const& graph,
-  result_t* result,
-  bool normalize,
-  vertex_t const number_of_sources,
-  vertex_t const* sources)
-{
-  vertex_t number_of_vertices = graph.number_of_vertices;
-  edge_t number_of_edges      = graph.number_of_edges;
-  thrust::host_vector<vertex_t> h_indices(number_of_edges);
-  thrust::host_vector<edge_t> h_offsets(number_of_vertices + 1);
-
-  thrust::device_ptr<vertex_t> d_indices((vertex_t*)&graph.indices[0]);
-  thrust::device_ptr<edge_t> d_offsets((edge_t*)&graph.offsets[0]);
-
-  thrust::copy(d_indices, d_indices + number_of_edges, h_indices.begin());
-  thrust::copy(d_offsets, d_offsets + (number_of_vertices + 1), h_offsets.begin());
-
-  cudaDeviceSynchronize();
-
-  reference_edge_betweenness_centrality_impl<vertex_t, edge_t, weight_t, result_t>(
-    &h_indices[0], &h_offsets[0], number_of_vertices, result, sources, number_of_sources);
-  reference_rescale<vertex_t, edge_t, weight_t, result_t>(
-    result, graph.prop.directed, normalize, number_of_vertices, number_of_edges);
-}
-
-// =============================================================================
-// Utility functions
-// =============================================================================
-// Compare while allowing relatie error of epsilon
-// zero_threshold indicates when  we should drop comparison for small numbers
-template <typename T, typename precision_t>
-bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold)
-{
-  return ((zero_threshold > a && zero_threshold > b)) ||
-         (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon));
-}
-
-// =============================================================================
-// Test Suite
-// =============================================================================
-// Defines Betweenness Centrality UseCase
-// SSSP's test suite code uses type of Graph parameter that could be used
-// (MTX / RMAT)
-typedef struct EdgeBC_Usecase_t {
-  std::string config_;     // Path to graph file
-  std::string file_path_;  // Complete path to graph using dataset_root_dir
-  int number_of_sources_;  // Starting point from the traversal
-  EdgeBC_Usecase_t(const std::string& config, int number_of_sources)
-    : config_(config), number_of_sources_(number_of_sources)
-  {
-    // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
-    // FIXME: Use platform independent stuff from c++14/17 on compiler update
-    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
-    if ((config_ != "") && (config_[0] != '/')) {
-      file_path_ = rapidsDatasetRootDir + "/" + config_;
-    } else {
-      file_path_ = config_;
-    }
-  };
-} EdgeBC_Usecase;
-
-class Tests_EdgeBC : public ::testing::TestWithParam<EdgeBC_Usecase> {
-  raft::handle_t handle;
-
- public:
-  Tests_EdgeBC() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  // FIXME: Should normalize be part of the configuration instead?
-  // vertex_t         vertex identifier data type
-  // edge_t         edge identifier data type
-  // weight_t         edge weight data type
-  // result_t   result data type
-  // normalize  should the result be normalized
-  template <typename vertex_t,
-            typename edge_t,
-            typename weight_t,
-            typename result_t,
-            bool normalize>
-  void run_current_test(const EdgeBC_Usecase& configuration)
-  {
-    // Step 1: Construction of the graph based on configuration
-    bool is_directed = false;
-    auto csr         = cugraph::test::generate_graph_csr_from_mm<vertex_t, edge_t, weight_t>(
-      is_directed, configuration.file_path_);
-    cudaDeviceSynchronize();
-    cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> G = csr->view();
-    G.prop.directed                                             = is_directed;
-    RAFT_CUDA_TRY(cudaGetLastError());
-    std::vector<result_t> result(G.number_of_edges, 0);
-    std::vector<result_t> expected(G.number_of_edges, 0);
-
-    // Step 2: Generation of sources based on configuration
-    //         if number_of_sources_ is 0 then sources must be nullptr
-    //         Otherwise we only  use the first k values
-    ASSERT_TRUE(configuration.number_of_sources_ >= 0 &&
-                configuration.number_of_sources_ <= G.number_of_vertices)
-      << "Number number of sources should be >= 0 and"
-      << " less than the number of vertices in the graph";
-    std::vector<vertex_t> sources(configuration.number_of_sources_);
-    thrust::sequence(thrust::host, sources.begin(), sources.end(), 0);
-
-    vertex_t* sources_ptr = nullptr;
-    if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); }
-
-    reference_edge_betweenness_centrality(
-      G, expected.data(), normalize, configuration.number_of_sources_, sources_ptr);
-
-    sources_ptr = nullptr;
-    if (configuration.number_of_sources_ > 0) { sources_ptr = sources.data(); }
-
-    rmm::device_vector<result_t> d_result(G.number_of_edges);
-    cugraph::edge_betweenness_centrality(handle,
-                                         G,
-                                         d_result.data().get(),
-                                         normalize,
-                                         static_cast<weight_t*>(nullptr),
-                                         configuration.number_of_sources_,
-                                         sources_ptr);
-    RAFT_CUDA_TRY(cudaMemcpy(result.data(),
-                             d_result.data().get(),
-                             sizeof(result_t) * G.number_of_edges,
-                             cudaMemcpyDeviceToHost));
-    for (int i = 0; i < G.number_of_edges; ++i)
-      EXPECT_TRUE(compare_close(result[i], expected[i], TEST_EPSILON, TEST_ZERO_THRESHOLD))
-        << "[MISMATCH] vaid = " << i << ", cugraph = " << result[i]
-        << " expected = " << expected[i];
-  }
-};
-
-// ============================================================================
-// Tests
-// ============================================================================
-// Verifiy Un-Normalized results
-TEST_P(Tests_EdgeBC, CheckFP32_NO_NORMALIZE)
-{
-  run_current_test<int, int, float, float, false>(GetParam());
-}
-
-#if 0
-// Temporarily disable some of the test combinations
-//  Full solution will be explored for issue #1555
-TEST_P(Tests_EdgeBC, CheckFP64_NO_NORMALIZE)
-{
-  run_current_test<int, int, double, double, false>(GetParam());
-}
-
-// Verifiy Normalized results
-TEST_P(Tests_EdgeBC, CheckFP32_NORMALIZE)
-{
-  run_current_test<int, int, float, float, true>(GetParam());
-}
-#endif
-
-TEST_P(Tests_EdgeBC, CheckFP64_NORMALIZE)
-{
-  run_current_test<int, int, double, double, true>(GetParam());
-}
-
-#if 0
-// Temporarily disable some of the test combinations
-//  Full solution will be explored for issue #1555
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_EdgeBC,
-                         ::testing::Values(EdgeBC_Usecase("test/datasets/karate.mtx", 0),
-                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 0),
-                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 4),
-                                           EdgeBC_Usecase("test/datasets/wiki2003.mtx", 4),
-                                           EdgeBC_Usecase("test/datasets/wiki-Talk.mtx", 4)));
-#else
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_EdgeBC,
-                         ::testing::Values(EdgeBC_Usecase("test/datasets/karate.mtx", 0),
-                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 0),
-                                           EdgeBC_Usecase("test/datasets/netscience.mtx", 4)));
-#endif
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/traversal/legacy/bfs_ref.h b/cpp/tests/traversal/legacy/bfs_ref.h
deleted file mode 100644
index 5efdce818e7..00000000000
--- a/cpp/tests/traversal/legacy/bfs_ref.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <limits>
-#include <queue>
-#include <stack>
-#include <vector>
-
-template <typename VT, typename ET>
-void populate_neighbors(VT* indices, ET* offsets, VT w, std::vector<VT>& neighbors)
-{
-  ET edge_start = offsets[w];
-  ET edge_end   = offsets[w + 1];
-
-  neighbors.assign(indices + edge_start, indices + edge_end);
-}
-
-// This implements the BFS based on (Brandes, 2001) for shortest path counting
-template <typename VT, typename ET>
-void ref_bfs(VT* indices,
-             ET* offsets,
-             VT const number_of_vertices,
-             std::queue<VT>& Q,
-             std::stack<VT>& S,
-             std::vector<VT>& dist,
-             std::vector<std::vector<VT>>& pred,
-             std::vector<double>& sigmas,
-             VT source)
-{
-  std::vector<VT> neighbors;
-  pred.clear();
-  pred.resize(number_of_vertices);
-  dist.assign(number_of_vertices, std::numeric_limits<VT>::max());
-  sigmas.assign(number_of_vertices, 0);
-  dist[source]   = 0;
-  sigmas[source] = 1;
-  Q.push(source);
-  //   b. Traversal
-  while (!Q.empty()) {
-    VT v = Q.front();
-    Q.pop();
-    S.push(v);
-    populate_neighbors<VT, ET>(indices, offsets, v, neighbors);
-    for (VT w : neighbors) {
-      // Path Discovery:
-      // Found for the first time?
-      if (dist[w] == std::numeric_limits<VT>::max()) {
-        dist[w] = dist[v] + 1;
-        Q.push(w);
-      }
-      // Path counting
-      // Edge(v, w) on  a shortest path?
-      if (dist[w] == dist[v] + 1) {
-        sigmas[w] += sigmas[v];
-        pred[w].push_back(v);
-      }
-    }
-  }
-}
diff --git a/cpp/tests/traversal/legacy/bfs_test.cu b/cpp/tests/traversal/legacy/bfs_test.cu
deleted file mode 100644
index c6d3c96aa93..00000000000
--- a/cpp/tests/traversal/legacy/bfs_test.cu
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governin_from_mtxg permissions and
- * limitations under the License.
- */
-
-#include "bfs_ref.h"
-
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_utilities.hpp>
-
-#include <cugraph/algorithms.hpp>
-
-#include <rmm/device_vector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <raft/core/handle.hpp>
-
-#include <queue>
-#include <stack>
-#include <vector>
-
-// NOTE: This could be common to other files but we might not want the same precision
-// depending on the algorithm
-#ifndef TEST_EPSILON  // It is currently use for relative error
-#define TEST_EPSILON 0.0001
-#endif
-
-// NOTE: Defines under which values the difference should  be discarded when
-// considering values are close to zero
-//  i.e: Do we consider that the difference between 1.3e-9 and 8.e-12 is
-// significant
-#ifndef TEST_ZERO_THRESHOLD
-#define TEST_ZERO_THRESHOLD 1e-10
-#endif
-// ============================================================================
-// C++ Reference Implementation
-// ============================================================================
-template <typename T, typename precision_t>
-bool compare_close(const T& a, const T& b, const precision_t epsilon, precision_t zero_threshold)
-{
-  return ((zero_threshold > a && zero_threshold > b)) ||
-         (a >= b * (1.0 - epsilon)) && (a <= b * (1.0 + epsilon));
-}
-
-// ============================================================================
-// Test Suite
-// ============================================================================
-typedef struct BFS_Usecase_t {
-  std::string config_;     // Path to graph file
-  std::string file_path_;  // Complete path to graph using dataset_root_dir
-  int source_;             // Starting point from the traversal
-  BFS_Usecase_t(const std::string& config, int source) : config_(config), source_(source)
-  {
-    const std::string& rapidsDatasetRootDir = cugraph::test::get_rapids_dataset_root_dir();
-    if ((config_ != "") && (config_[0] != '/')) {
-      file_path_ = rapidsDatasetRootDir + "/" + config_;
-    } else {
-      file_path_ = config_;
-    }
-  };
-} BFS_Usecase;
-
-class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
-  raft::handle_t handle;
-
- public:
-  Tests_BFS() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  // VT                 vertex identifier data type
-  // ET                 edge identifier data type
-  // WT                 edge weight data type
-  // return_sp_counter  should BFS return shortest path countner
-  template <typename VT, typename ET, typename WT, bool return_sp_counter>
-  void run_current_test(const BFS_Usecase& configuration)
-  {
-    // Step 1: Construction of the graph based on configuration
-    VT number_of_vertices;
-    ET number_of_edges;
-    bool directed = false;
-    auto csr =
-      cugraph::test::generate_graph_csr_from_mm<VT, ET, WT>(directed, configuration.file_path_);
-    cudaDeviceSynchronize();
-    cugraph::legacy::GraphCSRView<VT, ET, WT> G = csr->view();
-    G.prop.directed                             = directed;
-
-    ASSERT_TRUE(configuration.source_ >= 0 && (VT)configuration.source_ < G.number_of_vertices)
-      << "Starting sources should be >= 0 and"
-      << " less than the number of vertices in the graph";
-
-    VT source = configuration.source_;
-
-    number_of_vertices = G.number_of_vertices;
-    number_of_edges    = G.number_of_edges;
-
-    std::vector<VT> indices(number_of_edges);
-    std::vector<ET> offsets(number_of_vertices + 1);
-
-    RAFT_CUDA_TRY(
-      cudaMemcpy(indices.data(), G.indices, sizeof(VT) * indices.size(), cudaMemcpyDeviceToHost));
-    RAFT_CUDA_TRY(
-      cudaMemcpy(offsets.data(), G.offsets, sizeof(ET) * offsets.size(), cudaMemcpyDeviceToHost));
-
-    std::queue<VT> Q;
-    std::stack<VT> S;
-    std::vector<VT> ref_bfs_dist(number_of_vertices);
-    std::vector<std::vector<VT>> ref_bfs_pred(number_of_vertices);
-    std::vector<double> ref_bfs_sigmas(number_of_vertices);
-
-    ref_bfs<VT, ET>(indices.data(),
-                    offsets.data(),
-                    number_of_vertices,
-                    Q,
-                    S,
-                    ref_bfs_dist,
-                    ref_bfs_pred,
-                    ref_bfs_sigmas,
-                    source);
-
-    // Device data for cugraph_bfs
-    rmm::device_vector<VT> d_cugraph_dist(number_of_vertices);
-    rmm::device_vector<VT> d_cugraph_pred(number_of_vertices);
-    rmm::device_vector<double> d_cugraph_sigmas(number_of_vertices);
-
-    std::vector<VT> cugraph_dist(number_of_vertices);
-    std::vector<VT> cugraph_pred(number_of_vertices);
-    std::vector<double> cugraph_sigmas(number_of_vertices);
-
-    // Don't pass valid sp_sp_counter ptr unless needed because it disables
-    // the bottom up flow
-    cugraph::bfs<VT, ET, WT>(handle,
-                             G,
-                             d_cugraph_dist.data().get(),
-                             d_cugraph_pred.data().get(),
-                             (return_sp_counter) ? d_cugraph_sigmas.data().get() : nullptr,
-                             source,
-                             G.prop.directed);
-    RAFT_CUDA_TRY(cudaMemcpy(cugraph_dist.data(),
-                             d_cugraph_dist.data().get(),
-                             sizeof(VT) * d_cugraph_dist.size(),
-                             cudaMemcpyDeviceToHost));
-    RAFT_CUDA_TRY(cudaMemcpy(cugraph_pred.data(),
-                             d_cugraph_pred.data().get(),
-                             sizeof(VT) * d_cugraph_pred.size(),
-                             cudaMemcpyDeviceToHost));
-
-    if (return_sp_counter) {
-      RAFT_CUDA_TRY(cudaMemcpy(cugraph_sigmas.data(),
-                               d_cugraph_sigmas.data().get(),
-                               sizeof(double) * d_cugraph_sigmas.size(),
-                               cudaMemcpyDeviceToHost));
-    }
-
-    for (VT i = 0; i < number_of_vertices; ++i) {
-      // Check distances: should be an exact match as we use signed int 32-bit
-      EXPECT_EQ(cugraph_dist[i], ref_bfs_dist[i])
-        << "[MISMATCH] vaid = " << i << ", cugraph = " << cugraph_sigmas[i]
-        << " c++ ref = " << ref_bfs_sigmas[i];
-      // Check predecessor: We do not enforce the predecessor, we simply verifiy
-      // that the predecessor obtained with the GPU implementation is one of the
-      // predecessors obtained during the C++ BFS traversal
-      VT pred = cugraph_pred[i];  // It could be equal to -1 if the node is never reached
-      constexpr VT invalid_vid = cugraph::legacy::invalid_vertex_id<VT>::value;
-      if (pred == invalid_vid) {
-        EXPECT_TRUE(ref_bfs_pred[i].empty())
-          << "[MISMATCH][PREDECESSOR] vaid = " << i << " cugraph had not predecessor,"
-          << "while c++ ref found at least one.";
-      } else {
-        // This can get expensive to check, we could have simply verified that based
-        // on the the distance from the source to the predecessor, but this ensures that there
-        // are no misassignations
-        auto it = std::find(ref_bfs_pred[i].begin(), ref_bfs_pred[i].end(), pred);
-        EXPECT_TRUE(it != ref_bfs_pred[i].end())
-          << "[MISMATCH][PREDECESSOR] vaid = " << i << " cugraph = " << cugraph_sigmas[i]
-          << " , c++ ref did not consider it as a predecessor.";
-      }
-
-      if (return_sp_counter) {
-        EXPECT_TRUE(
-          compare_close(cugraph_sigmas[i], ref_bfs_sigmas[i], TEST_EPSILON, TEST_ZERO_THRESHOLD))
-          << "[MISMATCH] vaid = " << i << ", cugraph = " << cugraph_sigmas[i]
-          << " c++ ref = " << ref_bfs_sigmas[i];
-      }
-    }
-  }
-};
-
-// ============================================================================
-// Tests
-// ============================================================================
-
-// We don't need to test WT for both float and double since it's anyway ignored in BFS
-TEST_P(Tests_BFS, CheckUint32_NO_SP_COUNTER)
-{
-  run_current_test<uint32_t, uint32_t, float, false>(GetParam());
-}
-TEST_P(Tests_BFS, CheckInt_NO_SP_COUNTER) { run_current_test<int, int, float, false>(GetParam()); }
-TEST_P(Tests_BFS, CheckInt64_NO_SP_COUNTER)
-{
-  run_current_test<int64_t, int64_t, float, false>(GetParam());
-}
-
-TEST_P(Tests_BFS, CheckUint32_SP_COUNTER)
-{
-  run_current_test<uint32_t, uint32_t, float, true>(GetParam());
-}
-TEST_P(Tests_BFS, CheckInt_SP_COUNTER) { run_current_test<int, int, float, true>(GetParam()); }
-TEST_P(Tests_BFS, CheckInt64_SP_COUNTER)
-{
-  run_current_test<int64_t, int64_t, float, true>(GetParam());
-}
-
-INSTANTIATE_TEST_SUITE_P(simple_test,
-                         Tests_BFS,
-                         ::testing::Values(BFS_Usecase("test/datasets/karate.mtx", 0),
-                                           BFS_Usecase("test/datasets/polbooks.mtx", 0),
-                                           BFS_Usecase("test/datasets/netscience.mtx", 0),
-                                           BFS_Usecase("test/datasets/netscience.mtx", 100),
-                                           BFS_Usecase("test/datasets/wiki2003.mtx", 1000),
-                                           BFS_Usecase("test/datasets/wiki-Talk.mtx", 1000)));
-
-CUGRAPH_TEST_PROGRAM_MAIN()

From 2fab584d85cf8844782997fa912c0ca540aff655 Mon Sep 17 00:00:00 2001
From: lmeyerov <lmeyerov@gmail.com>
Date: Wed, 30 Aug 2023 18:23:36 -0400
Subject: [PATCH 24/72] Update README.md (#3826)

Add pygraphistry to oss list ("(please post an issue if you have a project to add to this list)")

Authors:
  - https://github.com/lmeyerov
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3826
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 33d56671a9b..3daeb0570b0 100644
--- a/README.md
+++ b/README.md
@@ -116,6 +116,7 @@ df_page.sort_values('pagerank', ascending=False).head(10)
 * ArangoDB - a free and open-source native multi-model database system  - https://www.arangodb.com/
 * CuPy - "NumPy/SciPy-compatible Array Library for GPU-accelerated Computing with Python" -  https://cupy.dev/
 * Memgraph - In-memory Graph database - https://memgraph.com/
+* PyGraphistry - free and open-source GPU graph ETL, AI, and visualization, including native RAPIDS & cuGraph support - http://github.com/graphistry/pygraphistry
 * ScanPy - a scalable toolkit for analyzing single-cell gene expression data - https://scanpy.readthedocs.io/en/stable/
 
 (please post an issue if you have a project to add to this list)

From 9268b08cc630338749b6377b62ab26ebac3e800e Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Wed, 30 Aug 2023 22:35:35 -0500
Subject: [PATCH 25/72] Add `louvain_communities` to cugraph-nx (#3803)

See: #3773

Possible follow-up tasks:
- Update to use threshold parameter exposed from C++ (#3792)
- Add `max_level` argument to networkx implementation
  - ~Or, add `max_level` as extra`cugraph_nx`-specific argument~ (**done**)
- Update PLC to handle empty graphs gracefully (#3804)
- Update PLC to handle directed graphs
- Add `louvain_partitions` (needs added to PLC)
  - https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_partitions.html

This is passing many networkx tests. I don't have this as draft, b/c it's usable (and I would argue) mergable as is.

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3803
---
 .../cugraph_nx/algorithms/__init__.py         |  2 +-
 .../algorithms/community/__init__.py          | 13 ++++
 .../algorithms/community/louvain.py           | 56 ++++++++++++++
 python/cugraph-nx/cugraph_nx/classes/graph.py | 13 +++-
 python/cugraph-nx/cugraph_nx/interface.py     | 20 +++++
 .../cugraph_nx/tests/test_match_api.py        | 77 +++++++++++++++----
 .../cugraph-nx/cugraph_nx/utils/__init__.py   |  1 +
 .../cugraph-nx/cugraph_nx/utils/decorators.py | 13 +++-
 python/cugraph-nx/cugraph_nx/utils/misc.py    | 45 +++++++++++
 python/cugraph-nx/lint.yaml                   |  6 +-
 python/cugraph-nx/pyproject.toml              |  2 +-
 11 files changed, 224 insertions(+), 24 deletions(-)
 create mode 100644 python/cugraph-nx/cugraph_nx/algorithms/community/__init__.py
 create mode 100644 python/cugraph-nx/cugraph_nx/algorithms/community/louvain.py
 create mode 100644 python/cugraph-nx/cugraph_nx/utils/misc.py

diff --git a/python/cugraph-nx/cugraph_nx/algorithms/__init__.py b/python/cugraph-nx/cugraph_nx/algorithms/__init__.py
index d014f7f401f..3a585452d6d 100644
--- a/python/cugraph-nx/cugraph_nx/algorithms/__init__.py
+++ b/python/cugraph-nx/cugraph_nx/algorithms/__init__.py
@@ -10,5 +10,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from . import centrality
+from . import centrality, community
 from .centrality import *
diff --git a/python/cugraph-nx/cugraph_nx/algorithms/community/__init__.py b/python/cugraph-nx/cugraph_nx/algorithms/community/__init__.py
new file mode 100644
index 00000000000..51a4f5c195f
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/algorithms/community/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .louvain import *
diff --git a/python/cugraph-nx/cugraph_nx/algorithms/community/louvain.py b/python/cugraph-nx/cugraph_nx/algorithms/community/louvain.py
new file mode 100644
index 00000000000..476f7428aab
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/algorithms/community/louvain.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+import pylibcugraph as plc
+
+from cugraph_nx.convert import _to_undirected_graph
+from cugraph_nx.utils import _groupby, networkx_algorithm, not_implemented_for
+
+__all__ = ["louvain_communities"]
+
+
+@not_implemented_for("directed")
+@networkx_algorithm(extra_params="max_level")
+def louvain_communities(
+    G, weight="weight", resolution=1, threshold=0.0000001, seed=None, *, max_level=None
+):
+    """`threshold` and `seed` parameters are currently ignored.
+
+    Extra parameter: `max_level` controls the maximum number of levels of the algorithm.
+    """
+    # NetworkX allows both directed and undirected, but cugraph only allows undirected.
+    G = _to_undirected_graph(G, weight)
+    if G.row_indices.size == 0:
+        # TODO: PLC doesn't handle empty graphs gracefully!
+        return [{key} for key in G._nodeiter_to_iter(range(len(G)))]
+    if max_level is None:
+        max_level = sys.maxsize
+    vertices, clusters, modularity = plc.louvain(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(),
+        max_level=max_level,  # TODO: add this parameter to NetworkX
+        resolution=resolution,
+        # threshold=threshold,  # TODO: add this parameter to PLC
+        do_expensive_check=False,
+    )
+    groups = _groupby(clusters, vertices)
+    return [set(G._nodearray_to_list(node_ids)) for node_ids in groups.values()]
+
+
+@louvain_communities._can_run
+def _(
+    G, weight="weight", resolution=1, threshold=0.0000001, seed=None, *, max_level=None
+):
+    # NetworkX allows both directed and undirected, but cugraph only allows undirected.
+    return not G.is_directed()
diff --git a/python/cugraph-nx/cugraph_nx/classes/graph.py b/python/cugraph-nx/cugraph_nx/classes/graph.py
index 3d561815de6..5604f2457f8 100644
--- a/python/cugraph-nx/cugraph_nx/classes/graph.py
+++ b/python/cugraph-nx/cugraph_nx/classes/graph.py
@@ -24,7 +24,7 @@
 import cugraph_nx as cnx
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
+    from collections.abc import Iterable, Iterator
 
     from cugraph_nx.typing import (
         AttrKey,
@@ -532,6 +532,17 @@ def _get_plc_graph(
             do_expensive_check=False,
         )
 
+    def _nodeiter_to_iter(self, node_ids: Iterable[IndexValue]) -> Iterable[NodeKey]:
+        """Convert an iterable of node IDs to an iterable of node keys."""
+        if (id_to_key := self.id_to_key) is not None:
+            return map(id_to_key.__getitem__, node_ids)
+        return node_ids
+
+    def _nodearray_to_list(self, node_ids: cp.ndarray[IndexValue]) -> list[NodeKey]:
+        if self.key_to_id is None:
+            return node_ids.tolist()
+        return list(self._nodeiter_to_iter(node_ids.tolist()))
+
     def _nodearrays_to_dict(
         self, node_ids: cp.ndarray[IndexValue], values: cp.ndarray[NodeValue]
     ) -> dict[NodeKey, NodeValue]:
diff --git a/python/cugraph-nx/cugraph_nx/interface.py b/python/cugraph-nx/cugraph_nx/interface.py
index ccd8d418d30..198fdd09cfc 100644
--- a/python/cugraph-nx/cugraph_nx/interface.py
+++ b/python/cugraph-nx/cugraph_nx/interface.py
@@ -59,8 +59,12 @@ def key(testpath):
                 return (testname, frozenset({classname, filename}))
             return (testname, frozenset({filename}))
 
+        # Reasons for xfailing
         no_weights = "weighted implementation not currently supported"
         no_multigraph = "multigraphs not currently supported"
+        louvain_different = (
+            "Louvain may be different due to RNG or unsupported threshold parameter"
+        )
 
         xfail = {}
 
@@ -69,6 +73,10 @@ def key(testpath):
         nxver = parse(nx.__version__)
         if nxver.major == 3 and nxver.minor in {0, 1}:
             # MAINT: networkx 3.0, 3.1
+            # NetworkX 3.2 added the ability to "fallback to nx" if backend algorithms
+            # raise NotImplementedError or `can_run` returns False. The tests below
+            # exercise behavior we have not implemented yet, so we mark them as xfail
+            # for previous versions of NetworkX.
             xfail.update(
                 {
                     key(
@@ -160,6 +168,18 @@ def key(testpath):
                     ): no_multigraph,
                 }
             )
+        else:
+            xfail.update(
+                {
+                    key(
+                        "test_louvain.py:test_karate_club_partition"
+                    ): louvain_different,
+                    key("test_louvain.py:test_none_weight_param"): louvain_different,
+                    key("test_louvain.py:test_multigraph"): louvain_different,
+                    key("test_louvain.py:test_threshold"): louvain_different,
+                }
+            )
+
         for item in items:
             kset = set(item.keywords)
             for (test_name, keywords), reason in xfail.items():
diff --git a/python/cugraph-nx/cugraph_nx/tests/test_match_api.py b/python/cugraph-nx/cugraph_nx/tests/test_match_api.py
index 2a2e33ec2f4..918c18b4ce3 100644
--- a/python/cugraph-nx/cugraph_nx/tests/test_match_api.py
+++ b/python/cugraph-nx/cugraph_nx/tests/test_match_api.py
@@ -10,6 +10,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import inspect
 
 import networkx as nx
@@ -25,33 +26,45 @@ def test_match_signature_and_names():
             continue
 
         # nx version >=3.2 uses utils.backends, version >=3.0,<3.2 uses classes.backends
-        nx_backends = getattr(
-            nx.utils, "backends", getattr(nx.classes, "backends", None)
-        )
-        if nx_backends is None:
-            raise AttributeError(
-                f"imported networkx version {nx.__version__} is not "
-                "supported, must be >= 3.0"
-            )
+        is_nx_30_or_31 = hasattr(nx.classes, "backends")
+        nx_backends = nx.classes.backends if is_nx_30_or_31 else nx.utils.backends
 
+        if is_nx_30_or_31 and name in {"louvain_communities"}:
+            continue
         dispatchable_func = nx_backends._registered_algorithms[name]
         # nx version >=3.2 uses orig_func, version >=3.0,<3.2 uses _orig_func
-        orig_func = getattr(
-            dispatchable_func, "orig_func", getattr(dispatchable_func, "_orig_func")
-        )
+        if is_nx_30_or_31:
+            orig_func = dispatchable_func._orig_func
+        else:
+            orig_func = dispatchable_func.orig_func
 
         # Matching signatures?
-        sig = inspect.signature(orig_func)
-        assert sig == inspect.signature(func)
+        orig_sig = inspect.signature(orig_func)
+        func_sig = inspect.signature(func)
+        if not func.extra_params:
+            assert orig_sig == func_sig
+        else:
+            # Ignore extra parameters added to cugraph-nx algorithm
+            assert orig_sig == func_sig.replace(
+                parameters=[
+                    p
+                    for name, p in func_sig.parameters.items()
+                    if name not in func.extra_params
+                ]
+            )
+        if func.can_run is not cnx.utils.decorators._default_can_run:
+            assert func_sig == inspect.signature(func.can_run)
 
         # Matching function names?
         assert func.__name__ == dispatchable_func.__name__ == orig_func.__name__
 
         # Matching dispatch names?
         # nx version >=3.2 uses name, version >=3.0,<3.2 uses dispatchname
-        assert func.name == getattr(
-            dispatchable_func, "name", getattr(dispatchable_func, "dispatchname")
-        )
+        if is_nx_30_or_31:
+            dispatchname = dispatchable_func.dispatchname
+        else:
+            dispatchname = dispatchable_func.name
+        assert func.name == dispatchname
 
         # Matching modules (i.e., where function defined)?
         assert (
@@ -59,3 +72,35 @@ def test_match_signature_and_names():
             == dispatchable_func.__module__
             == orig_func.__module__
         )
+
+        # Matching package layout (i.e., which modules have the function)?
+        cnx_path = func.__module__
+        name = func.__name__
+        while "." in cnx_path:
+            # This only walks up the module tree and does not check sibling modules
+            cnx_path, mod_name = cnx_path.rsplit(".", 1)
+            nx_path = cnx_path.replace("cugraph_nx", "networkx")
+            cnx_mod = importlib.import_module(cnx_path)
+            nx_mod = importlib.import_module(nx_path)
+            # Is the function present in the current module?
+            present_in_cnx = hasattr(cnx_mod, name)
+            present_in_nx = hasattr(nx_mod, name)
+            if present_in_cnx is not present_in_nx:  # pragma: no cover (debug)
+                if present_in_cnx:
+                    raise AssertionError(
+                        f"{name} exists in {cnx_path}, but not in {nx_path}"
+                    )
+                raise AssertionError(
+                    f"{name} exists in {nx_path}, but not in {cnx_path}"
+                )
+            # Is the nested module present in the current module?
+            present_in_cnx = hasattr(cnx_mod, mod_name)
+            present_in_nx = hasattr(nx_mod, mod_name)
+            if present_in_cnx is not present_in_nx:  # pragma: no cover (debug)
+                if present_in_cnx:
+                    raise AssertionError(
+                        f"{mod_name} exists in {cnx_path}, but not in {nx_path}"
+                    )
+                raise AssertionError(
+                    f"{mod_name} exists in {nx_path}, but not in {cnx_path}"
+                )
diff --git a/python/cugraph-nx/cugraph_nx/utils/__init__.py b/python/cugraph-nx/cugraph_nx/utils/__init__.py
index f7ef42c8677..6df5fb60978 100644
--- a/python/cugraph-nx/cugraph_nx/utils/__init__.py
+++ b/python/cugraph-nx/cugraph_nx/utils/__init__.py
@@ -11,3 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .decorators import *
+from .misc import *
diff --git a/python/cugraph-nx/cugraph_nx/utils/decorators.py b/python/cugraph-nx/cugraph_nx/utils/decorators.py
index 7bda3e58b6b..619c9610c5d 100644
--- a/python/cugraph-nx/cugraph_nx/utils/decorators.py
+++ b/python/cugraph-nx/cugraph_nx/utils/decorators.py
@@ -28,17 +28,26 @@ def inner(func):
 
 
 class networkx_algorithm:
-    def __new__(cls, func=None, *, name=None):
+    def __new__(cls, func=None, *, name=None, extra_params=None):
         if func is None:
-            return partial(networkx_algorithm, name=name)
+            return partial(networkx_algorithm, name=name, extra_params=extra_params)
         instance = object.__new__(cls)
         # update_wrapper sets __wrapped__, which will be used for the signature
         update_wrapper(instance, func)
         instance.__defaults__ = func.__defaults__
         instance.__kwdefaults__ = func.__kwdefaults__
         instance.name = func.__name__ if name is None else name
+        # TODO: should extra_params be a dict[str, str] that describes the parameters?
+        if extra_params is None:
+            instance.extra_params = None
+        elif isinstance(extra_params, str):
+            instance.extra_params = {extra_params}
+        else:
+            instance.extra_params = set(extra_params)
         instance.can_run = _default_can_run
         setattr(BackendInterface, instance.name, instance)
+        # Set methods so they are in __dict__
+        instance._can_run = instance._can_run
         return instance
 
     def _can_run(self, func):
diff --git a/python/cugraph-nx/cugraph_nx/utils/misc.py b/python/cugraph-nx/cugraph_nx/utils/misc.py
new file mode 100644
index 00000000000..18487a05996
--- /dev/null
+++ b/python/cugraph-nx/cugraph_nx/utils/misc.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+
+__all__ = ["_groupby"]
+
+
+def _groupby(groups: cp.ndarray, values: cp.ndarray) -> dict[int, cp.ndarray]:
+    """Perform a groupby operation given an array of group IDs and array of values.
+
+    Parameters
+    ----------
+    groups : cp.ndarray
+        Array that holds the group IDs.
+        Group IDs are assumed to be consecutive integers from 0.
+    values : cp.ndarray
+        Array of values to be grouped according to groups.
+        Must be the same size as groups array.
+
+    Returns
+    -------
+    dict with group IDs as keys and cp.ndarray as values.
+    """
+    # It would actually be easy to support groups that aren't consecutive integers,
+    # but let's wait until we need it to implement it.
+    sorted_groups = cp.argsort(groups)
+    sorted_values = values[sorted_groups]
+    rv = {}
+    start = 0
+    for i, end in enumerate(
+        [*(cp.nonzero(cp.diff(groups[sorted_groups]))[0] + 1).tolist(), groups.size]
+    ):
+        rv[i] = sorted_values[start:end]
+        start = end
+    return rv
diff --git a/python/cugraph-nx/lint.yaml b/python/cugraph-nx/lint.yaml
index 04747a2b49b..42c1b9657c7 100644
--- a/python/cugraph-nx/lint.yaml
+++ b/python/cugraph-nx/lint.yaml
@@ -26,7 +26,7 @@ repos:
       - id: mixed-line-ending
       - id: trailing-whitespace
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.13
+    rev: v0.14
     hooks:
       - id: validate-pyproject
         name: Validate pyproject.toml
@@ -50,7 +50,7 @@ repos:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.284
+    rev: v0.0.286
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]
@@ -76,7 +76,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(cugraph_nx|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.284
+    rev: v0.0.286
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/cugraph-nx/pyproject.toml b/python/cugraph-nx/pyproject.toml
index e8c4f670444..7384fc75007 100644
--- a/python/cugraph-nx/pyproject.toml
+++ b/python/cugraph-nx/pyproject.toml
@@ -116,7 +116,7 @@ omit = []
 ignore_errors = false
 precision = 1
 fail_under = 0
-skip_covered = true
+skip_covered = false  # Nice to see fully covered files when running `run_nx_tests.sh`
 skip_empty = true
 exclude_lines = [
     "pragma: no cover",

From 392250464cd123fc67056d330cd8d8dbf334c181 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Thu, 31 Aug 2023 09:49:23 -0400
Subject: [PATCH 26/72] [BUG] Fix Batch Renumbering of Empty Batches (#3823)

Ensures that batches are renumbered starting from the starting batch id rather than 0.
Adds an appropriate failing test, which passes with the change.
Closes #3819

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3823
---
 .../gnn/data_loading/bulk_sampler_io.py       |  4 +-
 .../tests/sampling/test_bulk_sampler_io.py    | 82 +++++++++++++++++
 .../tests/sampling/test_bulk_sampler_io_mg.py | 89 +++++++++++++++++++
 3 files changed, 174 insertions(+), 1 deletion(-)

diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
index 04917143030..e9e5be26fc3 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
@@ -87,7 +87,9 @@ def _write_samples_to_parquet(
             # renumbered to have contiguous batch ids and the empty
             # samples are dropped.
             offsets_p.drop("batch_id", axis=1, inplace=True)
-            batch_id_range = cudf.Series(cupy.arange(len(offsets_p)))
+            batch_id_range = cudf.Series(
+                cupy.arange(start_batch_id, start_batch_id + len(offsets_p))
+            )
             end_batch_id = start_batch_id + len(offsets_p) - 1
         else:
             batch_id_range = offsets_p.batch_id
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py
index 0b4b9fa73de..f71c16a8368 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py
@@ -75,3 +75,85 @@ def test_bulk_sampler_io(scratch_dir):
     assert (df.batch_id == 1).all()
 
     shutil.rmtree(samples_path)
+
+
+@pytest.mark.sg
+def test_bulk_sampler_io_empty_batch(scratch_dir):
+    sources_array = [
+        0,
+        0,
+        1,
+        2,
+        2,
+        2,
+        3,
+        4,
+        5,
+        5,
+        6,
+        7,
+        9,
+        9,
+        12,
+        13,
+        29,
+        29,
+        31,
+        14,
+    ]
+
+    destinations_array = [
+        1,
+        2,
+        3,
+        3,
+        3,
+        4,
+        1,
+        1,
+        6,
+        7,
+        2,
+        3,
+        12,
+        13,
+        18,
+        19,
+        31,
+        14,
+        15,
+        16,
+    ]
+
+    hops_array = [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+
+    results = cudf.DataFrame(
+        {
+            "sources": sources_array,
+            "destinations": destinations_array,
+            "edge_id": None,
+            "edge_type": None,
+            "weight": None,
+            "hop_id": hops_array,
+        }
+    )
+
+    # some batches are missing
+    offsets = cudf.DataFrame({"offsets": [0, 8, 12, 16], "batch_id": [0, 3, 4, 10]})
+
+    samples_path = os.path.join(scratch_dir, "test_bulk_sampler_io_empty_batch")
+    create_directory_with_overwrite(samples_path)
+
+    write_samples(results, offsets, None, 2, samples_path)
+
+    files = os.listdir(samples_path)
+    assert len(files) == 2
+
+    df0 = cudf.read_parquet(os.path.join(samples_path, "batch=0-1.parquet"))
+
+    assert df0.batch_id.min() == 0
+    assert df0.batch_id.max() == 1
+
+    df1 = cudf.read_parquet(os.path.join(samples_path, "batch=4-5.parquet"))
+    assert df1.batch_id.min() == 4
+    assert df1.batch_id.max() == 5
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py
index 4115eca1c1f..41f68c08e5c 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py
@@ -81,3 +81,92 @@ def test_bulk_sampler_io(scratch_dir):
     assert (df.batch_id == 1).all()
 
     shutil.rmtree(samples_path)
+
+
+@pytest.mark.sg
+def test_bulk_sampler_io_empty_batch(scratch_dir):
+    sources_array = [
+        0,
+        0,
+        1,
+        2,
+        2,
+        2,
+        3,
+        4,
+        5,
+        5,
+        6,
+        7,
+        9,
+        9,
+        12,
+        13,
+        29,
+        29,
+        31,
+        14,
+    ]
+
+    destinations_array = [
+        1,
+        2,
+        3,
+        3,
+        3,
+        4,
+        1,
+        1,
+        6,
+        7,
+        2,
+        3,
+        12,
+        13,
+        18,
+        19,
+        31,
+        14,
+        15,
+        16,
+    ]
+
+    hops_array = [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+
+    results = cudf.DataFrame(
+        {
+            "sources": sources_array,
+            "destinations": destinations_array,
+            "edge_id": None,
+            "edge_type": None,
+            "weight": None,
+            "hop_id": hops_array,
+        }
+    )
+
+    results = dask_cudf.from_cudf(results, npartitions=1).repartition(
+        divisions=[0, 12, 19]
+    )
+
+    # some batches are missing
+    offsets = cudf.DataFrame({"offsets": [0, 8, 0, 4], "batch_id": [0, 3, 4, 10]})
+    offsets = dask_cudf.from_cudf(offsets, npartitions=1).repartition(
+        divisions=[0, 2, 3]
+    )
+
+    samples_path = os.path.join(scratch_dir, "mg_test_bulk_sampler_io_empty_batch")
+    create_directory_with_overwrite(samples_path)
+
+    write_samples(results, offsets, None, 2, samples_path)
+
+    files = os.listdir(samples_path)
+    assert len(files) == 2
+
+    df0 = cudf.read_parquet(os.path.join(samples_path, "batch=0-1.parquet"))
+
+    assert df0.batch_id.min() == 0
+    assert df0.batch_id.max() == 1
+
+    df1 = cudf.read_parquet(os.path.join(samples_path, "batch=4-5.parquet"))
+    assert df1.batch_id.min() == 4
+    assert df1.batch_id.max() == 5

From f13feff3b3ae00386ca2c979faaaa4f72e18d012 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 31 Aug 2023 11:28:10 -0500
Subject: [PATCH 27/72] Simplify wheel build scripts and allow alphas of RAPIDS
 dependencies (#3809)

This PR makes a handful of changes aimed at simplifying the CI pipeline for building wheels as a precursor to switching RAPIDS nightlies to using proper alpha versions:
- Inlines apply_wheel_modifications.sh in build_wheel.sh. Now that the build doesn't rely excessively on logic in shared workflows, there's no real benefit to having a separate script (previously apply_wheel_modification.sh was a special script that the shared workflow knew to execute i.e. it was a hook into an externally controlled workflow).
- Consolidates the textual replacements using for loops and makes the replacements more targeted by only modifying the Python package being built in a given script. For instance, python/cugraph/pyproject.toml is no longer overwritten when building pylibcugraph.
- Modifies dependency specs for RAPIDS packages to include a `>=0.0.0a0` component. This is the key change that will allow alpha dependencies to be discovered. dask-cuda is the canary here because we already upload alphas of it, so the installation of cugraph in the test job should pull the latest dask-cuda alpha now without requiring direct installation from git.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3809
---
 ci/build_wheel.sh                            | 32 ++++++++++++++++--
 ci/build_wheel_cugraph.sh                    |  6 +++-
 ci/release/apply_wheel_modifications.sh      | 35 --------------------
 ci/test_wheel_cugraph.sh                     |  2 +-
 dependencies.yaml                            |  2 +-
 python/cugraph-dgl/pyproject.toml            |  2 +-
 python/cugraph-nx/pyproject.toml             |  2 +-
 python/cugraph-pyg/pyproject.toml            |  2 +-
 python/cugraph-service/client/pyproject.toml |  2 +-
 python/cugraph-service/server/pyproject.toml |  2 +-
 python/cugraph/pyproject.toml                |  2 +-
 python/pylibcugraph/pyproject.toml           |  2 +-
 12 files changed, 43 insertions(+), 48 deletions(-)
 delete mode 100755 ci/release/apply_wheel_modifications.sh

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index fbf1e6fce2d..3798d561126 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -15,9 +15,35 @@ version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-bash ci/release/apply_wheel_modifications.sh ${version_override} "-${RAPIDS_PY_CUDA_SUFFIX}"
-echo "The package name and/or version was modified in the package source. The git diff is:"
-git diff
+# This is the version of the suffix with a preceding hyphen. It's used
+# everywhere except in the final wheel name.
+PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
+
+# Patch project metadata files to include the CUDA version suffix and version override.
+pyproject_file="${package_dir}/pyproject.toml"
+
+sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file}
+sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
+
+for dep in rmm cudf raft-dask pylibcugraph pylibraft ucx-py; do
+    sed -r -i "s/${dep}==(.*)\"/${dep}${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+done
+
+# dask-cuda doesn't get a suffix, but it does get an alpha spec.
+sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
+
+if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
+    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
+fi
 
 cd "${package_dir}"
 
diff --git a/ci/build_wheel_cugraph.sh b/ci/build_wheel_cugraph.sh
index 657220d372b..5b5061f67c2 100755
--- a/ci/build_wheel_cugraph.sh
+++ b/ci/build_wheel_cugraph.sh
@@ -5,8 +5,12 @@ set -euo pipefail
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
+# Download the pylibcugraph wheel built in the previous step and make it
+# available for pip to find. We must use PIP_FIND_LINKS because the package
+# must be made available to the isolated build step, and there is no way to
+# manually install it into that environment.
 RAPIDS_PY_WHEEL_NAME=pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX} rapids-download-wheels-from-s3 ./local-pylibcugraph
-python -m pip install --no-deps ./local-pylibcugraph/pylibcugraph*.whl
+export PIP_FIND_LINKS=$(pwd)/local-pylibcugraph
 
 export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
deleted file mode 100755
index 610a603cef8..00000000000
--- a/ci/release/apply_wheel_modifications.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Usage: bash apply_wheel_modifications.sh <new_version> <cuda_suffix>
-
-VERSION=${1}
-CUDA_SUFFIX=${2}
-
-# setup.py updates
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" \
-  python/cugraph/pyproject.toml \
-  python/cugraph-dgl/pyproject.toml \
-  python/cugraph-pyg/pyproject.toml \
-  python/cugraph-service/client/pyproject.toml \
-  python/cugraph-service/server/pyproject.toml \
-  python/pylibcugraph/pyproject.toml
-
-# pylibcugraph pyproject.toml cuda suffixes
-sed -i "s/name = \"pylibcugraph\"/name = \"pylibcugraph${CUDA_SUFFIX}\"/g" python/pylibcugraph/pyproject.toml
-sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/pylibcugraph/pyproject.toml
-sed -i "s/pylibraft/pylibraft${CUDA_SUFFIX}/g" python/pylibcugraph/pyproject.toml
-sed -i "s/cudf/cudf${CUDA_SUFFIX}/g" python/pylibcugraph/pyproject.toml
-
-# cugraph pyproject.toml cuda suffixes
-sed -i "s/name = \"cugraph\"/name = \"cugraph${CUDA_SUFFIX}\"/g" python/cugraph/pyproject.toml
-sed -i "s/rmm/rmm${CUDA_SUFFIX}/g" python/cugraph/pyproject.toml
-sed -i "s/cudf/cudf${CUDA_SUFFIX}/g" python/cugraph/pyproject.toml
-sed -i "s/raft-dask/raft-dask${CUDA_SUFFIX}/g" python/cugraph/pyproject.toml
-sed -i "s/pylibcugraph/pylibcugraph${CUDA_SUFFIX}/g" python/cugraph/pyproject.toml
-sed -i "s/pylibraft/pylibraft${CUDA_SUFFIX}/g" python/cugraph/pyproject.toml
-sed -i "s/ucx-py/ucx-py${CUDA_SUFFIX}/g" python/cugraph/pyproject.toml
-
-if [[ $CUDA_SUFFIX == "-cu12" ]]; then
-    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" python/cugraph/pyproject.toml
-fi
diff --git a/ci/test_wheel_cugraph.sh b/ci/test_wheel_cugraph.sh
index a117e00b8a2..4d511ac2a0f 100755
--- a/ci/test_wheel_cugraph.sh
+++ b/ci/test_wheel_cugraph.sh
@@ -9,7 +9,7 @@ RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-whe
 python -m pip install --no-deps ./local-pylibcugraph-dep/pylibcugraph*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main
 
 # Only download test data for x86
 arch=$(uname -m)
diff --git a/dependencies.yaml b/dependencies.yaml
index 22579425898..23bd5c800b6 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -325,7 +325,7 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - wheel
-          - setuptools
+          - setuptools>=61.0.0
   python_build_cythonize:
     common:
       - output_types: [conda, pyproject]
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index 4205ac69df5..50354184133 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -3,7 +3,7 @@
 [build-system]
 
 requires = [
-    "setuptools",
+    "setuptools>=61.0.0",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "setuptools.build_meta"
diff --git a/python/cugraph-nx/pyproject.toml b/python/cugraph-nx/pyproject.toml
index 7384fc75007..1882bed251f 100644
--- a/python/cugraph-nx/pyproject.toml
+++ b/python/cugraph-nx/pyproject.toml
@@ -3,7 +3,7 @@
 [build-system]
 
 requires = [
-    "setuptools",
+    "setuptools>=61.0.0",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "setuptools.build_meta"
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index a28a4b3e905..218c09fbd1d 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -3,7 +3,7 @@
 [build-system]
 
 requires = [
-    "setuptools",
+    "setuptools>=61.0.0",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/cugraph-service/client/pyproject.toml b/python/cugraph-service/client/pyproject.toml
index cef9391805e..3b31a5f2e0a 100644
--- a/python/cugraph-service/client/pyproject.toml
+++ b/python/cugraph-service/client/pyproject.toml
@@ -3,7 +3,7 @@
 [build-system]
 
 requires = [
-    "setuptools",
+    "setuptools>=61.0.0",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "setuptools.build_meta"
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index f8f90b864cd..f25ea6c46e5 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -3,7 +3,7 @@
 [build-system]
 
 requires = [
-    "setuptools",
+    "setuptools>=61.0.0",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "setuptools.build_meta"
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index 591161774e3..cadf6879e23 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -10,7 +10,7 @@ requires = [
     "pylibraft==23.10.*",
     "rmm==23.10.*",
     "scikit-build>=0.13.1",
-    "setuptools",
+    "setuptools>=61.0.0",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "setuptools.build_meta"
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 191bdf41920..806ea65ac6c 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
     "pylibraft==23.10.*",
     "rmm==23.10.*",
     "scikit-build>=0.13.1",
-    "setuptools",
+    "setuptools>=61.0.0",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "setuptools.build_meta"

From 2b4118aee4af912d74ce1ebe7adc39cf596899ef Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Fri, 1 Sep 2023 14:03:50 -0400
Subject: [PATCH 28/72] Remove Deprecated Sampling Options (#3816)

The `uniform_neighbor_sample` code is becoming increasingly difficult to maintain.  This PR removes all the options that were deprecated in the previous release, and also deprecates the `with_edge_properties` option, which will be replaced by returning whatever properties are in the graph in the next release.

This PR also resolves a FIXME by allowing `fanout_vals` to be a `cupy.ndarray`, `numpy.ndarray`, or `cudf.Series`.

Closes #3698

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3816
---
 .../cugraph_pyg/data/cugraph_store.py         |   1 +
 .../tests/mg/test_mg_cugraph_sampler.py       |  22 +-
 .../cugraph_pyg/tests/test_cugraph_sampler.py |  22 +-
 .../dask/sampling/uniform_neighbor_sample.py  | 338 +-----------------
 .../sampling/uniform_neighbor_sample.py       | 180 +---------
 .../graph_implementation/simpleGraph.py       |   4 +-
 6 files changed, 72 insertions(+), 495 deletions(-)

diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
index e3eb4a85a85..8d5d2fd4894 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
@@ -367,6 +367,7 @@ def __construct_graph(
         -------
         A newly-constructed directed cugraph.MultiGraph object.
         """
+
         # Ensure the original dict is not modified.
         edge_info_cg = {}
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
index 93687c4a107..550852a3303 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
@@ -33,14 +33,21 @@ def test_neighbor_sample(dask_client, basic_graph_1):
     F, G, N = basic_graph_1
     cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
 
+    batches = cudf.DataFrame(
+        {
+            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+            "batch": cudf.Series(cupy.zeros(5, dtype="int32")),
+        }
+    )
+
     sampling_results = (
         uniform_neighbor_sample(
             cugraph_store._subgraph(),
-            cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+            batches,
+            with_batch_ids=True,
             fanout_vals=[-1],
             with_replacement=False,
             with_edge_properties=True,
-            batch_id_list=cudf.Series(cupy.zeros(5, dtype="int32")),
             random_state=62,
             return_offsets=False,
             return_hops=True,
@@ -90,16 +97,23 @@ def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph
     F, G, N = multi_edge_multi_vertex_graph_1
     cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
 
+    batches = cudf.DataFrame(
+        {
+            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+            "batches": cudf.Series(cupy.zeros(5, dtype="int32")),
+        }
+    )
+
     sampling_results = (
         uniform_neighbor_sample(
             cugraph_store._subgraph(),
-            cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+            batches,
             fanout_vals=[-1],
             with_replacement=False,
             with_edge_properties=True,
-            batch_id_list=cudf.Series(cupy.zeros(5, dtype="int32")),
             random_state=62,
             return_offsets=False,
+            with_batch_ids=True,
         )
         .sort_values(by=["sources", "destinations"])
         .compute()
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
index c1949f495e4..08a8625b33b 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
@@ -31,13 +31,20 @@ def test_neighbor_sample(basic_graph_1):
     F, G, N = basic_graph_1
     cugraph_store = CuGraphStore(F, G, N)
 
+    batches = cudf.DataFrame(
+        {
+            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+            "batch": cudf.Series(cupy.zeros(5, dtype="int32")),
+        }
+    )
+
     sampling_results = uniform_neighbor_sample(
         cugraph_store._subgraph(),
-        cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+        batches,
         fanout_vals=[-1],
         with_replacement=False,
         with_edge_properties=True,
-        batch_id_list=cudf.Series(cupy.zeros(5, dtype="int32")),
+        with_batch_ids=True,
         random_state=62,
         return_offsets=False,
     ).sort_values(by=["sources", "destinations"])
@@ -82,15 +89,22 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
     cugraph_store = CuGraphStore(F, G, N)
 
+    batches = cudf.DataFrame(
+        {
+            "start": cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+            "batch": cudf.Series(cupy.zeros(5, dtype="int32")),
+        }
+    )
+
     sampling_results = uniform_neighbor_sample(
         cugraph_store._subgraph(),
-        cudf.Series([0, 1, 2, 3, 4], dtype="int64"),
+        batches,
         fanout_vals=[-1],
         with_replacement=False,
         with_edge_properties=True,
-        batch_id_list=cudf.Series(cupy.zeros(5, dtype="int32")),
         random_state=62,
         return_offsets=False,
+        with_batch_ids=True,
     ).sort_values(by=["sources", "destinations"])
 
     out = _sampler_output_from_sampling_results(
diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
index 88fab60120d..9e50169b4a7 100644
--- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
@@ -27,10 +27,8 @@
 from pylibcugraph import ResourceHandle
 
 from pylibcugraph import uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample
-from pylibcugraph.utilities.api_tools import deprecated_warning_wrapper
 
 from cugraph.dask.comms import comms as Comms
-from cugraph.dask.common.input_utils import get_distributed_data
 from cugraph.dask import get_n_workers
 
 from typing import Sequence, List, Union, Tuple
@@ -287,112 +285,6 @@ def _call_plc_uniform_neighbor_sample(
     )
 
 
-def _call_plc_uniform_neighbor_sample_legacy(
-    sID,
-    mg_graph_x,
-    st_x,
-    label_list,
-    label_to_output_comm_rank,
-    fanout_vals,
-    with_replacement,
-    weight_t,
-    with_edge_properties,
-    random_state=None,
-    return_offsets=False,
-    return_hops=True,
-):
-    start_list_x = st_x[start_col_name]
-    batch_id_list_x = st_x[batch_col_name] if batch_col_name in st_x else None
-    cp_arrays = pylibcugraph_uniform_neighbor_sample(
-        resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
-        input_graph=mg_graph_x,
-        start_list=start_list_x,
-        label_list=label_list,
-        label_to_output_comm_rank=label_to_output_comm_rank,
-        h_fan_out=fanout_vals,
-        with_replacement=with_replacement,
-        do_expensive_check=False,
-        with_edge_properties=with_edge_properties,
-        batch_id_list=batch_id_list_x,
-        random_state=random_state,
-        return_hops=return_hops,
-    )
-
-    output = convert_to_cudf(
-        cp_arrays, weight_t, with_edge_properties, return_offsets=return_offsets
-    )
-
-    if isinstance(output, (list, tuple)) and len(output) == 1:
-        return output[0]
-    return output
-
-
-def _mg_call_plc_uniform_neighbor_sample_legacy(
-    client,
-    session_id,
-    input_graph,
-    ddf,
-    label_list,
-    label_to_output_comm_rank,
-    fanout_vals,
-    with_replacement,
-    weight_t,
-    indices_t,
-    with_edge_properties,
-    random_state,
-    return_offsets=False,
-    return_hops=True,
-):
-    result = [
-        client.submit(
-            _call_plc_uniform_neighbor_sample_legacy,
-            session_id,
-            input_graph._plc_graph[w],
-            ddf[w][0],
-            label_list,
-            label_to_output_comm_rank,
-            fanout_vals,
-            with_replacement,
-            weight_t=weight_t,
-            with_edge_properties=with_edge_properties,
-            # FIXME accept and properly transmute a numpy/cupy random state.
-            random_state=hash((random_state, i)),
-            workers=[w],
-            allow_other_workers=False,
-            pure=False,
-            return_offsets=return_offsets,
-            return_hops=return_hops,
-        )
-        for i, w in enumerate(Comms.get_workers())
-    ]
-
-    empty_df = (
-        create_empty_df_with_edge_props(
-            indices_t, weight_t, return_offsets=return_offsets
-        )
-        if with_edge_properties
-        else create_empty_df(indices_t, weight_t)
-    )
-
-    if return_offsets:
-        result = [delayed(lambda x: x, nout=2)(r) for r in result]
-        ddf = dask_cudf.from_delayed(
-            [r[0] for r in result], meta=empty_df[0], verify_meta=False
-        ).persist()
-        ddf_offsets = dask_cudf.from_delayed(
-            [r[1] for r in result], meta=empty_df[1], verify_meta=False
-        ).persist()
-        wait(ddf)
-        wait(ddf_offsets)
-        wait([r.release() for r in result])
-        return ddf, ddf_offsets
-    else:
-        ddf = dask_cudf.from_delayed(result, meta=empty_df, verify_meta=False).persist()
-        wait(ddf)
-        wait([r.release() for r in result])
-        return ddf
-
-
 def _mg_call_plc_uniform_neighbor_sample(
     client,
     session_id,
@@ -501,170 +393,12 @@ def _mg_call_plc_uniform_neighbor_sample(
         return tuple(return_dfs)
 
 
-def _uniform_neighbor_sample_legacy(
-    input_graph: Graph,
-    start_list: Sequence,
-    fanout_vals: List[int],
-    with_replacement: bool = True,
-    with_edge_properties: bool = False,
-    batch_id_list: Sequence = None,
-    label_list: Sequence = None,
-    label_to_output_comm_rank: bool = None,
-    random_state: int = None,
-    return_offsets: bool = False,
-    return_hops: bool = False,
-    _multiple_clients: bool = False,
-) -> Union[dask_cudf.DataFrame, Tuple[dask_cudf.DataFrame, dask_cudf.DataFrame]]:
-    warnings.warn(
-        "The batch_id_list, label_list, and label_to_output_comm_rank "
-        "parameters are deprecated.  Consider using with_batch_ids, "
-        "keep_batches_together, min_batch_id, and max_batch_id instead."
-    )
-
-    if isinstance(start_list, int):
-        start_list = [start_list]
-
-    if isinstance(start_list, list):
-        start_list = cudf.Series(
-            start_list,
-            dtype=input_graph.edgelist.edgelist_df[
-                input_graph.renumber_map.renumbered_src_col_name
-            ].dtype,
-        )
-
-    elif with_edge_properties and batch_id_list is None:
-        batch_id_list = cudf.Series(cp.zeros(len(start_list), dtype="int32"))
-
-    # fanout_vals must be a host array!
-    # FIXME: ensure other sequence types (eg. cudf Series) can be handled.
-    if isinstance(fanout_vals, list):
-        fanout_vals = numpy.asarray(fanout_vals, dtype="int32")
-    else:
-        raise TypeError("fanout_vals must be a list, " f"got: {type(fanout_vals)}")
-
-    if "value" in input_graph.edgelist.edgelist_df:
-        weight_t = input_graph.edgelist.edgelist_df["value"].dtype
-    else:
-        weight_t = "float32"
-
-    if "_SRC_" in input_graph.edgelist.edgelist_df:
-        indices_t = input_graph.edgelist.edgelist_df["_SRC_"].dtype
-    elif src_n in input_graph.edgelist.edgelist_df:
-        indices_t = input_graph.edgelist.edgelist_df[src_n].dtype
-    else:
-        indices_t = numpy.int32
-
-    start_list = start_list.rename(start_col_name)
-    if batch_id_list is not None:
-        batch_id_list = batch_id_list.rename(batch_col_name)
-        if hasattr(start_list, "compute"):
-            # mg input
-            start_list = start_list.to_frame()
-            batch_id_list = batch_id_list.to_frame()
-            ddf = start_list.merge(
-                batch_id_list,
-                how="left",
-                left_index=True,
-                right_index=True,
-            )
-        else:
-            # sg input
-            ddf = cudf.concat(
-                [
-                    start_list,
-                    batch_id_list,
-                ],
-                axis=1,
-            )
-    else:
-        ddf = start_list.to_frame()
-
-    if input_graph.renumbered:
-        ddf = input_graph.lookup_internal_vertex_id(ddf, column_name=start_col_name)
-
-    if hasattr(ddf, "compute"):
-        ddf = get_distributed_data(ddf)
-        wait(ddf)
-        ddf = ddf.worker_to_parts
-    else:
-        splits = cp.array_split(cp.arange(len(ddf)), len(Comms.get_workers()))
-        ddf = {w: [ddf.iloc[splits[i]]] for i, w in enumerate(Comms.get_workers())}
-
-    client = get_client()
-    session_id = Comms.get_session_id()
-    if _multiple_clients:
-        # Distributed centralized lock to allow
-        # two disconnected processes (clients) to coordinate a lock
-        # https://docs.dask.org/en/stable/futures.html?highlight=lock#distributed.Lock
-        lock = Lock("plc_graph_access")
-        if lock.acquire(timeout=100):
-            try:
-                ddf = _mg_call_plc_uniform_neighbor_sample_legacy(
-                    client=client,
-                    session_id=session_id,
-                    input_graph=input_graph,
-                    ddf=ddf,
-                    label_list=label_list,
-                    label_to_output_comm_rank=label_to_output_comm_rank,
-                    fanout_vals=fanout_vals,
-                    with_replacement=with_replacement,
-                    weight_t=weight_t,
-                    indices_t=indices_t,
-                    with_edge_properties=with_edge_properties,
-                    random_state=random_state,
-                    return_offsets=return_offsets,
-                    return_hops=return_hops,
-                )
-            finally:
-                lock.release()
-        else:
-            raise RuntimeError(
-                "Failed to acquire lock(plc_graph_access) while trying to sampling"
-            )
-    else:
-        ddf = _mg_call_plc_uniform_neighbor_sample_legacy(
-            client=client,
-            session_id=session_id,
-            input_graph=input_graph,
-            ddf=ddf,
-            label_list=label_list,
-            label_to_output_comm_rank=label_to_output_comm_rank,
-            fanout_vals=fanout_vals,
-            with_replacement=with_replacement,
-            weight_t=weight_t,
-            indices_t=indices_t,
-            with_edge_properties=with_edge_properties,
-            random_state=random_state,
-            return_offsets=return_offsets,
-            return_hops=return_hops,
-        )
-
-    if return_offsets:
-        ddf, offsets_ddf = ddf
-    if input_graph.renumbered:
-        ddf = input_graph.unrenumber(ddf, "sources", preserve_order=True)
-        ddf = input_graph.unrenumber(ddf, "destinations", preserve_order=True)
-
-    if return_offsets:
-        return ddf, offsets_ddf
-
-    return ddf
-
-
-uniform_neighbor_sample_legacy = deprecated_warning_wrapper(
-    _uniform_neighbor_sample_legacy
-)
-
-
 def uniform_neighbor_sample(
     input_graph: Graph,
     start_list: Sequence,
     fanout_vals: List[int],
     with_replacement: bool = True,
-    with_edge_properties: bool = False,
-    batch_id_list: Sequence = None,  # deprecated
-    label_list: Sequence = None,  # deprecated
-    label_to_output_comm_rank: bool = None,  # deprecated
+    with_edge_properties: bool = False,  # deprecated
     with_batch_ids: bool = False,
     keep_batches_together=False,
     min_batch_id=None,
@@ -698,27 +432,10 @@ def uniform_neighbor_sample(
         Flag to specify if the random sampling is done with replacement
 
     with_edge_properties: bool, optional (default=False)
+        Deprecated.
         Flag to specify whether to return edge properties (weight, edge id,
         edge type, batch id, hop id) with the sampled edges.
 
-    batch_id_list: cudf.Series or dask_cudf.Series (int32), optional (default=None)
-        Deprecated.
-        List of batch ids that will be returned with the sampled edges if
-        with_edge_properties is set to True.
-
-    label_list: cudf.Series or dask_cudf.Series (int32), optional (default=None)
-        Deprecated.
-        List of unique batch id labels.  Used along with
-        label_to_output_comm_rank to assign batch ids to GPUs.
-
-    label_to_out_comm_rank: cudf.Series or dask_cudf.Series (int32),
-    optional (default=None)
-        Deprecated.
-        List of output GPUs (by rank) corresponding to batch
-        id labels in the label list.  Used to assign each batch
-        id to a GPU.
-        Must be in ascending order (i.e. [0, 0, 1, 2]).
-
     with_batch_ids: bool, optional (default=False)
         Flag to specify whether batch ids are present in the start_list
 
@@ -831,40 +548,12 @@ def uniform_neighbor_sample(
                         Contains the batch offsets for the renumber maps
     """
 
-    if (
-        batch_id_list is not None
-        or label_list is not None
-        or label_to_output_comm_rank is not None
-    ):
-        if prior_sources_behavior or deduplicate_sources:
-            raise ValueError(
-                "unique sources, carry_over_sources, and deduplicate_sources"
-                " are not supported with batch_id_list, label_list, and"
-                " label_to_output_comm_rank.  Consider using with_batch_ids"
-                " and keep_batches_together instead."
-            )
-
-        if renumber:
-            raise ValueError(
-                "renumber is not supported with batch_id_list, label_list, "
-                "and label_to_output_comm_rank.  Consider using "
-                "with_batch_ids and keep_batches_together instead."
-            )
-
-        return uniform_neighbor_sample_legacy(
-            input_graph,
-            start_list,
-            fanout_vals,
-            with_replacement=with_replacement,
-            with_edge_properties=with_edge_properties,
-            batch_id_list=batch_id_list,
-            label_list=label_list,
-            label_to_output_comm_rank=label_to_output_comm_rank,
-            random_state=random_state,
-            return_offsets=return_offsets,
-            return_hops=return_hops,
-            _multiple_clients=_multiple_clients,
+    if with_edge_properties:
+        warning_msg = (
+            "The with_edge_properties flag is deprecated"
+            " and will be removed in the next release."
         )
+        warnings.warn(warning_msg, DeprecationWarning)
 
     if isinstance(start_list, int):
         start_list = [start_list]
@@ -906,12 +595,17 @@ def uniform_neighbor_sample(
             "when performing renumbering."
         )
 
-    # fanout_vals must be a host array!
-    # FIXME: ensure other sequence types (eg. cudf Series) can be handled.
-    if isinstance(fanout_vals, list):
+    # fanout_vals must be passed to pylibcugraph as a host array
+    if isinstance(fanout_vals, numpy.ndarray):
+        fanout_vals = fanout_vals.astype("int32")
+    elif isinstance(fanout_vals, list):
         fanout_vals = numpy.asarray(fanout_vals, dtype="int32")
+    elif isinstance(fanout_vals, cp.ndarray):
+        fanout_vals = fanout_vals.get().astype("int32")
+    elif isinstance(fanout_vals, cudf.Series):
+        fanout_vals = fanout_vals.values_host.astype("int32")
     else:
-        raise TypeError("fanout_vals must be a list, " f"got: {type(fanout_vals)}")
+        raise TypeError("fanout_vals must be a sequence, " f"got: {type(fanout_vals)}")
 
     if "value" in input_graph.edgelist.edgelist_df:
         weight_t = input_graph.edgelist.edgelist_df["value"].dtype
diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
index 96f40090a34..219854bb002 100644
--- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -15,7 +15,6 @@
 
 from pylibcugraph import ResourceHandle
 from pylibcugraph import uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample
-from pylibcugraph.utilities.api_tools import deprecated_warning_wrapper
 
 import numpy
 
@@ -55,143 +54,12 @@ def ensure_valid_dtype(input_graph, start_list):
     return start_list
 
 
-def _uniform_neighbor_sample_legacy(
-    G: Graph,
-    start_list: Sequence,
-    fanout_vals: List[int],
-    with_replacement: bool = True,
-    with_edge_properties: bool = False,
-    batch_id_list: Sequence = None,
-    random_state: int = None,
-    return_offsets: bool = False,
-    return_hops: bool = True,
-) -> Union[cudf.DataFrame, Tuple[cudf.DataFrame, cudf.DataFrame]]:
-
-    warnings.warn(
-        "The batch_id_list parameter is deprecated. "
-        "Consider passing a DataFrame where the last column "
-        "is the batch ids and setting with_batch_ids=True"
-    )
-
-    if isinstance(start_list, int):
-        start_list = [start_list]
-
-    if isinstance(start_list, list):
-        start_list = cudf.Series(
-            start_list, dtype=G.edgelist.edgelist_df[G.srcCol].dtype
-        )
-
-    if with_edge_properties and batch_id_list is None:
-        batch_id_list = cp.zeros(len(start_list), dtype="int32")
-
-    # fanout_vals must be a host array!
-    # FIXME: ensure other sequence types (eg. cudf Series) can be handled.
-    if isinstance(fanout_vals, list):
-        fanout_vals = numpy.asarray(fanout_vals, dtype="int32")
-    else:
-        raise TypeError("fanout_vals must be a list, " f"got: {type(fanout_vals)}")
-
-    if "weights" in G.edgelist.edgelist_df:
-        weight_t = G.edgelist.edgelist_df["weights"].dtype
-    else:
-        weight_t = "float32"
-
-    start_list = ensure_valid_dtype(G, start_list)
-
-    if G.renumbered is True:
-        if isinstance(start_list, cudf.DataFrame):
-            start_list = G.lookup_internal_vertex_id(start_list, start_list.columns)
-        else:
-            start_list = G.lookup_internal_vertex_id(start_list)
-
-    sampling_result = pylibcugraph_uniform_neighbor_sample(
-        resource_handle=ResourceHandle(),
-        input_graph=G._plc_graph,
-        start_list=start_list,
-        h_fan_out=fanout_vals,
-        with_replacement=with_replacement,
-        do_expensive_check=False,
-        with_edge_properties=with_edge_properties,
-        batch_id_list=batch_id_list,
-        return_hops=return_hops,
-        random_state=random_state,
-    )
-
-    df = cudf.DataFrame()
-
-    if with_edge_properties:
-        (
-            sources,
-            destinations,
-            weights,
-            edge_ids,
-            edge_types,
-            batch_ids,
-            offsets,
-            hop_ids,
-        ) = sampling_result
-
-        df["sources"] = sources
-        df["destinations"] = destinations
-        df["weight"] = weights
-        df["edge_id"] = edge_ids
-        df["edge_type"] = edge_types
-        df["hop_id"] = hop_ids
-
-        if return_offsets:
-            offsets_df = cudf.DataFrame(
-                {
-                    "batch_id": batch_ids,
-                    "offsets": offsets[:-1],
-                }
-            )
-
-        else:
-            if len(batch_ids) > 0:
-                batch_ids = cudf.Series(batch_ids).repeat(cp.diff(offsets))
-                batch_ids.reset_index(drop=True, inplace=True)
-
-            df["batch_id"] = batch_ids
-
-    else:
-        sources, destinations, indices = sampling_result
-
-        df["sources"] = sources
-        df["destinations"] = destinations
-
-        if indices is None:
-            df["indices"] = None
-        else:
-            df["indices"] = indices
-            if weight_t == "int32":
-                df["indices"] = indices.astype("int32")
-            elif weight_t == "int64":
-                df["indices"] = indices.astype("int64")
-            else:
-                df["indices"] = indices
-
-    if G.renumbered:
-        df = G.unrenumber(df, "sources", preserve_order=True)
-        df = G.unrenumber(df, "destinations", preserve_order=True)
-
-    if return_offsets:
-        return df, offsets_df
-
-    return df
-
-
-uniform_neighbor_sample_legacy = deprecated_warning_wrapper(
-    _uniform_neighbor_sample_legacy
-)
-
-
 def uniform_neighbor_sample(
     G: Graph,
     start_list: Sequence,
     fanout_vals: List[int],
     with_replacement: bool = True,
-    with_edge_properties: bool = False,
-    batch_id_list: Sequence = None,  # deprecated
+    with_edge_properties: bool = False,  # deprecated
     with_batch_ids: bool = False,
     random_state: int = None,
     return_offsets: bool = False,
@@ -221,14 +89,10 @@ def uniform_neighbor_sample(
         Flag to specify if the random sampling is done with replacement
 
     with_edge_properties: bool, optional (default=False)
+        Deprecated.
         Flag to specify whether to return edge properties (weight, edge id,
         edge type, batch id, hop id) with the sampled edges.
 
-    batch_id_list: list (int32)
-        Deprecated.
-        List of batch ids that will be returned with the sampled edges if
-        with_edge_properties is set to True.
-
     with_batch_ids: bool, optional (default=False)
         Flag to specify whether batch ids are present in the start_list
         Assumes they are the last column in the start_list dataframe
@@ -329,29 +193,12 @@ def uniform_neighbor_sample(
                         Contains the batch offsets for the renumber maps
     """
 
-    if batch_id_list is not None:
-        if prior_sources_behavior or deduplicate_sources:
-            raise ValueError(
-                "prior_sources_behavior and deduplicate_sources"
-                " are not supported with batch_id_list."
-                " Consider using with_batch_ids instead."
-            )
-        if renumber:
-            raise ValueError(
-                "renumber is not supported with batch_id_list."
-                " Consider using with_batch_ids instead."
-            )
-        return uniform_neighbor_sample_legacy(
-            G,
-            start_list,
-            fanout_vals,
-            with_replacement=with_replacement,
-            with_edge_properties=with_edge_properties,
-            batch_id_list=batch_id_list,
-            random_state=random_state,
-            return_offsets=return_offsets,
-            return_hops=return_hops,
+    if with_edge_properties:
+        warning_msg = (
+            "The with_edge_properties flag is deprecated"
+            " and will be removed in the next release."
         )
+        warnings.warn(warning_msg, DeprecationWarning)
 
     if isinstance(start_list, int):
         start_list = [start_list]
@@ -369,12 +216,17 @@ def uniform_neighbor_sample(
             cp.zeros(len(start_list), dtype="int32")
         )
 
-    # fanout_vals must be a host array!
-    # FIXME: ensure other sequence types (eg. cudf Series) can be handled.
-    if isinstance(fanout_vals, list):
+    # fanout_vals must be passed to pylibcugraph as a host array
+    if isinstance(fanout_vals, numpy.ndarray):
+        fanout_vals = fanout_vals.astype("int32")
+    elif isinstance(fanout_vals, list):
         fanout_vals = numpy.asarray(fanout_vals, dtype="int32")
+    elif isinstance(fanout_vals, cp.ndarray):
+        fanout_vals = fanout_vals.get().astype("int32")
+    elif isinstance(fanout_vals, cudf.Series):
+        fanout_vals = fanout_vals.values_host.astype("int32")
     else:
-        raise TypeError("fanout_vals must be a list, " f"got: {type(fanout_vals)}")
+        raise TypeError("fanout_vals must be a sequence, " f"got: {type(fanout_vals)}")
 
     if "weights" in G.edgelist.edgelist_df:
         weight_t = G.edgelist.edgelist_df["weights"].dtype
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
index 2690ab88c13..2b23d3a26b7 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -136,7 +136,9 @@ def __from_edgelist(
             warning_msg = (
                 "The parameter 'legacy_renum_only' is deprecated and will be removed."
             )
-            warnings.warn(warning_msg, DeprecationWarning)
+            warnings.warn(
+                warning_msg,
+            )
 
         # Verify column names present in input DataFrame
         s_col = source

From a5e08c09bcb3e51205fdf78494a221fe755e8a9f Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Wed, 6 Sep 2023 10:45:29 -0400
Subject: [PATCH 29/72] Use new `raft::compiled_static` targets (#3842)

https://github.com/rapidsai/raft/pull/1746 added static targets for RAFT that can be used directly now instead of building RAFT static explicitly

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3842
---
 cpp/CMakeLists.txt                  | 20 ++++++++++++++------
 cpp/cmake/thirdparty/get_raft.cmake | 12 +++++++-----
 python/cugraph/CMakeLists.txt       |  2 +-
 python/pylibcugraph/CMakeLists.txt  |  2 +-
 4 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 63a91d4971f..370e665106d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -346,11 +346,19 @@ target_include_directories(cugraph
         "$<INSTALL_INTERFACE:include>"
 )
 
+set(COMPILED_RAFT_LIB "")
 if(CUDA_STATIC_RUNTIME)
   get_target_property(_includes raft::raft INTERFACE_INCLUDE_DIRECTORIES)
   target_include_directories(cugraph PUBLIC ${_includes})
   # Add CTK include paths because we make our CTK library links private below
   target_include_directories(cugraph SYSTEM PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+  if(CUGRAPH_COMPILE_RAFT_LIB)
+    set(COMPILED_RAFT_LIB raft::compiled_static)
+  endif()
+else()
+  if(CUGRAPH_COMPILE_RAFT_LIB)
+    set(COMPILED_RAFT_LIB raft::compiled)
+  endif()
 endif()
 
 ################################################################################
@@ -361,10 +369,10 @@ if (USE_CUGRAPH_OPS)
             rmm::rmm
             cugraph-ops::cugraph-ops++
             $<$<NOT:$<BOOL:${CUDA_STATIC_RUNTIME}>>:raft::raft>
-            $<$<NOT:$<BOOL:${CUDA_STATIC_RUNTIME}>>:raft::compiled>
+            $<$<NOT:$<BOOL:${CUDA_STATIC_RUNTIME}>>:${COMPILED_RAFT_LIB}>
         PRIVATE
             $<$<BOOL:${CUDA_STATIC_RUNTIME}>:raft::raft>
-            $<$<BOOL:${CUDA_STATIC_RUNTIME}>:raft::compiled>
+            $<$<BOOL:${CUDA_STATIC_RUNTIME}>:${COMPILED_RAFT_LIB}>
             cuco::cuco
             cugraph::cuHornet
             NCCL::NCCL
@@ -374,10 +382,10 @@ else()
         PUBLIC
             rmm::rmm
             $<$<NOT:$<BOOL:${CUDA_STATIC_RUNTIME}>>:raft::raft>
-            $<$<NOT:$<BOOL:${CUDA_STATIC_RUNTIME}>>:raft::compiled>
+            $<$<NOT:$<BOOL:${CUDA_STATIC_RUNTIME}>>:${COMPILED_RAFT_LIB}>
         PRIVATE
             $<$<BOOL:${CUDA_STATIC_RUNTIME}>:raft::raft>
-            $<$<BOOL:${CUDA_STATIC_RUNTIME}>:raft::compiled>
+            $<$<BOOL:${CUDA_STATIC_RUNTIME}>:${COMPILED_RAFT_LIB}>
             cuco::cuco
             cugraph::cuHornet
             NCCL::NCCL
@@ -481,12 +489,12 @@ target_link_libraries(cugraph_c
                 CUDA::cusparse${_ctk_static_suffix}
                 rmm::rmm
                 $<$<NOT:$<BOOL:${CUDA_STATIC_RUNTIME}>>:raft::raft>
-                $<$<NOT:$<BOOL:${CUDA_STATIC_RUNTIME}>>:raft::compiled>
+                $<$<NOT:$<BOOL:${CUDA_STATIC_RUNTIME}>>:${COMPILED_RAFT_LIB}>
         PRIVATE
                 cuco::cuco
                 cugraph::cugraph
                 $<$<BOOL:${CUDA_STATIC_RUNTIME}>:raft::raft>
-                $<$<BOOL:${CUDA_STATIC_RUNTIME}>:raft::compiled>
+                $<$<BOOL:${CUDA_STATIC_RUNTIME}>:${COMPILED_RAFT_LIB}>
 )
 
 ################################################################################
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index aa341d0d70f..015b5b07920 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -30,16 +30,19 @@ function(find_and_configure_raft)
       set(CPM_DOWNLOAD_raft ON)
     endif()
 
-    set(BUILD_RAFT_SHARED ON)
-    if(PKG_USE_RAFT_STATIC)
-      set(BUILD_RAFT_SHARED OFF)
+    if(PKG_COMPILE_RAFT_LIB)
+      if(NOT PKG_USE_RAFT_STATIC)
+        string(APPEND RAFT_COMPONENTS " compiled")
+      else()
+        string(APPEND RAFT_COMPONENTS " compiled_static")
+      endif()
     endif()
 
     rapids_cpm_find(raft ${PKG_VERSION}
       GLOBAL_TARGETS      raft::raft
       BUILD_EXPORT_SET    cugraph-exports
       INSTALL_EXPORT_SET  cugraph-exports
-      COMPONENTS compiled
+      COMPONENTS ${RAFT_COMPONENTS}
         CPM_ARGS
             EXCLUDE_FROM_ALL TRUE
             GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
@@ -49,7 +52,6 @@ function(find_and_configure_raft)
                 "RAFT_COMPILE_LIBRARY ${PKG_COMPILE_RAFT_LIB}"
                 "BUILD_TESTS OFF"
                 "BUILD_BENCH OFF"
-                "BUILD_SHARED_LIBS ${BUILD_RAFT_SHARED}"
     )
 
     if(raft_ADDED)
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index fc58c4de89c..f3b28623b12 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -65,7 +65,7 @@ if(NOT cugraph_FOUND)
     # Statically link dependencies if building wheels
     set(CUDA_STATIC_RUNTIME ON)
     set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_DIST_LIBS OFF)
+    set(CUGRAPH_COMPILE_RAFT_LIB ON)
     set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
     set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
     set(ALLOW_CLONE_CUGRAPH_OPS ON)
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index 65ccdec1af8..b5b564e6881 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -65,7 +65,7 @@ if (NOT cugraph_FOUND)
     # Statically link dependencies if building wheels
     set(CUDA_STATIC_RUNTIME ON)
     set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_DIST_LIBS OFF)
+    set(CUGRAPH_COMPILE_RAFT_LIB ON)
     set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
     set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
     set(ALLOW_CLONE_CUGRAPH_OPS ON)

From 5b5001a8de5adb0b58d32fe2811767e68a5fdb8d Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Wed, 6 Sep 2023 13:44:33 -0400
Subject: [PATCH 30/72] [IMP] Add ability to get batch size from the loader in
 cuGraph-PyG (#3846)

Add a property getter for batch size.  Requested by JoC.

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3846
---
 python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
index fcec341d1db..8d79685965f 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
@@ -351,6 +351,10 @@ def __init__(
         self.__input_nodes = input_nodes
         self.inner_loader_args = kwargs
 
+    @property
+    def batch_size(self) -> int:
+        return self.__batch_size
+
     def __iter__(self):
         self.current_loader = EXPERIMENTAL__BulkSampleLoader(
             self.__feature_store,

From b27d99f76084173f7048010114e9543c0f9b2049 Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Wed, 6 Sep 2023 13:04:23 -0500
Subject: [PATCH 31/72] Rename `cugraph-nx` to `nx-cugraph` (#3840)

We decided to rename `cugraph-nx` to `nx-cugraph` to follow (and help establish) conventions for names of networkx backends. See: https://github.com/networkx/networkx/discussions/6883

This PR was created from the following commands:
```sh
mv notebooks/ ../notebooks-bak
find * -type f -print0 | xargs -0 sed -i 's/cugraph_nx/nx_cugraph/g'
find * -type f -print0 | xargs -0 sed -i 's/cugraph-nx/nx-cugraph/g'
git mv ./conda/recipes/cugraph-nx ./conda/recipes/nx-cugraph
git mv ./python/cugraph-nx ./python/nx-cugraph
git mv ./python/nx-cugraph/cugraph_nx ./python/nx-cugraph/nx_cugraph
mv ../notebooks-bak/ notebooks
```
(a more reliable bash script would ensure the destination of `git mv` does not exist yet, b/c if the destination is a directory, it will happily--and incorrectly--move the target _into_ the directory)
```sh
# Make sure everything got renamed correctly
git grep -i 'cugraph.nx'
find . -iname '*cugraph*nx*' -print
```
Should we remove `cugraph-nx` nightlies once this is merged?

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/3840
---
 build.sh                                      |  14 +-
 ci/build_python.sh                            |   4 +-
 ci/release/update-version.sh                  |   4 +-
 ci/test_python.sh                             |  20 +-
 .../{cugraph-nx => nx-cugraph}/build.sh       |   2 +-
 .../{cugraph-nx => nx-cugraph}/meta.yaml      |   4 +-
 dependencies.yaml                             |  26 +--
 .../source/installation/source_build.md       |   2 +-
 .../cugraph_nx/tests/test_convert.py          | 203 ------------------
 .../tests/link_analysis/test_pagerank.py      |   4 +-
 python/{cugraph-nx => nx-cugraph}/.flake8     |   2 +-
 python/{cugraph-nx => nx-cugraph}/LICENSE     |   0
 python/{cugraph-nx => nx-cugraph}/Makefile    |   0
 python/{cugraph-nx => nx-cugraph}/README.md   |  12 +-
 python/{cugraph-nx => nx-cugraph}/conftest.py |   0
 python/{cugraph-nx => nx-cugraph}/lint.yaml   |   2 +-
 .../nx_cugraph}/__init__.py                   |   0
 .../nx_cugraph}/algorithms/__init__.py        |   0
 .../algorithms/centrality/__init__.py         |   0
 .../algorithms/centrality/betweenness.py      |   5 +-
 .../algorithms/community/__init__.py          |   0
 .../algorithms/community/louvain.py           |  10 +-
 .../nx_cugraph}/classes/__init__.py           |   0
 .../nx_cugraph}/classes/digraph.py            |   6 +-
 .../nx_cugraph}/classes/graph.py              |  16 +-
 .../nx_cugraph}/convert.py                    |  48 ++---
 .../nx_cugraph}/interface.py                  |   8 +-
 .../nx_cugraph}/tests/__init__.py             |   0
 .../nx_cugraph}/tests/bench_convert.py        |  17 +-
 .../nx_cugraph}/tests/conftest.py             |   0
 .../nx_cugraph/tests/test_convert.py          | 203 ++++++++++++++++++
 .../nx_cugraph}/tests/test_match_api.py       |  40 ++--
 .../nx_cugraph}/typing.py                     |   0
 .../nx_cugraph}/utils/__init__.py             |   0
 .../nx_cugraph}/utils/decorators.py           |   2 +-
 .../nx_cugraph}/utils/misc.py                 |  17 +-
 .../{cugraph-nx => nx-cugraph}/pyproject.toml |  16 +-
 .../run_nx_tests.sh                           |  10 +-
 python/{cugraph-nx => nx-cugraph}/setup.py    |   0
 39 files changed, 361 insertions(+), 336 deletions(-)
 rename conda/recipes/{cugraph-nx => nx-cugraph}/build.sh (86%)
 rename conda/recipes/{cugraph-nx => nx-cugraph}/meta.yaml (96%)
 delete mode 100644 python/cugraph-nx/cugraph_nx/tests/test_convert.py
 rename python/{cugraph-nx => nx-cugraph}/.flake8 (88%)
 rename python/{cugraph-nx => nx-cugraph}/LICENSE (100%)
 rename python/{cugraph-nx => nx-cugraph}/Makefile (100%)
 rename python/{cugraph-nx => nx-cugraph}/README.md (72%)
 rename python/{cugraph-nx => nx-cugraph}/conftest.py (100%)
 rename python/{cugraph-nx => nx-cugraph}/lint.yaml (98%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/__init__.py (100%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/algorithms/__init__.py (100%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/algorithms/centrality/__init__.py (100%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/algorithms/centrality/betweenness.py (94%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/algorithms/community/__init__.py (100%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/algorithms/community/louvain.py (89%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/classes/__init__.py (100%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/classes/digraph.py (92%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/classes/graph.py (97%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/convert.py (94%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/interface.py (97%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/tests/__init__.py (100%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/tests/bench_convert.py (92%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/tests/conftest.py (100%)
 create mode 100644 python/nx-cugraph/nx_cugraph/tests/test_convert.py
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/tests/test_match_api.py (75%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/typing.py (100%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/utils/__init__.py (100%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/utils/decorators.py (97%)
 rename python/{cugraph-nx/cugraph_nx => nx-cugraph/nx_cugraph}/utils/misc.py (78%)
 rename python/{cugraph-nx => nx-cugraph}/pyproject.toml (96%)
 rename python/{cugraph-nx => nx-cugraph}/run_nx_tests.sh (79%)
 rename python/{cugraph-nx => nx-cugraph}/setup.py (100%)

diff --git a/build.sh b/build.sh
index 74bdb3c6a2f..8dca89aeedd 100755
--- a/build.sh
+++ b/build.sh
@@ -30,7 +30,7 @@ VALIDARGS="
    cpp-mgtests
    cugraph-pyg
    cugraph-dgl
-   cugraph-nx
+   nx-cugraph
    docs
    -v
    -g
@@ -54,7 +54,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    pylibcugraph               - build the pylibcugraph Python package
    cugraph-pyg                - build the cugraph-pyg Python package
    cugraph                    - build the cugraph Python package
-   cugraph-nx                 - build the cugraph-nx Python package
+   nx-cugraph                 - build the nx-cugraph Python package
    cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
    cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
    cugraph-dgl                - build the cugraph-dgl extensions for DGL
@@ -209,7 +209,7 @@ if hasArg uninstall; then
     # removes the latest one and leaves the others installed. build.sh uninstall
     # can be run multiple times to remove all of them, but that is not obvious.
     pip uninstall -y pylibcugraph cugraph cugraph-service-client cugraph-service-server \
-        cugraph-dgl cugraph-pyg cugraph-nx
+        cugraph-dgl cugraph-pyg nx-cugraph
 fi
 
 if hasArg clean; then
@@ -382,12 +382,12 @@ if hasArg cugraph-dgl; then
     fi
 fi
 
-# Build and install the cugraph-nx Python package
-if hasArg cugraph-nx; then
+# Build and install the nx-cugraph Python package
+if hasArg nx-cugraph; then
     if hasArg --clean; then
-        cleanPythonDir ${REPODIR}/python/cugraph-nx
+        cleanPythonDir ${REPODIR}/python/nx-cugraph
     else
-        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/cugraph-nx
+        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/nx-cugraph
     fi
 fi
 
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 595eedf9e46..429ba649d1d 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -26,7 +26,7 @@ rapids-mamba-retry mambabuild \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cugraph
 
-# NOTE: nothing in cugraph-nx is CUDA-specific, but it is built on each CUDA
+# NOTE: nothing in nx-cugraph is CUDA-specific, but it is built on each CUDA
 # platform to ensure it is included in each set of artifacts, since test
 # scripts only install from one set of artifacts based on the CUDA version used
 # for the test run.
@@ -34,7 +34,7 @@ rapids-mamba-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/cugraph-nx
+  conda/recipes/nx-cugraph
 
 # NOTE: nothing in the cugraph-service packages are CUDA-specific, but they are
 # built on each CUDA platform to ensure they are included in each set of
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index f9a78b275ae..2c8735079f0 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -61,7 +61,7 @@ sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cugr
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cugraph-service/client/cugraph_service_client/__init__.py
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cugraph-service/server/cugraph_service_server/__init__.py
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/pylibcugraph/pylibcugraph/__init__.py
-sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cugraph-nx/cugraph_nx/__init__.py
+sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/nx-cugraph/nx_cugraph/__init__.py
 
 # Python pyproject.toml updates
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph/pyproject.toml
@@ -70,7 +70,7 @@ sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph-pyg
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph-service/client/pyproject.toml
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph-service/server/pyproject.toml
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/pylibcugraph/pyproject.toml
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph-nx/pyproject.toml
+sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/nx-cugraph/pyproject.toml
 
 # Wheel testing script
 sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cugraph.sh
diff --git a/ci/test_python.sh b/ci/test_python.sh
index e650630fa47..14886909fc9 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -34,7 +34,7 @@ rapids-mamba-retry install \
   libcugraph \
   pylibcugraph \
   cugraph \
-  cugraph-nx \
+  nx-cugraph \
   cugraph-service-server \
   cugraph-service-client
 
@@ -93,28 +93,28 @@ pytest \
   cugraph/pytest-based/bench_algos.py
 popd
 
-rapids-logger "pytest cugraph-nx"
-pushd python/cugraph-nx/cugraph_nx
+rapids-logger "pytest nx-cugraph"
+pushd python/nx-cugraph/nx_cugraph
 pytest \
   --capture=no \
   --verbose \
   --cache-clear \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph-nx.xml" \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-nx-cugraph.xml" \
   --cov-config=../../.coveragerc \
-  --cov=cugraph_nx \
-  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-nx-coverage.xml" \
+  --cov=nx_cugraph \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/nx-cugraph-coverage.xml" \
   --cov-report=term \
   --benchmark-disable \
   tests
 popd
 
-rapids-logger "pytest networkx using cugraph-nx backend"
-pushd python/cugraph-nx
+rapids-logger "pytest networkx using nx-cugraph backend"
+pushd python/nx-cugraph
 ./run_nx_tests.sh
 # run_nx_tests.sh outputs coverage data, so check that total coverage is >0.0%
-# in case cugraph-nx failed to load but fallback mode allowed the run to pass.
+# in case nx-cugraph failed to load but fallback mode allowed the run to pass.
 _coverage=$(coverage report|grep "^TOTAL")
-echo "cugraph-nx coverage from networkx tests: $_coverage"
+echo "nx-cugraph coverage from networkx tests: $_coverage"
 echo $_coverage | awk '{ if ($NF == "0.0%") exit 1 }'
 popd
 
diff --git a/conda/recipes/cugraph-nx/build.sh b/conda/recipes/nx-cugraph/build.sh
similarity index 86%
rename from conda/recipes/cugraph-nx/build.sh
rename to conda/recipes/nx-cugraph/build.sh
index 31ad477a73e..26665c1e76a 100644
--- a/conda/recipes/cugraph-nx/build.sh
+++ b/conda/recipes/nx-cugraph/build.sh
@@ -4,4 +4,4 @@
 
 # This assumes the script is executed from the root of the repo directory
 
-./build.sh cugraph-nx
+./build.sh nx-cugraph
diff --git a/conda/recipes/cugraph-nx/meta.yaml b/conda/recipes/nx-cugraph/meta.yaml
similarity index 96%
rename from conda/recipes/cugraph-nx/meta.yaml
rename to conda/recipes/nx-cugraph/meta.yaml
index d6b12974981..556d72e8548 100644
--- a/conda/recipes/cugraph-nx/meta.yaml
+++ b/conda/recipes/nx-cugraph/meta.yaml
@@ -6,7 +6,7 @@
 {% set date_string = environ['RAPIDS_DATE_STRING'] %}
 
 package:
-  name: cugraph-nx
+  name: nx-cugraph
   version: {{ version }}
 
 source:
@@ -29,7 +29,7 @@ requirements:
 
 tests:
   imports:
-    - cugraph_nx
+    - nx_cugraph
   commands:
     - pip check
   requires:
diff --git a/dependencies.yaml b/dependencies.yaml
index 23bd5c800b6..e8692cd670f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -14,14 +14,14 @@ files:
       - python_build_cythonize
       - python_run_cugraph
       - python_run_pylibcugraph
-      - python_run_cugraph_nx
+      - python_run_nx_cugraph
       - python_run_cugraph_dgl
       - python_run_cugraph_pyg
       - test_notebook
       - test_python_common
       - test_python_cugraph
       - test_python_pylibcugraph
-      - test_python_cugraph_nx
+      - test_python_nx_cugraph
   checks:
     output: none
     includes:
@@ -105,29 +105,29 @@ files:
     includes:
       - test_python_common
       - test_python_pylibcugraph
-  py_build_cugraph_nx:
+  py_build_nx_cugraph:
     output: pyproject
-    pyproject_dir: python/cugraph-nx
+    pyproject_dir: python/nx-cugraph
     extras:
       table: build-system
     includes:
       - python_build_wheel
-  py_run_cugraph_nx:
+  py_run_nx_cugraph:
     output: pyproject
-    pyproject_dir: python/cugraph-nx
+    pyproject_dir: python/nx-cugraph
     extras:
       table: project
     includes:
-      - python_run_cugraph_nx
-  py_test_cugraph_nx:
+      - python_run_nx_cugraph
+  py_test_nx_cugraph:
     output: pyproject
-    pyproject_dir: python/cugraph-nx
+    pyproject_dir: python/nx-cugraph
     extras:
       table: project.optional-dependencies
       key: test
     includes:
       - test_python_common
-      - test_python_cugraph_nx
+      - test_python_nx_cugraph
   py_build_cugraph_dgl:
     output: pyproject
     pyproject_dir: python/cugraph-dgl
@@ -375,7 +375,7 @@ dependencies:
         packages:
           - *pylibraft
           - *rmm
-  python_run_cugraph_nx:
+  python_run_nx_cugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
@@ -482,12 +482,12 @@ dependencies:
         packages:
           - *cudf
           - *numpy
-  test_python_cugraph_nx:
+  test_python_nx_cugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
           - packaging>=21
-            # not needed by cugraph-nx tests, but is required for running networkx tests
+            # not needed by nx-cugraph tests, but is required for running networkx tests
           - pytest-mpl
   cugraph_dgl_dev:
     common:
diff --git a/docs/cugraph/source/installation/source_build.md b/docs/cugraph/source/installation/source_build.md
index 453149d6cea..7782591f1ce 100644
--- a/docs/cugraph/source/installation/source_build.md
+++ b/docs/cugraph/source/installation/source_build.md
@@ -84,7 +84,7 @@ build.sh [<target> ...] [<flag> ...]
    libcugraph_etl             - build libcugraph_etl.so and SG test binaries
    pylibcugraph               - build the pylibcugraph Python package
    cugraph                    - build the cugraph Python package
-   cugraph-nx                 - build the cugraph-nx Python package
+   nx-cugraph                 - build the nx-cugraph Python package
    cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
    cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
    cugraph-dgl                - build the cugraph-dgl extensions for DGL
diff --git a/python/cugraph-nx/cugraph_nx/tests/test_convert.py b/python/cugraph-nx/cugraph_nx/tests/test_convert.py
deleted file mode 100644
index 7efba9ea555..00000000000
--- a/python/cugraph-nx/cugraph_nx/tests/test_convert.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import cupy as cp
-import networkx as nx
-import pytest
-
-import cugraph_nx as cnx
-from cugraph_nx import interface
-
-
-@pytest.mark.parametrize("graph_class", [nx.Graph, nx.DiGraph])
-@pytest.mark.parametrize(
-    "kwargs",
-    [
-        {},
-        {"preserve_edge_attrs": True},
-        {"preserve_node_attrs": True},
-        {"preserve_all_attrs": True},
-        {"edge_attrs": {"x": 0}},
-        {"edge_attrs": {"x": None}},
-        {"edge_attrs": {"x": cnx.convert.REQUIRED}},
-        {"edge_attrs": {"x": ...}},  # sugar for REQUIRED
-        {"edge_attrs": "x"},
-        {"node_attrs": {"x": 0}},
-        {"node_attrs": {"x": None}},
-        {"node_attrs": {"x": cnx.convert.REQUIRED}},
-        {"node_attrs": {"x": ...}},  # sugar for REQUIRED
-        {"node_attrs": "x"},
-    ],
-)
-def test_convert_empty(graph_class, kwargs):
-    G = graph_class()
-    cG = cnx.from_networkx(G, **kwargs)
-    H = cnx.to_networkx(cG)
-    assert G.number_of_nodes() == cG.number_of_nodes() == H.number_of_nodes() == 0
-    assert G.number_of_edges() == cG.number_of_edges() == H.number_of_edges() == 0
-    assert cG.edge_values == cG.edge_masks == cG.node_values == cG.node_masks == {}
-    assert G.graph == cG.graph == H.graph == {}
-
-
-def test_convert():
-    # FIXME: can we break this into smaller tests?
-    G = nx.Graph()
-    G.add_edge(0, 1, x=2)
-    G.add_node(0, foo=10)
-    G.add_node(1, foo=20, bar=100)
-    for kwargs in [
-        {"preserve_edge_attrs": True},
-        {"preserve_all_attrs": True},
-        {"edge_attrs": {"x": 0}},
-        {"edge_attrs": {"x": None}, "node_attrs": {"bar": None}},
-        {"edge_attrs": "x", "edge_dtypes": int},
-        {
-            "edge_attrs": {"x": cnx.convert.REQUIRED},
-            "node_attrs": {"foo": cnx.convert.REQUIRED},
-        },
-        {"edge_attrs": {"x": ...}, "node_attrs": {"foo": ...}},  # sugar for REQUIRED
-    ]:
-        # All edges have "x" attribute, so all kwargs are equivalent
-        cG = cnx.from_networkx(G, **kwargs)
-        cp.testing.assert_array_equal(cG.row_indices, [0, 1])
-        cp.testing.assert_array_equal(cG.col_indices, [1, 0])
-        cp.testing.assert_array_equal(cG.edge_values["x"], [2, 2])
-        assert len(cG.edge_values) == 1
-        assert cG.edge_masks == {}
-        H = cnx.to_networkx(cG)
-        assert G.number_of_nodes() == cG.number_of_nodes() == H.number_of_nodes() == 2
-        assert G.number_of_edges() == cG.number_of_edges() == H.number_of_edges() == 1
-        assert G.adj == H.adj
-
-    with pytest.raises(KeyError, match="bar"):
-        cnx.from_networkx(G, node_attrs={"bar": ...})
-
-    # Structure-only graph (no edge attributes)
-    cG = cnx.from_networkx(G, preserve_node_attrs=True)
-    cp.testing.assert_array_equal(cG.row_indices, [0, 1])
-    cp.testing.assert_array_equal(cG.col_indices, [1, 0])
-    cp.testing.assert_array_equal(cG.node_values["foo"], [10, 20])
-    assert cG.edge_values == cG.edge_masks == {}
-    H = cnx.to_networkx(cG)
-    assert set(G.edges) == set(H.edges) == {(0, 1)}
-    assert G.nodes == H.nodes
-
-    # Fill completely missing attribute with default value
-    cG = cnx.from_networkx(G, edge_attrs={"y": 0})
-    cp.testing.assert_array_equal(cG.row_indices, [0, 1])
-    cp.testing.assert_array_equal(cG.col_indices, [1, 0])
-    cp.testing.assert_array_equal(cG.edge_values["y"], [0, 0])
-    assert len(cG.edge_values) == 1
-    assert cG.edge_masks == cG.node_values == cG.node_masks == {}
-    H = cnx.to_networkx(cG)
-    assert list(H.edges(data=True)) == [(0, 1, {"y": 0})]
-
-    # If attribute is completely missing (and no default), then just ignore it
-    cG = cnx.from_networkx(G, edge_attrs={"y": None})
-    cp.testing.assert_array_equal(cG.row_indices, [0, 1])
-    cp.testing.assert_array_equal(cG.col_indices, [1, 0])
-    assert sorted(cG.edge_values) == sorted(cG.edge_masks) == []
-    H = cnx.to_networkx(cG)
-    assert list(H.edges(data=True)) == [(0, 1, {})]
-
-    G.add_edge(0, 2)
-    # Some edges are missing 'x' attribute; need to use a mask
-    for kwargs in [{"preserve_edge_attrs": True}, {"edge_attrs": {"x": None}}]:
-        cG = cnx.from_networkx(G, **kwargs)
-        cp.testing.assert_array_equal(cG.row_indices, [0, 0, 1, 2])
-        cp.testing.assert_array_equal(cG.col_indices, [1, 2, 0, 0])
-        assert sorted(cG.edge_values) == sorted(cG.edge_masks) == ["x"]
-        cp.testing.assert_array_equal(cG.edge_masks["x"], [True, False, True, False])
-        cp.testing.assert_array_equal(cG.edge_values["x"][cG.edge_masks["x"]], [2, 2])
-    H = cnx.to_networkx(cG)
-    assert list(H.edges(data=True)) == [(0, 1, {"x": 2}), (0, 2, {})]
-
-    with pytest.raises(KeyError, match="x"):
-        cnx.from_networkx(G, edge_attrs={"x": cnx.convert.REQUIRED})
-    with pytest.raises(KeyError, match="x"):
-        cnx.from_networkx(G, edge_attrs={"x": ...})
-    with pytest.raises(KeyError, match="bar"):
-        cnx.from_networkx(G, node_attrs={"bar": cnx.convert.REQUIRED})
-    with pytest.raises(KeyError, match="bar"):
-        cnx.from_networkx(G, node_attrs={"bar": ...})
-
-    # Now for something more complicated...
-    G = nx.Graph()
-    G.add_edge(10, 20, x=1)
-    G.add_edge(10, 30, x=2, y=1.5)
-    G.add_node(10, foo=100)
-    G.add_node(20, foo=200, bar=1000)
-    G.add_node(30, foo=300)
-    # Some edges have masks, some don't
-    for kwargs in [
-        {"preserve_edge_attrs": True},
-        {"preserve_all_attrs": True},
-        {"edge_attrs": {"x": None, "y": None}},
-        {"edge_attrs": {"x": 0, "y": None}},
-        {"edge_attrs": {"x": 0, "y": None}},
-        {"edge_attrs": {"x": 0, "y": None}, "edge_dtypes": {"x": int, "y": float}},
-    ]:
-        cG = cnx.from_networkx(G, **kwargs)
-        assert cG.id_to_key == {0: 10, 1: 20, 2: 30}  # Remap node IDs to 0, 1, ...
-        cp.testing.assert_array_equal(cG.row_indices, [0, 0, 1, 2])
-        cp.testing.assert_array_equal(cG.col_indices, [1, 2, 0, 0])
-        cp.testing.assert_array_equal(cG.edge_values["x"], [1, 2, 1, 2])
-        assert sorted(cG.edge_masks) == ["y"]
-        cp.testing.assert_array_equal(cG.edge_masks["y"], [False, True, False, True])
-        cp.testing.assert_array_equal(
-            cG.edge_values["y"][cG.edge_masks["y"]], [1.5, 1.5]
-        )
-        H = cnx.to_networkx(cG)
-        assert G.adj == H.adj
-
-    # Some nodes have masks, some don't
-    for kwargs in [
-        {"preserve_node_attrs": True},
-        {"preserve_all_attrs": True},
-        {"node_attrs": {"foo": None, "bar": None}},
-        {"node_attrs": {"foo": None, "bar": None}},
-        {"node_attrs": {"foo": 0, "bar": None, "missing": None}},
-    ]:
-        cG = cnx.from_networkx(G, **kwargs)
-        assert cG.id_to_key == {0: 10, 1: 20, 2: 30}  # Remap node IDs to 0, 1, ...
-        cp.testing.assert_array_equal(cG.row_indices, [0, 0, 1, 2])
-        cp.testing.assert_array_equal(cG.col_indices, [1, 2, 0, 0])
-        cp.testing.assert_array_equal(cG.node_values["foo"], [100, 200, 300])
-        assert sorted(cG.node_masks) == ["bar"]
-        cp.testing.assert_array_equal(cG.node_masks["bar"], [False, True, False])
-        cp.testing.assert_array_equal(
-            cG.node_values["bar"][cG.node_masks["bar"]], [1000]
-        )
-        H = cnx.to_networkx(cG)
-        assert G.nodes == H.nodes
-
-    # Check default values for nodes
-    for kwargs in [
-        {"node_attrs": {"foo": None, "bar": 0}},
-        {"node_attrs": {"foo": None, "bar": 0, "missing": None}},
-        {"node_attrs": {"bar": 0}},
-        {"node_attrs": {"bar": 0}, "node_dtypes": {"bar": int}},
-        {"node_attrs": {"bar": 0, "foo": None}, "node_dtypes": int},
-    ]:
-        cG = cnx.from_networkx(G, **kwargs)
-        assert cG.id_to_key == {0: 10, 1: 20, 2: 30}  # Remap node IDs to 0, 1, ...
-        cp.testing.assert_array_equal(cG.row_indices, [0, 0, 1, 2])
-        cp.testing.assert_array_equal(cG.col_indices, [1, 2, 0, 0])
-        cp.testing.assert_array_equal(cG.node_values["bar"], [0, 1000, 0])
-        assert cG.node_masks == {}
-
-    with pytest.raises(
-        TypeError, match="edge_attrs and weight arguments should not both be given"
-    ):
-        interface.BackendInterface.convert_from_nx(G, edge_attrs={"x": 1}, weight="x")
-    with pytest.raises(TypeError, match="Expected networkx.Graph"):
-        cnx.from_networkx({})
diff --git a/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py b/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py
index 9d9572b88b2..a8d3e2fc7ec 100644
--- a/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py
+++ b/python/cugraph/cugraph/tests/link_analysis/test_pagerank.py
@@ -65,7 +65,7 @@ def cugraph_call(G, max_iter, tol, alpha, personalization, nstart, pre_vtx_o_wgt
 
 
 # need a different function since the Nx version returns a dictionary
-def cugraph_nx_call(G, max_iter, tol, alpha, personalization, nstart):
+def nx_cugraph_call(G, max_iter, tol, alpha, personalization, nstart):
     # cugraph Pagerank Call
     t1 = time.time()
     pr = cugraph.pagerank(
@@ -238,7 +238,7 @@ def test_pagerank_nx(graph_file, max_iter, tol, alpha, personalization_perc, has
     cu_prsn = cudify(networkx_prsn)
 
     # cuGraph PageRank with Nx Graph
-    cugraph_pr = cugraph_nx_call(Gnx, max_iter, tol, alpha, cu_prsn, cu_nstart)
+    cugraph_pr = nx_cugraph_call(Gnx, max_iter, tol, alpha, cu_prsn, cu_nstart)
 
     # Calculating mismatch
     networkx_pr = sorted(networkx_pr.items(), key=lambda x: x[0])
diff --git a/python/cugraph-nx/.flake8 b/python/nx-cugraph/.flake8
similarity index 88%
rename from python/cugraph-nx/.flake8
rename to python/nx-cugraph/.flake8
index f66815e8507..3a2e3fb8617 100644
--- a/python/cugraph-nx/.flake8
+++ b/python/nx-cugraph/.flake8
@@ -9,5 +9,5 @@ extend-ignore =
     SIM401,
 # E203 whitespace before ':' (to be compatible with black)
 per-file-ignores =
-    cugraph_nx/tests/*.py:T201,
+    nx_cugraph/tests/*.py:T201,
     __init__.py:F401,F403,
diff --git a/python/cugraph-nx/LICENSE b/python/nx-cugraph/LICENSE
similarity index 100%
rename from python/cugraph-nx/LICENSE
rename to python/nx-cugraph/LICENSE
diff --git a/python/cugraph-nx/Makefile b/python/nx-cugraph/Makefile
similarity index 100%
rename from python/cugraph-nx/Makefile
rename to python/nx-cugraph/Makefile
diff --git a/python/cugraph-nx/README.md b/python/nx-cugraph/README.md
similarity index 72%
rename from python/cugraph-nx/README.md
rename to python/nx-cugraph/README.md
index 2137fdb6472..e7cd26218e6 100644
--- a/python/cugraph-nx/README.md
+++ b/python/nx-cugraph/README.md
@@ -1,24 +1,24 @@
-# cugraph-nx
+# nx-cugraph
 
 ## Description
-[RAPIDS](https://rapids.ai) cugraph-nx is a [backend to NetworkX](https://networkx.org/documentation/stable/reference/classes/index.html#backends)
+[RAPIDS](https://rapids.ai) nx-cugraph is a [backend to NetworkX](https://networkx.org/documentation/stable/reference/classes/index.html#backends)
 with minimal dependencies (`networkx`, `cupy`, and `pylibcugraph`) to run graph algorithms on the GPU.
 
 ### Contribute
 
 Follow instructions for [contributing to cugraph](https://github.com/rapidsai/cugraph/blob/branch-23.10/readme_pages/CONTRIBUTING.md)
-and [building from source](https://docs.rapids.ai/api/cugraph/stable/installation/source_build/), then build cugraph-nx in develop (i.e., editable) mode:
+and [building from source](https://docs.rapids.ai/api/cugraph/stable/installation/source_build/), then build nx-cugraph in develop (i.e., editable) mode:
 ```
-$ ./build.sh cugraph-nx --pydevelop
+$ ./build.sh nx-cugraph --pydevelop
 ```
 
 ### Run tests
 
-Run cugraph-nx tests from `cugraph/python/cugraph-nx` directory:
+Run nx-cugraph tests from `cugraph/python/nx-cugraph` directory:
 ```
 $ pytest
 ```
-Run cugraph-nx benchmarks:
+Run nx-cugraph benchmarks:
 ```
 $ pytest --bench
 ```
diff --git a/python/cugraph-nx/conftest.py b/python/nx-cugraph/conftest.py
similarity index 100%
rename from python/cugraph-nx/conftest.py
rename to python/nx-cugraph/conftest.py
diff --git a/python/cugraph-nx/lint.yaml b/python/nx-cugraph/lint.yaml
similarity index 98%
rename from python/cugraph-nx/lint.yaml
rename to python/nx-cugraph/lint.yaml
index 42c1b9657c7..dba061bd6b5 100644
--- a/python/cugraph-nx/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -74,7 +74,7 @@ repos:
       - id: codespell
         types_or: [python, rst, markdown]
         additional_dependencies: [tomli]
-        files: ^(cugraph_nx|docs)/
+        files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.0.286
     hooks:
diff --git a/python/cugraph-nx/cugraph_nx/__init__.py b/python/nx-cugraph/nx_cugraph/__init__.py
similarity index 100%
rename from python/cugraph-nx/cugraph_nx/__init__.py
rename to python/nx-cugraph/nx_cugraph/__init__.py
diff --git a/python/cugraph-nx/cugraph_nx/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
similarity index 100%
rename from python/cugraph-nx/cugraph_nx/algorithms/__init__.py
rename to python/nx-cugraph/nx_cugraph/algorithms/__init__.py
diff --git a/python/cugraph-nx/cugraph_nx/algorithms/centrality/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/__init__.py
similarity index 100%
rename from python/cugraph-nx/cugraph_nx/algorithms/centrality/__init__.py
rename to python/nx-cugraph/nx_cugraph/algorithms/centrality/__init__.py
diff --git a/python/cugraph-nx/cugraph_nx/algorithms/centrality/betweenness.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
similarity index 94%
rename from python/cugraph-nx/cugraph_nx/algorithms/centrality/betweenness.py
rename to python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
index a5962a52865..b777919f86f 100644
--- a/python/cugraph-nx/cugraph_nx/algorithms/centrality/betweenness.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
@@ -12,8 +12,8 @@
 # limitations under the License.
 import pylibcugraph as plc
 
-from cugraph_nx.convert import _to_graph
-from cugraph_nx.utils import networkx_algorithm
+from nx_cugraph.convert import _to_graph
+from nx_cugraph.utils import _handle_seed, networkx_algorithm
 
 __all__ = ["betweenness_centrality", "edge_betweenness_centrality"]
 
@@ -26,6 +26,7 @@ def betweenness_centrality(
         raise NotImplementedError(
             "Weighted implementation of betweenness centrality not currently supported"
         )
+    seed = _handle_seed(seed)
     G = _to_graph(G, weight)
     node_ids, values = plc.betweenness_centrality(
         resource_handle=plc.ResourceHandle(),
diff --git a/python/cugraph-nx/cugraph_nx/algorithms/community/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/community/__init__.py
similarity index 100%
rename from python/cugraph-nx/cugraph_nx/algorithms/community/__init__.py
rename to python/nx-cugraph/nx_cugraph/algorithms/community/__init__.py
diff --git a/python/cugraph-nx/cugraph_nx/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
similarity index 89%
rename from python/cugraph-nx/cugraph_nx/algorithms/community/louvain.py
rename to python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
index 476f7428aab..ca5f05c2014 100644
--- a/python/cugraph-nx/cugraph_nx/algorithms/community/louvain.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
@@ -14,8 +14,13 @@
 
 import pylibcugraph as plc
 
-from cugraph_nx.convert import _to_undirected_graph
-from cugraph_nx.utils import _groupby, networkx_algorithm, not_implemented_for
+from nx_cugraph.convert import _to_undirected_graph
+from nx_cugraph.utils import (
+    _groupby,
+    _handle_seed,
+    networkx_algorithm,
+    not_implemented_for,
+)
 
 __all__ = ["louvain_communities"]
 
@@ -30,6 +35,7 @@ def louvain_communities(
     Extra parameter: `max_level` controls the maximum number of levels of the algorithm.
     """
     # NetworkX allows both directed and undirected, but cugraph only allows undirected.
+    seed = _handle_seed(seed)  # Unused, but ensure it's valid for future compatibility
     G = _to_undirected_graph(G, weight)
     if G.row_indices.size == 0:
         # TODO: PLC doesn't handle empty graphs gracefully!
diff --git a/python/cugraph-nx/cugraph_nx/classes/__init__.py b/python/nx-cugraph/nx_cugraph/classes/__init__.py
similarity index 100%
rename from python/cugraph-nx/cugraph_nx/classes/__init__.py
rename to python/nx-cugraph/nx_cugraph/classes/__init__.py
diff --git a/python/cugraph-nx/cugraph_nx/classes/digraph.py b/python/nx-cugraph/nx_cugraph/classes/digraph.py
similarity index 92%
rename from python/cugraph-nx/cugraph_nx/classes/digraph.py
rename to python/nx-cugraph/nx_cugraph/classes/digraph.py
index 0cea08f3e77..0aaf88fd793 100644
--- a/python/cugraph-nx/cugraph_nx/classes/digraph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/digraph.py
@@ -16,16 +16,16 @@
 
 import networkx as nx
 
-import cugraph_nx as cnx
+import nx_cugraph as nxcg
 
 from .graph import Graph
 
 if TYPE_CHECKING:
-    from cugraph_nx.typing import NodeKey
+    from nx_cugraph.typing import NodeKey
 
 __all__ = ["DiGraph"]
 
-networkx_api = cnx.utils.decorators.networkx_class(nx.DiGraph)
+networkx_api = nxcg.utils.decorators.networkx_class(nx.DiGraph)
 
 
 class DiGraph(Graph):
diff --git a/python/cugraph-nx/cugraph_nx/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py
similarity index 97%
rename from python/cugraph-nx/cugraph_nx/classes/graph.py
rename to python/nx-cugraph/nx_cugraph/classes/graph.py
index 5604f2457f8..1432f68c752 100644
--- a/python/cugraph-nx/cugraph_nx/classes/graph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/graph.py
@@ -21,12 +21,12 @@
 import numpy as np
 import pylibcugraph as plc
 
-import cugraph_nx as cnx
+import nx_cugraph as nxcg
 
 if TYPE_CHECKING:
     from collections.abc import Iterable, Iterator
 
-    from cugraph_nx.typing import (
+    from nx_cugraph.typing import (
         AttrKey,
         Dtype,
         EdgeTuple,
@@ -38,11 +38,11 @@
 
 __all__ = ["Graph"]
 
-networkx_api = cnx.utils.decorators.networkx_class(nx.Graph)
+networkx_api = nxcg.utils.decorators.networkx_class(nx.Graph)
 
 
 class Graph:
-    # Tell networkx to dispatch calls with this object to cugraph-nx
+    # Tell networkx to dispatch calls with this object to nx-cugraph
     __networkx_plugin__: ClassVar[str] = "cugraph"
 
     # networkx properties
@@ -248,7 +248,7 @@ def __new__(cls, incoming_graph_data=None, **attr) -> Graph:
         elif incoming_graph_data.__class__ is new_graph.__class__:
             new_graph = incoming_graph_data.copy()
         elif incoming_graph_data.__class__ is new_graph.to_networkx_class():
-            new_graph = cnx.from_networkx(incoming_graph_data, preserve_all_attrs=True)
+            new_graph = nxcg.from_networkx(incoming_graph_data, preserve_all_attrs=True)
         else:
             raise NotImplementedError
         new_graph.graph.update(attr)
@@ -270,8 +270,8 @@ def is_multigraph(cls) -> bool:
 
     @classmethod
     @networkx_api
-    def to_directed_class(cls) -> type[cnx.DiGraph]:
-        return cnx.DiGraph
+    def to_directed_class(cls) -> type[nxcg.DiGraph]:
+        return nxcg.DiGraph
 
     @classmethod
     def to_networkx_class(cls) -> type[nx.Graph]:
@@ -428,7 +428,7 @@ def size(self, weight: AttrKey | None = None) -> int:
         return int((self.row_indices <= self.col_indices).sum())
 
     @networkx_api
-    def to_directed(self, as_view: bool = False) -> cnx.DiGraph:
+    def to_directed(self, as_view: bool = False) -> nxcg.DiGraph:
         return self._copy(as_view, self.to_directed_class())
 
     @networkx_api
diff --git a/python/cugraph-nx/cugraph_nx/convert.py b/python/nx-cugraph/nx_cugraph/convert.py
similarity index 94%
rename from python/cugraph-nx/cugraph_nx/convert.py
rename to python/nx-cugraph/nx_cugraph/convert.py
index 530dd700f35..9be8cac7877 100644
--- a/python/cugraph-nx/cugraph_nx/convert.py
+++ b/python/nx-cugraph/nx_cugraph/convert.py
@@ -22,10 +22,10 @@
 import networkx as nx
 import numpy as np
 
-import cugraph_nx as cnx
+import nx_cugraph as nxcg
 
 if TYPE_CHECKING:
-    from cugraph_nx.typing import AttrKey, Dtype, EdgeValue, NodeValue
+    from nx_cugraph.typing import AttrKey, Dtype, EdgeValue, NodeValue
 
 __all__ = [
     "from_networkx",
@@ -51,8 +51,8 @@ def from_networkx(
     as_directed: bool = False,
     name: str | None = None,
     graph_name: str | None = None,
-) -> cnx.Graph:
-    """Convert a networkx graph to cugraph_nx graph; can convert all attributes.
+) -> nxcg.Graph:
+    """Convert a networkx graph to nx_cugraph graph; can convert all attributes.
 
     Parameters
     ----------
@@ -61,7 +61,7 @@ def from_networkx(
         Dict that maps edge attributes to default values if missing in ``G``.
         If None, then no edge attributes will be converted.
         If default value is None, then missing values are handled with a mask.
-        A default value of ``cnx.convert.REQUIRED`` or ``...`` indicates that
+        A default value of ``nxcg.convert.REQUIRED`` or ``...`` indicates that
         all edges have data for this attribute, and raise `KeyError` if not.
         For convenience, `edge_attrs` may be a single attribute with default 1;
         for example ``edge_attrs="weight"``.
@@ -70,7 +70,7 @@ def from_networkx(
         Dict that maps node attributes to default values if missing in ``G``.
         If None, then no node attributes will be converted.
         If default value is None, then missing values are handled with a mask.
-        A default value of ``cnx.convert.REQUIRED`` or ``...`` indicates that
+        A default value of ``nxcg.convert.REQUIRED`` or ``...`` indicates that
         all edges have data for this attribute, and raise `KeyError` if not.
         For convenience, `node_attrs` may be a single attribute with no default;
         for example ``node_attrs="weight"``.
@@ -94,7 +94,7 @@ def from_networkx(
 
     Returns
     -------
-    cugraph_nx.Graph
+    nx_cugraph.Graph
 
     Notes
     -----
@@ -111,7 +111,7 @@ def from_networkx(
 
     See Also
     --------
-    to_networkx : The opposite; convert cugraph_nx graph to networkx graph
+    to_networkx : The opposite; convert nx_cugraph graph to networkx graph
     """
     # This uses `graph._adj` and `graph._node`, which are private attributes in NetworkX
     if not isinstance(graph, nx.Graph):
@@ -352,9 +352,9 @@ def from_networkx(
                 # if vals.ndim > 1: ...
 
     if graph.is_directed() or as_directed:
-        klass = cnx.DiGraph
+        klass = nxcg.DiGraph
     else:
-        klass = cnx.Graph
+        klass = nxcg.Graph
     rv = klass.from_coo(
         N,
         row_indices,
@@ -398,14 +398,14 @@ def _iter_attr_dicts(
     return full_dicts
 
 
-def to_networkx(G: cnx.Graph) -> nx.Graph:
-    """Convert a cugraph_nx graph to networkx graph.
+def to_networkx(G: nxcg.Graph) -> nx.Graph:
+    """Convert a nx_cugraph graph to networkx graph.
 
     All edge and node attributes and ``G.graph`` properties are converted.
 
     Parameters
     ----------
-    G : cugraph_nx.Graph
+    G : nx_cugraph.Graph
 
     Returns
     -------
@@ -413,7 +413,7 @@ def to_networkx(G: cnx.Graph) -> nx.Graph:
 
     See Also
     --------
-    from_networkx : The opposite; convert networkx graph to cugraph_nx graph
+    from_networkx : The opposite; convert networkx graph to nx_cugraph graph
     """
     rv = G.to_networkx_class()()
     id_to_key = G.id_to_key
@@ -463,13 +463,13 @@ def _to_graph(
     edge_attr: AttrKey | None = None,
     edge_default: EdgeValue | None = 1,
     edge_dtype: Dtype | None = None,
-) -> cnx.Graph | cnx.DiGraph:
-    """Ensure that input type is a cugraph_nx graph, and convert if necessary.
+) -> nxcg.Graph | nxcg.DiGraph:
+    """Ensure that input type is a nx_cugraph graph, and convert if necessary.
 
     Directed and undirected graphs are both allowed.
     This is an internal utility function and may change or be removed.
     """
-    if isinstance(G, cnx.Graph):
+    if isinstance(G, nxcg.Graph):
         return G
     if isinstance(G, nx.Graph):
         return from_networkx(
@@ -484,15 +484,15 @@ def _to_directed_graph(
     edge_attr: AttrKey | None = None,
     edge_default: EdgeValue | None = 1,
     edge_dtype: Dtype | None = None,
-) -> cnx.DiGraph:
-    """Ensure that input type is a cugraph_nx DiGraph, and convert if necessary.
+) -> nxcg.DiGraph:
+    """Ensure that input type is a nx_cugraph DiGraph, and convert if necessary.
 
     Undirected graphs will be converted to directed.
     This is an internal utility function and may change or be removed.
     """
-    if isinstance(G, cnx.DiGraph):
+    if isinstance(G, nxcg.DiGraph):
         return G
-    if isinstance(G, cnx.Graph):
+    if isinstance(G, nxcg.Graph):
         return G.to_directed()
     if isinstance(G, nx.Graph):
         return from_networkx(
@@ -510,13 +510,13 @@ def _to_undirected_graph(
     edge_attr: AttrKey | None = None,
     edge_default: EdgeValue | None = 1,
     edge_dtype: Dtype | None = None,
-) -> cnx.Graph:
-    """Ensure that input type is a cugraph_nx Graph, and convert if necessary.
+) -> nxcg.Graph:
+    """Ensure that input type is a nx_cugraph Graph, and convert if necessary.
 
     Only undirected graphs are allowed. Directed graphs will raise ValueError.
     This is an internal utility function and may change or be removed.
     """
-    if isinstance(G, cnx.Graph):
+    if isinstance(G, nxcg.Graph):
         if G.is_directed():
             raise ValueError("Only undirected graphs supported; got a directed graph")
         return G
diff --git a/python/cugraph-nx/cugraph_nx/interface.py b/python/nx-cugraph/nx_cugraph/interface.py
similarity index 97%
rename from python/cugraph-nx/cugraph_nx/interface.py
rename to python/nx-cugraph/nx_cugraph/interface.py
index 198fdd09cfc..cc750cd2d5b 100644
--- a/python/cugraph-nx/cugraph_nx/interface.py
+++ b/python/nx-cugraph/nx_cugraph/interface.py
@@ -14,7 +14,7 @@
 
 import networkx as nx
 
-import cugraph_nx as cnx
+import nx_cugraph as nxcg
 
 
 class BackendInterface:
@@ -29,12 +29,12 @@ def convert_from_nx(graph, *args, edge_attrs=None, weight=None, **kwargs):
                     "edge_attrs and weight arguments should not both be given"
                 )
             edge_attrs = {weight: 1}
-        return cnx.from_networkx(graph, *args, edge_attrs=edge_attrs, **kwargs)
+        return nxcg.from_networkx(graph, *args, edge_attrs=edge_attrs, **kwargs)
 
     @staticmethod
     def convert_to_nx(obj, *, name: str | None = None):
-        if isinstance(obj, cnx.Graph):
-            return cnx.to_networkx(obj)
+        if isinstance(obj, nxcg.Graph):
+            return nxcg.to_networkx(obj)
         return obj
 
     @staticmethod
diff --git a/python/cugraph-nx/cugraph_nx/tests/__init__.py b/python/nx-cugraph/nx_cugraph/tests/__init__.py
similarity index 100%
rename from python/cugraph-nx/cugraph_nx/tests/__init__.py
rename to python/nx-cugraph/nx_cugraph/tests/__init__.py
diff --git a/python/cugraph-nx/cugraph_nx/tests/bench_convert.py b/python/nx-cugraph/nx_cugraph/tests/bench_convert.py
similarity index 92%
rename from python/cugraph-nx/cugraph_nx/tests/bench_convert.py
rename to python/nx-cugraph/nx_cugraph/tests/bench_convert.py
index 85ef66ac918..7e6278661c2 100644
--- a/python/cugraph-nx/cugraph_nx/tests/bench_convert.py
+++ b/python/nx-cugraph/nx_cugraph/tests/bench_convert.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 
-import cugraph_nx as cnx
+import nx_cugraph as nxcg
 
 try:
     import cugraph
@@ -50,19 +50,22 @@ def _bench_helper(gpubenchmark, N, attr_kind, create_using, method):
                 continue
             edgedict["x"] = random.randint(0, 100000)
         if attr_kind == "preserve":
-            gpubenchmark(cnx.from_networkx, G, preserve_edge_attrs=True)
+            gpubenchmark(nxcg.from_networkx, G, preserve_edge_attrs=True)
         elif attr_kind == "half_missing":
-            gpubenchmark(cnx.from_networkx, G, edge_attrs={"x": None})
+            gpubenchmark(nxcg.from_networkx, G, edge_attrs={"x": None})
         elif attr_kind == "required":
-            gpubenchmark(cnx.from_networkx, G, edge_attrs={"x": ...})
+            gpubenchmark(nxcg.from_networkx, G, edge_attrs={"x": ...})
         elif attr_kind == "required_dtype":
             gpubenchmark(
-                cnx.from_networkx, G, edge_attrs={"x": ...}, edge_dtypes={"x": np.int32}
+                nxcg.from_networkx,
+                G,
+                edge_attrs={"x": ...},
+                edge_dtypes={"x": np.int32},
             )
         else:  # full, half_default
-            gpubenchmark(cnx.from_networkx, G, edge_attrs={"x": 0})
+            gpubenchmark(nxcg.from_networkx, G, edge_attrs={"x": 0})
     else:
-        gpubenchmark(cnx.from_networkx, G)
+        gpubenchmark(nxcg.from_networkx, G)
 
 
 def _bench_helper_cugraph(
diff --git a/python/cugraph-nx/cugraph_nx/tests/conftest.py b/python/nx-cugraph/nx_cugraph/tests/conftest.py
similarity index 100%
rename from python/cugraph-nx/cugraph_nx/tests/conftest.py
rename to python/nx-cugraph/nx_cugraph/tests/conftest.py
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_convert.py b/python/nx-cugraph/nx_cugraph/tests/test_convert.py
new file mode 100644
index 00000000000..ba3cd7aaee1
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_convert.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+import networkx as nx
+import pytest
+
+import nx_cugraph as nxcg
+from nx_cugraph import interface
+
+
+@pytest.mark.parametrize("graph_class", [nx.Graph, nx.DiGraph])
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {},
+        {"preserve_edge_attrs": True},
+        {"preserve_node_attrs": True},
+        {"preserve_all_attrs": True},
+        {"edge_attrs": {"x": 0}},
+        {"edge_attrs": {"x": None}},
+        {"edge_attrs": {"x": nxcg.convert.REQUIRED}},
+        {"edge_attrs": {"x": ...}},  # sugar for REQUIRED
+        {"edge_attrs": "x"},
+        {"node_attrs": {"x": 0}},
+        {"node_attrs": {"x": None}},
+        {"node_attrs": {"x": nxcg.convert.REQUIRED}},
+        {"node_attrs": {"x": ...}},  # sugar for REQUIRED
+        {"node_attrs": "x"},
+    ],
+)
+def test_convert_empty(graph_class, kwargs):
+    G = graph_class()
+    Gcg = nxcg.from_networkx(G, **kwargs)
+    H = nxcg.to_networkx(Gcg)
+    assert G.number_of_nodes() == Gcg.number_of_nodes() == H.number_of_nodes() == 0
+    assert G.number_of_edges() == Gcg.number_of_edges() == H.number_of_edges() == 0
+    assert Gcg.edge_values == Gcg.edge_masks == Gcg.node_values == Gcg.node_masks == {}
+    assert G.graph == Gcg.graph == H.graph == {}
+
+
+def test_convert():
+    # FIXME: can we break this into smaller tests?
+    G = nx.Graph()
+    G.add_edge(0, 1, x=2)
+    G.add_node(0, foo=10)
+    G.add_node(1, foo=20, bar=100)
+    for kwargs in [
+        {"preserve_edge_attrs": True},
+        {"preserve_all_attrs": True},
+        {"edge_attrs": {"x": 0}},
+        {"edge_attrs": {"x": None}, "node_attrs": {"bar": None}},
+        {"edge_attrs": "x", "edge_dtypes": int},
+        {
+            "edge_attrs": {"x": nxcg.convert.REQUIRED},
+            "node_attrs": {"foo": nxcg.convert.REQUIRED},
+        },
+        {"edge_attrs": {"x": ...}, "node_attrs": {"foo": ...}},  # sugar for REQUIRED
+    ]:
+        # All edges have "x" attribute, so all kwargs are equivalent
+        Gcg = nxcg.from_networkx(G, **kwargs)
+        cp.testing.assert_array_equal(Gcg.row_indices, [0, 1])
+        cp.testing.assert_array_equal(Gcg.col_indices, [1, 0])
+        cp.testing.assert_array_equal(Gcg.edge_values["x"], [2, 2])
+        assert len(Gcg.edge_values) == 1
+        assert Gcg.edge_masks == {}
+        H = nxcg.to_networkx(Gcg)
+        assert G.number_of_nodes() == Gcg.number_of_nodes() == H.number_of_nodes() == 2
+        assert G.number_of_edges() == Gcg.number_of_edges() == H.number_of_edges() == 1
+        assert G.adj == H.adj
+
+    with pytest.raises(KeyError, match="bar"):
+        nxcg.from_networkx(G, node_attrs={"bar": ...})
+
+    # Structure-only graph (no edge attributes)
+    Gcg = nxcg.from_networkx(G, preserve_node_attrs=True)
+    cp.testing.assert_array_equal(Gcg.row_indices, [0, 1])
+    cp.testing.assert_array_equal(Gcg.col_indices, [1, 0])
+    cp.testing.assert_array_equal(Gcg.node_values["foo"], [10, 20])
+    assert Gcg.edge_values == Gcg.edge_masks == {}
+    H = nxcg.to_networkx(Gcg)
+    assert set(G.edges) == set(H.edges) == {(0, 1)}
+    assert G.nodes == H.nodes
+
+    # Fill completely missing attribute with default value
+    Gcg = nxcg.from_networkx(G, edge_attrs={"y": 0})
+    cp.testing.assert_array_equal(Gcg.row_indices, [0, 1])
+    cp.testing.assert_array_equal(Gcg.col_indices, [1, 0])
+    cp.testing.assert_array_equal(Gcg.edge_values["y"], [0, 0])
+    assert len(Gcg.edge_values) == 1
+    assert Gcg.edge_masks == Gcg.node_values == Gcg.node_masks == {}
+    H = nxcg.to_networkx(Gcg)
+    assert list(H.edges(data=True)) == [(0, 1, {"y": 0})]
+
+    # If attribute is completely missing (and no default), then just ignore it
+    Gcg = nxcg.from_networkx(G, edge_attrs={"y": None})
+    cp.testing.assert_array_equal(Gcg.row_indices, [0, 1])
+    cp.testing.assert_array_equal(Gcg.col_indices, [1, 0])
+    assert sorted(Gcg.edge_values) == sorted(Gcg.edge_masks) == []
+    H = nxcg.to_networkx(Gcg)
+    assert list(H.edges(data=True)) == [(0, 1, {})]
+
+    G.add_edge(0, 2)
+    # Some edges are missing 'x' attribute; need to use a mask
+    for kwargs in [{"preserve_edge_attrs": True}, {"edge_attrs": {"x": None}}]:
+        Gcg = nxcg.from_networkx(G, **kwargs)
+        cp.testing.assert_array_equal(Gcg.row_indices, [0, 0, 1, 2])
+        cp.testing.assert_array_equal(Gcg.col_indices, [1, 2, 0, 0])
+        assert sorted(Gcg.edge_values) == sorted(Gcg.edge_masks) == ["x"]
+        cp.testing.assert_array_equal(Gcg.edge_masks["x"], [True, False, True, False])
+        cp.testing.assert_array_equal(Gcg.edge_values["x"][Gcg.edge_masks["x"]], [2, 2])
+    H = nxcg.to_networkx(Gcg)
+    assert list(H.edges(data=True)) == [(0, 1, {"x": 2}), (0, 2, {})]
+
+    with pytest.raises(KeyError, match="x"):
+        nxcg.from_networkx(G, edge_attrs={"x": nxcg.convert.REQUIRED})
+    with pytest.raises(KeyError, match="x"):
+        nxcg.from_networkx(G, edge_attrs={"x": ...})
+    with pytest.raises(KeyError, match="bar"):
+        nxcg.from_networkx(G, node_attrs={"bar": nxcg.convert.REQUIRED})
+    with pytest.raises(KeyError, match="bar"):
+        nxcg.from_networkx(G, node_attrs={"bar": ...})
+
+    # Now for something more complicated...
+    G = nx.Graph()
+    G.add_edge(10, 20, x=1)
+    G.add_edge(10, 30, x=2, y=1.5)
+    G.add_node(10, foo=100)
+    G.add_node(20, foo=200, bar=1000)
+    G.add_node(30, foo=300)
+    # Some edges have masks, some don't
+    for kwargs in [
+        {"preserve_edge_attrs": True},
+        {"preserve_all_attrs": True},
+        {"edge_attrs": {"x": None, "y": None}},
+        {"edge_attrs": {"x": 0, "y": None}},
+        {"edge_attrs": {"x": 0, "y": None}},
+        {"edge_attrs": {"x": 0, "y": None}, "edge_dtypes": {"x": int, "y": float}},
+    ]:
+        Gcg = nxcg.from_networkx(G, **kwargs)
+        assert Gcg.id_to_key == {0: 10, 1: 20, 2: 30}  # Remap node IDs to 0, 1, ...
+        cp.testing.assert_array_equal(Gcg.row_indices, [0, 0, 1, 2])
+        cp.testing.assert_array_equal(Gcg.col_indices, [1, 2, 0, 0])
+        cp.testing.assert_array_equal(Gcg.edge_values["x"], [1, 2, 1, 2])
+        assert sorted(Gcg.edge_masks) == ["y"]
+        cp.testing.assert_array_equal(Gcg.edge_masks["y"], [False, True, False, True])
+        cp.testing.assert_array_equal(
+            Gcg.edge_values["y"][Gcg.edge_masks["y"]], [1.5, 1.5]
+        )
+        H = nxcg.to_networkx(Gcg)
+        assert G.adj == H.adj
+
+    # Some nodes have masks, some don't
+    for kwargs in [
+        {"preserve_node_attrs": True},
+        {"preserve_all_attrs": True},
+        {"node_attrs": {"foo": None, "bar": None}},
+        {"node_attrs": {"foo": None, "bar": None}},
+        {"node_attrs": {"foo": 0, "bar": None, "missing": None}},
+    ]:
+        Gcg = nxcg.from_networkx(G, **kwargs)
+        assert Gcg.id_to_key == {0: 10, 1: 20, 2: 30}  # Remap node IDs to 0, 1, ...
+        cp.testing.assert_array_equal(Gcg.row_indices, [0, 0, 1, 2])
+        cp.testing.assert_array_equal(Gcg.col_indices, [1, 2, 0, 0])
+        cp.testing.assert_array_equal(Gcg.node_values["foo"], [100, 200, 300])
+        assert sorted(Gcg.node_masks) == ["bar"]
+        cp.testing.assert_array_equal(Gcg.node_masks["bar"], [False, True, False])
+        cp.testing.assert_array_equal(
+            Gcg.node_values["bar"][Gcg.node_masks["bar"]], [1000]
+        )
+        H = nxcg.to_networkx(Gcg)
+        assert G.nodes == H.nodes
+
+    # Check default values for nodes
+    for kwargs in [
+        {"node_attrs": {"foo": None, "bar": 0}},
+        {"node_attrs": {"foo": None, "bar": 0, "missing": None}},
+        {"node_attrs": {"bar": 0}},
+        {"node_attrs": {"bar": 0}, "node_dtypes": {"bar": int}},
+        {"node_attrs": {"bar": 0, "foo": None}, "node_dtypes": int},
+    ]:
+        Gcg = nxcg.from_networkx(G, **kwargs)
+        assert Gcg.id_to_key == {0: 10, 1: 20, 2: 30}  # Remap node IDs to 0, 1, ...
+        cp.testing.assert_array_equal(Gcg.row_indices, [0, 0, 1, 2])
+        cp.testing.assert_array_equal(Gcg.col_indices, [1, 2, 0, 0])
+        cp.testing.assert_array_equal(Gcg.node_values["bar"], [0, 1000, 0])
+        assert Gcg.node_masks == {}
+
+    with pytest.raises(
+        TypeError, match="edge_attrs and weight arguments should not both be given"
+    ):
+        interface.BackendInterface.convert_from_nx(G, edge_attrs={"x": 1}, weight="x")
+    with pytest.raises(TypeError, match="Expected networkx.Graph"):
+        nxcg.from_networkx({})
diff --git a/python/cugraph-nx/cugraph_nx/tests/test_match_api.py b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
similarity index 75%
rename from python/cugraph-nx/cugraph_nx/tests/test_match_api.py
rename to python/nx-cugraph/nx_cugraph/tests/test_match_api.py
index 918c18b4ce3..64d3704dd65 100644
--- a/python/cugraph-nx/cugraph_nx/tests/test_match_api.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
@@ -15,13 +15,13 @@
 
 import networkx as nx
 
-import cugraph_nx as cnx
-from cugraph_nx.utils import networkx_algorithm
+import nx_cugraph as nxcg
+from nx_cugraph.utils import networkx_algorithm
 
 
 def test_match_signature_and_names():
     """Simple test to ensure our signatures and basic module layout match networkx."""
-    for name, func in vars(cnx.interface.BackendInterface).items():
+    for name, func in vars(nxcg.interface.BackendInterface).items():
         if not isinstance(func, networkx_algorithm):
             continue
 
@@ -44,7 +44,7 @@ def test_match_signature_and_names():
         if not func.extra_params:
             assert orig_sig == func_sig
         else:
-            # Ignore extra parameters added to cugraph-nx algorithm
+            # Ignore extra parameters added to nx-cugraph algorithm
             assert orig_sig == func_sig.replace(
                 parameters=[
                     p
@@ -52,7 +52,7 @@ def test_match_signature_and_names():
                     if name not in func.extra_params
                 ]
             )
-        if func.can_run is not cnx.utils.decorators._default_can_run:
+        if func.can_run is not nxcg.utils.decorators._default_can_run:
             assert func_sig == inspect.signature(func.can_run)
 
         # Matching function names?
@@ -74,33 +74,33 @@ def test_match_signature_and_names():
         )
 
         # Matching package layout (i.e., which modules have the function)?
-        cnx_path = func.__module__
+        nxcg_path = func.__module__
         name = func.__name__
-        while "." in cnx_path:
+        while "." in nxcg_path:
             # This only walks up the module tree and does not check sibling modules
-            cnx_path, mod_name = cnx_path.rsplit(".", 1)
-            nx_path = cnx_path.replace("cugraph_nx", "networkx")
-            cnx_mod = importlib.import_module(cnx_path)
+            nxcg_path, mod_name = nxcg_path.rsplit(".", 1)
+            nx_path = nxcg_path.replace("nx_cugraph", "networkx")
+            nxcg_mod = importlib.import_module(nxcg_path)
             nx_mod = importlib.import_module(nx_path)
             # Is the function present in the current module?
-            present_in_cnx = hasattr(cnx_mod, name)
+            present_in_nxcg = hasattr(nxcg_mod, name)
             present_in_nx = hasattr(nx_mod, name)
-            if present_in_cnx is not present_in_nx:  # pragma: no cover (debug)
-                if present_in_cnx:
+            if present_in_nxcg is not present_in_nx:  # pragma: no cover (debug)
+                if present_in_nxcg:
                     raise AssertionError(
-                        f"{name} exists in {cnx_path}, but not in {nx_path}"
+                        f"{name} exists in {nxcg_path}, but not in {nx_path}"
                     )
                 raise AssertionError(
-                    f"{name} exists in {nx_path}, but not in {cnx_path}"
+                    f"{name} exists in {nx_path}, but not in {nxcg_path}"
                 )
             # Is the nested module present in the current module?
-            present_in_cnx = hasattr(cnx_mod, mod_name)
+            present_in_nxcg = hasattr(nxcg_mod, mod_name)
             present_in_nx = hasattr(nx_mod, mod_name)
-            if present_in_cnx is not present_in_nx:  # pragma: no cover (debug)
-                if present_in_cnx:
+            if present_in_nxcg is not present_in_nx:  # pragma: no cover (debug)
+                if present_in_nxcg:
                     raise AssertionError(
-                        f"{mod_name} exists in {cnx_path}, but not in {nx_path}"
+                        f"{mod_name} exists in {nxcg_path}, but not in {nx_path}"
                     )
                 raise AssertionError(
-                    f"{mod_name} exists in {nx_path}, but not in {cnx_path}"
+                    f"{mod_name} exists in {nx_path}, but not in {nxcg_path}"
                 )
diff --git a/python/cugraph-nx/cugraph_nx/typing.py b/python/nx-cugraph/nx_cugraph/typing.py
similarity index 100%
rename from python/cugraph-nx/cugraph_nx/typing.py
rename to python/nx-cugraph/nx_cugraph/typing.py
diff --git a/python/cugraph-nx/cugraph_nx/utils/__init__.py b/python/nx-cugraph/nx_cugraph/utils/__init__.py
similarity index 100%
rename from python/cugraph-nx/cugraph_nx/utils/__init__.py
rename to python/nx-cugraph/nx_cugraph/utils/__init__.py
diff --git a/python/cugraph-nx/cugraph_nx/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py
similarity index 97%
rename from python/cugraph-nx/cugraph_nx/utils/decorators.py
rename to python/nx-cugraph/nx_cugraph/utils/decorators.py
index 619c9610c5d..3dbdb07e87f 100644
--- a/python/cugraph-nx/cugraph_nx/utils/decorators.py
+++ b/python/nx-cugraph/nx_cugraph/utils/decorators.py
@@ -14,7 +14,7 @@
 
 from networkx.utils.decorators import not_implemented_for
 
-from cugraph_nx.interface import BackendInterface
+from nx_cugraph.interface import BackendInterface
 
 __all__ = ["not_implemented_for", "networkx_algorithm"]
 
diff --git a/python/cugraph-nx/cugraph_nx/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py
similarity index 78%
rename from python/cugraph-nx/cugraph_nx/utils/misc.py
rename to python/nx-cugraph/nx_cugraph/utils/misc.py
index 18487a05996..64c0be066f2 100644
--- a/python/cugraph-nx/cugraph_nx/utils/misc.py
+++ b/python/nx-cugraph/nx_cugraph/utils/misc.py
@@ -10,9 +10,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+
+import operator as op
+import sys
+from random import Random
+
 import cupy as cp
 
-__all__ = ["_groupby"]
+__all__ = ["_groupby", "_handle_seed"]
 
 
 def _groupby(groups: cp.ndarray, values: cp.ndarray) -> dict[int, cp.ndarray]:
@@ -43,3 +49,12 @@ def _groupby(groups: cp.ndarray, values: cp.ndarray) -> dict[int, cp.ndarray]:
         rv[i] = sorted_values[start:end]
         start = end
     return rv
+
+
+def _handle_seed(seed: int | Random | None) -> int:
+    """Handle seed argument and ensure it is what pylibcugraph needs: an int."""
+    if seed is None:
+        return
+    if isinstance(seed, Random):
+        return seed.randint(0, sys.maxsize)
+    return op.index(seed)  # Ensure seed is integral
diff --git a/python/cugraph-nx/pyproject.toml b/python/nx-cugraph/pyproject.toml
similarity index 96%
rename from python/cugraph-nx/pyproject.toml
rename to python/nx-cugraph/pyproject.toml
index 1882bed251f..95e9c256e5d 100644
--- a/python/cugraph-nx/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "cugraph-nx"
+name = "nx-cugraph"
 version = "23.10.00"
 description = "cugraph backend for NetworkX"
 readme = { file = "README.md", content-type = "text/markdown" }
@@ -52,15 +52,15 @@ Homepage = "https://github.com/rapidsai/cugraph"
 Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 
 [project.entry-points."networkx.plugins"]
-cugraph = "cugraph_nx.interface:BackendInterface"
+cugraph = "nx_cugraph.interface:BackendInterface"
 
 [tool.setuptools]
 license-files = ["LICENSE"]
 
 [tool.setuptools.packages.find]
 include = [
-    "cugraph_nx*",
-    "cugraph_nx.*",
+    "nx_cugraph*",
+    "nx_cugraph.*",
 ]
 
 [tool.black]
@@ -73,12 +73,12 @@ profile = "black"
 skip_gitignore = true
 float_to_top = true
 default_section = "THIRDPARTY"
-known_first_party = "cugraph_nx"
+known_first_party = "nx_cugraph"
 line_length = 88
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-testpaths = "cugraph_nx/tests"
+testpaths = "nx_cugraph/tests"
 xfail_strict = true
 markers = [
     "slow: Skipped unless --runslow passed",
@@ -109,7 +109,7 @@ addopts = [
 
 [tool.coverage.run]
 branch = true
-source = ["cugraph_nx"]
+source = ["nx_cugraph"]
 omit = []
 
 [tool.coverage.report]
@@ -202,7 +202,7 @@ ignore = [
 [tool.ruff.per-file-ignores]
 "__init__.py" = ["F401"]  # Allow unused imports (w/o defining `__all__`)
 # Allow assert, print, RNG, and no docstring
-"cugraph_nx/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"]
+"nx_cugraph/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"]
 
 [tool.ruff.flake8-annotations]
 mypy-init-return = true
diff --git a/python/cugraph-nx/run_nx_tests.sh b/python/nx-cugraph/run_nx_tests.sh
similarity index 79%
rename from python/cugraph-nx/run_nx_tests.sh
rename to python/nx-cugraph/run_nx_tests.sh
index 7ea2348eaff..07c97cdf947 100755
--- a/python/cugraph-nx/run_nx_tests.sh
+++ b/python/nx-cugraph/run_nx_tests.sh
@@ -4,21 +4,21 @@
 #
 # NETWORKX_GRAPH_CONVERT=cugraph
 #   Used by networkx versions 3.0 and 3.1
-#   Must be set to "cugraph" to test the cugraph-nx backend.
+#   Must be set to "cugraph" to test the nx-cugraph backend.
 #
 # NETWORKX_TEST_BACKEND=cugraph
 #   Replaces NETWORKX_GRAPH_CONVERT for networkx versions >=3.2
-#   Must be set to "cugraph" to test the cugraph-nx backend.
+#   Must be set to "cugraph" to test the nx-cugraph backend.
 #
 # NETWORKX_FALLBACK_TO_NX=True (optional)
 #   Used by networkx versions >=3.2.  With this set, input graphs will not be
-#   converted to cugraph-nx and the networkx algorithm will be called for
+#   converted to nx-cugraph and the networkx algorithm will be called for
 #   algorithms that we don't implement or if we raise NotImplementedError.
 #   This is sometimes helpful to get increased testing and coverage, but
 #   testing takes longer.  Without it, tests will xfail when encountering a
 #   function that we don't implement.
 #
-# Coverage of `cugraph_nx.algorithms` is reported and is a good sanity check
+# Coverage of `nx_cugraph.algorithms` is reported and is a good sanity check
 # that algorithms run.
 
 # Warning: cugraph has a .coveragerc file in the <repo root>/python directory,
@@ -30,7 +30,7 @@ NETWORKX_TEST_BACKEND=cugraph \
 NETWORKX_FALLBACK_TO_NX=True \
     pytest \
     --pyargs networkx \
-    --cov=cugraph_nx.algorithms \
+    --cov=nx_cugraph.algorithms \
     --cov-report term-missing \
     --no-cov-on-fail \
     "$@"
diff --git a/python/cugraph-nx/setup.py b/python/nx-cugraph/setup.py
similarity index 100%
rename from python/cugraph-nx/setup.py
rename to python/nx-cugraph/setup.py

From 6b57f56f5efde9b5b1fdef63474d13f6e1908133 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Wed, 6 Sep 2023 14:49:15 -0400
Subject: [PATCH 32/72] Migrate upstream models to `cugraph-pyg` (#3763)

This PR migrates SAGEConv and RGCNConv to cugraph-pyg, in preparation for removing these models from upstream.
`pylibcugraphops` now becomes a dependency of cugraph-pyg.

Authors:
  - Tingyu Wang (https://github.com/tingyu66)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cugraph/pull/3763
---
 .../all_cuda-118_arch-x86_64.yaml             |   1 +
 .../all_cuda-120_arch-x86_64.yaml             |   1 +
 conda/recipes/cugraph-pyg/meta.yaml           |   1 +
 dependencies.yaml                             |  24 ++-
 .../conda/cugraph_dgl_dev_cuda-118.yaml       |   7 +-
 .../conda/cugraph_pyg_dev_cuda-118.yaml       |  25 +++
 .../cugraph_pyg/nn/conv/__init__.py           |   4 +
 .../cugraph-pyg/cugraph_pyg/nn/conv/base.py   |  26 +--
 .../cugraph_pyg/nn/conv/gat_conv.py           |  11 +-
 .../cugraph_pyg/nn/conv/gatv2_conv.py         |   5 +-
 .../cugraph_pyg/nn/conv/rgcn_conv.py          | 141 +++++++++++++++++
 .../cugraph_pyg/nn/conv/sage_conv.py          | 149 ++++++++++++++++++
 .../cugraph_pyg/nn/conv/transformer_conv.py   |   5 +-
 .../cugraph-pyg/cugraph_pyg/tests/conftest.py |  19 +++
 .../cugraph_pyg/tests/nn/test_gat_conv.py     |  43 +++--
 .../cugraph_pyg/tests/nn/test_gatv2_conv.py   |  37 ++---
 .../cugraph_pyg/tests/nn/test_rgcn_conv.py    |  71 +++++++++
 .../cugraph_pyg/tests/nn/test_sage_conv.py    |  89 +++++++++++
 .../tests/nn/test_transformer_conv.py         |  45 +++---
 19 files changed, 603 insertions(+), 101 deletions(-)
 create mode 100644 python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
 create mode 100644 python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
 create mode 100644 python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 075cf231725..c66890f8ae5 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -5,6 +5,7 @@ channels:
 - rapidsai-nightly
 - dask/label/dev
 - pytorch
+- pyg
 - dglteam/label/cu118
 - conda-forge
 - nvidia
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index eacafbfd6c4..3afb1415572 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -5,6 +5,7 @@ channels:
 - rapidsai-nightly
 - dask/label/dev
 - pytorch
+- pyg
 - dglteam/label/cu118
 - conda-forge
 - nvidia
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 4d3d7c44093..2d7ed2f4cda 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -33,6 +33,7 @@ requirements:
     - pytorch >=2.0
     - cupy >=12.0.0
     - cugraph ={{ version }}
+    - pylibcugraphops ={{ version }}
     - pyg >=2.3,<2.4
 
 tests:
diff --git a/dependencies.yaml b/dependencies.yaml
index e8692cd670f..04ec1b6e957 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -199,12 +199,24 @@ files:
     output: conda
     conda_dir: python/cugraph-dgl/conda
     includes:
+      - checks
       - cugraph_dgl_dev
+      - test_python_common
+  cugraph_pyg_dev:
+    matrix:
+      cuda: ["11.8"]
+    output: conda
+    conda_dir: python/cugraph-pyg/conda
+    includes:
+      - checks
+      - cugraph_pyg_dev
+      - test_python_common
 channels:
   - rapidsai
   - rapidsai-nightly
   - dask/label/dev
   - pytorch
+  - pyg
   - dglteam/label/cu118
   - conda-forge
   - nvidia
@@ -498,6 +510,12 @@ dependencies:
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - dgl>=1.1.0.cu*
-          - setuptools
-          - pre-commit
-          - pytest
+  cugraph_pyg_dev:
+    common:
+      - output_types: [conda]
+        packages:
+          - cugraph==23.10.*
+          - pylibcugraphops==23.10.*
+          - pytorch==2.0
+          - pytorch-cuda==11.8
+          - pyg=2.3.1=*torch_2.0.0*cu118*
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
index 2bb4b0f3cd3..138d384ebcf 100644
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -5,16 +5,21 @@ channels:
 - rapidsai-nightly
 - dask/label/dev
 - pytorch
+- pyg
 - dglteam/label/cu118
 - conda-forge
 - nvidia
 dependencies:
 - cugraph==23.10.*
 - dgl>=1.1.0.cu*
+- pandas
 - pre-commit
 - pylibcugraphops==23.10.*
 - pytest
+- pytest-benchmark
+- pytest-cov
+- pytest-xdist
 - pytorch-cuda==11.8
 - pytorch>=2.0
-- setuptools
+- scipy
 name: cugraph_dgl_dev_cuda-118
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
new file mode 100644
index 00000000000..4e5159e6b45
--- /dev/null
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -0,0 +1,25 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- pytorch
+- pyg
+- dglteam/label/cu118
+- conda-forge
+- nvidia
+dependencies:
+- cugraph==23.10.*
+- pandas
+- pre-commit
+- pyg=2.3.1=*torch_2.0.0*cu118*
+- pylibcugraphops==23.10.*
+- pytest
+- pytest-benchmark
+- pytest-cov
+- pytest-xdist
+- pytorch-cuda==11.8
+- pytorch==2.0
+- scipy
+name: cugraph_pyg_dev_cuda-118
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
index 0c94be5e12b..9c9dcdb43bb 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
@@ -13,10 +13,14 @@
 
 from .gat_conv import GATConv
 from .gatv2_conv import GATv2Conv
+from .rgcn_conv import RGCNConv
+from .sage_conv import SAGEConv
 from .transformer_conv import TransformerConv
 
 __all__ = [
     "GATConv",
     "GATv2Conv",
+    "RGCNConv",
+    "SAGEConv",
     "TransformerConv",
 ]
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
index 2639f66f440..10431a0398d 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
@@ -15,10 +15,10 @@
 from typing import Optional, Tuple, Union
 
 from cugraph.utilities.utils import import_optional
+from pylibcugraphops.pytorch import CSC, HeteroCSC
 
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
-ops_torch = import_optional("pylibcugraphops.pytorch")
 
 
 class BaseConv(torch.nn.Module):  # pragma: no cover
@@ -74,7 +74,7 @@ def get_cugraph(
         csc: Tuple[torch.Tensor, torch.Tensor, int],
         bipartite: bool = False,
         max_num_neighbors: Optional[int] = None,
-    ) -> ops_torch.CSC:
+    ) -> CSC:
         r"""Constructs a :obj:`cugraph-ops` graph object from CSC representation.
         Supports both bipartite and non-bipartite graphs.
 
@@ -87,22 +87,22 @@ def get_cugraph(
             bipartite (bool): If set to :obj:`True`, will create the bipartite
                 structure in cugraph-ops. (default: :obj:`False`)
             max_num_neighbors (int, optional): The maximum number of neighbors
-                of a target node. It is only effective when operating in a
-                bipartite graph. When not given, will be computed on-the-fly,
-                leading to slightly worse performance. (default: :obj:`None`)
+                of a destination node. When enabled, it allows models to use
+                the message-flow-graph primitives in cugraph-ops.
+                (default: :obj:`None`)
         """
         row, colptr, num_src_nodes = csc
 
         if not row.is_cuda:
             raise RuntimeError(
-                f"'{self.__class__.__name__}' requires GPU-"
-                f"based processing (got CPU tensor)"
+                f"'{self.__class__.__name__}' requires GPU-based processing "
+                f"but got CPU tensor."
             )
 
         if max_num_neighbors is None:
             max_num_neighbors = -1
 
-        return ops_torch.CSC(
+        return CSC(
             offsets=colptr,
             indices=row,
             num_src_nodes=num_src_nodes,
@@ -117,7 +117,7 @@ def get_typed_cugraph(
         num_edge_types: Optional[int] = None,
         bipartite: bool = False,
         max_num_neighbors: Optional[int] = None,
-    ) -> ops_torch.HeteroCSC:
+    ) -> HeteroCSC:
         r"""Constructs a typed :obj:`cugraph` graph object from a CSC
         representation where each edge corresponds to a given edge type.
         Supports both bipartite and non-bipartite graphs.
@@ -135,9 +135,9 @@ def get_typed_cugraph(
             bipartite (bool): If set to :obj:`True`, will create the bipartite
                 structure in cugraph-ops. (default: :obj:`False`)
             max_num_neighbors (int, optional): The maximum number of neighbors
-                of a target node. It is only effective when operating in a
-                bipartite graph. When not given, will be computed on-the-fly,
-                leading to slightly worse performance. (default: :obj:`None`)
+                of a destination node. When enabled, it allows models to use
+                the message-flow-graph primitives in cugraph-ops.
+                (default: :obj:`None`)
         """
         if num_edge_types is None:
             num_edge_types = int(edge_type.max()) + 1
@@ -148,7 +148,7 @@ def get_typed_cugraph(
         row, colptr, num_src_nodes = csc
         edge_type = edge_type.int()
 
-        return ops_torch.HeteroCSC(
+        return HeteroCSC(
             offsets=colptr,
             indices=row,
             edge_types=edge_type,
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
index f0040015b4a..309bee4e228 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
@@ -10,16 +10,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from typing import Optional, Tuple, Union
 
 from cugraph.utilities.utils import import_optional
+from pylibcugraphops.pytorch.operators import mha_gat_n2n
 
 from .base import BaseConv
 
 torch = import_optional("torch")
 nn = import_optional("torch.nn")
 torch_geometric = import_optional("torch_geometric")
-ops_torch = import_optional("pylibcugraphops.pytorch")
 
 
 class GATConv(BaseConv):
@@ -174,9 +175,9 @@ def forward(
                 representation to the desired format.
             edge_attr: (torch.Tensor, optional) The edge features.
             max_num_neighbors (int, optional): The maximum number of neighbors
-                of a target node. It is only effective when operating in a
-                bipartite graph. When not given, will be computed on-the-fly,
-                leading to slightly worse performance. (default: :obj:`None`)
+                of a destination node. When enabled, it allows models to use
+                the message-flow-graph primitives in cugraph-ops.
+                (default: :obj:`None`)
         """
         bipartite = not isinstance(x, torch.Tensor)
         graph = self.get_cugraph(
@@ -210,7 +211,7 @@ def forward(
                 )
             x = self.lin(x)
 
-        out = ops_torch.operators.mha_gat_n2n(
+        out = mha_gat_n2n(
             (x_src, x_dst) if bipartite else x,
             self.att,
             graph,
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
index d74ca6b00d0..32956dcb400 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
@@ -10,16 +10,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from typing import Optional, Tuple, Union
 
 from cugraph.utilities.utils import import_optional
+from pylibcugraphops.pytorch.operators import mha_gat_v2_n2n
 
 from .base import BaseConv
 
 torch = import_optional("torch")
 nn = import_optional("torch.nn")
 torch_geometric = import_optional("torch_geometric")
-ops_torch = import_optional("pylibcugraphops.pytorch")
 
 
 class GATv2Conv(BaseConv):
@@ -207,7 +208,7 @@ def forward(
         else:
             x = self.lin_src(x)
 
-        out = ops_torch.operators.mha_gat_v2_n2n(
+        out = mha_gat_v2_n2n(
             (x_src, x_dst) if bipartite else x,
             self.att,
             graph,
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
new file mode 100644
index 00000000000..683780b66eb
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+from cugraph.utilities.utils import import_optional
+from pylibcugraphops.pytorch.operators import agg_hg_basis_n2n_post
+
+from .base import BaseConv
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+
+class RGCNConv(BaseConv):  # pragma: no cover
+    r"""The relational graph convolutional operator from the `"Modeling
+    Relational Data with Graph Convolutional Networks"
+    <https://arxiv.org/abs/1703.06103>`_ paper.
+
+    .. math::
+        \mathbf{x}^{\prime}_i = \mathbf{\Theta}_{\textrm{root}} \cdot
+        \mathbf{x}_i + \sum_{r \in \mathcal{R}} \sum_{j \in \mathcal{N}_r(i)}
+        \frac{1}{|\mathcal{N}_r(i)|} \mathbf{\Theta}_r \cdot \mathbf{x}_j,
+
+    where :math:`\mathcal{R}` denotes the set of relations, *i.e.* edge types.
+    Edge type needs to be a one-dimensional :obj:`torch.long` tensor which
+    stores a relation identifier
+    :math:`\in \{ 0, \ldots, |\mathcal{R}| - 1\}` for each edge.
+
+    Args:
+        in_channels (int): Size of each input sample.
+        out_channels (int): Size of each output sample.
+        num_relations (int): Number of relations.
+        num_bases (int, optional): If set, this layer will use the
+            basis-decomposition regularization scheme where :obj:`num_bases`
+            denotes the number of bases to use. (default: :obj:`None`)
+        aggr (str, optional): The aggregation scheme to use
+            (:obj:`"add"`, :obj:`"mean"`, :obj:`"sum"`).
+            (default: :obj:`"mean"`)
+        root_weight (bool, optional): If set to :obj:`False`, the layer will
+            not add transformed root node features to the output.
+            (default: :obj:`True`)
+        bias (bool, optional): If set to :obj:`False`, the layer will not learn
+            an additive bias. (default: :obj:`True`)
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_relations: int,
+        num_bases: Optional[int] = None,
+        aggr: str = "mean",
+        root_weight: bool = True,
+        bias: bool = True,
+    ):
+        super().__init__()
+
+        if aggr not in ["mean", "sum", "add"]:
+            raise ValueError(
+                f"Aggregation function must be chosen from 'mean', 'sum' or "
+                f"'add', but got '{aggr}'."
+            )
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_relations = num_relations
+        self.num_bases = num_bases
+        self.aggr = aggr
+        self.root_weight = root_weight
+
+        dim_root_weight = 1 if root_weight else 0
+
+        if num_bases is not None:
+            self.weight = torch.nn.Parameter(
+                torch.empty(num_bases + dim_root_weight, in_channels, out_channels)
+            )
+            self.comp = torch.nn.Parameter(torch.empty(num_relations, num_bases))
+        else:
+            self.weight = torch.nn.Parameter(
+                torch.empty(num_relations + dim_root_weight, in_channels, out_channels)
+            )
+            self.register_parameter("comp", None)
+
+        if bias:
+            self.bias = torch.nn.Parameter(torch.empty(out_channels))
+        else:
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        end = -1 if self.root_weight else None
+        torch_geometric.nn.inits.glorot(self.weight[:end])
+        torch_geometric.nn.inits.glorot(self.comp)
+        if self.root_weight:
+            torch_geometric.nn.inits.glorot(self.weight[-1])
+        torch_geometric.nn.inits.zeros(self.bias)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        edge_type: torch.Tensor,
+        max_num_neighbors: Optional[int] = None,
+    ) -> torch.Tensor:
+
+        graph = self.get_typed_cugraph(
+            csc, edge_type, self.num_relations, max_num_neighbors=max_num_neighbors
+        )
+
+        out = agg_hg_basis_n2n_post(
+            x,
+            self.comp,
+            graph,
+            concat_own=self.root_weight,
+            norm_by_out_degree=bool(self.aggr == "mean"),
+        )
+
+        out = out @ self.weight.view(-1, self.out_channels)
+
+        if self.bias is not None:
+            out = out + self.bias
+
+        return out
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}({self.in_channels}, "
+            f"{self.out_channels}, num_relations={self.num_relations})"
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
new file mode 100644
index 00000000000..8e0c1027416
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+from cugraph.utilities.utils import import_optional
+from pylibcugraphops.pytorch.operators import agg_concat_n2n
+
+from .base import BaseConv
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+
+class SAGEConv(BaseConv):
+    r"""The GraphSAGE operator from the `"Inductive Representation Learning on
+    Large Graphs" <https://arxiv.org/abs/1706.02216>`_ paper.
+
+    .. math::
+        \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i + \mathbf{W}_2 \cdot
+        \mathrm{mean}_{j \in \mathcal{N(i)}} \mathbf{x}_j
+
+    If :obj:`project = True`, then :math:`\mathbf{x}_j` will first get
+    projected via
+
+    .. math::
+        \mathbf{x}_j \leftarrow \sigma ( \mathbf{W}_3 \mathbf{x}_j +
+        \mathbf{b})
+
+    as described in Eq. (3) of the paper.
+
+    Args:
+        in_channels (int or tuple): Size of each input sample. A tuple
+            corresponds to the sizes of source and target dimensionalities.
+        out_channels (int): Size of each output sample.
+        aggr (str or Aggregation, optional): The aggregation scheme to use.
+            Choose from :obj:`"mean"`, :obj:`"sum"`, :obj:`"min"` or
+            :obj:`"max"`. (default: :obj:`"mean"`)
+        normalize (bool, optional): If set to :obj:`True`, output features
+            will be :math:`\ell_2`-normalized, *i.e.*,
+            :math:`\frac{\mathbf{h}_i^{k+1}}
+            {\| \mathbf{h}_i^{k+1} \|_2}`.
+            (default: :obj:`False`)
+        root_weight (bool, optional): If set to :obj:`False`, the layer will
+            not add transformed root node features to the output.
+            (default: :obj:`True`)
+        project (bool, optional): If set to :obj:`True`, the layer will apply a
+            linear transformation followed by an activation function before
+            aggregation (as described in Eq. (3) of the paper).
+            (default: :obj:`False`)
+        bias (bool, optional): If set to :obj:`False`, the layer will not learn
+            an additive bias. (default: :obj:`True`)
+    """
+
+    def __init__(
+        self,
+        in_channels: Union[int, Tuple[int, int]],
+        out_channels: int,
+        aggr: str = "mean",
+        normalize: bool = False,
+        root_weight: bool = True,
+        project: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+
+        if aggr not in ["mean", "sum", "min", "max"]:
+            raise ValueError(
+                f"Aggregation function must be chosen from 'mean',"
+                f" 'sum', 'min' or 'max', but got '{aggr}'."
+            )
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.aggr = aggr
+        self.normalize = normalize
+        self.root_weight = root_weight
+        self.project = project
+
+        if isinstance(in_channels, int):
+            self.in_channels_src = self.in_channels_dst = in_channels
+        else:
+            self.in_channels_src, self.in_channels_dst = in_channels
+
+        if self.project:
+            self.pre_lin = torch_geometric.nn.Linear(
+                self.in_channels_src, self.in_channels_src, bias=True
+            )
+
+        if self.root_weight:
+            self.lin = torch_geometric.nn.Linear(
+                self.in_channels_src + self.in_channels_dst, out_channels, bias=bias
+            )
+        else:
+            self.lin = torch_geometric.nn.Linear(
+                self.in_channels_src, out_channels, bias=bias
+            )
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.project:
+            self.pre_lin.reset_parameters()
+        self.lin.reset_parameters()
+
+    def forward(
+        self,
+        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        csc: Tuple[torch.Tensor, torch.Tensor, int],
+        max_num_neighbors: Optional[int] = None,
+    ) -> torch.Tensor:
+        bipartite = isinstance(x, Tuple)
+        graph = self.get_cugraph(
+            csc, bipartite=bipartite, max_num_neighbors=max_num_neighbors
+        )
+
+        if self.project:
+            if bipartite:
+                x = (self.pre_lin(x[0]).relu(), x[1])
+            else:
+                x = self.pre_lin(x).relu()
+
+        out = agg_concat_n2n(x, graph, self.aggr)
+
+        if self.root_weight:
+            out = self.lin(out)
+        else:
+            out = self.lin(out[:, : self.in_channels_src])
+
+        if self.normalize:
+            out = torch.nn.functional.normalize(out, p=2.0, dim=-1)
+
+        return out
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}({self.in_channels}, "
+            f"{self.out_channels}, aggr={self.aggr})"
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
index 1b8b1aa0ffa..41c0b4b4090 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
@@ -10,16 +10,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from typing import Optional, Tuple, Union
 
 from cugraph.utilities.utils import import_optional
+from pylibcugraphops.pytorch.operators import mha_simple_n2n
 
 from .base import BaseConv
 
 torch = import_optional("torch")
 nn = import_optional("torch.nn")
 torch_geometric = import_optional("torch_geometric")
-ops_torch = import_optional("pylibcugraphops.pytorch")
 
 
 class TransformerConv(BaseConv):
@@ -185,7 +186,7 @@ def forward(
                 )
             edge_attr = self.lin_edge(edge_attr)
 
-        out = ops_torch.operators.mha_simple_n2n(
+        out = mha_simple_n2n(
             key,
             query,
             value,
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
index 3270dd0bf93..083c4a2b37b 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
@@ -265,3 +265,22 @@ def abc_graph():
     )
 
     return F, G, N
+
+
+@pytest.fixture
+def basic_pyg_graph_1():
+    edge_index = torch.tensor([[0, 1, 2, 3], [0, 0, 1, 1]])
+    size = (4, 4)
+    return edge_index, size
+
+
+@pytest.fixture
+def basic_pyg_graph_2():
+    edge_index = torch.tensor(
+        [
+            [0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9],
+            [1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0],
+        ]
+    )
+    size = (10, 10)
+    return edge_index, size
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
index ae5fd73c438..21c43bad38c 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
@@ -13,15 +13,9 @@
 
 import pytest
 
-try:
-    from torch_geometric.nn import GATConv
-except ModuleNotFoundError:
-    pytest.skip("PyG not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
 from cugraph_pyg.nn import GATConv as CuGraphGATConv
 
-torch = import_optional("torch")
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("bias", [True, False])
@@ -30,17 +24,16 @@
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("max_num_neighbors", [8, None])
 @pytest.mark.parametrize("use_edge_attr", [True, False])
+@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
 def test_gat_conv_equality(
-    bias, bipartite, concat, heads, max_num_neighbors, use_edge_attr
+    bias, bipartite, concat, heads, max_num_neighbors, use_edge_attr, graph, request
 ):
-    atol = 1e-6
-    edge_index = torch.tensor(
-        [
-            [7, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 8, 9],
-            [0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7],
-        ],
-    ).cuda()
-    size = (10, 10)
+    pytest.importorskip("torch_geometric", reason="PyG not available")
+    import torch
+    from torch_geometric.nn import GATConv
+
+    edge_index, size = request.getfixturevalue(graph)
+    edge_index = edge_index.cuda()
 
     if bipartite:
         in_channels = (5, 3)
@@ -87,7 +80,7 @@ def test_gat_conv_equality(
 
     out1 = conv1(x, edge_index, edge_attr=edge_attr)
     out2 = conv2(x, csc, edge_attr=edge_attr_perm, max_num_neighbors=max_num_neighbors)
-    assert torch.allclose(out1, out2, atol=atol)
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_output = torch.rand_like(out1)
     out1.backward(grad_output)
@@ -95,30 +88,30 @@ def test_gat_conv_equality(
 
     if bipartite:
         assert torch.allclose(
-            conv1.lin_src.weight.grad, conv2.lin_src.weight.grad, atol=atol
+            conv1.lin_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
         )
         assert torch.allclose(
-            conv1.lin_dst.weight.grad, conv2.lin_dst.weight.grad, atol=atol
+            conv1.lin_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
         )
     else:
         assert torch.allclose(
-            conv1.lin_src.weight.grad, conv2.lin.weight.grad, atol=atol
+            conv1.lin_src.weight.grad, conv2.lin.weight.grad, atol=ATOL
         )
 
     assert torch.allclose(
-        conv1.att_src.grad.flatten(), conv2.att.grad[:out_dim], atol=atol
+        conv1.att_src.grad.flatten(), conv2.att.grad[:out_dim], atol=ATOL
     )
     assert torch.allclose(
-        conv1.att_dst.grad.flatten(), conv2.att.grad[out_dim : 2 * out_dim], atol=atol
+        conv1.att_dst.grad.flatten(), conv2.att.grad[out_dim : 2 * out_dim], atol=ATOL
     )
 
     if use_edge_attr:
         assert torch.allclose(
-            conv1.att_edge.grad.flatten(), conv2.att.grad[2 * out_dim :], atol=atol
+            conv1.att_edge.grad.flatten(), conv2.att.grad[2 * out_dim :], atol=ATOL
         )
         assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=atol
+            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
         )
 
     if bias:
-        assert torch.allclose(conv1.bias.grad, conv2.bias.grad, atol=atol)
+        assert torch.allclose(conv1.bias.grad, conv2.bias.grad, atol=ATOL)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
index 1c4f241304e..6b11e87154a 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
@@ -13,30 +13,23 @@
 
 import pytest
 
-try:
-    from torch_geometric.nn import GATv2Conv
-except ModuleNotFoundError:
-    pytest.skip("PyG not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
 from cugraph_pyg.nn import GATv2Conv as CuGraphGATv2Conv
 
-torch = import_optional("torch")
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("bipartite", [True, False])
 @pytest.mark.parametrize("concat", [True, False])
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
 @pytest.mark.parametrize("use_edge_attr", [True, False])
-def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr):
-    atol = 1e-6
-    edge_index = torch.tensor(
-        [
-            [7, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 8, 9],
-            [0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7],
-        ],
-    ).cuda()
-    size = (10, 10)
+@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr, graph, request):
+    pytest.importorskip("torch_geometric", reason="PyG not available")
+    import torch
+    from torch_geometric.nn import GATv2Conv
+
+    edge_index, size = request.getfixturevalue(graph)
+    edge_index = edge_index.cuda()
 
     if bipartite:
         in_channels = (5, 3)
@@ -70,26 +63,24 @@ def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr):
     with torch.no_grad():
         conv2.lin_src.weight.data = conv1.lin_l.weight.data.detach().clone()
         conv2.lin_dst.weight.data = conv1.lin_r.weight.data.detach().clone()
-
         conv2.att.data = conv1.att.data.flatten().detach().clone()
-
         if use_edge_attr:
             conv2.lin_edge.weight.data = conv1.lin_edge.weight.data.detach().clone()
 
     out1 = conv1(x, edge_index, edge_attr=edge_attr)
     out2 = conv2(x, csc, edge_attr=edge_attr_perm)
-    assert torch.allclose(out1, out2, atol=atol)
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_output = torch.rand_like(out1)
     out1.backward(grad_output)
     out2.backward(grad_output)
 
-    assert torch.allclose(conv1.lin_l.weight.grad, conv2.lin_src.weight.grad, atol=atol)
-    assert torch.allclose(conv1.lin_r.weight.grad, conv2.lin_dst.weight.grad, atol=atol)
+    assert torch.allclose(conv1.lin_l.weight.grad, conv2.lin_src.weight.grad, atol=ATOL)
+    assert torch.allclose(conv1.lin_r.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL)
 
-    assert torch.allclose(conv1.att.grad.flatten(), conv2.att.grad, atol=atol)
+    assert torch.allclose(conv1.att.grad.flatten(), conv2.att.grad, atol=ATOL)
 
     if use_edge_attr:
         assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=atol
+            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
         )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
new file mode 100644
index 00000000000..233c6aa2836
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph_pyg.nn import RGCNConv as CuGraphRGCNConv
+
+ATOL = 1e-6
+
+
+@pytest.mark.parametrize("aggr", ["add", "sum", "mean"])
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("max_num_neighbors", [8, None])
+@pytest.mark.parametrize("num_bases", [1, 2, None])
+@pytest.mark.parametrize("root_weight", [True, False])
+@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+def test_rgcn_conv_equality(
+    aggr, bias, max_num_neighbors, num_bases, root_weight, graph, request
+):
+    pytest.importorskip("torch_geometric", reason="PyG not available")
+    import torch
+    from torch_geometric.nn import FastRGCNConv as RGCNConv
+
+    in_channels, out_channels, num_relations = (4, 2, 3)
+    kwargs = dict(aggr=aggr, bias=bias, num_bases=num_bases, root_weight=root_weight)
+
+    edge_index, size = request.getfixturevalue(graph)
+    edge_index = edge_index.cuda()
+    edge_type = torch.randint(num_relations, (edge_index.size(1),)).cuda()
+
+    x = torch.rand(size[0], in_channels, device="cuda")
+    csc, edge_type_perm = CuGraphRGCNConv.to_csc(edge_index, size, edge_type)
+
+    conv1 = RGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
+    conv2 = CuGraphRGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
+
+    with torch.no_grad():
+        if root_weight:
+            conv2.weight.data[:-1] = conv1.weight.data
+            conv2.weight.data[-1] = conv1.root.data
+        else:
+            conv2.weight.data = conv1.weight.data.detach().clone()
+        if num_bases is not None:
+            conv2.comp.data = conv1.comp.data.detach().clone()
+
+    out1 = conv1(x, edge_index, edge_type)
+    out2 = conv2(x, csc, edge_type_perm, max_num_neighbors=max_num_neighbors)
+    assert torch.allclose(out1, out2, atol=ATOL)
+
+    grad_out = torch.rand_like(out1)
+    out1.backward(grad_out)
+    out2.backward(grad_out)
+
+    if root_weight:
+        assert torch.allclose(conv1.weight.grad, conv2.weight.grad[:-1], atol=ATOL)
+        assert torch.allclose(conv1.root.grad, conv2.weight.grad[-1], atol=ATOL)
+    else:
+        assert torch.allclose(conv1.weight.grad, conv2.weight.grad, atol=ATOL)
+
+    if num_bases is not None:
+        assert torch.allclose(conv1.comp.grad, conv2.comp.grad, atol=ATOL)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
new file mode 100644
index 00000000000..7f73cddbdbb
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
+
+ATOL = 1e-6
+
+
+@pytest.mark.parametrize("aggr", ["sum", "mean", "min", "max"])
+@pytest.mark.parametrize("bias", [True, False])
+@pytest.mark.parametrize("bipartite", [True, False])
+@pytest.mark.parametrize("max_num_neighbors", [8, None])
+@pytest.mark.parametrize("normalize", [True, False])
+@pytest.mark.parametrize("root_weight", [True, False])
+@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+def test_sage_conv_equality(
+    aggr, bias, bipartite, max_num_neighbors, normalize, root_weight, graph, request
+):
+    pytest.importorskip("torch_geometric", reason="PyG not available")
+    import torch
+    from torch_geometric.nn import SAGEConv
+
+    edge_index, size = request.getfixturevalue(graph)
+    edge_index = edge_index.cuda()
+    csc = CuGraphSAGEConv.to_csc(edge_index, size)
+
+    if bipartite:
+        in_channels = (7, 3)
+        x = (
+            torch.rand(size[0], in_channels[0]).cuda(),
+            torch.rand(size[1], in_channels[1]).cuda(),
+        )
+    else:
+        in_channels = 5
+        x = torch.rand(size[0], in_channels).cuda()
+    out_channels = 4
+
+    kwargs = dict(aggr=aggr, bias=bias, normalize=normalize, root_weight=root_weight)
+
+    conv1 = SAGEConv(in_channels, out_channels, **kwargs).cuda()
+    conv2 = CuGraphSAGEConv(in_channels, out_channels, **kwargs).cuda()
+
+    in_channels_src = conv2.in_channels_src
+    with torch.no_grad():
+        conv2.lin.weight.data[:, :in_channels_src] = conv1.lin_l.weight.data
+        if root_weight:
+            conv2.lin.weight.data[:, in_channels_src:] = conv1.lin_r.weight.data
+        if bias:
+            conv2.lin.bias.data[:] = conv1.lin_l.bias.data
+
+    out1 = conv1(x, edge_index)
+    out2 = conv2(x, csc, max_num_neighbors=max_num_neighbors)
+    assert torch.allclose(out1, out2, atol=ATOL)
+
+    grad_out = torch.rand_like(out1)
+    out1.backward(grad_out)
+    out2.backward(grad_out)
+
+    assert torch.allclose(
+        conv1.lin_l.weight.grad,
+        conv2.lin.weight.grad[:, :in_channels_src],
+        atol=ATOL,
+    )
+
+    if root_weight:
+        assert torch.allclose(
+            conv1.lin_r.weight.grad,
+            conv2.lin.weight.grad[:, in_channels_src:],
+            atol=ATOL,
+        )
+
+    if bias:
+        assert torch.allclose(
+            conv1.lin_l.bias.grad,
+            conv2.lin.bias.grad,
+            atol=ATOL,
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
index a2153ee7891..7dba1a6d515 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
@@ -13,23 +13,25 @@
 
 import pytest
 
-try:
-    from torch_geometric.nn import TransformerConv
-except ModuleNotFoundError:
-    pytest.skip("PyG not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
 from cugraph_pyg.nn import TransformerConv as CuGraphTransformerConv
 
-torch = import_optional("torch")
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("bipartite", [True, False])
 @pytest.mark.parametrize("concat", [True, False])
 @pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
-def test_transformer_conv_equality(bipartite, concat, heads):
+@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
+def test_transformer_conv_equality(bipartite, concat, heads, graph, request):
+    pytest.importorskip("torch_geometric", reason="PyG not available")
+    import torch
+    from torch_geometric.nn import TransformerConv
+
+    edge_index, size = request.getfixturevalue(graph)
+    edge_index = edge_index.cuda()
+    csc = CuGraphTransformerConv.to_csc(edge_index, size)
+
     out_channels = 2
-    size = (10, 10)
     kwargs = dict(concat=concat, bias=False, root_weight=False)
 
     if bipartite:
@@ -42,14 +44,6 @@ def test_transformer_conv_equality(bipartite, concat, heads):
         in_channels = 5
         x = torch.rand(size[0], in_channels, device="cuda")
 
-    edge_index = torch.tensor(
-        [
-            [7, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 8, 9, 3, 4, 5],
-            [0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 6],
-        ],
-        device="cuda",
-    )
-
     conv1 = TransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
     conv2 = CuGraphTransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
 
@@ -62,30 +56,27 @@ def test_transformer_conv_equality(bipartite, concat, heads):
         conv2.lin_value.bias.data = conv1.lin_value.bias.data.detach().clone()
 
     out1 = conv1(x, edge_index)
-    csc = CuGraphTransformerConv.to_csc(edge_index, size)
     out2 = conv2(x, csc)
 
-    atol = 1e-6
-
-    assert torch.allclose(out1, out2, atol=atol)
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_output = torch.rand_like(out1)
     out1.backward(grad_output)
     out2.backward(grad_output)
 
     assert torch.allclose(
-        conv1.lin_query.weight.grad, conv2.lin_query.weight.grad, atol=atol
+        conv1.lin_query.weight.grad, conv2.lin_query.weight.grad, atol=ATOL
     )
     assert torch.allclose(
-        conv1.lin_key.weight.grad, conv2.lin_key.weight.grad, atol=atol
+        conv1.lin_key.weight.grad, conv2.lin_key.weight.grad, atol=ATOL
     )
     assert torch.allclose(
-        conv1.lin_value.weight.grad, conv2.lin_value.weight.grad, atol=atol
+        conv1.lin_value.weight.grad, conv2.lin_value.weight.grad, atol=ATOL
     )
     assert torch.allclose(
-        conv1.lin_query.bias.grad, conv2.lin_query.bias.grad, atol=atol
+        conv1.lin_query.bias.grad, conv2.lin_query.bias.grad, atol=ATOL
     )
-    assert torch.allclose(conv1.lin_key.bias.grad, conv2.lin_key.bias.grad, atol=atol)
+    assert torch.allclose(conv1.lin_key.bias.grad, conv2.lin_key.bias.grad, atol=ATOL)
     assert torch.allclose(
-        conv1.lin_value.bias.grad, conv2.lin_value.bias.grad, atol=atol
+        conv1.lin_value.bias.grad, conv2.lin_value.bias.grad, atol=ATOL
     )

From 98324acc83a55618f1fa41313b4bd2ec32079215 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Wed, 6 Sep 2023 18:48:18 -0400
Subject: [PATCH 33/72] Expose threshold in louvain (#3792)

The threshold parameter (referred to as `epsilon` in most of the centrality measures) is used to define when to stop the iterative steps of Louvain.  Once the modularity increase for an iteration of Louvain is smaller than the threshold we will stop that iteration and start coarsening the graph.

This parameter was hard-coded in the initial C++ implementation of Louvain.  This PR exposes this parameter through the C++, C API, PLC and Python layers.

The PR also renames the python parameter `max_iter` to be `max_level`, which is more appropriate semantically.

Closes #3791

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Naim (https://github.com/naimnv)
  - Joseph Nke (https://github.com/jnke2016)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3792
---
 cpp/include/cugraph/algorithms.hpp            |  4 ++
 cpp/include/cugraph_c/community_algorithms.h  |  2 +
 cpp/src/c_api/louvain.cpp                     |  7 ++-
 cpp/src/community/louvain_impl.cuh            |  9 ++-
 cpp/src/community/louvain_mg.cu               | 14 ++++-
 cpp/src/community/louvain_sg.cu               | 14 ++++-
 cpp/tests/c_api/louvain_test.c                |  7 ++-
 cpp/tests/c_api/mg_louvain_test.c             |  6 +-
 cpp/tests/community/louvain_test.cpp          | 52 ++++++++++++++--
 cpp/tests/community/mg_louvain_test.cpp       | 35 ++++++-----
 python/cugraph/cugraph/community/louvain.py   | 45 ++++++++++++--
 .../cugraph/cugraph/dask/community/louvain.py | 60 ++++++++++++++++---
 .../_cugraph_c/community_algorithms.pxd       |  1 +
 python/pylibcugraph/pylibcugraph/louvain.pyx  | 15 +++--
 .../pylibcugraph/tests/test_louvain.py        |  8 ++-
 15 files changed, 228 insertions(+), 51 deletions(-)

diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 29a488e7505..b624ec5c0e0 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -589,6 +589,8 @@ weight_t hungarian(raft::handle_t const& handle,
  * @param[in]  graph                 input graph object
  * @param[out] clustering            Pointer to device array where the clustering should be stored
  * @param[in]  max_level             (optional) maximum number of levels to run (default 100)
+ * @param[in]  threshold             (optional) threshold for convergence at each level (default
+ * 1e-7)
  * @param[in]  resolution            (optional) The value of the resolution parameter to use.
  *                                   Called gamma in the modularity formula, this changes the size
  *                                   of the communities.  Higher resolutions lead to more smaller
@@ -607,6 +609,7 @@ std::pair<size_t, weight_t> louvain(
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   vertex_t* clustering,
   size_t max_level    = 100,
+  weight_t threshold  = weight_t{1e-7},
   weight_t resolution = weight_t{1});
 
 template <typename vertex_t, typename edge_t, typename weight_t>
@@ -652,6 +655,7 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   size_t max_level    = 100,
+  weight_t threshold  = weight_t{1e-7},
   weight_t resolution = weight_t{1});
 
 /**
diff --git a/cpp/include/cugraph_c/community_algorithms.h b/cpp/include/cugraph_c/community_algorithms.h
index fd0e1de9cb4..e938c77cccd 100644
--- a/cpp/include/cugraph_c/community_algorithms.h
+++ b/cpp/include/cugraph_c/community_algorithms.h
@@ -93,6 +93,7 @@ typedef struct {
  * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
  *                           needs to be transposed
  * @param [in]  max_level    Maximum level in hierarchy
+ * @param [in]  threshold    Threshold parameter, defines convergence at each level of hierarchy
  * @param [in]  resolution   Resolution parameter (gamma) in modularity formula.
  *                           This changes the size of the communities.  Higher resolutions
  *                           lead to more smaller communities, lower resolutions lead to
@@ -107,6 +108,7 @@ typedef struct {
 cugraph_error_code_t cugraph_louvain(const cugraph_resource_handle_t* handle,
                                      cugraph_graph_t* graph,
                                      size_t max_level,
+                                     double threshold,
                                      double resolution,
                                      bool_t do_expensive_check,
                                      cugraph_hierarchical_clustering_result_t** result,
diff --git a/cpp/src/c_api/louvain.cpp b/cpp/src/c_api/louvain.cpp
index ff75cafa031..0e48b29388a 100644
--- a/cpp/src/c_api/louvain.cpp
+++ b/cpp/src/c_api/louvain.cpp
@@ -36,6 +36,7 @@ struct louvain_functor : public cugraph::c_api::abstract_functor {
   raft::handle_t const& handle_;
   cugraph::c_api::cugraph_graph_t* graph_;
   size_t max_level_;
+  double threshold_;
   double resolution_;
   bool do_expensive_check_;
   cugraph::c_api::cugraph_hierarchical_clustering_result_t* result_{};
@@ -43,12 +44,14 @@ struct louvain_functor : public cugraph::c_api::abstract_functor {
   louvain_functor(::cugraph_resource_handle_t const* handle,
                   ::cugraph_graph_t* graph,
                   size_t max_level,
+                  double threshold,
                   double resolution,
                   bool do_expensive_check)
     : abstract_functor(),
       handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
       graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
       max_level_(max_level),
+      threshold_(threshold),
       resolution_(resolution),
       do_expensive_check_(do_expensive_check)
   {
@@ -102,6 +105,7 @@ struct louvain_functor : public cugraph::c_api::abstract_functor {
                                                   .view()),
                          clusters.data(),
                          max_level_,
+                         static_cast<weight_t>(threshold_),
                          static_cast<weight_t>(resolution_));
 
       rmm::device_uvector<vertex_t> vertices(graph_view.local_vertex_partition_range_size(),
@@ -121,12 +125,13 @@ struct louvain_functor : public cugraph::c_api::abstract_functor {
 extern "C" cugraph_error_code_t cugraph_louvain(const cugraph_resource_handle_t* handle,
                                                 cugraph_graph_t* graph,
                                                 size_t max_level,
+                                                double threshold,
                                                 double resolution,
                                                 bool_t do_expensive_check,
                                                 cugraph_hierarchical_clustering_result_t** result,
                                                 cugraph_error_t** error)
 {
-  louvain_functor functor(handle, graph, max_level, resolution, do_expensive_check);
+  louvain_functor functor(handle, graph, max_level, threshold, resolution, do_expensive_check);
 
   return cugraph::c_api::run_algorithm(graph, functor, result, error);
 }
diff --git a/cpp/src/community/louvain_impl.cuh b/cpp/src/community/louvain_impl.cuh
index 167de36dd13..7777921a091 100644
--- a/cpp/src/community/louvain_impl.cuh
+++ b/cpp/src/community/louvain_impl.cuh
@@ -47,6 +47,7 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   size_t max_level,
+  weight_t threshold,
   weight_t resolution)
 {
   using graph_t      = cugraph::graph_t<vertex_t, edge_t, false, multi_gpu>;
@@ -169,7 +170,7 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
     // during each iteration of the loop
     bool up_down = true;
 
-    while (new_Q > (cur_Q + 0.0001)) {
+    while (new_Q > (cur_Q + threshold)) {
       cur_Q = new_Q;
 
       next_clusters_v = detail::update_clustering_by_delta_modularity(handle,
@@ -291,12 +292,13 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   size_t max_level,
+  weight_t threshold,
   weight_t resolution)
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
   CUGRAPH_EXPECTS(edge_weight_view.has_value(), "Graph must be weighted");
-  return detail::louvain(handle, graph_view, edge_weight_view, max_level, resolution);
+  return detail::louvain(handle, graph_view, edge_weight_view, max_level, threshold, resolution);
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -317,6 +319,7 @@ std::pair<size_t, weight_t> louvain(
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   vertex_t* clustering,
   size_t max_level,
+  weight_t threshold,
   weight_t resolution)
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
@@ -328,7 +331,7 @@ std::pair<size_t, weight_t> louvain(
   weight_t modularity;
 
   std::tie(dendrogram, modularity) =
-    detail::louvain(handle, graph_view, edge_weight_view, max_level, resolution);
+    detail::louvain(handle, graph_view, edge_weight_view, max_level, threshold, resolution);
 
   detail::flatten_dendrogram(handle, graph_view, *dendrogram, clustering);
 
diff --git a/cpp/src/community/louvain_mg.cu b/cpp/src/community/louvain_mg.cu
index d6d266df273..0be32ed049f 100644
--- a/cpp/src/community/louvain_mg.cu
+++ b/cpp/src/community/louvain_mg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,36 +25,42 @@ template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   graph_view_t<int32_t, int32_t, false, true> const&,
   std::optional<edge_property_view_t<int32_t, float const*>>,
   size_t,
+  float,
   float);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   raft::handle_t const&,
   graph_view_t<int32_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   size_t,
+  float,
   float);
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> louvain(
   raft::handle_t const&,
   graph_view_t<int64_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   size_t,
+  float,
   float);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   raft::handle_t const&,
   graph_view_t<int32_t, int32_t, false, true> const&,
   std::optional<edge_property_view_t<int32_t, double const*>>,
   size_t,
+  double,
   double);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   raft::handle_t const&,
   graph_view_t<int32_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   size_t,
+  double,
   double);
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> louvain(
   raft::handle_t const&,
   graph_view_t<int64_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   size_t,
+  double,
   double);
 
 template std::pair<size_t, float> louvain(
@@ -63,6 +69,7 @@ template std::pair<size_t, float> louvain(
   std::optional<edge_property_view_t<int32_t, float const*>>,
   int32_t*,
   size_t,
+  float,
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
@@ -70,6 +77,7 @@ template std::pair<size_t, double> louvain(
   std::optional<edge_property_view_t<int32_t, double const*>>,
   int32_t*,
   size_t,
+  double,
   double);
 template std::pair<size_t, float> louvain(
   raft::handle_t const&,
@@ -77,6 +85,7 @@ template std::pair<size_t, float> louvain(
   std::optional<edge_property_view_t<int64_t, float const*>>,
   int32_t*,
   size_t,
+  float,
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
@@ -84,6 +93,7 @@ template std::pair<size_t, double> louvain(
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int32_t*,
   size_t,
+  double,
   double);
 template std::pair<size_t, float> louvain(
   raft::handle_t const&,
@@ -91,6 +101,7 @@ template std::pair<size_t, float> louvain(
   std::optional<edge_property_view_t<int64_t, float const*>>,
   int64_t*,
   size_t,
+  float,
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
@@ -98,6 +109,7 @@ template std::pair<size_t, double> louvain(
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int64_t*,
   size_t,
+  double,
   double);
 
 }  // namespace cugraph
diff --git a/cpp/src/community/louvain_sg.cu b/cpp/src/community/louvain_sg.cu
index 4e26aa1cf18..3fc0ffab928 100644
--- a/cpp/src/community/louvain_sg.cu
+++ b/cpp/src/community/louvain_sg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,36 +25,42 @@ template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   graph_view_t<int32_t, int32_t, false, false> const&,
   std::optional<edge_property_view_t<int32_t, float const*>>,
   size_t,
+  float,
   float);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   raft::handle_t const&,
   graph_view_t<int32_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   size_t,
+  float,
   float);
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> louvain(
   raft::handle_t const&,
   graph_view_t<int64_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   size_t,
+  float,
   float);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   raft::handle_t const&,
   graph_view_t<int32_t, int32_t, false, false> const&,
   std::optional<edge_property_view_t<int32_t, double const*>>,
   size_t,
+  double,
   double);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   raft::handle_t const&,
   graph_view_t<int32_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   size_t,
+  double,
   double);
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> louvain(
   raft::handle_t const&,
   graph_view_t<int64_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   size_t,
+  double,
   double);
 
 template std::pair<size_t, float> louvain(
@@ -63,6 +69,7 @@ template std::pair<size_t, float> louvain(
   std::optional<edge_property_view_t<int32_t, float const*>>,
   int32_t*,
   size_t,
+  float,
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
@@ -70,6 +77,7 @@ template std::pair<size_t, double> louvain(
   std::optional<edge_property_view_t<int32_t, double const*>>,
   int32_t*,
   size_t,
+  double,
   double);
 template std::pair<size_t, float> louvain(
   raft::handle_t const&,
@@ -77,6 +85,7 @@ template std::pair<size_t, float> louvain(
   std::optional<edge_property_view_t<int64_t, float const*>>,
   int32_t*,
   size_t,
+  float,
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
@@ -84,6 +93,7 @@ template std::pair<size_t, double> louvain(
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int32_t*,
   size_t,
+  double,
   double);
 template std::pair<size_t, float> louvain(
   raft::handle_t const&,
@@ -91,6 +101,7 @@ template std::pair<size_t, float> louvain(
   std::optional<edge_property_view_t<int64_t, float const*>>,
   int64_t*,
   size_t,
+  float,
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
@@ -98,6 +109,7 @@ template std::pair<size_t, double> louvain(
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int64_t*,
   size_t,
+  double,
   double);
 
 }  // namespace cugraph
diff --git a/cpp/tests/c_api/louvain_test.c b/cpp/tests/c_api/louvain_test.c
index f3813b5a1ac..e9ac5c9ff06 100644
--- a/cpp/tests/c_api/louvain_test.c
+++ b/cpp/tests/c_api/louvain_test.c
@@ -33,6 +33,7 @@ int generic_louvain_test(vertex_t* h_src,
                          size_t num_vertices,
                          size_t num_edges,
                          size_t max_level,
+                         double threshold,
                          double resolution,
                          bool_t store_transposed)
 {
@@ -60,7 +61,7 @@ int generic_louvain_test(vertex_t* h_src,
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
 
   ret_code =
-    cugraph_louvain(p_handle, p_graph, max_level, resolution, FALSE, &p_result, &ret_error);
+    cugraph_louvain(p_handle, p_graph, max_level, threshold, resolution, FALSE, &p_result, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "cugraph_louvain failed.");
@@ -108,6 +109,7 @@ int test_louvain()
   size_t num_edges    = 16;
   size_t num_vertices = 6;
   size_t max_level    = 10;
+  weight_t threshold  = 1e-7;
   weight_t resolution = 1.0;
 
   vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
@@ -126,6 +128,7 @@ int test_louvain()
                               num_vertices,
                               num_edges,
                               max_level,
+                              threshold,
                               resolution,
                               FALSE);
 }
@@ -135,6 +138,7 @@ int test_louvain_no_weight()
   size_t num_edges    = 16;
   size_t num_vertices = 6;
   size_t max_level    = 10;
+  weight_t threshold  = 1e-7;
   weight_t resolution = 1.0;
 
   vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
@@ -151,6 +155,7 @@ int test_louvain_no_weight()
                               num_vertices,
                               num_edges,
                               max_level,
+                              threshold,
                               resolution,
                               FALSE);
 }
diff --git a/cpp/tests/c_api/mg_louvain_test.c b/cpp/tests/c_api/mg_louvain_test.c
index d4c10d49891..2465709c03c 100644
--- a/cpp/tests/c_api/mg_louvain_test.c
+++ b/cpp/tests/c_api/mg_louvain_test.c
@@ -33,6 +33,7 @@ int generic_louvain_test(const cugraph_resource_handle_t* p_handle,
                          size_t num_vertices,
                          size_t num_edges,
                          size_t max_level,
+                         double threshold,
                          double resolution,
                          bool_t store_transposed)
 {
@@ -51,7 +52,7 @@ int generic_louvain_test(const cugraph_resource_handle_t* p_handle,
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
 
   ret_code =
-    cugraph_louvain(p_handle, p_graph, max_level, resolution, FALSE, &p_result, &ret_error);
+    cugraph_louvain(p_handle, p_graph, max_level, threshold, resolution, FALSE, &p_result, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "cugraph_louvain failed.");
@@ -106,6 +107,7 @@ int test_louvain(const cugraph_resource_handle_t* handle)
   size_t num_edges    = 8;
   size_t num_vertices = 6;
   size_t max_level    = 10;
+  weight_t threshold  = 1e-7;
   weight_t resolution = 1.0;
 
   vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
@@ -116,7 +118,7 @@ int test_louvain(const cugraph_resource_handle_t* handle)
 
   // Louvain wants store_transposed = FALSE
   return generic_louvain_test(
-    handle, h_src, h_dst, h_wgt, h_result, num_vertices, num_edges, max_level, resolution, FALSE);
+    handle, h_src, h_dst, h_wgt, h_result, num_vertices, num_edges, max_level, threshold, resolution, FALSE);
 }
 
 /******************************************************************************/
diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp
index 4792042365b..1e1fb6d4c33 100644
--- a/cpp/tests/community/louvain_test.cpp
+++ b/cpp/tests/community/louvain_test.cpp
@@ -30,8 +30,9 @@
 #include <vector>
 
 struct Louvain_Usecase {
-  size_t max_level_{100};
-  double resolution_{1};
+  std::optional<size_t> max_level_{std::nullopt};
+  std::optional<double> threshold_{std::nullopt};
+  std::optional<double> resolution_{std::nullopt};
   bool check_correctness_{false};
   int expected_level_{0};
   float expected_modularity_{0};
@@ -54,6 +55,12 @@ class Tests_Louvain
   {
     auto [louvain_usecase, input_usecase] = param;
 
+    // Legacy implementation does not support resolution parameter,
+    //   defaulting it to 1.  If the test case is not resolution
+    //   1 then skip it.
+    if (louvain_usecase.resolution_)
+      if (louvain_usecase.resolution_ != double{1}) return;
+
     raft::handle_t handle{};
 
     bool directed{false};
@@ -134,6 +141,9 @@ class Tests_Louvain
       EXPECT_THROW(louvain(graph_view,
                            edge_weight_view,
                            graph_view.local_vertex_partition_range_size(),
+                           louvain_usecase.max_level_,
+                           louvain_usecase.threshold_,
+                           louvain_usecase.resolution_,
                            louvain_usecase.check_correctness_,
                            louvain_usecase.expected_level_,
                            louvain_usecase.expected_modularity_),
@@ -142,6 +152,9 @@ class Tests_Louvain
       louvain(graph_view,
               edge_weight_view,
               graph_view.local_vertex_partition_range_size(),
+              louvain_usecase.max_level_,
+              louvain_usecase.threshold_,
+              louvain_usecase.resolution_,
               louvain_usecase.check_correctness_,
               louvain_usecase.expected_level_,
               louvain_usecase.expected_modularity_);
@@ -185,6 +198,9 @@ class Tests_Louvain
     cugraph::graph_view_t<vertex_t, edge_t, false, false> const& graph_view,
     std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
     vertex_t num_vertices,
+    std::optional<size_t> max_level,
+    std::optional<double> threshold,
+    std::optional<double> resolution,
     bool check_correctness,
     int expected_level,
     float expected_modularity)
@@ -195,8 +211,29 @@ class Tests_Louvain
     size_t level;
     weight_t modularity;
 
-    std::tie(level, modularity) = cugraph::louvain(
-      handle, graph_view, edge_weight_view, clustering_v.data(), size_t{100}, weight_t{1});
+    if (resolution) {
+      std::tie(level, modularity) =
+        cugraph::louvain(handle,
+                         graph_view,
+                         edge_weight_view,
+                         clustering_v.data(),
+                         max_level ? *max_level : size_t{100},
+                         threshold ? static_cast<weight_t>(*threshold) : weight_t{1e-7},
+                         static_cast<weight_t>(*resolution));
+    } else if (threshold) {
+      std::tie(level, modularity) = cugraph::louvain(handle,
+                                                     graph_view,
+                                                     edge_weight_view,
+                                                     clustering_v.data(),
+                                                     max_level ? *max_level : size_t{100},
+                                                     static_cast<weight_t>(*threshold));
+    } else if (max_level) {
+      std::tie(level, modularity) =
+        cugraph::louvain(handle, graph_view, edge_weight_view, clustering_v.data(), *max_level);
+    } else {
+      std::tie(level, modularity) =
+        cugraph::louvain(handle, graph_view, edge_weight_view, clustering_v.data());
+    }
 
     RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
 
@@ -421,8 +458,11 @@ TEST_P(Tests_Louvain_Rmat64, CheckInt64Int64FloatFloat)
 INSTANTIATE_TEST_SUITE_P(
   simple_test,
   Tests_Louvain_File,
-  ::testing::Combine(::testing::Values(Louvain_Usecase{100, 1, true, 3, 0.408695}),
-                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+  ::testing::Combine(
+    ::testing::Values(Louvain_Usecase{std::nullopt, std::nullopt, std::nullopt, true, 3, 0.408695},
+                      Louvain_Usecase{20, double{1e-4}, std::nullopt, true, 3, 0.408695},
+                      Louvain_Usecase{100, double{1e-4}, double{0.8}, true, 3, 0.48336622}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
   file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
index f89301c485b..41339e32d77 100644
--- a/cpp/tests/community/mg_louvain_test.cpp
+++ b/cpp/tests/community/mg_louvain_test.cpp
@@ -41,6 +41,7 @@
 //
 struct Louvain_Usecase {
   size_t max_level_{100};
+  double threshold_{1e-7};
   double resolution_{1};
   bool check_correctness_{true};
 };
@@ -72,6 +73,7 @@ class Tests_MGLouvain
     cugraph::graph_view_t<vertex_t, edge_t, false, true> const& mg_graph_view,
     std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>> mg_edge_weight_view,
     cugraph::Dendrogram<vertex_t> const& mg_dendrogram,
+    weight_t threshold,
     weight_t resolution,
     weight_t mg_modularity)
   {
@@ -100,6 +102,7 @@ class Tests_MGLouvain
        &sg_edge_weights,
        &sg_modularity,
        &handle,
+       threshold,
        resolution,
        comm_rank](size_t i) {
         rmm::device_uvector<vertex_t> d_mg_aggregate_cluster_v(0, handle.get_stream());
@@ -128,6 +131,7 @@ class Tests_MGLouvain
                                                                   sg_edge_weight_view,
                                                                   d_sg_cluster_v.data(),
                                                                   size_t{1},
+                                                                  threshold,
                                                                   resolution);
 
           EXPECT_TRUE(cugraph::test::check_invertible(
@@ -185,12 +189,13 @@ class Tests_MGLouvain
       hr_timer.start("MG Louvain");
     }
 
-    auto [dendrogram, mg_modularity] =
-      cugraph::louvain<vertex_t, edge_t, weight_t, true>(*handle_,
-                                                         mg_graph_view,
-                                                         mg_edge_weight_view,
-                                                         louvain_usecase.max_level_,
-                                                         louvain_usecase.resolution_);
+    auto [dendrogram, mg_modularity] = cugraph::louvain<vertex_t, edge_t, weight_t, true>(
+      *handle_,
+      mg_graph_view,
+      mg_edge_weight_view,
+      louvain_usecase.max_level_,
+      static_cast<weight_t>(louvain_usecase.threshold_),
+      static_cast<weight_t>(louvain_usecase.resolution_));
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -206,6 +211,7 @@ class Tests_MGLouvain
                                                      mg_graph_view,
                                                      mg_edge_weight_view,
                                                      *dendrogram,
+                                                     louvain_usecase.threshold_,
                                                      louvain_usecase.resolution_,
                                                      mg_modularity);
     }
@@ -257,15 +263,16 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGLouvain_File,
   ::testing::Combine(
     // enable correctness checks for small graphs
-    ::testing::Values(Louvain_Usecase{100, 1}),
+    ::testing::Values(Louvain_Usecase{100, double{1e-7}, 1}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
 
-INSTANTIATE_TEST_SUITE_P(rmat_small_tests,
-                         Tests_MGLouvain_Rmat,
-                         ::testing::Combine(::testing::Values(Louvain_Usecase{100, 1}),
-                                            ::testing::Values(cugraph::test::Rmat_Usecase(
-                                              10, 16, 0.57, 0.19, 0.19, 0, true, false))));
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_MGLouvain_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Louvain_Usecase{100, double{1e-7}, 1}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
 
 INSTANTIATE_TEST_SUITE_P(
   file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
@@ -276,7 +283,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGLouvain_File,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(Louvain_Usecase{100, 1, false}),
+    ::testing::Values(Louvain_Usecase{100, double{1e-7}, 1, false}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -288,7 +295,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGLouvain_Rmat,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(Louvain_Usecase{100, 1, false}),
+    ::testing::Values(Louvain_Usecase{100, double{1e-7}, 1, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/python/cugraph/cugraph/community/louvain.py b/python/cugraph/cugraph/community/louvain.py
index 35ca864824f..7f9742c8f09 100644
--- a/python/cugraph/cugraph/community/louvain.py
+++ b/python/cugraph/cugraph/community/louvain.py
@@ -17,11 +17,13 @@
 )
 import cudf
 
+import warnings
 from pylibcugraph import louvain as pylibcugraph_louvain
 from pylibcugraph import ResourceHandle
 
 
-def louvain(G, max_iter=100, resolution=1.0):
+# FIXME: max_level should default to 100 once max_iter is removed
+def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7):
     """
     Compute the modularity optimizing partition of the input graph using the
     Louvain method
@@ -40,18 +42,30 @@ def louvain(G, max_iter=100, resolution=1.0):
         present.
         The current implementation only supports undirected graphs.
 
-    max_iter : integer, optional (default=100)
-        This controls the maximum number of levels/iterations of the Louvain
+    max_level : integer, optional (default=100)
+        This controls the maximum number of levels of the Louvain
         algorithm. When specified the algorithm will terminate after no more
-        than the specified number of iterations. No error occurs when the
+        than the specified number of levels. No error occurs when the
         algorithm terminates early in this manner.
 
-    resolution: float/double, optional (default=1.0)
+    max_iter : integer, optional (default=None)
+        This parameter is deprecated in favor of max_level.  Previously
+        it was used to control the maximum number of levels of the Louvain
+        algorithm.
+
+    resolution: float, optional (default=1.0)
         Called gamma in the modularity formula, this changes the size
         of the communities.  Higher resolutions lead to more smaller
         communities, lower resolutions lead to fewer larger communities.
         Defaults to 1.
 
+    threshold: float
+        Modularity gain threshold for each level. If the gain of
+        modularity between 2 levels of the algorithm is less than the
+        given threshold then the algorithm stops and returns the
+        resulting communities.
+        Defaults to 1e-7.
+
     Returns
     -------
     parts : cudf.DataFrame
@@ -80,10 +94,29 @@ def louvain(G, max_iter=100, resolution=1.0):
     if G.is_directed():
         raise ValueError("input graph must be undirected")
 
+    # FIXME: This max_iter logic and the max_level defaulting can be deleted
+    #        in favor of defaulting max_level in call once max_iter is deleted
+    if max_iter:
+        if max_level:
+            raise ValueError(
+                "max_iter is deprecated.  Cannot specify both max_iter and max_level"
+            )
+
+        warning_msg = (
+            "max_iter has been renamed max_level.  Use of max_iter is "
+            "deprecated and will no longer be supported in the next releases."
+        )
+        warnings.warn(warning_msg, FutureWarning)
+        max_level = max_iter
+
+    if max_level is None:
+        max_level = 100
+
     vertex, partition, mod_score = pylibcugraph_louvain(
         resource_handle=ResourceHandle(),
         graph=G._plc_graph,
-        max_level=max_iter,
+        max_level=max_level,
+        threshold=threshold,
         resolution=resolution,
         do_expensive_check=False,
     )
diff --git a/python/cugraph/cugraph/dask/community/louvain.py b/python/cugraph/cugraph/dask/community/louvain.py
index c003939f5eb..8efbbafaf7b 100644
--- a/python/cugraph/cugraph/dask/community/louvain.py
+++ b/python/cugraph/cugraph/dask/community/louvain.py
@@ -28,6 +28,8 @@
 from pylibcugraph import louvain as pylibcugraph_louvain
 from typing import Tuple, TYPE_CHECKING
 
+import warnings
+
 if TYPE_CHECKING:
     from cugraph import Graph
 
@@ -45,19 +47,30 @@ def convert_to_cudf(result: cp.ndarray) -> Tuple[cudf.DataFrame, float]:
 
 
 def _call_plc_louvain(
-    sID: bytes, mg_graph_x, max_iter: int, resolution: int, do_expensive_check: bool
+    sID: bytes,
+    mg_graph_x,
+    max_level: int,
+    threshold: float,
+    resolution: float,
+    do_expensive_check: bool,
 ) -> Tuple[cp.ndarray, cp.ndarray, float]:
     return pylibcugraph_louvain(
         resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
         graph=mg_graph_x,
-        max_level=max_iter,
+        max_level=max_level,
+        threshold=threshold,
         resolution=resolution,
         do_expensive_check=do_expensive_check,
     )
 
 
+# FIXME: max_level should default to 100 once max_iter is removed
 def louvain(
-    input_graph: Graph, max_iter: int = 100, resolution: int = 1.0
+    input_graph: Graph,
+    max_level: int = None,
+    max_iter: int = None,
+    resolution: float = 1.0,
+    threshold: float = 1e-7,
 ) -> Tuple[dask_cudf.DataFrame, float]:
     """
     Compute the modularity optimizing partition of the input graph using the
@@ -77,17 +90,27 @@ def louvain(
         present.
         The current implementation only supports undirected graphs.
 
-    max_iter : integer, optional (default=100)
-        This controls the maximum number of levels/iterations of the Louvain
+    max_level : integer, optional (default=100)
+        This controls the maximum number of levels of the Louvain
         algorithm. When specified the algorithm will terminate after no more
-        than the specified number of iterations. No error occurs when the
+        than the specified number of levels. No error occurs when the
         algorithm terminates early in this manner.
 
-    resolution: float/double, optional (default=1.0)
+    max_iter : integer, optional (default=None)
+        This parameter is deprecated in favor of max_level.  Previously
+        it was used to control the maximum number of levels of the Louvain
+        algorithm.
+
+    resolution: float, optional (default=1.0)
         Called gamma in the modularity formula, this changes the size
         of the communities.  Higher resolutions lead to more smaller
         communities, lower resolutions lead to fewer larger communities.
-        Defaults to 1.
+
+    threshold: float, optional (default=1e-7)
+        Modularity gain threshold for each level. If the gain of
+        modularity between 2 levels of the algorithm is less than the
+        given threshold then the algorithm stops and returns the
+        resulting communities.
 
     Returns
     -------
@@ -115,6 +138,24 @@ def louvain(
     if input_graph.is_directed():
         raise ValueError("input graph must be undirected")
 
+    # FIXME: This max_iter logic and the max_level defaulting can be deleted
+    #        in favor of defaulting max_level in call once max_iter is deleted
+    if max_iter:
+        if max_level:
+            raise ValueError(
+                "max_iter is deprecated.  Cannot specify both max_iter and max_level"
+            )
+
+        warning_msg = (
+            "max_iter has been renamed max_level.  Use of max_iter is "
+            "deprecated and will no longer be supported in the next releases. "
+        )
+        warnings.warn(warning_msg, FutureWarning)
+        max_level = max_iter
+
+    if max_level is None:
+        max_level = 100
+
     # Initialize dask client
     client = default_client()
 
@@ -125,7 +166,8 @@ def louvain(
             _call_plc_louvain,
             Comms.get_session_id(),
             input_graph._plc_graph[w],
-            max_iter,
+            max_level,
+            threshold,
             resolution,
             do_expensive_check,
             workers=[w],
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd
index 67ba43bf611..64944e8773f 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd
@@ -98,6 +98,7 @@ cdef extern from "cugraph_c/community_algorithms.h":
             const cugraph_resource_handle_t* handle,
             cugraph_graph_t* graph,
             size_t max_level,
+            double threshold,
             double resolution,
             bool_t do_expensive_check,
             cugraph_hierarchical_clustering_result_t** result,
diff --git a/python/pylibcugraph/pylibcugraph/louvain.pyx b/python/pylibcugraph/pylibcugraph/louvain.pyx
index ecae7e700b4..eca569d7da1 100644
--- a/python/pylibcugraph/pylibcugraph/louvain.pyx
+++ b/python/pylibcugraph/pylibcugraph/louvain.pyx
@@ -51,7 +51,8 @@ from pylibcugraph.utils cimport (
 def louvain(ResourceHandle resource_handle,
             _GPUGraph graph,
             size_t max_level,
-            double resolution,
+            float threshold,
+            float resolution,
             bool_t do_expensive_check):
     """
     Compute the modularity optimizing partition of the input graph using the
@@ -72,11 +73,16 @@ def louvain(ResourceHandle resource_handle,
         than the specified number of iterations. No error occurs when the
         algorithm terminates early in this manner.
 
-    resolution: double
+    threshold: float
+        Modularity gain threshold for each level. If the gain of
+        modularity between 2 levels of the algorithm is less than the
+        given threshold then the algorithm stops and returns the
+        resulting communities.
+
+    resolution: float
         Called gamma in the modularity formula, this changes the size
         of the communities.  Higher resolutions lead to more smaller
         communities, lower resolutions lead to fewer larger communities.
-        Defaults to 1.
 
     do_expensive_check : bool_t
         If True, performs more extensive tests on the inputs to ensure
@@ -100,7 +106,7 @@ def louvain(ResourceHandle resource_handle,
     ...     resource_handle, graph_props, srcs, dsts, weights,
     ...     store_transposed=True, renumber=False, do_expensive_check=False)
     >>> (vertices, clusters, modularity) = pylibcugraph.louvain(
-                                resource_handle, G, 100, 1., False)
+                                resource_handle, G, 100, 1e-7, 1., False)
     >>> vertices
     [0, 1, 2]
     >>> clusters
@@ -119,6 +125,7 @@ def louvain(ResourceHandle resource_handle,
     error_code = cugraph_louvain(c_resource_handle_ptr,
                                  c_graph_ptr,
                                  max_level,
+                                 threshold,
                                  resolution,
                                  do_expensive_check,
                                  &result_ptr,
diff --git a/python/pylibcugraph/pylibcugraph/tests/test_louvain.py b/python/pylibcugraph/pylibcugraph/tests/test_louvain.py
index d2027a46d9a..adea5e01f15 100644
--- a/python/pylibcugraph/pylibcugraph/tests/test_louvain.py
+++ b/python/pylibcugraph/pylibcugraph/tests/test_louvain.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -77,6 +77,7 @@ def test_sg_louvain_cupy():
     )
 
     max_level = 100
+    threshold = 0.0001
     resolution = 1.0
 
     sg = SGGraph(
@@ -91,7 +92,7 @@ def test_sg_louvain_cupy():
     )
 
     vertices, clusters, modularity = louvain(
-        resource_handle, sg, max_level, resolution, do_expensive_check=False
+        resource_handle, sg, max_level, threshold, resolution, do_expensive_check=False
     )
 
     check_results(vertices, clusters, modularity)
@@ -130,6 +131,7 @@ def test_sg_louvain_cudf():
     )
 
     max_level = 100
+    threshold = 0.0001
     resolution = 1.0
 
     sg = SGGraph(
@@ -144,7 +146,7 @@ def test_sg_louvain_cudf():
     )
 
     vertices, clusters, modularity = louvain(
-        resource_handle, sg, max_level, resolution, do_expensive_check=False
+        resource_handle, sg, max_level, threshold, resolution, do_expensive_check=False
     )
 
     check_results(vertices, clusters, modularity)

From 4ee227c7c84ef487828ecadd5fe86934f1fce4eb Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Thu, 7 Sep 2023 16:39:22 -0500
Subject: [PATCH 34/72] Remove the assumption made on the client data's keys
 (#3835)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When calling `client.has_what(`) which returns the data's key that are held in each worker’s memory, those keys used to be returned as string but a recent change in `dask` changed the type to tuples
 
From `{worker_ip_address: ("('from-delayed-190587f1b2318dc54d5f92a79e59b71a', 0)", "('from-delayed-190587f1b2318dc54d5f92a79e59b71a', 1)")}` to`{worker_ip_address: (('from-delayed-c3d92b2cc9948634e82a0b2b62453a6c', 0), ('from-delayed-c3d92b2cc9948634e82a0b2b62453a6c', 1))}`
 
When mapping workers to persisted data in the function `get_persisted_df_worker_map`, an assumption about the type of those keys was made thereby breaking our MG tests.

This PR removes that assumption.
Closes #3834

Authors:
  - Joseph Nke (https://github.com/jnke2016)
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3835
---
 .../cugraph/cugraph/dask/common/part_utils.py | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/cugraph/cugraph/dask/common/part_utils.py b/python/cugraph/cugraph/dask/common/part_utils.py
index fda7e257367..7c0aad6c3ee 100644
--- a/python/cugraph/cugraph/dask/common/part_utils.py
+++ b/python/cugraph/cugraph/dask/common/part_utils.py
@@ -73,7 +73,7 @@ def persist_distributed_data(dask_df, client):
     _keys = dask_df.__dask_keys__()
     worker_dict = {}
     for i, key in enumerate(_keys):
-        worker_dict[str(key)] = tuple([worker_addresses[i]])
+        worker_dict[key] = tuple([worker_addresses[i]])
     persisted = client.persist(dask_df, workers=worker_dict)
     parts = futures_of(persisted)
     return parts
@@ -89,7 +89,7 @@ def get_persisted_df_worker_map(dask_df, client):
     ddf_keys = futures_of(dask_df)
     output_map = {}
     for w, w_keys in client.has_what().items():
-        output_map[w] = [ddf_k for ddf_k in ddf_keys if str(ddf_k.key) in w_keys]
+        output_map[w] = [ddf_k for ddf_k in ddf_keys if ddf_k.key in w_keys]
         if len(output_map[w]) == 0:
             output_map[w] = _create_empty_dask_df_future(dask_df._meta, client, w)
     return output_map
@@ -157,7 +157,7 @@ async def _extract_partitions(
         # NOTE: We colocate (X, y) here by zipping delayed
         # n partitions of them as (X1, y1), (X2, y2)...
         # and asking client to compute a single future for
-        # each tuple in the list
+        # each tuple in the list.
         dela = [np.asarray(d.to_delayed()) for d in dask_obj]
 
         # TODO: ravel() is causing strange behavior w/ delayed Arrays which are
@@ -167,7 +167,7 @@ async def _extract_partitions(
         parts = client.compute([p for p in zip(*raveled)])
 
     await wait(parts)
-    key_to_part = [(str(part.key), part) for part in parts]
+    key_to_part = [(part.key, part) for part in parts]
     who_has = await client.who_has(parts)
     return [(first(who_has[key]), part) for key, part in key_to_part]
 
@@ -229,7 +229,7 @@ def load_balance_func(ddf_, by, client=None):
     wait(parts)
 
     who_has = client.who_has(parts)
-    key_to_part = [(str(part.key), part) for part in parts]
+    key_to_part = [(part.key, part) for part in parts]
     gpu_fututres = [
         (first(who_has[key]), part.key[1], part) for key, part in key_to_part
     ]
@@ -245,7 +245,7 @@ def load_balance_func(ddf_, by, client=None):
     for cumsum in cumsum_parts:
         num_rows.append(cumsum.iloc[-1])
 
-    # Calculate current partition divisions
+    # Calculate current partition divisions.
     divisions = [sum(num_rows[0:x:1]) for x in range(0, len(num_rows) + 1)]
     divisions[-1] = divisions[-1] - 1
     divisions = tuple(divisions)
@@ -271,7 +271,7 @@ def load_balance_func(ddf_, by, client=None):
 
 def concat_dfs(df_list):
     """
-    Concat a list of cudf dataframes
+    Concat a list of cudf dataframes.
     """
     return cudf.concat(df_list)
 
@@ -279,17 +279,17 @@ def concat_dfs(df_list):
 def get_delayed_dict(ddf):
     """
     Returns a dicitionary with the dataframe tasks as keys and
-    the dataframe delayed objects as values
+    the dataframe delayed objects as values.
     """
     df_delayed = {}
     for delayed_obj in ddf.to_delayed():
-        df_delayed[str(delayed_obj.key)] = delayed_obj
+        df_delayed[delayed_obj.key] = delayed_obj
     return df_delayed
 
 
 def concat_within_workers(client, ddf):
     """
-    Concats all partitions within workers without transfers
+    Concats all partitions within workers without transfers.
     """
     df_delayed = get_delayed_dict(ddf)
 

From 6779e896edf310f5bcaad5acb8673995041c2801 Mon Sep 17 00:00:00 2001
From: ralph <137829296+nv-rliu@users.noreply.github.com>
Date: Fri, 8 Sep 2023 09:58:44 -0400
Subject: [PATCH 35/72] Adding metadata getter methods to datasets API (#3821)

Closes #3820

This PR adds simple getter methods to the `dataset` class, which allows users to easily get information about datasets without need to access the `metadata` dict or look in the directory.

```python
from cugraph.datasets import karate

# users now call
karate.number_of_nodes()

# instead of
karate.metadata['number_of_nodes']
```

Authors:
  - ralph (https://github.com/nv-rliu)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3821
---
 python/cugraph/cugraph/datasets/dataset.py    | 36 +++++++++++++++++++
 .../cugraph/tests/utils/test_dataset.py       | 10 ++++++
 2 files changed, 46 insertions(+)

diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py
index b276a87b88e..877eade7708 100644
--- a/python/cugraph/cugraph/datasets/dataset.py
+++ b/python/cugraph/cugraph/datasets/dataset.py
@@ -266,6 +266,42 @@ def get_path(self):
 
         return self._path.absolute()
 
+    def is_directed(self):
+        """
+        Returns True if the graph is a directed graph.
+        """
+        return self.metadata["is_directed"]
+
+    def is_multigraph(self):
+        """
+        Returns True if the graph is a multigraph.
+        """
+        return self.metadata["is_multigraph"]
+
+    def is_symmetric(self):
+        """
+        Returns True if the graph is symmetric.
+        """
+        return self.metadata["is_symmetric"]
+
+    def number_of_nodes(self):
+        """
+        An alias of number_of_vertices()
+        """
+        return self.number_of_vertices()
+
+    def number_of_vertices(self):
+        """
+        Get the number of vertices in the graph.
+        """
+        return self.metadata["number_of_nodes"]
+
+    def number_of_edges(self):
+        """
+        Get the number of edges in the graph.
+        """
+        return self.metadata["number_of_edges"]
+
 
 def download_all(force=False):
     """
diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py
index 643d0468d46..c2a4f7c6072 100644
--- a/python/cugraph/cugraph/tests/utils/test_dataset.py
+++ b/python/cugraph/cugraph/tests/utils/test_dataset.py
@@ -328,6 +328,16 @@ def test_is_multigraph(dataset):
     assert G.is_multigraph() == dataset.metadata["is_multigraph"]
 
 
+@pytest.mark.parametrize("dataset", ALL_DATASETS)
+def test_object_getters(dataset):
+    assert dataset.is_directed() == dataset.metadata["is_directed"]
+    assert dataset.is_multigraph() == dataset.metadata["is_multigraph"]
+    assert dataset.is_symmetric() == dataset.metadata["is_symmetric"]
+    assert dataset.number_of_nodes() == dataset.metadata["number_of_nodes"]
+    assert dataset.number_of_vertices() == dataset.metadata["number_of_nodes"]
+    assert dataset.number_of_edges() == dataset.metadata["number_of_edges"]
+
+
 #
 # Test experimental for DeprecationWarnings
 #

From 17b34479094e42e1401d0e5354d8da98672ba291 Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Fri, 8 Sep 2023 13:38:22 -0500
Subject: [PATCH 36/72] Uses `conda mambabuild` rather than `mamba mambabuild`
 (#3853)

Applies same changes for the same reasons as cuDF PR https://github.com/rapidsai/cudf/pull/14067 to cuGraph.

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/3853
---
 ci/build_cpp.sh    |  2 +-
 ci/build_python.sh | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 3fd57f24c40..3fb72cac08b 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -11,6 +11,6 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
-rapids-mamba-retry mambabuild conda/recipes/libcugraph
+rapids-conda-retry mambabuild conda/recipes/libcugraph
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 429ba649d1d..62eb6c2ccec 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -15,12 +15,12 @@ rapids-logger "Begin py build"
 
 # TODO: Remove `--no-test` flags once importing on a CPU
 # node works correctly
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibcugraph
 
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -30,7 +30,7 @@ rapids-mamba-retry mambabuild \
 # platform to ensure it is included in each set of artifacts, since test
 # scripts only install from one set of artifacts based on the CUDA version used
 # for the test run.
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -40,7 +40,7 @@ rapids-mamba-retry mambabuild \
 # built on each CUDA platform to ensure they are included in each set of
 # artifacts, since test scripts only install from one set of artifacts based on
 # the CUDA version used for the test run.
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -50,7 +50,7 @@ RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 
 if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then
   # Only CUDA 11 is supported right now due to PyTorch requirement.
-  rapids-mamba-retry mambabuild \
+  rapids-conda-retry mambabuild \
     --no-test \
     --channel "${CPP_CHANNEL}" \
     --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -60,7 +60,7 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then
     conda/recipes/cugraph-pyg
 
   # Only CUDA 11 is supported right now due to PyTorch requirement.
-  rapids-mamba-retry mambabuild \
+  rapids-conda-retry mambabuild \
     --no-test \
     --channel "${CPP_CHANNEL}" \
     --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \

From e55c131e35081e368db9b315a5f9706e048709f8 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Wed, 13 Sep 2023 13:01:24 -0400
Subject: [PATCH 37/72] Fix subtle memory leak in nbr_intersection primitive
 (#3858)

Closes https://github.com/rapidsai/graph_dl/issues/259

A customer found a subtle memory leak in Jaccard similarity.  Tracked it down to this subtle error.

`major_nbr_indices` is an `std::optional` that is initialized to `std::nullopt`.  Overwriting the dereferenced entry replaces the value but does not mark the optional as containing a value.  So the resulting value is never destroyed.

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/3858
---
 cpp/src/prims/detail/nbr_intersection.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index f4c4745b14c..2f30faebb3e 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -1023,7 +1023,7 @@ nbr_intersection(raft::handle_t const& handle,
                                (*major_nbr_offsets).begin() + 1);
       }
 
-      std::tie(*major_nbr_indices, std::ignore) = shuffle_values(
+      std::tie(major_nbr_indices, std::ignore) = shuffle_values(
         major_comm, local_nbrs_for_rx_majors.begin(), local_nbr_counts, handle.get_stream());
 
       if constexpr (!std::is_same_v<edge_property_value_t, thrust::nullopt_t>) {

From 5f7616173069cee5d856348f6084684962c670d6 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Mon, 18 Sep 2023 21:21:14 -0700
Subject: [PATCH 38/72] Sampling post processing functions to accelerate MFG
 creation. (#3815)

Closes #3787
Closes #3788

Added C++ functions to accelerate the MFG (message flow graph) creation step after sampling in the end-to-end GNN workflow.

Three C++ public functions are added to accelerate the GNN workflow.

1) renumbering + compression (CSR/DCSR/CSC/DCSC)
2) renumbering + sorting (COO)
3) sorting only (COO)

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Joseph Nke (https://github.com/jnke2016)
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3815
---
 cpp/CMakeLists.txt                            |    1 +
 .../cugraph/detail/utility_wrappers.hpp       |    4 +-
 cpp/include/cugraph/graph_functions.hpp       |    4 +
 cpp/include/cugraph/sampling_functions.hpp    |  296 +++
 cpp/src/c_api/uniform_neighbor_sampling.cpp   |    2 +-
 cpp/src/prims/kv_store.cuh                    |    1 +
 ...r_v_random_select_transform_outgoing_e.cuh |    4 +-
 .../renumber_sampled_edgelist_impl.cuh        |    2 +
 .../sampling/renumber_sampled_edgelist_sg.cu  |    3 +-
 .../sampling_post_processing_impl.cuh         | 1800 +++++++++++++++++
 .../sampling/sampling_post_processing_sg.cu   |  389 ++++
 cpp/tests/CMakeLists.txt                      |    6 +-
 .../renumber_sampled_edgelist_test.cu         |  512 -----
 .../sampling/sampling_post_processing_test.cu | 1457 +++++++++++++
 14 files changed, 3960 insertions(+), 521 deletions(-)
 create mode 100644 cpp/include/cugraph/sampling_functions.hpp
 create mode 100644 cpp/src/sampling/sampling_post_processing_impl.cuh
 create mode 100644 cpp/src/sampling/sampling_post_processing_sg.cu
 delete mode 100644 cpp/tests/sampling/renumber_sampled_edgelist_test.cu
 create mode 100644 cpp/tests/sampling/sampling_post_processing_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 370e665106d..69a488de0b8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -228,6 +228,7 @@ set(CUGRAPH_SOURCES
     src/sampling/uniform_neighbor_sampling_mg.cpp
     src/sampling/uniform_neighbor_sampling_sg.cpp
     src/sampling/renumber_sampled_edgelist_sg.cu
+    src/sampling/sampling_post_processing_sg.cu
     src/cores/core_number_sg.cu
     src/cores/core_number_mg.cu
     src/cores/k_core_sg.cu
diff --git a/cpp/include/cugraph/detail/utility_wrappers.hpp b/cpp/include/cugraph/detail/utility_wrappers.hpp
index a15dbf34cf9..faa0fbb841b 100644
--- a/cpp/include/cugraph/detail/utility_wrappers.hpp
+++ b/cpp/include/cugraph/detail/utility_wrappers.hpp
@@ -37,8 +37,8 @@ namespace detail {
  * @param[in]   stream_view  stream view
  * @param[out]  d_value      device array to fill
  * @param[in]   size         number of elements in array
- * @param[in]   min_value    minimum value
- * @param[in]   max_value    maximum value
+ * @param[in]   min_value    minimum value (inclusive)
+ * @param[in]   max_value    maximum value (exclusive)
  * @param[in]   rng_state    The RngState instance holding pseudo-random number generator state.
  *
  */
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 200ee725b7a..5c1e9d5311f 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -919,6 +919,10 @@ rmm::device_uvector<vertex_t> select_random_vertices(
 /**
  * @brief renumber sampling output
  *
+ * @deprecated This API will be deprecated and will be replaced by the
+ * renumber_and_compress_sampled_edgelist and renumber_and_sort_sampled_edgelist functions in
+ * sampling_functions.hpp.
+ *
  * This function renumbers sampling function (e.g. uniform_neighbor_sample) outputs satisfying the
  * following requirements.
  *
diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
new file mode 100644
index 00000000000..e42ef9bfcf3
--- /dev/null
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <optional>
+#include <tuple>
+
+namespace cugraph {
+
+/*
+ * @brief renumber sampled edge list and compress to the (D)CSR|(D)CSC format.
+ *
+ * This function renumbers sampling function (e.g. uniform_neighbor_sample) output edges fulfilling
+ * the following requirements. Assume major = source if @p src_is_major is true, major = destination
+ * if @p src_is_major is false.
+ *
+ * 1. If @p edgelist_hops is valid, we can consider (vertex ID, hop, flag=major) triplets for each
+ * vertex ID in edge majors (@p edgelist_srcs if @p src_is_major is true, @p edgelist_dsts if false)
+ * and (vertex ID, hop, flag=minor) triplets for each vertex ID in edge minors. From these triplets,
+ * we can find the minimum (hop, flag) pairs for every unique vertex ID (hop is the primary key and
+ * flag is the secondary key, flag=major is considered smaller than flag=minor if hop numbers are
+ * same). Vertex IDs with smaller (hop, flag) pairs precede vertex IDs with larger (hop, flag) pairs
+ * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs.
+ * 2. If @p edgelist_hops is invalid, unique vertex IDs in edge majors precede vertex IDs that
+ * appear only in edge minors.
+ * 3. If edgelist_label_offsets.has_value() is true, edge lists for different labels will be
+ * renumbered separately.
+ *
+ * The renumbered edges are compressed based on the following requirements.
+ *
+ * 1. If @p compress_per_hop is true, edges are compressed separately for each hop. If @p
+ * compress_per_hop is false, edges with different hop numbers are compressed altogether.
+ * 2. Edges are compressed independently for different labels.
+ * 3. If @p doubly_compress is false, edges are compressed to CSR (if @p src_is_major is true) or
+ * CSC (if @p src_is_major is false). If @p doubly_compress is true, edges are compressed to DCSR
+ * (if @p src_is_major is true) or DCSC (if @p src_is_major is false). If @p doubly_compress is
+ * false, the CSR/CSC offset array size is the number of vertices (which is the maximum vertex ID +
+ * 1) + 1. Here, the maximum vertex ID is the maximum major vertex ID in the edges to compress if @p
+ * compress_per_hop is false or for hop 0. If @p compress_per_hop is true and hop number is 1 or
+ * larger, the maximum vertex ID is the larger of the maximum major vertex ID for this hop and the
+ * maximum vertex ID for the edges in the previous hops.
+ *
+ * If both @p compress_per_hop is false and @p edgelist_hops.has_value() is true, majors should be
+ * non-decreasing within each label after renumbering and sorting by (hop, major, minor). Also,
+ * majors in hop N should not appear in any of the previous hops. This condition is satisfied if
+ * majors in hop N + 1 does not have any vertices from the previous hops excluding the minors from
+ * hop N.
+ *
+ * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weight.  Needs to be floating point type
+ * @tparam edge_id_t Type of edge id.  Needs to be an integral type
+ * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
+ * supported
+ * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_srcs A vector storing edgelist source vertices.
+ * @param edgelist_dsts A vector storing edgelist destination vertices (size = @p
+ * edgelist_srcs.size()).
+ * @param edgelist_weights An optional vector storing edgelist weights (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_ids An optional vector storing edgelist edge IDs (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid) and the number of hops.
+ * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
+ * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
+ * labels.
+ * @param src_is_major A flag to determine whether to use the source or destination as the
+ * major key in renumbering and compression.
+ * @param compress_per_hop A flag to determine whether to compress edges with different hop numbers
+ * separately (if true) or altogether (if false). If @p compress_per_hop is true, @p
+ * edgelist_hops.has_value() should be true and @p doubly_compress should be false.
+ * @param doubly_compress A flag to determine whether to compress to the CSR/CSC format (if false)
+ * or the DCSR/DCSC format (if true).
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return Tuple of vectors storing optional DCSR/DCSC major vertex IDs with one or more neighbors,
+ * (D)CSR|(D)CSC offset values, edge minor vertex IDs, optional edge weights (valid only if @p
+ * edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
+ * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
+ * edgelist_edge_types.has_value() is true), optional (label, hop) offset values to the
+ * (D)CSR|(D)CSC offset array (size = # labels * # hops + 1, where # labels =
+ * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
+ * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
+ * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
+ * edgelist_hops.has_value() is rue), renumber_map to query original vertices (size = # unique
+ * vertices or aggregate # unique vertices for every label), and label offsets to the renumber_map
+ * (size = std::get<1>(*edgelist_label_offsets) + 1, valid only if @p
+ * edgelist_label_offsets.has_value() is true).
+ */
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<std::optional<rmm::device_uvector<vertex_t>>,     // dcsr/dcsc major vertices
+           rmm::device_uvector<size_t>,                      // (d)csr/(d)csc offset values
+           rmm::device_uvector<vertex_t>,                    // minor vertices
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,  // (label, hop) offsets to the (d)csr/(d)csc
+                                                        // offset array
+           rmm::device_uvector<vertex_t>,               // renumber map
+           std::optional<rmm::device_uvector<size_t>>>  // label offsets to the renumber map
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major       = true,
+  bool compress_per_hop   = false,
+  bool doubly_compress    = false,
+  bool do_expensive_check = false);
+
+/*
+ * @brief renumber sampled edge list and sort the renumbered edges.
+ *
+ * This function renumbers sampling function (e.g. uniform_neighbor_sample) output edges fulfilling
+ * the following requirements. Assume major = source if @p src_is_major is true, major = destination
+ * if @p src_is_major is false.
+ *
+ * 1. If @p edgelist_hops is valid, we can consider (vertex ID, hop, flag=major) triplets for each
+ * vertex ID in edge majors (@p edgelist_srcs if @p src_is_major is true, @p edgelist_dsts if false)
+ * and (vertex ID, hop, flag=minor) triplets for each vertex ID in edge minors. From these triplets,
+ * we can find the minimum (hop, flag) pairs for every unique vertex ID (hop is the primary key and
+ * flag is the secondary key, flag=major is considered smaller than flag=minor if hop numbers are
+ * same). Vertex IDs with smaller (hop, flag) pairs precede vertex IDs with larger (hop, flag) pairs
+ * in renumbering. Ordering can be arbitrary among the vertices with the same (hop, flag) pairs.
+ * 2. If @p edgelist_hops is invalid, unique vertex IDs in edge majors precede vertex IDs that
+ * appear only in edge minors.
+ * 3. If edgelist_label_offsets.has_value() is true, edge lists for different labels will be
+ * renumbered separately.
+ *
+ * The renumbered edges are sorted based on the following rules.
+ *
+ * 1. If @p src_is_major is true, use ((hop), src, dst) as the key in sorting. If @p src_is_major is
+ * false, use ((hop), dst, src) instead. hop is used only if @p edgelist_hops.has_value() is true.
+ * 2. Edges in each label are sorted independently if @p edgelist_label_offsets.has_value() is true.
+ *
+ * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weight.  Needs to be floating point type
+ * @tparam edge_id_t Type of edge id.  Needs to be an integral type
+ * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
+ * supported
+ * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_srcs A vector storing edgelist source vertices.
+ * @param edgelist_dsts A vector storing edgelist destination vertices (size = @p
+ * edgelist_srcs.size()).
+ * @param edgelist_weights An optional vector storing edgelist weights (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_ids An optional vector storing edgelist edge IDs (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid) and the number of hops. The hop vector values should be
+ * non-decreasing within each label.
+ * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
+ * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
+ * labels.
+ * @param src_is_major A flag to determine whether to use the source or destination as the
+ * major key in renumbering and sorting.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return Tuple of vectors storing edge sources, edge destinations, optional edge weights (valid
+ * only if @p edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
+ * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
+ * edgelist_edge_types.has_value() is true), optional (label, hop) offset values to the renumbered
+ * and sorted edges (size = # labels * # hops + 1, where # labels =
+ * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
+ * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
+ * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
+ * edgelist_hops.has_value() is true), renumber_map to query original vertices (size = # unique
+ * vertices or aggregate # unique vertices for every label), and label offsets to the renumber_map
+ * (size = std::get<1>(*edgelist_label_offsets) + 1, valid only if @p
+ * edgelist_label_offsets.has_value() is true).
+ */
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,       // (label, hop) offsets to the edges
+           rmm::device_uvector<vertex_t>,                    // renumber map
+           std::optional<rmm::device_uvector<size_t>>>       // label offsets to the renumber map
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major       = true,
+  bool do_expensive_check = false);
+
+/*
+ * @brief sort sampled edge list.
+ *
+ * Sampled edges are sorted based on the following rules.
+ *
+ * 1. If @p src_is_major is true, use ((hop), src, dst) as the key in sorting. If @p src_is_major is
+ * false, use ((hop), dst, src) instead. hop is used only if @p edgelist_hops.has_value() is true.
+ * 2. Edges in each label are sorted independently if @p edgelist_label_offsets.has_value() is true.
+ *
+ * This function is single-GPU only (we are not aware of any practical multi-GPU use cases).
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weight.  Needs to be floating point type
+ * @tparam edge_id_t Type of edge id.  Needs to be an integral type
+ * @tparam edge_type_t Type of edge type.  Needs to be an integral type, currently only int32_t is
+ * supported
+ * @param  handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edgelist_srcs A vector storing edgelist source vertices.
+ * @param edgelist_dsts A vector storing edgelist destination vertices (size = @p
+ * edgelist_srcs.size()).
+ * @param edgelist_weights An optional vector storing edgelist weights (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_ids An optional vector storing edgelist edge IDs (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_edge_types An optional vector storing edgelist edge types (size = @p
+ * edgelist_srcs.size() if valid).
+ * @param edgelist_hops An optional tuple having a vector storing edge list hop numbers (size = @p
+ * edgelist_srcs.size() if valid) and the number of hops. The hop vector values should be
+ * non-decreasing within each label.
+ * @param edgelist_label_offsets An optional tuple storing a pointer to the array storing label
+ * offsets to the input edges (size = std::get<1>(*edgelist_label_offsets) + 1) and the number of
+ * labels.
+ * @param src_is_major A flag to determine whether to use the source or destination as the
+ * major key in renumbering and sorting.
+ * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
+ * @return Tuple of vectors storing edge sources, edge destinations, optional edge weights (valid
+ * only if @p edgelist_weights.has_value() is true), optional edge IDs (valid only if @p
+ * edgelist_edge_ids.has_value() is true), optional edge types (valid only if @p
+ * edgelist_edge_types.has_value() is true), and optional (label, hop) offset values to the
+ * renumbered and sorted edges (size = # labels * # hops + 1, where # labels =
+ * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
+ * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
+ * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
+ * edgelist_hops.has_value() is true)
+ */
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>>       // (label, hop) offsets to the edges
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major       = true,
+  bool do_expensive_check = false);
+
+}  // namespace cugraph
diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index caaba8e9c8d..f146c331d8c 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -25,7 +25,7 @@
 #include <cugraph/algorithms.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
-#include <cugraph/graph_functions.hpp>
+#include <cugraph/sampling_functions.hpp>
 
 #include <raft/core/handle.hpp>
 
diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh
index 8490bacfd9c..c46e83aa5da 100644
--- a/cpp/src/prims/kv_store.cuh
+++ b/cpp/src/prims/kv_store.cuh
@@ -31,6 +31,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/remove.h>
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index b238b964ede..3375a651982 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -108,7 +108,7 @@ struct convert_pair_to_quadruplet_t {
             thrust::seq, displacement_first, displacement_first + minor_comm_size, nbr_idx))) -
         1;
       local_nbr_idx -= *(displacement_first + minor_comm_rank);
-      cuda::std::atomic_ref<size_t> counter(tx_counts[minor_comm_rank]);
+      cuda::atomic_ref<size_t, cuda::thread_scope_device> counter(tx_counts[minor_comm_rank]);
       intra_partition_offset = counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
     }
     return thrust::make_tuple(minor_comm_rank, intra_partition_offset, local_nbr_idx, key_idx);
@@ -252,7 +252,7 @@ struct count_t {
 
   __device__ size_t operator()(size_t key_idx) const
   {
-    cuda::std::atomic_ref<int32_t> counter(sample_counts[key_idx]);
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> counter(sample_counts[key_idx]);
     return counter.fetch_add(int32_t{1}, cuda::std::memory_order_relaxed);
   }
 };
diff --git a/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh b/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
index 6fdb1c887f2..50f42851a1f 100644
--- a/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
+++ b/cpp/src/sampling/renumber_sampled_edgelist_impl.cuh
@@ -19,6 +19,7 @@
 #include <prims/kv_store.cuh>
 
 #include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/misc_utils.cuh>
 
 #include <raft/core/handle.hpp>
@@ -41,6 +42,7 @@
 
 #include <optional>
 
+// FIXME: deprecated, to be deleted
 namespace cugraph {
 
 namespace {
diff --git a/cpp/src/sampling/renumber_sampled_edgelist_sg.cu b/cpp/src/sampling/renumber_sampled_edgelist_sg.cu
index 46e2264a0c1..9a5f0d357b2 100644
--- a/cpp/src/sampling/renumber_sampled_edgelist_sg.cu
+++ b/cpp/src/sampling/renumber_sampled_edgelist_sg.cu
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include <cugraph/graph_functions.hpp>
+#include <cugraph/sampling_functions.hpp>
 
 #include "renumber_sampled_edgelist_impl.cuh"
 
+// FIXME: deprecated, to be deleted
 namespace cugraph {
 
 template std::tuple<rmm::device_uvector<int32_t>,
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
new file mode 100644
index 00000000000..ff8da72ff35
--- /dev/null
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -0,0 +1,1800 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <prims/kv_store.cuh>
+
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/misc_utils.cuh>
+
+#include <raft/core/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cub/cub.cuh>
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/merge.h>
+#include <thrust/set_operations.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+#include <thrust/unique.h>
+
+#include <optional>
+
+namespace cugraph {
+
+namespace {
+
+template <typename vertex_t, typename weight_t, typename edge_id_t, typename edge_type_t>
+struct edge_order_t {
+  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+  thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
+  raft::device_span<vertex_t const> edgelist_majors{};
+  raft::device_span<vertex_t const> edgelist_minors{};
+
+  __device__ bool operator()(size_t l_idx, size_t r_idx) const
+  {
+    if (edgelist_label_offsets) {
+      auto l_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
+                                      thrust::upper_bound(thrust::seq,
+                                                          (*edgelist_label_offsets).begin() + 1,
+                                                          (*edgelist_label_offsets).end(),
+                                                          (*edgelist_label_offsets)[0] + l_idx));
+      auto r_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
+                                      thrust::upper_bound(thrust::seq,
+                                                          (*edgelist_label_offsets).begin() + 1,
+                                                          (*edgelist_label_offsets).end(),
+                                                          (*edgelist_label_offsets)[0] + r_idx));
+      if (l_label != r_label) { return l_label < r_label; }
+    }
+
+    if (edgelist_hops) {
+      auto l_hop = (*edgelist_hops)[l_idx];
+      auto r_hop = (*edgelist_hops)[r_idx];
+      if (l_hop != r_hop) { return l_hop < r_hop; }
+    }
+
+    auto l_major = edgelist_majors[l_idx];
+    auto r_major = edgelist_majors[r_idx];
+    if (l_major != r_major) { return l_major < r_major; }
+
+    auto l_minor = edgelist_minors[l_idx];
+    auto r_minor = edgelist_minors[r_idx];
+    if (l_minor != r_minor) { return l_minor < r_minor; }
+
+    return l_idx < r_idx;
+  }
+};
+
+template <typename vertex_t>
+struct is_first_in_run_t {
+  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+  thrust::optional<raft::device_span<int32_t const>> edgelist_hops{thrust::nullopt};
+  raft::device_span<vertex_t const> edgelist_majors{};
+
+  __device__ bool operator()(size_t i) const
+  {
+    if (i == 0) return true;
+    if (edgelist_label_offsets) {
+      auto prev_label = thrust::distance((*edgelist_label_offsets).begin() + 1,
+                                         thrust::upper_bound(thrust::seq,
+                                                             (*edgelist_label_offsets).begin() + 1,
+                                                             (*edgelist_label_offsets).end(),
+                                                             i - 1));
+      auto this_label = thrust::distance(
+        (*edgelist_label_offsets).begin() + 1,
+        thrust::upper_bound(
+          thrust::seq, (*edgelist_label_offsets).begin() + 1, (*edgelist_label_offsets).end(), i));
+      if (this_label != prev_label) { return true; }
+    }
+    if (edgelist_hops) {
+      auto prev_hop = (*edgelist_hops)[i - 1];
+      auto this_hop = (*edgelist_hops)[i];
+      if (this_hop != prev_hop) { return true; }
+    }
+    return edgelist_majors[i] != edgelist_majors[i - 1];
+  }
+};
+
+template <typename label_index_t>
+struct compute_label_index_t {
+  raft::device_span<size_t const> edgelist_label_offsets{};
+
+  __device__ label_index_t operator()(size_t i) const
+  {
+    return static_cast<label_index_t>(thrust::distance(
+      edgelist_label_offsets.begin() + 1,
+      thrust::upper_bound(
+        thrust::seq, edgelist_label_offsets.begin() + 1, edgelist_label_offsets.end(), i)));
+  }
+};
+
+template <typename label_index_t>
+struct optionally_compute_label_index_t {
+  thrust::optional<raft::device_span<size_t const>> edgelist_label_offsets{thrust::nullopt};
+
+  __device__ label_index_t operator()(size_t i) const
+  {
+    return edgelist_label_offsets ? static_cast<label_index_t>(thrust::distance(
+                                      (*edgelist_label_offsets).begin() + 1,
+                                      thrust::upper_bound(thrust::seq,
+                                                          (*edgelist_label_offsets).begin() + 1,
+                                                          (*edgelist_label_offsets).end(),
+                                                          i)))
+                                  : label_index_t{0};
+  }
+};
+
+template <typename label_index_t,
+          typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+void check_input_edges(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t> const& edgelist_srcs,
+  rmm::device_uvector<vertex_t> const& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>> const& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>> const& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>> const& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>> const& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(!edgelist_label_offsets || (std::get<1>(*edgelist_label_offsets) <=
+                                              std::numeric_limits<label_index_t>::max()),
+                  "Invalid input arguments: current implementation assumes that the number of "
+                  "unique labels is no larger than std::numeric_limits<uint32_t>::max().");
+  CUGRAPH_EXPECTS(!edgelist_label_offsets || std::get<1>(*edgelist_label_offsets) > 0,
+                  "Invlaid input arguments: there should be 1 or more labels if "
+                  "edgelist_label_offsets.has_value() is true.");
+  CUGRAPH_EXPECTS(
+    !edgelist_label_offsets.has_value() ||
+      (std::get<0>(*edgelist_label_offsets).size() == std::get<1>(*edgelist_label_offsets) + 1),
+    "Invalid input arguments: if edgelist_label_offsets is valid, "
+    "std::get<0>(*edgelist_label_offsets).size() (size of the offset array) should be "
+    "std::get<1>(*edgelist_label_offsets) (number of unique labels) + 1.");
+
+  CUGRAPH_EXPECTS(
+    !edgelist_hops || (std::get<1>(*edgelist_hops) <= std::numeric_limits<int32_t>::max()),
+    "Invalid input arguments: current implementation assumes that the number of "
+    "hops is no larger than std::numeric_limits<int32_t>::max().");
+  CUGRAPH_EXPECTS(!edgelist_hops || std::get<1>(*edgelist_hops) > 0,
+                  "Invlaid input arguments: number of hops should be larger than 0 if "
+                  "edgelist_hops.has_value() is true.");
+
+  CUGRAPH_EXPECTS(
+    edgelist_srcs.size() == edgelist_dsts.size(),
+    "Invalid input arguments: edgelist_srcs.size() and edgelist_dsts.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_weights.has_value() || (edgelist_srcs.size() == (*edgelist_weights).size()),
+    "Invalid input arguments: if edgelist_weights is valid, std::get<0>(*edgelist_weights).size() "
+    "and edgelist_srcs.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_edge_ids.has_value() || (edgelist_srcs.size() == (*edgelist_edge_ids).size()),
+    "Invalid input arguments: if edgelist_edge_ids is valid, "
+    "std::get<0>(*edgelist_edge_ids).size() and edgelist_srcs.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_edge_types.has_value() || (edgelist_srcs.size() == (*edgelist_edge_types).size()),
+    "Invalid input arguments: if edgelist_edge_types is valid, "
+    "std::get<0>(*edgelist_edge_types).size() and edgelist_srcs.size() should coincide.");
+  CUGRAPH_EXPECTS(
+    !edgelist_hops.has_value() || (edgelist_srcs.size() == std::get<0>(*edgelist_hops).size()),
+    "Invalid input arguments: if edgelist_hops is valid, std::get<0>(*edgelist_hops).size() and "
+    "edgelist_srcs.size() should coincide.");
+
+  if (do_expensive_check) {
+    if (edgelist_label_offsets) {
+      CUGRAPH_EXPECTS(thrust::is_sorted(handle.get_thrust_policy(),
+                                        std::get<0>(*edgelist_label_offsets).begin(),
+                                        std::get<0>(*edgelist_label_offsets).end()),
+                      "Invalid input arguments: if edgelist_label_offsets is valid, "
+                      "std::get<0>(*edgelist_label_offsets) should be sorted.");
+      size_t back_element{};
+      raft::update_host(
+        &back_element,
+        std::get<0>(*edgelist_label_offsets).data() + std::get<1>(*edgelist_label_offsets),
+        size_t{1},
+        handle.get_stream());
+      handle.get_stream();
+      CUGRAPH_EXPECTS(
+        back_element == edgelist_srcs.size(),
+        "Invalid input arguments: if edgelist_label_offsets is valid, the last element of "
+        "std::get<0>(*edgelist_label_offsets) and edgelist_srcs.size() should coincide.");
+    }
+  }
+}
+
+// output sorted by (primary key:label_index, secondary key:vertex)
+template <typename vertex_t, typename label_index_t>
+std::tuple<std::optional<rmm::device_uvector<label_index_t>> /* label indices */,
+           rmm::device_uvector<vertex_t> /* vertices */,
+           std::optional<rmm::device_uvector<int32_t>> /* minimum hops for the vertices */,
+           std::optional<rmm::device_uvector<size_t>> /* label offsets for the output */>
+compute_min_hop_for_unique_label_vertex_pairs(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> vertices,
+  std::optional<raft::device_span<int32_t const>> hops,
+  std::optional<raft::device_span<label_index_t const>> label_indices,
+  std::optional<raft::device_span<size_t const>> label_offsets)
+{
+  auto approx_edges_to_sort_per_iteration =
+    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+    (1 << 20) /* tuning parameter */;  // for segmented sort
+
+  if (label_indices) {
+    auto num_labels = (*label_offsets).size() - 1;
+
+    rmm::device_uvector<label_index_t> tmp_label_indices((*label_indices).size(),
+                                                         handle.get_stream());
+    thrust::copy(handle.get_thrust_policy(),
+                 (*label_indices).begin(),
+                 (*label_indices).end(),
+                 tmp_label_indices.begin());
+
+    rmm::device_uvector<vertex_t> tmp_vertices(0, handle.get_stream());
+    std::optional<rmm::device_uvector<int32_t>> tmp_hops{std::nullopt};
+
+    if (hops) {
+      tmp_vertices.resize(vertices.size(), handle.get_stream());
+      thrust::copy(
+        handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
+      tmp_hops = rmm::device_uvector<int32_t>((*hops).size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(), (*hops).begin(), (*hops).end(), (*tmp_hops).begin());
+
+      auto triplet_first = thrust::make_zip_iterator(
+        tmp_label_indices.begin(), tmp_vertices.begin(), (*tmp_hops).begin());
+      thrust::sort(
+        handle.get_thrust_policy(), triplet_first, triplet_first + tmp_label_indices.size());
+      auto key_first   = thrust::make_zip_iterator(tmp_label_indices.begin(), tmp_vertices.begin());
+      auto num_uniques = static_cast<size_t>(
+        thrust::distance(key_first,
+                         thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                              key_first,
+                                                              key_first + tmp_label_indices.size(),
+                                                              (*tmp_hops).begin()))));
+      tmp_label_indices.resize(num_uniques, handle.get_stream());
+      tmp_vertices.resize(num_uniques, handle.get_stream());
+      (*tmp_hops).resize(num_uniques, handle.get_stream());
+      tmp_label_indices.shrink_to_fit(handle.get_stream());
+      tmp_vertices.shrink_to_fit(handle.get_stream());
+      (*tmp_hops).shrink_to_fit(handle.get_stream());
+    } else {
+      rmm::device_uvector<vertex_t> segment_sorted_vertices(vertices.size(), handle.get_stream());
+
+      rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+
+      auto [h_label_offsets, h_edge_offsets] =
+        detail::compute_offset_aligned_edge_chunks(handle,
+                                                   (*label_offsets).data(),
+                                                   num_labels,
+                                                   vertices.size(),
+                                                   approx_edges_to_sort_per_iteration);
+      auto num_chunks = h_label_offsets.size() - 1;
+
+      for (size_t i = 0; i < num_chunks; ++i) {
+        size_t tmp_storage_bytes{0};
+
+        auto offset_first =
+          thrust::make_transform_iterator((*label_offsets).data() + h_label_offsets[i],
+                                          detail::shift_left_t<size_t>{h_edge_offsets[i]});
+        cub::DeviceSegmentedSort::SortKeys(static_cast<void*>(nullptr),
+                                           tmp_storage_bytes,
+                                           vertices.begin() + h_edge_offsets[i],
+                                           segment_sorted_vertices.begin() + h_edge_offsets[i],
+                                           h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                           h_label_offsets[i + 1] - h_label_offsets[i],
+                                           offset_first,
+                                           offset_first + 1,
+                                           handle.get_stream());
+
+        if (tmp_storage_bytes > d_tmp_storage.size()) {
+          d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+        }
+
+        cub::DeviceSegmentedSort::SortKeys(d_tmp_storage.data(),
+                                           tmp_storage_bytes,
+                                           vertices.begin() + h_edge_offsets[i],
+                                           segment_sorted_vertices.begin() + h_edge_offsets[i],
+                                           h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                           h_label_offsets[i + 1] - h_label_offsets[i],
+                                           offset_first,
+                                           offset_first + 1,
+                                           handle.get_stream());
+      }
+      d_tmp_storage.resize(0, handle.get_stream());
+      d_tmp_storage.shrink_to_fit(handle.get_stream());
+
+      auto pair_first =
+        thrust::make_zip_iterator(tmp_label_indices.begin(), segment_sorted_vertices.begin());
+      auto num_uniques = static_cast<size_t>(thrust::distance(
+        pair_first,
+        thrust::unique(
+          handle.get_thrust_policy(), pair_first, pair_first + tmp_label_indices.size())));
+      tmp_label_indices.resize(num_uniques, handle.get_stream());
+      segment_sorted_vertices.resize(num_uniques, handle.get_stream());
+      tmp_label_indices.shrink_to_fit(handle.get_stream());
+      segment_sorted_vertices.shrink_to_fit(handle.get_stream());
+
+      tmp_vertices = std::move(segment_sorted_vertices);
+    }
+
+    rmm::device_uvector<size_t> tmp_label_offsets(num_labels + 1, handle.get_stream());
+    tmp_label_offsets.set_element_to_zero_async(0, handle.get_stream());
+    thrust::upper_bound(handle.get_thrust_policy(),
+                        tmp_label_indices.begin(),
+                        tmp_label_indices.end(),
+                        thrust::make_counting_iterator(size_t{0}),
+                        thrust::make_counting_iterator(num_labels),
+                        tmp_label_offsets.begin() + 1);
+
+    return std::make_tuple(std::move(tmp_label_indices),
+                           std::move(tmp_vertices),
+                           std::move(tmp_hops),
+                           std::move(tmp_label_offsets));
+  } else {
+    rmm::device_uvector<vertex_t> tmp_vertices(vertices.size(), handle.get_stream());
+    thrust::copy(
+      handle.get_thrust_policy(), vertices.begin(), vertices.end(), tmp_vertices.begin());
+
+    if (hops) {
+      rmm::device_uvector<int32_t> tmp_hops((*hops).size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(), (*hops).begin(), (*hops).end(), tmp_hops.begin());
+
+      auto pair_first = thrust::make_zip_iterator(
+        tmp_vertices.begin(), tmp_hops.begin());  // vertex is a primary key, hop is a secondary key
+      thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + tmp_vertices.size());
+      tmp_vertices.resize(
+        thrust::distance(tmp_vertices.begin(),
+                         thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                              tmp_vertices.begin(),
+                                                              tmp_vertices.end(),
+                                                              tmp_hops.begin()))),
+        handle.get_stream());
+      tmp_hops.resize(tmp_vertices.size(), handle.get_stream());
+
+      return std::make_tuple(
+        std::nullopt, std::move(tmp_vertices), std::move(tmp_hops), std::nullopt);
+    } else {
+      thrust::sort(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end());
+      tmp_vertices.resize(
+        thrust::distance(
+          tmp_vertices.begin(),
+          thrust::unique(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end())),
+        handle.get_stream());
+      tmp_vertices.shrink_to_fit(handle.get_stream());
+
+      return std::make_tuple(std::nullopt, std::move(tmp_vertices), std::nullopt, std::nullopt);
+    }
+  }
+}
+
+template <typename vertex_t, typename label_index_t>
+std::tuple<rmm::device_uvector<vertex_t>, std::optional<rmm::device_uvector<label_index_t>>>
+compute_renumber_map(raft::handle_t const& handle,
+                     raft::device_span<vertex_t const> edgelist_majors,
+                     raft::device_span<vertex_t const> edgelist_minors,
+                     std::optional<raft::device_span<int32_t const>> edgelist_hops,
+                     std::optional<raft::device_span<size_t const>> edgelist_label_offsets)
+{
+  auto approx_edges_to_sort_per_iteration =
+    static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+    (1 << 20) /* tuning parameter */;  // for segmented sort
+
+  std::optional<rmm::device_uvector<label_index_t>> edgelist_label_indices{std::nullopt};
+  if (edgelist_label_offsets) {
+    edgelist_label_indices =
+      detail::expand_sparse_offsets(*edgelist_label_offsets, label_index_t{0}, handle.get_stream());
+  }
+
+  auto [unique_label_major_pair_label_indices,
+        unique_label_major_pair_vertices,
+        unique_label_major_pair_hops,
+        unique_label_major_pair_label_offsets] =
+    compute_min_hop_for_unique_label_vertex_pairs(
+      handle,
+      edgelist_majors,
+      edgelist_hops,
+      edgelist_label_indices ? std::make_optional<raft::device_span<label_index_t const>>(
+                                 (*edgelist_label_indices).data(), (*edgelist_label_indices).size())
+                             : std::nullopt,
+      edgelist_label_offsets);
+
+  auto [unique_label_minor_pair_label_indices,
+        unique_label_minor_pair_vertices,
+        unique_label_minor_pair_hops,
+        unique_label_minor_pair_label_offsets] =
+    compute_min_hop_for_unique_label_vertex_pairs(
+      handle,
+      edgelist_minors,
+      edgelist_hops,
+      edgelist_label_indices ? std::make_optional<raft::device_span<label_index_t const>>(
+                                 (*edgelist_label_indices).data(), (*edgelist_label_indices).size())
+                             : std::nullopt,
+      edgelist_label_offsets);
+
+  edgelist_label_indices = std::nullopt;
+
+  if (edgelist_label_offsets) {
+    auto num_labels = (*edgelist_label_offsets).size() - 1;
+
+    rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
+    rmm::device_uvector<label_index_t> renumber_map_label_indices(0, handle.get_stream());
+
+    renumber_map.reserve((*unique_label_major_pair_label_indices).size() +
+                           (*unique_label_minor_pair_label_indices).size(),
+                         handle.get_stream());
+    renumber_map_label_indices.reserve(renumber_map.capacity(), handle.get_stream());
+
+    auto num_chunks = (edgelist_majors.size() + (approx_edges_to_sort_per_iteration - 1)) /
+                      approx_edges_to_sort_per_iteration;
+    auto chunk_size = (num_chunks > 0) ? ((num_labels + (num_chunks - 1)) / num_chunks) : 0;
+
+    size_t copy_offset{0};
+    for (size_t i = 0; i < num_chunks; ++i) {
+      auto major_start_offset =
+        (*unique_label_major_pair_label_offsets).element(chunk_size * i, handle.get_stream());
+      auto major_end_offset =
+        (*unique_label_major_pair_label_offsets)
+          .element(std::min(chunk_size * (i + 1), num_labels), handle.get_stream());
+      auto minor_start_offset =
+        (*unique_label_minor_pair_label_offsets).element(chunk_size * i, handle.get_stream());
+      auto minor_end_offset =
+        (*unique_label_minor_pair_label_offsets)
+          .element(std::min(chunk_size * (i + 1), num_labels), handle.get_stream());
+
+      rmm::device_uvector<label_index_t> merged_label_indices(
+        (major_end_offset - major_start_offset) + (minor_end_offset - minor_start_offset),
+        handle.get_stream());
+      rmm::device_uvector<vertex_t> merged_vertices(merged_label_indices.size(),
+                                                    handle.get_stream());
+      rmm::device_uvector<int8_t> merged_flags(merged_label_indices.size(), handle.get_stream());
+
+      if (edgelist_hops) {
+        rmm::device_uvector<int32_t> merged_hops(merged_label_indices.size(), handle.get_stream());
+        auto major_quad_first =
+          thrust::make_zip_iterator((*unique_label_major_pair_label_indices).begin(),
+                                    unique_label_major_pair_vertices.begin(),
+                                    (*unique_label_major_pair_hops).begin(),
+                                    thrust::make_constant_iterator(int8_t{0}));
+        auto minor_quad_first =
+          thrust::make_zip_iterator((*unique_label_minor_pair_label_indices).begin(),
+                                    unique_label_minor_pair_vertices.begin(),
+                                    (*unique_label_minor_pair_hops).begin(),
+                                    thrust::make_constant_iterator(int8_t{1}));
+        thrust::merge(handle.get_thrust_policy(),
+                      major_quad_first + major_start_offset,
+                      major_quad_first + major_end_offset,
+                      minor_quad_first + minor_start_offset,
+                      minor_quad_first + minor_end_offset,
+                      thrust::make_zip_iterator(merged_label_indices.begin(),
+                                                merged_vertices.begin(),
+                                                merged_hops.begin(),
+                                                merged_flags.begin()));
+
+        auto unique_key_first =
+          thrust::make_zip_iterator(merged_label_indices.begin(), merged_vertices.begin());
+        merged_label_indices.resize(
+          thrust::distance(
+            unique_key_first,
+            thrust::get<0>(thrust::unique_by_key(
+              handle.get_thrust_policy(),
+              unique_key_first,
+              unique_key_first + merged_label_indices.size(),
+              thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
+          handle.get_stream());
+        merged_vertices.resize(merged_label_indices.size(), handle.get_stream());
+        merged_hops.resize(merged_label_indices.size(), handle.get_stream());
+        merged_flags.resize(merged_label_indices.size(), handle.get_stream());
+        auto sort_key_first = thrust::make_zip_iterator(
+          merged_label_indices.begin(), merged_hops.begin(), merged_flags.begin());
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            sort_key_first,
+                            sort_key_first + merged_label_indices.size(),
+                            merged_vertices.begin());
+      } else {
+        auto major_triplet_first =
+          thrust::make_zip_iterator((*unique_label_major_pair_label_indices).begin(),
+                                    unique_label_major_pair_vertices.begin(),
+                                    thrust::make_constant_iterator(int8_t{0}));
+        auto minor_triplet_first =
+          thrust::make_zip_iterator((*unique_label_minor_pair_label_indices).begin(),
+                                    unique_label_minor_pair_vertices.begin(),
+                                    thrust::make_constant_iterator(int8_t{1}));
+        thrust::merge(
+          handle.get_thrust_policy(),
+          major_triplet_first + major_start_offset,
+          major_triplet_first + major_end_offset,
+          minor_triplet_first + minor_start_offset,
+          minor_triplet_first + minor_end_offset,
+          thrust::make_zip_iterator(
+            merged_label_indices.begin(), merged_vertices.begin(), merged_flags.begin()));
+
+        auto unique_key_first =
+          thrust::make_zip_iterator(merged_label_indices.begin(), merged_vertices.begin());
+        merged_label_indices.resize(
+          thrust::distance(
+            unique_key_first,
+            thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                 unique_key_first,
+                                                 unique_key_first + merged_label_indices.size(),
+                                                 merged_flags.begin()))),
+          handle.get_stream());
+        merged_vertices.resize(merged_label_indices.size(), handle.get_stream());
+        merged_flags.resize(merged_label_indices.size(), handle.get_stream());
+        auto sort_key_first =
+          thrust::make_zip_iterator(merged_label_indices.begin(), merged_flags.begin());
+        thrust::sort_by_key(handle.get_thrust_policy(),
+                            sort_key_first,
+                            sort_key_first + merged_label_indices.size(),
+                            merged_vertices.begin());
+      }
+
+      renumber_map.resize(copy_offset + merged_vertices.size(), handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   merged_vertices.begin(),
+                   merged_vertices.end(),
+                   renumber_map.begin() + copy_offset);
+      renumber_map_label_indices.resize(copy_offset + merged_label_indices.size(),
+                                        handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   merged_label_indices.begin(),
+                   merged_label_indices.end(),
+                   renumber_map_label_indices.begin() + copy_offset);
+
+      copy_offset += merged_vertices.size();
+    }
+
+    renumber_map.shrink_to_fit(handle.get_stream());
+    renumber_map_label_indices.shrink_to_fit(handle.get_stream());
+
+    return std::make_tuple(std::move(renumber_map), std::move(renumber_map_label_indices));
+  } else {
+    if (edgelist_hops) {
+      rmm::device_uvector<vertex_t> merged_vertices(
+        unique_label_major_pair_vertices.size() + unique_label_minor_pair_vertices.size(),
+        handle.get_stream());
+      rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
+      rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
+      auto major_triplet_first =
+        thrust::make_zip_iterator(unique_label_major_pair_vertices.begin(),
+                                  (*unique_label_major_pair_hops).begin(),
+                                  thrust::make_constant_iterator(int8_t{0}));
+      auto minor_triplet_first =
+        thrust::make_zip_iterator(unique_label_minor_pair_vertices.begin(),
+                                  (*unique_label_minor_pair_hops).begin(),
+                                  thrust::make_constant_iterator(int8_t{1}));
+      thrust::merge(handle.get_thrust_policy(),
+                    major_triplet_first,
+                    major_triplet_first + unique_label_major_pair_vertices.size(),
+                    minor_triplet_first,
+                    minor_triplet_first + unique_label_minor_pair_vertices.size(),
+                    thrust::make_zip_iterator(
+                      merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
+
+      unique_label_major_pair_vertices.resize(0, handle.get_stream());
+      unique_label_major_pair_vertices.shrink_to_fit(handle.get_stream());
+      unique_label_major_pair_hops = std::nullopt;
+      unique_label_minor_pair_vertices.resize(0, handle.get_stream());
+      unique_label_minor_pair_vertices.shrink_to_fit(handle.get_stream());
+      unique_label_minor_pair_hops = std::nullopt;
+
+      merged_vertices.resize(
+        thrust::distance(merged_vertices.begin(),
+                         thrust::get<0>(thrust::unique_by_key(
+                           handle.get_thrust_policy(),
+                           merged_vertices.begin(),
+                           merged_vertices.end(),
+                           thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
+        handle.get_stream());
+      merged_hops.resize(merged_vertices.size(), handle.get_stream());
+      merged_flags.resize(merged_vertices.size(), handle.get_stream());
+
+      auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          sort_key_first,
+                          sort_key_first + merged_hops.size(),
+                          merged_vertices.begin());
+
+      return std::make_tuple(std::move(merged_vertices), std::nullopt);
+    } else {
+      rmm::device_uvector<vertex_t> output_vertices(unique_label_minor_pair_vertices.size(),
+                                                    handle.get_stream());
+      auto output_last = thrust::set_difference(handle.get_thrust_policy(),
+                                                unique_label_minor_pair_vertices.begin(),
+                                                unique_label_minor_pair_vertices.end(),
+                                                unique_label_major_pair_vertices.begin(),
+                                                unique_label_major_pair_vertices.end(),
+                                                output_vertices.begin());
+
+      auto num_unique_majors = unique_label_major_pair_vertices.size();
+      auto renumber_map      = std::move(unique_label_major_pair_vertices);
+      renumber_map.resize(
+        renumber_map.size() + thrust::distance(output_vertices.begin(), output_last),
+        handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   output_vertices.begin(),
+                   output_last,
+                   renumber_map.begin() + num_unique_majors);
+
+      return std::make_tuple(std::move(renumber_map), std::nullopt);
+    }
+  }
+}
+
+// this function does not reorder edges (the i'th returned edge is the renumbered output of the i'th
+// input edge)
+template <typename vertex_t, typename label_index_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<size_t>>>
+renumber_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_majors,
+  rmm::device_uvector<vertex_t>&& edgelist_minors,
+  std::optional<std::tuple<raft::device_span<int32_t const>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool do_expensive_check)
+{
+  // 1. compute renumber_map
+
+  auto [renumber_map, renumber_map_label_indices] = compute_renumber_map<vertex_t, label_index_t>(
+    handle,
+    raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
+    raft::device_span<vertex_t const>(edgelist_minors.data(), edgelist_minors.size()),
+    edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                      std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                  : std::nullopt,
+    edgelist_label_offsets
+      ? std::make_optional<raft::device_span<size_t const>>(std::get<0>(*edgelist_label_offsets))
+      : std::nullopt);
+
+  // 2. compute renumber map offsets for each label
+
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{};
+  if (edgelist_label_offsets) {
+    auto num_unique_labels = thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator((*renumber_map_label_indices).size()),
+      detail::is_first_in_run_t<label_index_t const*>{(*renumber_map_label_indices).data()});
+    rmm::device_uvector<label_index_t> unique_label_indices(num_unique_labels, handle.get_stream());
+    rmm::device_uvector<vertex_t> vertex_counts(num_unique_labels, handle.get_stream());
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          (*renumber_map_label_indices).begin(),
+                          (*renumber_map_label_indices).end(),
+                          thrust::make_constant_iterator(size_t{1}),
+                          unique_label_indices.begin(),
+                          vertex_counts.begin());
+
+    renumber_map_label_offsets =
+      rmm::device_uvector<size_t>(std::get<1>(*edgelist_label_offsets) + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 (*renumber_map_label_offsets).begin(),
+                 (*renumber_map_label_offsets).end(),
+                 size_t{0});
+    thrust::scatter(handle.get_thrust_policy(),
+                    vertex_counts.begin(),
+                    vertex_counts.end(),
+                    unique_label_indices.begin(),
+                    (*renumber_map_label_offsets).begin() + 1);
+
+    thrust::inclusive_scan(handle.get_thrust_policy(),
+                           (*renumber_map_label_offsets).begin(),
+                           (*renumber_map_label_offsets).end(),
+                           (*renumber_map_label_offsets).begin());
+  }
+
+  // 3. renumber input edges
+
+  if (edgelist_label_offsets) {
+    rmm::device_uvector<vertex_t> new_vertices(renumber_map.size(), handle.get_stream());
+    thrust::tabulate(handle.get_thrust_policy(),
+                     new_vertices.begin(),
+                     new_vertices.end(),
+                     [label_indices = raft::device_span<label_index_t const>(
+                        (*renumber_map_label_indices).data(), (*renumber_map_label_indices).size()),
+                      renumber_map_label_offsets = raft::device_span<size_t const>(
+                        (*renumber_map_label_offsets).data(),
+                        (*renumber_map_label_offsets).size())] __device__(size_t i) {
+                       auto label_index        = label_indices[i];
+                       auto label_start_offset = renumber_map_label_offsets[label_index];
+                       return static_cast<vertex_t>(i - label_start_offset);
+                     });
+
+    (*renumber_map_label_indices).resize(0, handle.get_stream());
+    (*renumber_map_label_indices).shrink_to_fit(handle.get_stream());
+
+    auto num_labels = std::get<0>(*edgelist_label_offsets).size();
+
+    rmm::device_uvector<vertex_t> segment_sorted_renumber_map(renumber_map.size(),
+                                                              handle.get_stream());
+    rmm::device_uvector<vertex_t> segment_sorted_new_vertices(new_vertices.size(),
+                                                              handle.get_stream());
+
+    rmm::device_uvector<std::byte> d_tmp_storage(0, handle.get_stream());
+
+    auto approx_edges_to_sort_per_iteration =
+      static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+      (1 << 20) /* tuning parameter */;  // for segmented sort
+
+    auto [h_label_offsets, h_edge_offsets] = detail::compute_offset_aligned_edge_chunks(
+      handle,
+      (*renumber_map_label_offsets).data(),
+      static_cast<size_t>((*renumber_map_label_offsets).size() - 1),
+      renumber_map.size(),
+      approx_edges_to_sort_per_iteration);
+    auto num_chunks = h_label_offsets.size() - 1;
+
+    for (size_t i = 0; i < num_chunks; ++i) {
+      size_t tmp_storage_bytes{0};
+
+      auto offset_first =
+        thrust::make_transform_iterator((*renumber_map_label_offsets).data() + h_label_offsets[i],
+                                        detail::shift_left_t<size_t>{h_edge_offsets[i]});
+      cub::DeviceSegmentedSort::SortPairs(static_cast<void*>(nullptr),
+                                          tmp_storage_bytes,
+                                          renumber_map.begin() + h_edge_offsets[i],
+                                          segment_sorted_renumber_map.begin() + h_edge_offsets[i],
+                                          new_vertices.begin() + h_edge_offsets[i],
+                                          segment_sorted_new_vertices.begin() + h_edge_offsets[i],
+                                          h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                          h_label_offsets[i + 1] - h_label_offsets[i],
+                                          offset_first,
+                                          offset_first + 1,
+                                          handle.get_stream());
+
+      if (tmp_storage_bytes > d_tmp_storage.size()) {
+        d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
+      }
+
+      cub::DeviceSegmentedSort::SortPairs(d_tmp_storage.data(),
+                                          tmp_storage_bytes,
+                                          renumber_map.begin() + h_edge_offsets[i],
+                                          segment_sorted_renumber_map.begin() + h_edge_offsets[i],
+                                          new_vertices.begin() + h_edge_offsets[i],
+                                          segment_sorted_new_vertices.begin() + h_edge_offsets[i],
+                                          h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                          h_label_offsets[i + 1] - h_label_offsets[i],
+                                          offset_first,
+                                          offset_first + 1,
+                                          handle.get_stream());
+    }
+    new_vertices.resize(0, handle.get_stream());
+    d_tmp_storage.resize(0, handle.get_stream());
+    new_vertices.shrink_to_fit(handle.get_stream());
+    d_tmp_storage.shrink_to_fit(handle.get_stream());
+
+    auto edgelist_label_indices = detail::expand_sparse_offsets(
+      std::get<0>(*edgelist_label_offsets), label_index_t{0}, handle.get_stream());
+
+    auto pair_first =
+      thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_label_indices.begin());
+    thrust::transform(
+      handle.get_thrust_policy(),
+      pair_first,
+      pair_first + edgelist_majors.size(),
+      edgelist_majors.begin(),
+      [renumber_map_label_offsets = raft::device_span<size_t const>(
+         (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
+       old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
+                                                        segment_sorted_renumber_map.size()),
+       new_vertices = raft::device_span<vertex_t const>(
+         segment_sorted_new_vertices.data(),
+         segment_sorted_new_vertices.size())] __device__(auto pair) {
+        auto old_vertex         = thrust::get<0>(pair);
+        auto label_index        = thrust::get<1>(pair);
+        auto label_start_offset = renumber_map_label_offsets[label_index];
+        auto label_end_offset   = renumber_map_label_offsets[label_index + 1];
+        auto it                 = thrust::lower_bound(thrust::seq,
+                                      old_vertices.begin() + label_start_offset,
+                                      old_vertices.begin() + label_end_offset,
+                                      old_vertex);
+        assert(*it == old_vertex);
+        return *(new_vertices.begin() + thrust::distance(old_vertices.begin(), it));
+      });
+
+    pair_first = thrust::make_zip_iterator(edgelist_minors.begin(), edgelist_label_indices.begin());
+    thrust::transform(
+      handle.get_thrust_policy(),
+      pair_first,
+      pair_first + edgelist_minors.size(),
+      edgelist_minors.begin(),
+      [renumber_map_label_offsets = raft::device_span<size_t const>(
+         (*renumber_map_label_offsets).data(), (*renumber_map_label_offsets).size()),
+       old_vertices = raft::device_span<vertex_t const>(segment_sorted_renumber_map.data(),
+                                                        segment_sorted_renumber_map.size()),
+       new_vertices = raft::device_span<vertex_t const>(
+         segment_sorted_new_vertices.data(),
+         segment_sorted_new_vertices.size())] __device__(auto pair) {
+        auto old_vertex         = thrust::get<0>(pair);
+        auto label_index        = thrust::get<1>(pair);
+        auto label_start_offset = renumber_map_label_offsets[label_index];
+        auto label_end_offset   = renumber_map_label_offsets[label_index + 1];
+        auto it                 = thrust::lower_bound(thrust::seq,
+                                      old_vertices.begin() + label_start_offset,
+                                      old_vertices.begin() + label_end_offset,
+                                      old_vertex);
+        assert(*it == old_vertex);
+        return new_vertices[thrust::distance(old_vertices.begin(), it)];
+      });
+  } else {
+    kv_store_t<vertex_t, vertex_t, false> kv_store(renumber_map.begin(),
+                                                   renumber_map.end(),
+                                                   thrust::make_counting_iterator(vertex_t{0}),
+                                                   std::numeric_limits<vertex_t>::max(),
+                                                   std::numeric_limits<vertex_t>::max(),
+                                                   handle.get_stream());
+    auto kv_store_view = kv_store.view();
+
+    kv_store_view.find(
+      edgelist_majors.begin(), edgelist_majors.end(), edgelist_majors.begin(), handle.get_stream());
+    kv_store_view.find(
+      edgelist_minors.begin(), edgelist_minors.end(), edgelist_minors.begin(), handle.get_stream());
+  }
+
+  return std::make_tuple(std::move(edgelist_majors),
+                         std::move(edgelist_minors),
+                         std::move(renumber_map),
+                         std::move(renumber_map_label_offsets));
+}
+
+template <typename IndexIterator, typename ValueIterator>
+void permute_array(raft::handle_t const& handle,
+                   IndexIterator index_first,
+                   IndexIterator index_last,
+                   ValueIterator value_first /* [INOUT] */)
+{
+  using value_t = typename thrust::iterator_traits<ValueIterator>::value_type;
+
+  auto tmp_buffer = allocate_dataframe_buffer<value_t>(thrust::distance(index_first, index_last),
+                                                       handle.get_stream());
+  thrust::gather(handle.get_thrust_policy(),
+                 index_first,
+                 index_last,
+                 value_first,
+                 get_dataframe_buffer_begin(tmp_buffer));
+  thrust::copy(handle.get_thrust_policy(),
+               get_dataframe_buffer_begin(tmp_buffer),
+               get_dataframe_buffer_end(tmp_buffer),
+               value_first);
+}
+
+// key: ((label), (hop), major, minor)
+template <typename vertex_t, typename weight_t, typename edge_id_t, typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_id_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>,
+           std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>>
+sort_sampled_edge_tuples(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_majors,
+  rmm::device_uvector<vertex_t>&& edgelist_minors,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets)
+{
+  std::vector<size_t> h_label_offsets{};
+  std::vector<size_t> h_edge_offsets{};
+
+  if (edgelist_label_offsets) {
+    auto approx_edges_to_sort_per_iteration =
+      static_cast<size_t>(handle.get_device_properties().multiProcessorCount) *
+      (1 << 20) /* tuning parameter */;  // for sorts in chunks
+
+    std::tie(h_label_offsets, h_edge_offsets) =
+      detail::compute_offset_aligned_edge_chunks(handle,
+                                                 std::get<0>(*edgelist_label_offsets).data(),
+                                                 std::get<1>(*edgelist_label_offsets),
+                                                 edgelist_majors.size(),
+                                                 approx_edges_to_sort_per_iteration);
+  } else {
+    h_label_offsets = {0, 1};
+    h_edge_offsets  = {0, edgelist_majors.size()};
+  }
+
+  auto num_chunks = h_label_offsets.size() - 1;
+  for (size_t i = 0; i < num_chunks; ++i) {
+    rmm::device_uvector<size_t> indices(h_edge_offsets[i + 1] - h_edge_offsets[i],
+                                        handle.get_stream());
+    thrust::sequence(handle.get_thrust_policy(), indices.begin(), indices.end(), size_t{0});
+    edge_order_t<vertex_t, weight_t, edge_id_t, edge_type_t> edge_order_comp{
+      edgelist_label_offsets ? thrust::make_optional<raft::device_span<size_t const>>(
+                                 std::get<0>(*edgelist_label_offsets).data() + h_label_offsets[i],
+                                 (h_label_offsets[i + 1] - h_label_offsets[i]) + 1)
+                             : thrust::nullopt,
+      edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                        std::get<0>(*edgelist_hops).data() + h_edge_offsets[i], indices.size())
+                    : thrust::nullopt,
+      raft::device_span<vertex_t const>(edgelist_majors.data() + h_edge_offsets[i], indices.size()),
+      raft::device_span<vertex_t const>(edgelist_minors.data() + h_edge_offsets[i],
+                                        indices.size())};
+    thrust::sort(handle.get_thrust_policy(), indices.begin(), indices.end(), edge_order_comp);
+
+    permute_array(handle,
+                  indices.begin(),
+                  indices.end(),
+                  thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()) +
+                    h_edge_offsets[i]);
+
+    if (edgelist_weights) {
+      permute_array(
+        handle, indices.begin(), indices.end(), (*edgelist_weights).begin() + h_edge_offsets[i]);
+    }
+
+    if (edgelist_edge_ids) {
+      permute_array(
+        handle, indices.begin(), indices.end(), (*edgelist_edge_ids).begin() + h_edge_offsets[i]);
+    }
+
+    if (edgelist_edge_types) {
+      permute_array(
+        handle, indices.begin(), indices.end(), (*edgelist_edge_types).begin() + h_edge_offsets[i]);
+    }
+
+    if (edgelist_hops) {
+      permute_array(handle,
+                    indices.begin(),
+                    indices.end(),
+                    std::get<0>(*edgelist_hops).begin() + h_edge_offsets[i]);
+    }
+  }
+
+  return std::make_tuple(std::move(edgelist_majors),
+                         std::move(edgelist_minors),
+                         std::move(edgelist_weights),
+                         std::move(edgelist_edge_ids),
+                         std::move(edgelist_edge_types),
+                         std::move(edgelist_hops));
+}
+
+}  // namespace
+
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<std::optional<rmm::device_uvector<vertex_t>>,     // dcsr/dcsc major vertices
+           rmm::device_uvector<size_t>,                      // (d)csr/(d)csc offset values
+           rmm::device_uvector<vertex_t>,                    // minor vertices
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,  // (label, hop) offsets to the (d)csr/(d)csc
+                                                        // offset array
+           rmm::device_uvector<vertex_t>,               // renumber map
+           std::optional<rmm::device_uvector<size_t>>>  // label offsets to the renumber map
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check)
+{
+  using label_index_t = uint32_t;
+
+  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
+  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+
+  // 1. check input arguments
+
+  check_input_edges<label_index_t>(handle,
+                                   edgelist_srcs,
+                                   edgelist_dsts,
+                                   edgelist_weights,
+                                   edgelist_edge_ids,
+                                   edgelist_edge_types,
+                                   edgelist_hops,
+                                   edgelist_label_offsets,
+                                   do_expensive_check);
+
+  CUGRAPH_EXPECTS(
+    !doubly_compress || !compress_per_hop,
+    "Invalid input arguments: compress_per_hop should be false if doubly_compress is true.");
+  CUGRAPH_EXPECTS(!compress_per_hop || edgelist_hops,
+                  "Invalid input arguments: edgelist_hops.has_value() should be true if "
+                  "compress_per_hop is true.");
+
+  // 2. renumber
+
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
+
+  rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
+  std::tie(edgelist_majors, edgelist_minors, renumber_map, renumber_map_label_offsets) =
+    renumber_sampled_edgelist<vertex_t, label_index_t>(
+      handle,
+      std::move(edgelist_majors),
+      std::move(edgelist_minors),
+      edgelist_hops ? std::make_optional(std::make_tuple(
+                        raft::device_span<int32_t const>(std::get<0>(*edgelist_hops).data(),
+                                                         std::get<0>(*edgelist_hops).size()),
+                        num_hops))
+                    : std::nullopt,
+      edgelist_label_offsets,
+      do_expensive_check);
+
+  // 3. sort by ((l), (h), major, minor)
+
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_weights,
+           edgelist_edge_ids,
+           edgelist_edge_types,
+           edgelist_hops) = sort_sampled_edge_tuples(handle,
+                                                     std::move(edgelist_majors),
+                                                     std::move(edgelist_minors),
+                                                     std::move(edgelist_weights),
+                                                     std::move(edgelist_edge_ids),
+                                                     std::move(edgelist_edge_types),
+                                                     std::move(edgelist_hops),
+                                                     edgelist_label_offsets);
+
+  if (do_expensive_check) {
+    if (!compress_per_hop && edgelist_hops) {
+      rmm::device_uvector<vertex_t> min_vertices(num_labels * num_hops, handle.get_stream());
+      rmm::device_uvector<vertex_t> max_vertices(min_vertices.size(), handle.get_stream());
+
+      auto label_index_first = thrust::make_transform_iterator(
+        thrust::make_counting_iterator(size_t{0}),
+        optionally_compute_label_index_t<label_index_t>{
+          edgelist_label_offsets ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                 : thrust::nullopt});
+      auto input_key_first =
+        thrust::make_zip_iterator(label_index_first, std::get<0>(*edgelist_hops).begin());
+      rmm::device_uvector<label_index_t> unique_key_label_indices(min_vertices.size(),
+                                                                  handle.get_stream());
+      rmm::device_uvector<int32_t> unique_key_hops(min_vertices.size(), handle.get_stream());
+      auto output_key_first =
+        thrust::make_zip_iterator(unique_key_label_indices.begin(), unique_key_hops.begin());
+
+      auto output_it =
+        thrust::reduce_by_key(handle.get_thrust_policy(),
+                              input_key_first,
+                              input_key_first + edgelist_majors.size(),
+                              edgelist_majors.begin(),
+                              output_key_first,
+                              min_vertices.begin(),
+                              thrust::equal_to<thrust::tuple<label_index_t, int32_t>>{},
+                              thrust::minimum<vertex_t>{});
+      auto num_unique_keys =
+        static_cast<size_t>(thrust::distance(output_key_first, thrust::get<0>(output_it)));
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            edgelist_majors.begin(),
+                            output_key_first,
+                            max_vertices.begin(),
+                            thrust::equal_to<thrust::tuple<label_index_t, int32_t>>{},
+                            thrust::maximum<vertex_t>{});
+      if (num_unique_keys > 1) {
+        auto num_invalids = thrust::count_if(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{1}),
+          thrust::make_counting_iterator(num_unique_keys),
+          [output_key_first,
+           min_vertices = raft::device_span<vertex_t const>(min_vertices.data(), num_unique_keys),
+           max_vertices = raft::device_span<vertex_t const>(max_vertices.data(),
+                                                            num_unique_keys)] __device__(size_t i) {
+            auto prev_key = *(output_key_first + (i - 1));
+            auto this_key = *(output_key_first + i);
+            if (thrust::get<0>(prev_key) == thrust::get<0>(this_key)) {
+              auto this_min = min_vertices[i];
+              auto prev_max = max_vertices[i - 1];
+              return prev_max >= this_min;
+            } else {
+              return false;
+            }
+          });
+        CUGRAPH_EXPECTS(num_invalids == 0,
+                        "Invalid input arguments: if @p compress_per_hop is false and @p "
+                        "edgelist_hops.has_value() is true, the minimum majors with hop N + 1 "
+                        "should be larger than the maximum majors with hop N after renumbering.");
+      }
+    }
+  }
+
+  // 4. compute offsets for ((l), (h), major) triplets with non zero neighbors (update
+  // compressed_label_indices, compressed_hops, compressed_nzd_vertices, and compressed_offsets)
+
+  auto num_uniques = thrust::count_if(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator(size_t{0}),
+    thrust::make_counting_iterator(edgelist_majors.size()),
+    is_first_in_run_t<vertex_t>{
+      edgelist_label_offsets ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                             : thrust::nullopt,
+      edgelist_hops ? thrust::make_optional<raft::device_span<int32_t const>>(
+                        std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                    : thrust::nullopt,
+      raft::device_span<vertex_t const>(
+        edgelist_majors.data(),
+        edgelist_majors.size())});  // number of unique ((label), (hop), major) triplets
+
+  auto compressed_label_indices =
+    edgelist_label_offsets
+      ? std::make_optional<rmm::device_uvector<label_index_t>>(num_uniques, handle.get_stream())
+      : std::nullopt;
+  auto compressed_hops = edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                                           num_uniques, handle.get_stream())
+                                       : std::nullopt;
+  rmm::device_uvector<vertex_t> compressed_nzd_vertices(num_uniques, handle.get_stream());
+  rmm::device_uvector<size_t> compressed_offsets(num_uniques + 1, handle.get_stream());
+  compressed_offsets.set_element_to_zero_async(num_uniques, handle.get_stream());
+
+  if (edgelist_label_offsets) {
+    auto label_index_first = thrust::make_transform_iterator(
+      thrust::make_counting_iterator(size_t{0}),
+      compute_label_index_t<label_index_t>{std::get<0>(*edgelist_label_offsets)});
+
+    if (edgelist_hops) {
+      auto input_key_first = thrust::make_zip_iterator(
+        label_index_first, std::get<0>(*edgelist_hops).begin(), edgelist_majors.begin());
+      auto output_key_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                        (*compressed_hops).begin(),
+                                                        compressed_nzd_vertices.begin());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    } else {
+      auto input_key_first  = thrust::make_zip_iterator(label_index_first, edgelist_majors.begin());
+      auto output_key_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                        compressed_nzd_vertices.begin());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    }
+  } else {
+    if (edgelist_hops) {
+      auto input_key_first =
+        thrust::make_zip_iterator(std::get<0>(*edgelist_hops).begin(), edgelist_majors.begin());
+      auto output_key_first =
+        thrust::make_zip_iterator((*compressed_hops).begin(), compressed_nzd_vertices.begin());
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    } else {
+      auto input_key_first  = edgelist_majors.begin();
+      auto output_key_first = compressed_nzd_vertices.begin();
+      thrust::reduce_by_key(handle.get_thrust_policy(),
+                            input_key_first,
+                            input_key_first + edgelist_majors.size(),
+                            thrust::make_constant_iterator(size_t{1}),
+                            output_key_first,
+                            compressed_offsets.begin());
+    }
+  }
+  thrust::exclusive_scan(handle.get_thrust_policy(),
+                         compressed_offsets.begin(),
+                         compressed_offsets.end(),
+                         compressed_offsets.begin());
+
+  // 5. update compressed_offsets to include zero degree vertices (if doubly_compress is false) and
+  // compressed_offset_label_hop_offsets (if edgelist_label_offsets.has_value() or
+  // edgelist_hops.has_value() is true)
+
+  std::optional<rmm::device_uvector<size_t>> compressed_offset_label_hop_offsets{std::nullopt};
+  if (doubly_compress) {
+    if (edgelist_label_offsets || edgelist_hops) {
+      rmm::device_uvector<size_t> offset_array_offsets(num_labels * num_hops + 1,
+                                                       handle.get_stream());
+      thrust::fill(handle.get_thrust_policy(),
+                   offset_array_offsets.begin(),
+                   offset_array_offsets.end(),
+                   size_t{0});
+
+      if (edgelist_label_offsets) {
+        if (edgelist_hops) {
+          auto pair_first       = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                      (*compressed_hops).begin());
+          auto value_pair_first = thrust::make_transform_iterator(
+            thrust::make_counting_iterator(size_t{0}), [num_hops] __device__(size_t i) {
+              return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
+                                        static_cast<int32_t>(i % num_hops));
+            });
+          thrust::upper_bound(handle.get_thrust_policy(),
+                              pair_first,
+                              pair_first + (*compressed_label_indices).size(),
+                              value_pair_first,
+                              value_pair_first + (num_labels * num_hops),
+                              offset_array_offsets.begin() + 1);
+        } else {
+          thrust::upper_bound(
+            handle.get_thrust_policy(),
+            (*compressed_label_indices).begin(),
+            (*compressed_label_indices).end(),
+            thrust::make_counting_iterator(label_index_t{0}),
+            thrust::make_counting_iterator(static_cast<label_index_t>(num_labels)),
+            offset_array_offsets.begin() + 1);
+        }
+      } else {
+        thrust::upper_bound(handle.get_thrust_policy(),
+                            (*compressed_hops).begin(),
+                            (*compressed_hops).end(),
+                            thrust::make_counting_iterator(int32_t{0}),
+                            thrust::make_counting_iterator(static_cast<int32_t>(num_hops)),
+                            offset_array_offsets.begin() + 1);
+      }
+
+      compressed_offset_label_hop_offsets = std::move(offset_array_offsets);
+    }
+  } else {  // !doubly_compress
+    rmm::device_uvector<vertex_t> major_vertex_counts(num_labels * num_hops, handle.get_stream());
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      major_vertex_counts.begin(),
+      major_vertex_counts.end(),
+      [edgelist_label_offsets = edgelist_label_offsets
+                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                  : thrust::nullopt,
+       edgelist_hops          = edgelist_hops
+                                  ? thrust::make_optional<raft::device_span<int32_t>>(
+                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                                  : thrust::nullopt,
+       edgelist_majors =
+         raft::device_span<vertex_t const>(edgelist_majors.data(), edgelist_majors.size()),
+       num_hops,
+       compress_per_hop] __device__(size_t i) {
+        size_t start_offset{0};
+        auto end_offset         = edgelist_majors.size();
+        auto label_start_offset = start_offset;
+        auto label_end_offset   = end_offset;
+
+        if (edgelist_label_offsets) {
+          auto l_idx         = static_cast<label_index_t>(i / num_hops);
+          start_offset       = (*edgelist_label_offsets)[l_idx];
+          end_offset         = (*edgelist_label_offsets)[l_idx + 1];
+          label_start_offset = start_offset;
+          label_end_offset   = end_offset;
+        }
+
+        if (num_hops > 1) {
+          auto h        = static_cast<int32_t>(i % num_hops);
+          auto lower_it = thrust::lower_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          auto upper_it = thrust::upper_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+        }
+        if (compress_per_hop) {
+          return (start_offset < end_offset) ? (edgelist_majors[end_offset - 1] + 1) : vertex_t{0};
+        } else {
+          if (end_offset != label_end_offset) {
+            return edgelist_majors[end_offset];
+          } else if (label_start_offset < label_end_offset) {
+            return edgelist_majors[end_offset - 1] + 1;
+          } else {
+            return vertex_t{0};
+          }
+        }
+      });
+
+    std::optional<rmm::device_uvector<vertex_t>> minor_vertex_counts{std::nullopt};
+    if (compress_per_hop) {
+      minor_vertex_counts =
+        rmm::device_uvector<vertex_t>(major_vertex_counts.size(), handle.get_stream());
+      thrust::fill(handle.get_thrust_policy(),
+                   (*minor_vertex_counts).begin(),
+                   (*minor_vertex_counts).end(),
+                   vertex_t{0});
+      if (edgelist_label_offsets) {
+        auto triplet_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                       (*compressed_hops).begin(),
+                                                       thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(handle.get_thrust_policy(),
+                         triplet_first,
+                         triplet_first + compressed_nzd_vertices.size(),
+                         [edgelist_minors = raft::device_span<vertex_t const>(
+                            edgelist_minors.data(), edgelist_minors.size()),
+                          compressed_offsets = raft::device_span<size_t const>(
+                            compressed_offsets.data(), compressed_offsets.size()),
+                          minor_vertex_counts = raft::device_span<vertex_t>(
+                            (*minor_vertex_counts).data(), (*minor_vertex_counts).size()),
+                          num_hops] __device__(auto triplet) {
+                           auto nzd_v_idx    = thrust::get<2>(triplet);
+                           size_t end_offset = compressed_offsets[nzd_v_idx + 1];
+                           auto l_idx        = thrust::get<0>(triplet);
+                           auto h            = thrust::get<1>(triplet);
+                           cuda::atomic_ref<vertex_t, cuda::thread_scope_device> minor_vertex_count(
+                             minor_vertex_counts[l_idx * num_hops + h]);
+                           minor_vertex_count.fetch_max(edgelist_minors[end_offset - 1] + 1,
+                                                        cuda::std::memory_order_relaxed);
+                         });
+      } else {
+        auto pair_first = thrust::make_zip_iterator((*compressed_hops).begin(),
+                                                    thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(handle.get_thrust_policy(),
+                         pair_first,
+                         pair_first + compressed_nzd_vertices.size(),
+                         [edgelist_minors = raft::device_span<vertex_t const>(
+                            edgelist_minors.data(), edgelist_minors.size()),
+                          compressed_offsets = raft::device_span<size_t const>(
+                            compressed_offsets.data(), compressed_offsets.size()),
+                          minor_vertex_counts = raft::device_span<vertex_t>(
+                            (*minor_vertex_counts).data(), (*minor_vertex_counts).size()),
+                          num_hops] __device__(auto pair) {
+                           auto nzd_v_idx    = thrust::get<1>(pair);
+                           size_t end_offset = compressed_offsets[nzd_v_idx + 1];
+                           auto h            = thrust::get<0>(pair);
+                           cuda::atomic_ref<vertex_t, cuda::thread_scope_device> minor_vertex_count(
+                             minor_vertex_counts[h]);
+                           minor_vertex_count.fetch_max(edgelist_minors[end_offset - 1] + 1,
+                                                        cuda::std::memory_order_relaxed);
+                         });
+      }
+    }
+
+    rmm::device_uvector<size_t> offset_array_offsets(num_labels * num_hops + 1,
+                                                     handle.get_stream());
+    offset_array_offsets.set_element_to_zero_async(num_labels * num_hops, handle.get_stream());
+    thrust::tabulate(
+      handle.get_thrust_policy(),
+      offset_array_offsets.begin(),
+      offset_array_offsets.begin() + (num_labels * num_hops),
+      [major_vertex_counts =
+         raft::device_span<vertex_t const>(major_vertex_counts.data(), major_vertex_counts.size()),
+       minor_vertex_counts = minor_vertex_counts
+                               ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                                   (*minor_vertex_counts).data(), (*minor_vertex_counts).size())
+                               : thrust::nullopt,
+       num_hops,
+       compress_per_hop] __device__(size_t i) {
+        auto vertex_count = major_vertex_counts[i];
+        if (num_hops > 1) {
+          if (compress_per_hop) {
+            for (size_t j = (i - (i % num_hops)); j < i; ++j) {
+              vertex_count = cuda::std::max(vertex_count, major_vertex_counts[j]);
+              vertex_count = cuda::std::max(vertex_count, (*minor_vertex_counts)[j]);
+            }
+          } else {
+            if (i % num_hops != 0) { vertex_count -= major_vertex_counts[i - 1]; }
+          }
+        }
+        return vertex_count;
+      });
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           offset_array_offsets.begin(),
+                           offset_array_offsets.end(),
+                           offset_array_offsets.begin());
+
+    auto tmp_compressed_offsets = rmm::device_uvector<size_t>(
+      offset_array_offsets.back_element(handle.get_stream()) + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 tmp_compressed_offsets.begin(),
+                 tmp_compressed_offsets.end(),
+                 size_t{0});
+
+    if (edgelist_label_offsets) {
+      if (edgelist_hops) {
+        auto triplet_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                       (*compressed_hops).begin(),
+                                                       thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          triplet_first,
+          triplet_first + compressed_nzd_vertices.size(),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           offset_array_offsets = raft::device_span<size_t const>(offset_array_offsets.data(),
+                                                                  offset_array_offsets.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets = raft::device_span<size_t>(tmp_compressed_offsets.data(),
+                                                              tmp_compressed_offsets.size()),
+           compress_per_hop,
+           num_hops] __device__(auto triplet) {
+            auto nzd_v_idx      = thrust::get<2>(triplet);
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            auto l_idx          = thrust::get<0>(triplet);
+            auto h              = thrust::get<1>(triplet);
+            tmp_compressed_offsets[offset_array_offsets[l_idx * num_hops +
+                                                        (compress_per_hop ? h : int32_t{0})] +
+                                   compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      } else {
+        auto pair_first = thrust::make_zip_iterator((*compressed_label_indices).begin(),
+                                                    thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          pair_first,
+          pair_first + compressed_nzd_vertices.size(),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           offset_array_offsets = raft::device_span<size_t const>(offset_array_offsets.data(),
+                                                                  offset_array_offsets.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets = raft::device_span<size_t>(
+             tmp_compressed_offsets.data(), tmp_compressed_offsets.size())] __device__(auto pair) {
+            auto nzd_v_idx      = thrust::get<1>(pair);
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            auto l_idx          = thrust::get<0>(pair);
+            tmp_compressed_offsets[offset_array_offsets[l_idx] +
+                                   compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      }
+    } else {
+      if (edgelist_hops) {
+        auto pair_first = thrust::make_zip_iterator((*compressed_hops).begin(),
+                                                    thrust::make_counting_iterator(size_t{0}));
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          pair_first,
+          pair_first + compressed_nzd_vertices.size(),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           offset_array_offsets = raft::device_span<size_t const>(offset_array_offsets.data(),
+                                                                  offset_array_offsets.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets = raft::device_span<size_t>(tmp_compressed_offsets.data(),
+                                                              tmp_compressed_offsets.size()),
+           compress_per_hop] __device__(auto pair) {
+            auto nzd_v_idx      = thrust::get<1>(pair);
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            auto h              = thrust::get<0>(pair);
+            tmp_compressed_offsets[offset_array_offsets[compress_per_hop ? h : int32_t{0}] +
+                                   compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      } else {
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(size_t{0}),
+          thrust::make_counting_iterator(compressed_nzd_vertices.size()),
+          [compressed_nzd_vertices = raft::device_span<vertex_t const>(
+             compressed_nzd_vertices.data(), compressed_nzd_vertices.size()),
+           compressed_offsets =
+             raft::device_span<size_t>(compressed_offsets.data(), compressed_offsets.size()),
+           tmp_compressed_offsets =
+             raft::device_span<size_t>(tmp_compressed_offsets.data(),
+                                       tmp_compressed_offsets.size())] __device__(auto nzd_v_idx) {
+            size_t start_offset = compressed_offsets[nzd_v_idx];
+            size_t end_offset   = compressed_offsets[nzd_v_idx + 1];
+            tmp_compressed_offsets[compressed_nzd_vertices[nzd_v_idx]] = end_offset - start_offset;
+          });
+      }
+    }
+
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           tmp_compressed_offsets.begin(),
+                           tmp_compressed_offsets.end(),
+                           tmp_compressed_offsets.begin());
+
+    compressed_offsets = std::move(tmp_compressed_offsets);
+
+    if (edgelist_label_offsets || edgelist_hops) {
+      compressed_offset_label_hop_offsets = std::move(offset_array_offsets);
+    }
+  }
+
+  edgelist_hops = std::nullopt;
+
+  return std::make_tuple(
+    doubly_compress ? std::make_optional(std::move(compressed_nzd_vertices)) : std::nullopt,
+    std::move(compressed_offsets),
+    std::move(edgelist_minors),
+    std::move(edgelist_weights),
+    std::move(edgelist_edge_ids),
+    std::move(edgelist_edge_types),
+    std::move(compressed_offset_label_hop_offsets),
+    std::move(renumber_map),
+    std::move(renumber_map_label_offsets));
+}
+
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>,       // (label, hop) offsets to the edges
+           rmm::device_uvector<vertex_t>,                    // renumber map
+           std::optional<rmm::device_uvector<size_t>>>       // label offsets to the renumber map
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check)
+{
+  using label_index_t = uint32_t;
+
+  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
+  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+
+  // 1. check input arguments
+
+  check_input_edges<label_index_t>(handle,
+                                   edgelist_srcs,
+                                   edgelist_dsts,
+                                   edgelist_weights,
+                                   edgelist_edge_ids,
+                                   edgelist_edge_types,
+                                   edgelist_hops,
+                                   edgelist_label_offsets,
+                                   do_expensive_check);
+
+  // 2. renumber
+
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
+
+  rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
+  std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
+  std::tie(edgelist_majors, edgelist_minors, renumber_map, renumber_map_label_offsets) =
+    renumber_sampled_edgelist<vertex_t, label_index_t>(
+      handle,
+      std::move(edgelist_majors),
+      std::move(edgelist_minors),
+      edgelist_hops ? std::make_optional(std::make_tuple(
+                        raft::device_span<int32_t const>(std::get<0>(*edgelist_hops).data(),
+                                                         std::get<0>(*edgelist_hops).size()),
+                        num_hops))
+                    : std::nullopt,
+      edgelist_label_offsets,
+      do_expensive_check);
+
+  // 3. sort by ((l), (h), major, minor)
+
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_weights,
+           edgelist_edge_ids,
+           edgelist_edge_types,
+           edgelist_hops) = sort_sampled_edge_tuples(handle,
+                                                     std::move(edgelist_majors),
+                                                     std::move(edgelist_minors),
+                                                     std::move(edgelist_weights),
+                                                     std::move(edgelist_edge_ids),
+                                                     std::move(edgelist_edge_types),
+                                                     std::move(edgelist_hops),
+                                                     edgelist_label_offsets);
+
+  // 4. compute edgelist_label_hop_offsets
+
+  std::optional<rmm::device_uvector<size_t>> edgelist_label_hop_offsets{std::nullopt};
+  if (edgelist_label_offsets || edgelist_hops) {
+    edgelist_label_hop_offsets =
+      rmm::device_uvector<size_t>(num_labels * num_hops + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 (*edgelist_label_hop_offsets).begin(),
+                 (*edgelist_label_hop_offsets).end(),
+                 size_t{0});
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_labels * num_hops),
+      [edgelist_label_offsets = edgelist_label_offsets
+                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                  : thrust::nullopt,
+       edgelist_hops          = edgelist_hops
+                                  ? thrust::make_optional<raft::device_span<int32_t const>>(
+                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                                  : thrust::nullopt,
+       num_hops,
+       num_edges = edgelist_majors.size()] __device__(size_t i) {
+        size_t start_offset{0};
+        auto end_offset = num_edges;
+
+        if (edgelist_label_offsets) {
+          auto l_idx   = static_cast<label_index_t>(i / num_hops);
+          start_offset = (*edgelist_label_offsets)[l_idx];
+          end_offset   = (*edgelist_label_offsets)[l_idx + 1];
+        }
+
+        if (edgelist_hops) {
+          auto h        = static_cast<int32_t>(i % num_hops);
+          auto lower_it = thrust::lower_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          auto upper_it = thrust::upper_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+        }
+
+        return end_offset - start_offset;
+      });
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           (*edgelist_label_hop_offsets).begin(),
+                           (*edgelist_label_hop_offsets).end(),
+                           (*edgelist_label_hop_offsets).begin());
+  }
+
+  edgelist_hops = std::nullopt;
+
+  return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
+                         std::move(src_is_major ? edgelist_minors : edgelist_majors),
+                         std::move(edgelist_weights),
+                         std::move(edgelist_edge_ids),
+                         std::move(edgelist_edge_types),
+                         std::move(edgelist_label_hop_offsets),
+                         std::move(renumber_map),
+                         std::move(renumber_map_label_offsets));
+}
+
+template <typename vertex_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t>
+std::tuple<rmm::device_uvector<vertex_t>,                    // srcs
+           rmm::device_uvector<vertex_t>,                    // dsts
+           std::optional<rmm::device_uvector<weight_t>>,     // weights
+           std::optional<rmm::device_uvector<edge_id_t>>,    // edge IDs
+           std::optional<rmm::device_uvector<edge_type_t>>,  // edge types
+           std::optional<rmm::device_uvector<size_t>>>       // (label, hop) offsets to the edges
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<vertex_t>&& edgelist_srcs,
+  rmm::device_uvector<vertex_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<edge_id_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check)
+{
+  using label_index_t = uint32_t;
+
+  auto num_labels = edgelist_label_offsets ? std::get<1>(*edgelist_label_offsets) : size_t{1};
+  auto num_hops   = edgelist_hops ? std::get<1>(*edgelist_hops) : size_t{1};
+
+  // 1. check input arguments
+
+  check_input_edges<label_index_t>(handle,
+                                   edgelist_srcs,
+                                   edgelist_dsts,
+                                   edgelist_weights,
+                                   edgelist_edge_ids,
+                                   edgelist_edge_types,
+                                   edgelist_hops,
+                                   edgelist_label_offsets,
+                                   do_expensive_check);
+
+  // 2. sort by ((l), (h), major, minor)
+
+  auto edgelist_majors = src_is_major ? std::move(edgelist_srcs) : std::move(edgelist_dsts);
+  auto edgelist_minors = src_is_major ? std::move(edgelist_dsts) : std::move(edgelist_srcs);
+
+  std::tie(edgelist_majors,
+           edgelist_minors,
+           edgelist_weights,
+           edgelist_edge_ids,
+           edgelist_edge_types,
+           edgelist_hops) = sort_sampled_edge_tuples(handle,
+                                                     std::move(edgelist_majors),
+                                                     std::move(edgelist_minors),
+                                                     std::move(edgelist_weights),
+                                                     std::move(edgelist_edge_ids),
+                                                     std::move(edgelist_edge_types),
+                                                     std::move(edgelist_hops),
+                                                     edgelist_label_offsets);
+
+  // 3. compute edgelist_label_hop_offsets
+
+  std::optional<rmm::device_uvector<size_t>> edgelist_label_hop_offsets{std::nullopt};
+  if (edgelist_label_offsets || edgelist_hops) {
+    edgelist_label_hop_offsets =
+      rmm::device_uvector<size_t>(num_labels * num_hops + 1, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(),
+                 (*edgelist_label_hop_offsets).begin(),
+                 (*edgelist_label_hop_offsets).end(),
+                 size_t{0});
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(num_labels * num_hops),
+      [edgelist_label_offsets = edgelist_label_offsets
+                                  ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
+                                  : thrust::nullopt,
+       edgelist_hops          = edgelist_hops
+                                  ? thrust::make_optional<raft::device_span<int32_t const>>(
+                             std::get<0>(*edgelist_hops).data(), std::get<0>(*edgelist_hops).size())
+                                  : thrust::nullopt,
+       num_hops,
+       num_edges = edgelist_majors.size()] __device__(size_t i) {
+        size_t start_offset{0};
+        auto end_offset = num_edges;
+
+        if (edgelist_label_offsets) {
+          auto l_idx   = static_cast<label_index_t>(i / num_hops);
+          start_offset = (*edgelist_label_offsets)[l_idx];
+          end_offset   = (*edgelist_label_offsets)[l_idx + 1];
+        }
+
+        if (edgelist_hops) {
+          auto h        = static_cast<int32_t>(i % num_hops);
+          auto lower_it = thrust::lower_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          auto upper_it = thrust::upper_bound(thrust::seq,
+                                              (*edgelist_hops).begin() + start_offset,
+                                              (*edgelist_hops).begin() + end_offset,
+                                              h);
+          start_offset  = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), lower_it));
+          end_offset    = static_cast<size_t>(thrust::distance((*edgelist_hops).begin(), upper_it));
+        }
+
+        return end_offset - start_offset;
+      });
+    thrust::exclusive_scan(handle.get_thrust_policy(),
+                           (*edgelist_label_hop_offsets).begin(),
+                           (*edgelist_label_hop_offsets).end(),
+                           (*edgelist_label_hop_offsets).begin());
+  }
+
+  edgelist_hops = std::nullopt;
+
+  return std::make_tuple(std::move(src_is_major ? edgelist_majors : edgelist_minors),
+                         std::move(src_is_major ? edgelist_minors : edgelist_majors),
+                         std::move(edgelist_weights),
+                         std::move(edgelist_edge_ids),
+                         std::move(edgelist_edge_types),
+                         std::move(edgelist_label_hop_offsets));
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/sampling/sampling_post_processing_sg.cu b/cpp/src/sampling/sampling_post_processing_sg.cu
new file mode 100644
index 00000000000..75e3c5f005a
--- /dev/null
+++ b/cpp/src/sampling/sampling_post_processing_sg.cu
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/sampling_functions.hpp>
+
+#include "sampling_post_processing_impl.cuh"
+
+namespace cugraph {
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int32_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int64_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<std::optional<rmm::device_uvector<int64_t>>,
+                    rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_compress_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> label_offsets,
+  bool src_is_major,
+  bool compress_per_hop,
+  bool doubly_compress,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<size_t>>>
+renumber_and_sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& edgelist_srcs,
+  rmm::device_uvector<int32_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<float>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<size_t>>>
+sort_sampled_edgelist(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& edgelist_srcs,
+  rmm::device_uvector<int64_t>&& edgelist_dsts,
+  std::optional<rmm::device_uvector<double>>&& edgelist_weights,
+  std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
+  std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+  std::optional<std::tuple<rmm::device_uvector<int32_t>, size_t>>&& edgelist_hops,
+  std::optional<std::tuple<raft::device_span<size_t const>, size_t>> edgelist_label_offsets,
+  bool src_is_major,
+  bool do_expensive_check);
+
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eebd31a0030..5e1e1d6ace3 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -390,9 +390,9 @@ ConfigureTest(UNIFORM_NEIGHBOR_SAMPLING_TEST sampling/sg_uniform_neighbor_sampli
 target_link_libraries(UNIFORM_NEIGHBOR_SAMPLING_TEST PRIVATE cuco::cuco)
 
 ###################################################################################################
-# - RENUMBER SAMPLED EDGE LIST tests --------------------------------------------------------------
-ConfigureTest(RENUMBER_SAMPLED_EDGELIST_TEST sampling/renumber_sampled_edgelist_test.cu)
-target_link_libraries(RENUMBER_SAMPLED_EDGELIST_TEST PRIVATE cuco::cuco)
+# - SAMPLING_POST_PROCESSING tests ----------------------------------------------------------------
+ConfigureTest(SAMPLING_POST_PROCESSING_TEST sampling/sampling_post_processing_test.cu)
+target_link_libraries(SAMPLING_POST_PROCESSING_TEST PRIVATE cuco::cuco)
 
 ###################################################################################################
 # - Renumber tests --------------------------------------------------------------------------------
diff --git a/cpp/tests/sampling/renumber_sampled_edgelist_test.cu b/cpp/tests/sampling/renumber_sampled_edgelist_test.cu
deleted file mode 100644
index 96c8d6173e7..00000000000
--- a/cpp/tests/sampling/renumber_sampled_edgelist_test.cu
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <utilities/base_fixture.hpp>
-
-#include <cugraph/detail/utility_wrappers.hpp>
-#include <cugraph/graph_functions.hpp>
-#include <cugraph/utilities/device_functors.cuh>
-#include <cugraph/utilities/high_res_timer.hpp>
-
-#include <raft/core/handle.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <gtest/gtest.h>
-
-#include <thrust/binary_search.h>
-#include <thrust/distance.h>
-#include <thrust/fill.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/reduce.h>
-#include <thrust/remove.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-
-struct RenumberSampledEdgelist_Usecase {
-  size_t num_vertices{};
-  size_t num_sampled_edges{};
-  size_t num_hops{1};    // enabled if larger than 1
-  size_t num_labels{1};  // enabled if larger than 1
-  bool check_correctness{true};
-};
-
-class Tests_RenumberSampledEdgelist
-  : public ::testing::TestWithParam<RenumberSampledEdgelist_Usecase> {
- public:
-  Tests_RenumberSampledEdgelist() {}
-
-  static void SetUpTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  template <typename vertex_t>
-  void run_current_test(RenumberSampledEdgelist_Usecase const& usecase)
-  {
-    using label_t = int32_t;
-
-    raft::handle_t handle{};
-    HighResTimer hr_timer{};
-
-    raft::random::RngState rng_state(0);
-
-    rmm::device_uvector<vertex_t> org_edgelist_srcs(usecase.num_sampled_edges, handle.get_stream());
-    rmm::device_uvector<vertex_t> org_edgelist_dsts(usecase.num_sampled_edges, handle.get_stream());
-    cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                         org_edgelist_srcs.data(),
-                                         org_edgelist_srcs.size(),
-                                         vertex_t{0},
-                                         static_cast<vertex_t>(usecase.num_vertices),
-                                         rng_state);
-    cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                         org_edgelist_dsts.data(),
-                                         org_edgelist_dsts.size(),
-                                         vertex_t{0},
-                                         static_cast<vertex_t>(usecase.num_vertices),
-                                         rng_state);
-
-    std::optional<rmm::device_uvector<int32_t>> edgelist_hops{std::nullopt};
-    if (usecase.num_hops > 1) {
-      edgelist_hops = rmm::device_uvector<int32_t>(usecase.num_sampled_edges, handle.get_stream());
-      cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                           (*edgelist_hops).data(),
-                                           (*edgelist_hops).size(),
-                                           int32_t{0},
-                                           static_cast<int32_t>(usecase.num_hops),
-                                           rng_state);
-    }
-
-    std::optional<std::tuple<rmm::device_uvector<label_t>, rmm::device_uvector<size_t>>>
-      label_offsets{std::nullopt};
-    if (usecase.num_labels > 1) {
-      rmm::device_uvector<label_t> labels(usecase.num_labels, handle.get_stream());
-      thrust::sequence(handle.get_thrust_policy(), labels.begin(), labels.end(), label_t{0});
-
-      rmm::device_uvector<label_t> edgelist_labels(usecase.num_sampled_edges, handle.get_stream());
-      cugraph::detail::uniform_random_fill(handle.get_stream(),
-                                           edgelist_labels.data(),
-                                           edgelist_labels.size(),
-                                           label_t{0},
-                                           static_cast<label_t>(usecase.num_labels),
-                                           rng_state);
-
-      rmm::device_uvector<size_t> offsets(usecase.num_labels + 1, handle.get_stream());
-      thrust::fill(handle.get_thrust_policy(), offsets.begin(), offsets.end(), size_t{0});
-
-      thrust::for_each(
-        handle.get_thrust_policy(),
-        edgelist_labels.begin(),
-        edgelist_labels.end(),
-        [offsets =
-           raft::device_span<size_t>(offsets.data(), offsets.size())] __device__(label_t label) {
-          cuda::atomic_ref<size_t, cuda::thread_scope_device> atomic_counter(offsets[label]);
-          atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
-        });
-
-      thrust::exclusive_scan(
-        handle.get_thrust_policy(), offsets.begin(), offsets.end(), offsets.begin());
-
-      label_offsets = std::make_tuple(std::move(labels), std::move(offsets));
-    }
-
-    rmm::device_uvector<vertex_t> renumbered_edgelist_srcs(org_edgelist_srcs.size(),
-                                                           handle.get_stream());
-    rmm::device_uvector<vertex_t> renumbered_edgelist_dsts(org_edgelist_dsts.size(),
-                                                           handle.get_stream());
-    thrust::copy(handle.get_thrust_policy(),
-                 org_edgelist_srcs.begin(),
-                 org_edgelist_srcs.end(),
-                 renumbered_edgelist_srcs.begin());
-    thrust::copy(handle.get_thrust_policy(),
-                 org_edgelist_dsts.begin(),
-                 org_edgelist_dsts.end(),
-                 renumbered_edgelist_dsts.begin());
-
-    rmm::device_uvector<vertex_t> renumber_map(0, handle.get_stream());
-    std::optional<rmm::device_uvector<size_t>> renumber_map_label_offsets{std::nullopt};
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.start("Renumber sampled edgelist");
-    }
-
-    std::tie(renumbered_edgelist_srcs,
-             renumbered_edgelist_dsts,
-             renumber_map,
-             renumber_map_label_offsets) =
-      cugraph::renumber_sampled_edgelist(
-        handle,
-        std::move(renumbered_edgelist_srcs),
-        std::move(renumbered_edgelist_dsts),
-        edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
-                          (*edgelist_hops).data(), (*edgelist_hops).size())
-                      : std::nullopt,
-        label_offsets
-          ? std::make_optional<
-              std::tuple<raft::device_span<label_t const>, raft::device_span<size_t const>>>(
-              std::make_tuple(raft::device_span<label_t const>(std::get<0>(*label_offsets).data(),
-                                                               std::get<0>(*label_offsets).size()),
-                              raft::device_span<size_t const>(std::get<1>(*label_offsets).data(),
-                                                              std::get<1>(*label_offsets).size())))
-          : std::nullopt);
-
-    if (cugraph::test::g_perf) {
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-      hr_timer.stop();
-      hr_timer.display_and_clear(std::cout);
-    }
-
-    if (usecase.check_correctness) {
-      for (size_t i = 0; i < usecase.num_labels; ++i) {
-        size_t edgelist_start_offset =
-          label_offsets ? std::get<1>(*label_offsets).element(i, handle.get_stream()) : size_t{0};
-        size_t edgelist_end_offset =
-          label_offsets ? std::get<1>(*label_offsets).element(i + 1, handle.get_stream())
-                        : usecase.num_sampled_edges;
-        if (edgelist_start_offset == edgelist_end_offset) continue;
-
-        auto this_label_org_edgelist_srcs =
-          raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-        auto this_label_org_edgelist_dsts =
-          raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-        auto this_label_edgelist_hops = edgelist_hops
-                                          ? std::make_optional<raft::device_span<int32_t const>>(
-                                              (*edgelist_hops).data() + edgelist_start_offset,
-                                              edgelist_end_offset - edgelist_start_offset)
-                                          : std::nullopt;
-        auto this_label_renumbered_edgelist_srcs =
-          raft::device_span<vertex_t const>(renumbered_edgelist_srcs.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-        auto this_label_renumbered_edgelist_dsts =
-          raft::device_span<vertex_t const>(renumbered_edgelist_dsts.data() + edgelist_start_offset,
-                                            edgelist_end_offset - edgelist_start_offset);
-
-        size_t renumber_map_start_offset =
-          renumber_map_label_offsets ? (*renumber_map_label_offsets).element(i, handle.get_stream())
-                                     : size_t{0};
-        size_t renumber_map_end_offset =
-          renumber_map_label_offsets
-            ? (*renumber_map_label_offsets).element(i + 1, handle.get_stream())
-            : renumber_map.size();
-        auto this_label_renumber_map =
-          raft::device_span<vertex_t const>(renumber_map.data() + renumber_map_start_offset,
-                                            renumber_map_end_offset - renumber_map_start_offset);
-
-        // check un-renumbering recovers the original edge list
-
-        auto pair_first = thrust::make_zip_iterator(this_label_org_edgelist_srcs.begin(),
-                                                    this_label_renumbered_edgelist_srcs.begin());
-        auto num_renumber_errors =
-          thrust::count_if(handle.get_thrust_policy(),
-                           pair_first,
-                           pair_first + this_label_org_edgelist_srcs.size(),
-                           [this_label_renumber_map] __device__(auto pair) {
-                             auto org        = thrust::get<0>(pair);
-                             auto renumbered = thrust::get<1>(pair);
-                             return this_label_renumber_map[renumbered] != org;
-                           });
-        ASSERT_TRUE(num_renumber_errors == 0) << "Renumber error in edge list sources.";
-
-        pair_first          = thrust::make_zip_iterator(this_label_org_edgelist_dsts.begin(),
-                                               this_label_renumbered_edgelist_dsts.begin());
-        num_renumber_errors = thrust::count_if(handle.get_thrust_policy(),
-                                               pair_first,
-                                               pair_first + this_label_org_edgelist_dsts.size(),
-                                               [this_label_renumber_map] __device__(auto pair) {
-                                                 auto org        = thrust::get<0>(pair);
-                                                 auto renumbered = thrust::get<1>(pair);
-                                                 return this_label_renumber_map[renumbered] != org;
-                                               });
-        ASSERT_TRUE(num_renumber_errors == 0) << "Renumber error in edge list destinations.";
-
-        // Check the invariants in renumber_map
-        // Say we found the minimum (primary key:hop, secondary key:flag) pairs for every unique
-        // vertices, where flag is 0 for sources and 1 for destinations. Then, vertices with smaller
-        // (hop, flag) pairs should be renumbered to smaller numbers than vertices with larger (hop,
-        // flag) pairs.
-
-        rmm::device_uvector<vertex_t> unique_srcs(this_label_org_edgelist_srcs.size(),
-                                                  handle.get_stream());
-        thrust::copy(handle.get_thrust_policy(),
-                     this_label_org_edgelist_srcs.begin(),
-                     this_label_org_edgelist_srcs.end(),
-                     unique_srcs.begin());
-        std::optional<rmm::device_uvector<int32_t>> unique_src_hops =
-          this_label_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
-                                       (*this_label_edgelist_hops).size(), handle.get_stream())
-                                   : std::nullopt;
-        if (this_label_edgelist_hops) {
-          thrust::copy(handle.get_thrust_policy(),
-                       (*this_label_edgelist_hops).begin(),
-                       (*this_label_edgelist_hops).end(),
-                       (*unique_src_hops).begin());
-
-          auto pair_first =
-            thrust::make_zip_iterator(unique_srcs.begin(), (*unique_src_hops).begin());
-          thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_srcs.size());
-          unique_srcs.resize(
-            thrust::distance(unique_srcs.begin(),
-                             thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                                  unique_srcs.begin(),
-                                                                  unique_srcs.end(),
-                                                                  (*unique_src_hops).begin()))),
-            handle.get_stream());
-          (*unique_src_hops).resize(unique_srcs.size(), handle.get_stream());
-        } else {
-          thrust::sort(handle.get_thrust_policy(), unique_srcs.begin(), unique_srcs.end());
-          unique_srcs.resize(
-            thrust::distance(
-              unique_srcs.begin(),
-              thrust::unique(handle.get_thrust_policy(), unique_srcs.begin(), unique_srcs.end())),
-            handle.get_stream());
-        }
-
-        rmm::device_uvector<vertex_t> unique_dsts(this_label_org_edgelist_dsts.size(),
-                                                  handle.get_stream());
-        thrust::copy(handle.get_thrust_policy(),
-                     this_label_org_edgelist_dsts.begin(),
-                     this_label_org_edgelist_dsts.end(),
-                     unique_dsts.begin());
-        std::optional<rmm::device_uvector<int32_t>> unique_dst_hops =
-          this_label_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
-                                       (*this_label_edgelist_hops).size(), handle.get_stream())
-                                   : std::nullopt;
-        if (this_label_edgelist_hops) {
-          thrust::copy(handle.get_thrust_policy(),
-                       (*this_label_edgelist_hops).begin(),
-                       (*this_label_edgelist_hops).end(),
-                       (*unique_dst_hops).begin());
-
-          auto pair_first =
-            thrust::make_zip_iterator(unique_dsts.begin(), (*unique_dst_hops).begin());
-          thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_dsts.size());
-          unique_dsts.resize(
-            thrust::distance(unique_dsts.begin(),
-                             thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
-                                                                  unique_dsts.begin(),
-                                                                  unique_dsts.end(),
-                                                                  (*unique_dst_hops).begin()))),
-            handle.get_stream());
-          (*unique_dst_hops).resize(unique_dsts.size(), handle.get_stream());
-        } else {
-          thrust::sort(handle.get_thrust_policy(), unique_dsts.begin(), unique_dsts.end());
-          unique_dsts.resize(
-            thrust::distance(
-              unique_dsts.begin(),
-              thrust::unique(handle.get_thrust_policy(), unique_dsts.begin(), unique_dsts.end())),
-            handle.get_stream());
-        }
-
-        rmm::device_uvector<vertex_t> sorted_org_vertices(this_label_renumber_map.size(),
-                                                          handle.get_stream());
-        rmm::device_uvector<vertex_t> matching_renumbered_vertices(sorted_org_vertices.size(),
-                                                                   handle.get_stream());
-        thrust::copy(handle.get_thrust_policy(),
-                     this_label_renumber_map.begin(),
-                     this_label_renumber_map.end(),
-                     sorted_org_vertices.begin());
-        thrust::sequence(handle.get_thrust_policy(),
-                         matching_renumbered_vertices.begin(),
-                         matching_renumbered_vertices.end(),
-                         vertex_t{0});
-        thrust::sort_by_key(handle.get_thrust_policy(),
-                            sorted_org_vertices.begin(),
-                            sorted_org_vertices.end(),
-                            matching_renumbered_vertices.begin());
-
-        if (this_label_edgelist_hops) {
-          rmm::device_uvector<vertex_t> merged_vertices(unique_srcs.size() + unique_dsts.size(),
-                                                        handle.get_stream());
-          rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
-          rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
-
-          auto src_triplet_first =
-            thrust::make_zip_iterator(unique_srcs.begin(),
-                                      (*unique_src_hops).begin(),
-                                      thrust::make_constant_iterator(int8_t{0}));
-          auto dst_triplet_first =
-            thrust::make_zip_iterator(unique_dsts.begin(),
-                                      (*unique_dst_hops).begin(),
-                                      thrust::make_constant_iterator(int8_t{1}));
-          thrust::merge(handle.get_thrust_policy(),
-                        src_triplet_first,
-                        src_triplet_first + unique_srcs.size(),
-                        dst_triplet_first,
-                        dst_triplet_first + unique_dsts.size(),
-                        thrust::make_zip_iterator(
-                          merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
-          merged_vertices.resize(
-            thrust::distance(
-              merged_vertices.begin(),
-              thrust::get<0>(thrust::unique_by_key(
-                handle.get_thrust_policy(),
-                merged_vertices.begin(),
-                merged_vertices.end(),
-                thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
-            handle.get_stream());
-          merged_hops.resize(merged_vertices.size(), handle.get_stream());
-          merged_flags.resize(merged_vertices.size(), handle.get_stream());
-
-          auto sort_key_first =
-            thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
-          thrust::sort_by_key(handle.get_thrust_policy(),
-                              sort_key_first,
-                              sort_key_first + merged_hops.size(),
-                              merged_vertices.begin());
-
-          auto num_unique_keys = thrust::count_if(
-            handle.get_thrust_policy(),
-            thrust::make_counting_iterator(size_t{0}),
-            thrust::make_counting_iterator(merged_hops.size()),
-            cugraph::detail::is_first_in_run_t<decltype(sort_key_first)>{sort_key_first});
-          rmm::device_uvector<vertex_t> min_vertices(num_unique_keys, handle.get_stream());
-          rmm::device_uvector<vertex_t> max_vertices(num_unique_keys, handle.get_stream());
-
-          auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
-            merged_vertices.begin(),
-            [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                     sorted_org_vertices.size()),
-             matching_renumbered_vertices = raft::device_span<vertex_t const>(
-               matching_renumbered_vertices.data(),
-               matching_renumbered_vertices.size())] __device__(vertex_t src) {
-              auto it = thrust::lower_bound(
-                thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), src);
-              return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
-                                                                   it)];
-            });
-
-          thrust::reduce_by_key(handle.get_thrust_policy(),
-                                sort_key_first,
-                                sort_key_first + merged_hops.size(),
-                                renumbered_merged_vertex_first,
-                                thrust::make_discard_iterator(),
-                                min_vertices.begin(),
-                                thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
-                                thrust::minimum<vertex_t>{});
-          thrust::reduce_by_key(handle.get_thrust_policy(),
-                                sort_key_first,
-                                sort_key_first + merged_hops.size(),
-                                renumbered_merged_vertex_first,
-                                thrust::make_discard_iterator(),
-                                max_vertices.begin(),
-                                thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
-                                thrust::maximum<vertex_t>{});
-
-          auto num_violations =
-            thrust::count_if(handle.get_thrust_policy(),
-                             thrust::make_counting_iterator(size_t{1}),
-                             thrust::make_counting_iterator(min_vertices.size()),
-                             [min_vertices = raft::device_span<vertex_t const>(min_vertices.data(),
-                                                                               min_vertices.size()),
-                              max_vertices = raft::device_span<vertex_t const>(
-                                max_vertices.data(), max_vertices.size())] __device__(size_t i) {
-                               return min_vertices[i] <= max_vertices[i - 1];
-                             });
-
-          ASSERT_TRUE(num_violations == 0)
-            << "Invariant violated, a vertex with a smaller (hop,flag) pair is renumbered to a "
-               "larger value than a vertex with a larger (hop, flag) pair.";
-        } else {
-          unique_dsts.resize(
-            thrust::distance(
-              unique_dsts.begin(),
-              thrust::remove_if(handle.get_thrust_policy(),
-                                unique_dsts.begin(),
-                                unique_dsts.end(),
-                                [sorted_unique_srcs = raft::device_span<vertex_t const>(
-                                   unique_srcs.data(), unique_srcs.size())] __device__(auto dst) {
-                                  return thrust::binary_search(thrust::seq,
-                                                               sorted_unique_srcs.begin(),
-                                                               sorted_unique_srcs.end(),
-                                                               dst);
-                                })),
-            handle.get_stream());
-
-          auto max_src_renumbered_vertex = thrust::transform_reduce(
-            handle.get_thrust_policy(),
-            unique_srcs.begin(),
-            unique_srcs.end(),
-            [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                     sorted_org_vertices.size()),
-             matching_renumbered_vertices = raft::device_span<vertex_t const>(
-               matching_renumbered_vertices.data(),
-               matching_renumbered_vertices.size())] __device__(vertex_t src) {
-              auto it = thrust::lower_bound(
-                thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), src);
-              return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
-                                                                   it)];
-            },
-            std::numeric_limits<vertex_t>::lowest(),
-            thrust::maximum<vertex_t>{});
-
-          auto min_dst_renumbered_vertex = thrust::transform_reduce(
-            handle.get_thrust_policy(),
-            unique_dsts.begin(),
-            unique_dsts.end(),
-            [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
-                                                                     sorted_org_vertices.size()),
-             matching_renumbered_vertices = raft::device_span<vertex_t const>(
-               matching_renumbered_vertices.data(),
-               matching_renumbered_vertices.size())] __device__(vertex_t dst) {
-              auto it = thrust::lower_bound(
-                thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), dst);
-              return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(),
-                                                                   it)];
-            },
-            std::numeric_limits<vertex_t>::max(),
-            thrust::minimum<vertex_t>{});
-
-          ASSERT_TRUE(max_src_renumbered_vertex < min_dst_renumbered_vertex)
-            << "Invariants violated, a source vertex is renumbered to a non-smaller value than a "
-               "vertex that appear only in the edge list destinations.";
-        }
-      }
-    }
-  }
-};
-
-TEST_P(Tests_RenumberSampledEdgelist, CheckInt32)
-{
-  auto param = GetParam();
-  run_current_test<int32_t>(param);
-}
-
-TEST_P(Tests_RenumberSampledEdgelist, CheckInt64)
-{
-  auto param = GetParam();
-  run_current_test<int64_t>(param);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-  small_test,
-  Tests_RenumberSampledEdgelist,
-  ::testing::Values(RenumberSampledEdgelist_Usecase{1024, 4096, 1, 1, true},
-                    RenumberSampledEdgelist_Usecase{1024, 4096, 3, 1, true},
-                    RenumberSampledEdgelist_Usecase{1024, 32768, 1, 256, true},
-                    RenumberSampledEdgelist_Usecase{1024, 32768, 3, 256, true}));
-
-INSTANTIATE_TEST_SUITE_P(
-  benchmark_test,
-  Tests_RenumberSampledEdgelist,
-  ::testing::Values(RenumberSampledEdgelist_Usecase{1 << 20, 1 << 20, 1, 1, false},
-                    RenumberSampledEdgelist_Usecase{1 << 20, 1 << 20, 5, 1, false},
-                    RenumberSampledEdgelist_Usecase{1 << 20, 1 << 24, 1, 1 << 20, false},
-                    RenumberSampledEdgelist_Usecase{1 << 20, 1 << 24, 5, 1 << 20, false}));
-
-CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
new file mode 100644
index 00000000000..422fe953b20
--- /dev/null
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -0,0 +1,1457 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/sampling_functions.hpp>
+#include <cugraph/utilities/device_functors.cuh>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/equal.h>
+#include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+struct SamplingPostProcessing_Usecase {
+  size_t num_labels{};
+  size_t num_seeds_per_label{};
+  std::vector<int32_t> fanouts{{-1}};
+  bool sample_with_replacement{false};
+
+  bool src_is_major{true};
+  bool compress_per_hop{false};
+  bool doubly_compress{false};
+  bool check_correctness{true};
+};
+
+template <typename vertex_t, typename weight_t>
+bool compare_edgelist(raft::handle_t const& handle,
+                      raft::device_span<vertex_t const> org_edgelist_srcs,
+                      raft::device_span<vertex_t const> org_edgelist_dsts,
+                      std::optional<raft::device_span<weight_t const>> org_edgelist_weights,
+                      raft::device_span<vertex_t const> renumbered_edgelist_srcs,
+                      raft::device_span<vertex_t const> renumbered_edgelist_dsts,
+                      std::optional<raft::device_span<weight_t const>> renumbered_edgelist_weights,
+                      std::optional<raft::device_span<vertex_t const>> renumber_map)
+{
+  if (org_edgelist_srcs.size() != renumbered_edgelist_srcs.size()) { return false; }
+
+  rmm::device_uvector<vertex_t> sorted_org_edgelist_srcs(org_edgelist_srcs.size(),
+                                                         handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_srcs.begin(),
+               org_edgelist_srcs.end(),
+               sorted_org_edgelist_srcs.begin());
+  rmm::device_uvector<vertex_t> sorted_org_edgelist_dsts(org_edgelist_dsts.size(),
+                                                         handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_dsts.begin(),
+               org_edgelist_dsts.end(),
+               sorted_org_edgelist_dsts.begin());
+  auto sorted_org_edgelist_weights = org_edgelist_weights
+                                       ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                           (*org_edgelist_weights).size(), handle.get_stream())
+                                       : std::nullopt;
+  if (sorted_org_edgelist_weights) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*org_edgelist_weights).begin(),
+                 (*org_edgelist_weights).end(),
+                 (*sorted_org_edgelist_weights).begin());
+  }
+
+  if (sorted_org_edgelist_weights) {
+    auto sorted_org_edge_first = thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(),
+                                                           sorted_org_edgelist_dsts.begin(),
+                                                           (*sorted_org_edgelist_weights).begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_org_edge_first,
+                 sorted_org_edge_first + sorted_org_edgelist_srcs.size());
+  } else {
+    auto sorted_org_edge_first =
+      thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(), sorted_org_edgelist_dsts.begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_org_edge_first,
+                 sorted_org_edge_first + sorted_org_edgelist_srcs.size());
+  }
+
+  rmm::device_uvector<vertex_t> sorted_unrenumbered_edgelist_srcs(renumbered_edgelist_srcs.size(),
+                                                                  handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               renumbered_edgelist_srcs.begin(),
+               renumbered_edgelist_srcs.end(),
+               sorted_unrenumbered_edgelist_srcs.begin());
+  rmm::device_uvector<vertex_t> sorted_unrenumbered_edgelist_dsts(renumbered_edgelist_dsts.size(),
+                                                                  handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               renumbered_edgelist_dsts.begin(),
+               renumbered_edgelist_dsts.end(),
+               sorted_unrenumbered_edgelist_dsts.begin());
+  auto sorted_unrenumbered_edgelist_weights =
+    renumbered_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                    (*renumbered_edgelist_weights).size(), handle.get_stream())
+                                : std::nullopt;
+  if (sorted_unrenumbered_edgelist_weights) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*renumbered_edgelist_weights).begin(),
+                 (*renumbered_edgelist_weights).end(),
+                 (*sorted_unrenumbered_edgelist_weights).begin());
+  }
+
+  if (renumber_map) {
+    cugraph::unrenumber_int_vertices<vertex_t, false>(
+      handle,
+      sorted_unrenumbered_edgelist_srcs.data(),
+      sorted_unrenumbered_edgelist_srcs.size(),
+      (*renumber_map).data(),
+      std::vector<vertex_t>{static_cast<vertex_t>((*renumber_map).size())});
+    cugraph::unrenumber_int_vertices<vertex_t, false>(
+      handle,
+      sorted_unrenumbered_edgelist_dsts.data(),
+      sorted_unrenumbered_edgelist_dsts.size(),
+      (*renumber_map).data(),
+      std::vector<vertex_t>{static_cast<vertex_t>((*renumber_map).size())});
+  }
+
+  if (sorted_unrenumbered_edgelist_weights) {
+    auto sorted_unrenumbered_edge_first =
+      thrust::make_zip_iterator(sorted_unrenumbered_edgelist_srcs.begin(),
+                                sorted_unrenumbered_edgelist_dsts.begin(),
+                                (*sorted_unrenumbered_edgelist_weights).begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_unrenumbered_edge_first,
+                 sorted_unrenumbered_edge_first + sorted_unrenumbered_edgelist_srcs.size());
+
+    auto sorted_org_edge_first = thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(),
+                                                           sorted_org_edgelist_dsts.begin(),
+                                                           (*sorted_org_edgelist_weights).begin());
+    return thrust::equal(handle.get_thrust_policy(),
+                         sorted_org_edge_first,
+                         sorted_org_edge_first + sorted_org_edgelist_srcs.size(),
+                         sorted_unrenumbered_edge_first);
+  } else {
+    auto sorted_unrenumbered_edge_first = thrust::make_zip_iterator(
+      sorted_unrenumbered_edgelist_srcs.begin(), sorted_unrenumbered_edgelist_dsts.begin());
+    thrust::sort(handle.get_thrust_policy(),
+                 sorted_unrenumbered_edge_first,
+                 sorted_unrenumbered_edge_first + sorted_unrenumbered_edgelist_srcs.size());
+
+    auto sorted_org_edge_first =
+      thrust::make_zip_iterator(sorted_org_edgelist_srcs.begin(), sorted_org_edgelist_dsts.begin());
+    return thrust::equal(handle.get_thrust_policy(),
+                         sorted_org_edge_first,
+                         sorted_org_edge_first + sorted_org_edgelist_srcs.size(),
+                         sorted_unrenumbered_edge_first);
+  }
+}
+
+template <typename vertex_t>
+bool check_renumber_map_invariants(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> org_edgelist_srcs,
+  raft::device_span<vertex_t const> org_edgelist_dsts,
+  std::optional<raft::device_span<int32_t const>> org_edgelist_hops,
+  raft::device_span<vertex_t const> renumber_map,
+  bool src_is_major)
+{
+  // Check the invariants in renumber_map
+  // Say we found the minimum (primary key:hop, secondary key:flag) pairs for every unique vertices,
+  // where flag is 0 for sources and 1 for destinations. Then, vertices with smaller (hop, flag)
+  // pairs should be renumbered to smaller numbers than vertices with larger (hop, flag) pairs.
+  auto org_edgelist_majors = src_is_major ? org_edgelist_srcs : org_edgelist_dsts;
+  auto org_edgelist_minors = src_is_major ? org_edgelist_dsts : org_edgelist_srcs;
+
+  rmm::device_uvector<vertex_t> unique_majors(org_edgelist_majors.size(), handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_majors.begin(),
+               org_edgelist_majors.end(),
+               unique_majors.begin());
+  std::optional<rmm::device_uvector<int32_t>> unique_major_hops =
+    org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                          (*org_edgelist_hops).size(), handle.get_stream())
+                      : std::nullopt;
+  if (org_edgelist_hops) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*org_edgelist_hops).begin(),
+                 (*org_edgelist_hops).end(),
+                 (*unique_major_hops).begin());
+
+    auto pair_first =
+      thrust::make_zip_iterator(unique_majors.begin(), (*unique_major_hops).begin());
+    thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_majors.size());
+    unique_majors.resize(
+      thrust::distance(unique_majors.begin(),
+                       thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                            unique_majors.begin(),
+                                                            unique_majors.end(),
+                                                            (*unique_major_hops).begin()))),
+      handle.get_stream());
+    (*unique_major_hops).resize(unique_majors.size(), handle.get_stream());
+  } else {
+    thrust::sort(handle.get_thrust_policy(), unique_majors.begin(), unique_majors.end());
+    unique_majors.resize(
+      thrust::distance(
+        unique_majors.begin(),
+        thrust::unique(handle.get_thrust_policy(), unique_majors.begin(), unique_majors.end())),
+      handle.get_stream());
+  }
+
+  rmm::device_uvector<vertex_t> unique_minors(org_edgelist_minors.size(), handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               org_edgelist_minors.begin(),
+               org_edgelist_minors.end(),
+               unique_minors.begin());
+  std::optional<rmm::device_uvector<int32_t>> unique_minor_hops =
+    org_edgelist_hops ? std::make_optional<rmm::device_uvector<int32_t>>(
+                          (*org_edgelist_hops).size(), handle.get_stream())
+                      : std::nullopt;
+  if (org_edgelist_hops) {
+    thrust::copy(handle.get_thrust_policy(),
+                 (*org_edgelist_hops).begin(),
+                 (*org_edgelist_hops).end(),
+                 (*unique_minor_hops).begin());
+
+    auto pair_first =
+      thrust::make_zip_iterator(unique_minors.begin(), (*unique_minor_hops).begin());
+    thrust::sort(handle.get_thrust_policy(), pair_first, pair_first + unique_minors.size());
+    unique_minors.resize(
+      thrust::distance(unique_minors.begin(),
+                       thrust::get<0>(thrust::unique_by_key(handle.get_thrust_policy(),
+                                                            unique_minors.begin(),
+                                                            unique_minors.end(),
+                                                            (*unique_minor_hops).begin()))),
+      handle.get_stream());
+    (*unique_minor_hops).resize(unique_minors.size(), handle.get_stream());
+  } else {
+    thrust::sort(handle.get_thrust_policy(), unique_minors.begin(), unique_minors.end());
+    unique_minors.resize(
+      thrust::distance(
+        unique_minors.begin(),
+        thrust::unique(handle.get_thrust_policy(), unique_minors.begin(), unique_minors.end())),
+      handle.get_stream());
+  }
+
+  rmm::device_uvector<vertex_t> sorted_org_vertices(renumber_map.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> matching_renumbered_vertices(sorted_org_vertices.size(),
+                                                             handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               renumber_map.begin(),
+               renumber_map.end(),
+               sorted_org_vertices.begin());
+  thrust::sequence(handle.get_thrust_policy(),
+                   matching_renumbered_vertices.begin(),
+                   matching_renumbered_vertices.end(),
+                   vertex_t{0});
+  thrust::sort_by_key(handle.get_thrust_policy(),
+                      sorted_org_vertices.begin(),
+                      sorted_org_vertices.end(),
+                      matching_renumbered_vertices.begin());
+
+  if (org_edgelist_hops) {
+    rmm::device_uvector<vertex_t> merged_vertices(unique_majors.size() + unique_minors.size(),
+                                                  handle.get_stream());
+    rmm::device_uvector<int32_t> merged_hops(merged_vertices.size(), handle.get_stream());
+    rmm::device_uvector<int8_t> merged_flags(merged_vertices.size(), handle.get_stream());
+
+    auto major_triplet_first = thrust::make_zip_iterator(unique_majors.begin(),
+                                                         (*unique_major_hops).begin(),
+                                                         thrust::make_constant_iterator(int8_t{0}));
+    auto minor_triplet_first = thrust::make_zip_iterator(unique_minors.begin(),
+                                                         (*unique_minor_hops).begin(),
+                                                         thrust::make_constant_iterator(int8_t{1}));
+    thrust::merge(handle.get_thrust_policy(),
+                  major_triplet_first,
+                  major_triplet_first + unique_majors.size(),
+                  minor_triplet_first,
+                  minor_triplet_first + unique_minors.size(),
+                  thrust::make_zip_iterator(
+                    merged_vertices.begin(), merged_hops.begin(), merged_flags.begin()));
+    merged_vertices.resize(
+      thrust::distance(merged_vertices.begin(),
+                       thrust::get<0>(thrust::unique_by_key(
+                         handle.get_thrust_policy(),
+                         merged_vertices.begin(),
+                         merged_vertices.end(),
+                         thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin())))),
+      handle.get_stream());
+    merged_hops.resize(merged_vertices.size(), handle.get_stream());
+    merged_flags.resize(merged_vertices.size(), handle.get_stream());
+
+    auto sort_key_first = thrust::make_zip_iterator(merged_hops.begin(), merged_flags.begin());
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        sort_key_first,
+                        sort_key_first + merged_hops.size(),
+                        merged_vertices.begin());
+
+    auto num_unique_keys = thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{0}),
+      thrust::make_counting_iterator(merged_hops.size()),
+      cugraph::detail::is_first_in_run_t<decltype(sort_key_first)>{sort_key_first});
+    rmm::device_uvector<vertex_t> min_vertices(num_unique_keys, handle.get_stream());
+    rmm::device_uvector<vertex_t> max_vertices(num_unique_keys, handle.get_stream());
+
+    auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
+      merged_vertices.begin(),
+      [sorted_org_vertices =
+         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
+       matching_renumbered_vertices = raft::device_span<vertex_t const>(
+         matching_renumbered_vertices.data(),
+         matching_renumbered_vertices.size())] __device__(vertex_t major) {
+        auto it = thrust::lower_bound(
+          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+      });
+
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          sort_key_first,
+                          sort_key_first + merged_hops.size(),
+                          renumbered_merged_vertex_first,
+                          thrust::make_discard_iterator(),
+                          min_vertices.begin(),
+                          thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
+                          thrust::minimum<vertex_t>{});
+    thrust::reduce_by_key(handle.get_thrust_policy(),
+                          sort_key_first,
+                          sort_key_first + merged_hops.size(),
+                          renumbered_merged_vertex_first,
+                          thrust::make_discard_iterator(),
+                          max_vertices.begin(),
+                          thrust::equal_to<thrust::tuple<int32_t, int8_t>>{},
+                          thrust::maximum<vertex_t>{});
+
+    auto num_violations = thrust::count_if(
+      handle.get_thrust_policy(),
+      thrust::make_counting_iterator(size_t{1}),
+      thrust::make_counting_iterator(min_vertices.size()),
+      [min_vertices = raft::device_span<vertex_t const>(min_vertices.data(), min_vertices.size()),
+       max_vertices = raft::device_span<vertex_t const>(max_vertices.data(),
+                                                        max_vertices.size())] __device__(size_t i) {
+        return min_vertices[i] <= max_vertices[i - 1];
+      });
+
+    return (num_violations == 0);
+  } else {
+    unique_minors.resize(
+      thrust::distance(
+        unique_minors.begin(),
+        thrust::remove_if(handle.get_thrust_policy(),
+                          unique_minors.begin(),
+                          unique_minors.end(),
+                          [sorted_unique_majors = raft::device_span<vertex_t const>(
+                             unique_majors.data(), unique_majors.size())] __device__(auto minor) {
+                            return thrust::binary_search(thrust::seq,
+                                                         sorted_unique_majors.begin(),
+                                                         sorted_unique_majors.end(),
+                                                         minor);
+                          })),
+      handle.get_stream());
+
+    auto max_major_renumbered_vertex = thrust::transform_reduce(
+      handle.get_thrust_policy(),
+      unique_majors.begin(),
+      unique_majors.end(),
+      [sorted_org_vertices =
+         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
+       matching_renumbered_vertices = raft::device_span<vertex_t const>(
+         matching_renumbered_vertices.data(),
+         matching_renumbered_vertices.size())] __device__(vertex_t major) {
+        auto it = thrust::lower_bound(
+          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+      },
+      std::numeric_limits<vertex_t>::lowest(),
+      thrust::maximum<vertex_t>{});
+
+    auto min_minor_renumbered_vertex = thrust::transform_reduce(
+      handle.get_thrust_policy(),
+      unique_minors.begin(),
+      unique_minors.end(),
+      [sorted_org_vertices =
+         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
+       matching_renumbered_vertices = raft::device_span<vertex_t const>(
+         matching_renumbered_vertices.data(),
+         matching_renumbered_vertices.size())] __device__(vertex_t minor) {
+        auto it = thrust::lower_bound(
+          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), minor);
+        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+      },
+      std::numeric_limits<vertex_t>::max(),
+      thrust::minimum<vertex_t>{});
+
+    return (max_major_renumbered_vertex < min_minor_renumbered_vertex);
+  }
+}
+
+template <typename input_usecase_t>
+class Tests_SamplingPostProcessing
+  : public ::testing::TestWithParam<std::tuple<SamplingPostProcessing_Usecase, input_usecase_t>> {
+ public:
+  Tests_SamplingPostProcessing() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t>
+  void run_current_test(
+    std::tuple<SamplingPostProcessing_Usecase const&, input_usecase_t const&> const& param)
+  {
+    using label_t     = int32_t;
+    using weight_t    = float;
+    using edge_id_t   = vertex_t;
+    using edge_type_t = int32_t;
+
+    bool constexpr store_transposed = false;
+    bool constexpr renumber         = true;
+    bool constexpr test_weighted    = true;
+
+    auto [sampling_post_processing_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    auto [graph, edge_weights, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, input_usecase, test_weighted, renumber);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+    auto edge_weight_view =
+      edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
+
+    raft::random::RngState rng_state(0);
+
+    rmm::device_uvector<vertex_t> starting_vertices(
+      sampling_post_processing_usecase.num_labels *
+        sampling_post_processing_usecase.num_seeds_per_label,
+      handle.get_stream());
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         starting_vertices.data(),
+                                         starting_vertices.size(),
+                                         vertex_t{0},
+                                         graph_view.number_of_vertices(),
+                                         rng_state);
+    auto starting_vertex_labels = (sampling_post_processing_usecase.num_labels > 1)
+                                    ? std::make_optional<rmm::device_uvector<label_t>>(
+                                        starting_vertices.size(), handle.get_stream())
+                                    : std::nullopt;
+    if (starting_vertex_labels) {
+      thrust::tabulate(
+        handle.get_thrust_policy(),
+        (*starting_vertex_labels).begin(),
+        (*starting_vertex_labels).end(),
+        [num_seeds_per_label = sampling_post_processing_usecase.num_seeds_per_label] __device__(
+          size_t i) { return static_cast<label_t>(i / num_seeds_per_label); });
+    }
+
+    rmm::device_uvector<vertex_t> org_edgelist_srcs(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> org_edgelist_dsts(0, handle.get_stream());
+    std::optional<rmm::device_uvector<weight_t>> org_edgelist_weights{std::nullopt};
+    std::optional<rmm::device_uvector<int32_t>> org_edgelist_hops{std::nullopt};
+    std::optional<rmm::device_uvector<label_t>> org_labels{std::nullopt};
+    std::optional<rmm::device_uvector<size_t>> org_edgelist_label_offsets{std::nullopt};
+    std::tie(org_edgelist_srcs,
+             org_edgelist_dsts,
+             org_edgelist_weights,
+             std::ignore,
+             std::ignore,
+             org_edgelist_hops,
+             org_labels,
+             org_edgelist_label_offsets) = cugraph::uniform_neighbor_sample<vertex_t,
+                                                                            edge_t,
+                                                                            weight_t,
+                                                                            edge_type_t,
+                                                                            label_t,
+                                                                            store_transposed,
+                                                                            false>(
+      handle,
+      graph_view,
+      edge_weight_view,
+      std::nullopt,
+      std::nullopt,
+      raft::device_span<vertex_t const>(starting_vertices.data(), starting_vertices.size()),
+      starting_vertex_labels ? std::make_optional<raft::device_span<label_t const>>(
+                                 (*starting_vertex_labels).data(), (*starting_vertex_labels).size())
+                             : std::nullopt,
+      std::nullopt,
+      raft::host_span<int32_t const>(sampling_post_processing_usecase.fanouts.data(),
+                                     sampling_post_processing_usecase.fanouts.size()),
+      rng_state,
+      sampling_post_processing_usecase.fanouts.size() > 1,
+      sampling_post_processing_usecase.sample_with_replacement,
+      (!sampling_post_processing_usecase.compress_per_hop &&
+       (sampling_post_processing_usecase.fanouts.size() > 1))
+        ? cugraph::prior_sources_behavior_t::EXCLUDE
+        : cugraph::prior_sources_behavior_t::DEFAULT,
+      false);
+
+    if (!sampling_post_processing_usecase.src_is_major) {
+      std::swap(org_edgelist_srcs, org_edgelist_dsts);
+    }
+
+    starting_vertices.resize(0, handle.get_stream());
+    starting_vertices.shrink_to_fit(handle.get_stream());
+    starting_vertex_labels = std::nullopt;
+
+    {
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_edgelist_srcs(org_edgelist_srcs.size(),
+                                                                        handle.get_stream());
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_edgelist_dsts(org_edgelist_dsts.size(),
+                                                                        handle.get_stream());
+      auto renumbered_and_sorted_edgelist_weights =
+        org_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                 (*org_edgelist_weights).size(), handle.get_stream())
+                             : std::nullopt;
+      std::optional<rmm::device_uvector<edge_id_t>> renumbered_and_sorted_edgelist_edge_ids{
+        std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> renumbered_and_sorted_edgelist_edge_types{
+        std::nullopt};
+      auto renumbered_and_sorted_edgelist_hops =
+        org_edgelist_hops
+          ? std::make_optional(std::make_tuple(
+              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
+              sampling_post_processing_usecase.fanouts.size()))
+          : std::nullopt;
+
+      raft::copy(renumbered_and_sorted_edgelist_srcs.data(),
+                 org_edgelist_srcs.data(),
+                 org_edgelist_srcs.size(),
+                 handle.get_stream());
+      raft::copy(renumbered_and_sorted_edgelist_dsts.data(),
+                 org_edgelist_dsts.data(),
+                 org_edgelist_dsts.size(),
+                 handle.get_stream());
+      if (renumbered_and_sorted_edgelist_weights) {
+        raft::copy((*renumbered_and_sorted_edgelist_weights).data(),
+                   (*org_edgelist_weights).data(),
+                   (*org_edgelist_weights).size(),
+                   handle.get_stream());
+      }
+      if (renumbered_and_sorted_edgelist_hops) {
+        raft::copy(std::get<0>(*renumbered_and_sorted_edgelist_hops).data(),
+                   (*org_edgelist_hops).data(),
+                   (*org_edgelist_hops).size(),
+                   handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<size_t>> renumbered_and_sorted_edgelist_label_hop_offsets{
+        std::nullopt};
+      rmm::device_uvector<vertex_t> renumbered_and_sorted_renumber_map(0, handle.get_stream());
+      std::optional<rmm::device_uvector<size_t>> renumbered_and_sorted_renumber_map_label_offsets{
+        std::nullopt};
+
+      {
+        size_t free_size{};
+        size_t total_size{};
+        RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+        std::cout << "free_size=" << free_size / (1024.0 * 1024.0 * 1024.0)
+                  << "GB total_size=" << total_size / (1024.0 * 1024.0 * 1024.0) << "GB."
+                  << std::endl;
+      }
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Renumber and sort sampled edgelist");
+      }
+
+      std::tie(renumbered_and_sorted_edgelist_srcs,
+               renumbered_and_sorted_edgelist_dsts,
+               renumbered_and_sorted_edgelist_weights,
+               renumbered_and_sorted_edgelist_edge_ids,
+               renumbered_and_sorted_edgelist_edge_types,
+               renumbered_and_sorted_edgelist_label_hop_offsets,
+               renumbered_and_sorted_renumber_map,
+               renumbered_and_sorted_renumber_map_label_offsets) =
+        cugraph::renumber_and_sort_sampled_edgelist(
+          handle,
+          std::move(renumbered_and_sorted_edgelist_srcs),
+          std::move(renumbered_and_sorted_edgelist_dsts),
+          std::move(renumbered_and_sorted_edgelist_weights),
+          std::move(renumbered_and_sorted_edgelist_edge_ids),
+          std::move(renumbered_and_sorted_edgelist_edge_types),
+          std::move(renumbered_and_sorted_edgelist_hops),
+          org_edgelist_label_offsets
+            ? std::make_optional(std::make_tuple(
+                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
+                                                (*org_edgelist_label_offsets).size()),
+                sampling_post_processing_usecase.num_labels))
+            : std::nullopt,
+          sampling_post_processing_usecase.src_is_major);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      if (sampling_post_processing_usecase.check_correctness) {
+        if (renumbered_and_sorted_edgelist_label_hop_offsets) {
+          ASSERT_TRUE((*renumbered_and_sorted_edgelist_label_hop_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels *
+                          sampling_post_processing_usecase.fanouts.size() +
+                        1)
+            << "Renumbered and sorted edge list (label,hop) offset array size should coincide with "
+               "the number of labels * the number of hops + 1.";
+
+          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                        (*renumbered_and_sorted_edgelist_label_hop_offsets).begin(),
+                                        (*renumbered_and_sorted_edgelist_label_hop_offsets).end()))
+            << "Renumbered and sorted edge list (label,hop) offset array values should be "
+               "non-decreasing.";
+        }
+
+        if (renumbered_and_sorted_renumber_map_label_offsets) {
+          ASSERT_TRUE((*renumbered_and_sorted_renumber_map_label_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels + 1)
+            << "Renumbered and sorted offset (label, hop) offset array size should coincide with "
+               "the number of labels + 1.";
+
+          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                        (*renumbered_and_sorted_renumber_map_label_offsets).begin(),
+                                        (*renumbered_and_sorted_renumber_map_label_offsets).end()))
+            << "Renumbered and sorted renumber map label offset array values should be "
+               "non-decreasing.";
+
+          ASSERT_TRUE(
+            (*renumbered_and_sorted_renumber_map_label_offsets).back_element(handle.get_stream()) ==
+            renumbered_and_sorted_renumber_map.size())
+            << "Renumbered and sorted renumber map label offset array's last value should coincide "
+               "with the renumber map size.";
+        }
+
+        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t edgelist_start_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t edgelist_end_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
+              : org_edgelist_srcs.size();
+          if (edgelist_start_offset == edgelist_end_offset) continue;
+
+          auto this_label_org_edgelist_srcs =
+            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_dsts =
+            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_hops =
+            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                  (*org_edgelist_hops).data() + edgelist_start_offset,
+                                  edgelist_end_offset - edgelist_start_offset)
+                              : std::nullopt;
+          auto this_label_org_edgelist_weights =
+            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                     (*org_edgelist_weights).data() + edgelist_start_offset,
+                                     edgelist_end_offset - edgelist_start_offset)
+                                 : std::nullopt;
+
+          auto this_label_output_edgelist_srcs = raft::device_span<vertex_t const>(
+            renumbered_and_sorted_edgelist_srcs.data() + edgelist_start_offset,
+            edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_dsts = raft::device_span<vertex_t const>(
+            renumbered_and_sorted_edgelist_dsts.data() + edgelist_start_offset,
+            edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_weights =
+            renumbered_and_sorted_edgelist_weights
+              ? std::make_optional<raft::device_span<weight_t const>>(
+                  (*renumbered_and_sorted_edgelist_weights).data() + edgelist_start_offset,
+                  edgelist_end_offset - edgelist_start_offset)
+              : std::nullopt;
+
+          size_t renumber_map_start_offset =
+            renumbered_and_sorted_renumber_map_label_offsets
+              ? (*renumbered_and_sorted_renumber_map_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t renumber_map_end_offset      = renumbered_and_sorted_renumber_map_label_offsets
+                                                  ? (*renumbered_and_sorted_renumber_map_label_offsets)
+                                                 .element(i + 1, handle.get_stream())
+                                                  : renumbered_and_sorted_renumber_map.size();
+          auto this_label_output_renumber_map = raft::device_span<vertex_t const>(
+            renumbered_and_sorted_renumber_map.data() + renumber_map_start_offset,
+            renumber_map_end_offset - renumber_map_start_offset);
+
+          // check whether the edges are properly sorted
+
+          auto this_label_output_edgelist_majors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_srcs
+                                                     : this_label_output_edgelist_dsts;
+          auto this_label_output_edgelist_minors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_dsts
+                                                     : this_label_output_edgelist_srcs;
+
+          if (this_label_org_edgelist_hops) {
+            auto num_hops   = sampling_post_processing_usecase.fanouts.size();
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            for (size_t j = 0; j < num_hops; ++j) {
+              auto hop_start_offset = (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                        .element(i * num_hops + j, handle.get_stream()) -
+                                      (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                        .element(i * num_hops, handle.get_stream());
+              auto hop_end_offset = (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                      .element(i * num_hops + j + 1, handle.get_stream()) -
+                                    (*renumbered_and_sorted_edgelist_label_hop_offsets)
+                                      .element(i * num_hops, handle.get_stream());
+              ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                            edge_first + hop_start_offset,
+                                            edge_first + hop_end_offset))
+                << "Renumbered and sorted output edges are not properly sorted.";
+            }
+          } else {
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                          edge_first,
+                                          edge_first + this_label_output_edgelist_majors.size()))
+              << "Renumbered and sorted output edges are not properly sorted.";
+          }
+
+          // check whether renumbering recovers the original edge list
+
+          ASSERT_TRUE(compare_edgelist(handle,
+                                       this_label_org_edgelist_srcs,
+                                       this_label_org_edgelist_dsts,
+                                       this_label_org_edgelist_weights,
+                                       this_label_output_edgelist_srcs,
+                                       this_label_output_edgelist_dsts,
+                                       this_label_output_edgelist_weights,
+                                       std::make_optional(this_label_output_renumber_map)))
+            << "Unrenumbering the renumbered and sorted edge list does not recover the original "
+               "edgelist.";
+
+          // Check the invariants in renumber_map
+
+          ASSERT_TRUE(check_renumber_map_invariants(handle,
+                                                    this_label_org_edgelist_srcs,
+                                                    this_label_org_edgelist_dsts,
+                                                    this_label_org_edgelist_hops,
+                                                    this_label_output_renumber_map,
+                                                    sampling_post_processing_usecase.src_is_major))
+            << "Renumbered and sorted output renumber map violates invariants.";
+        }
+      }
+    }
+
+    {
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_edgelist_srcs(
+        org_edgelist_srcs.size(), handle.get_stream());
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_edgelist_dsts(
+        org_edgelist_dsts.size(), handle.get_stream());
+      auto renumbered_and_compressed_edgelist_weights =
+        org_edgelist_weights ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                 (*org_edgelist_weights).size(), handle.get_stream())
+                             : std::nullopt;
+      std::optional<rmm::device_uvector<edge_id_t>> renumbered_and_compressed_edgelist_edge_ids{
+        std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> renumbered_and_compressed_edgelist_edge_types{
+        std::nullopt};
+      auto renumbered_and_compressed_edgelist_hops =
+        org_edgelist_hops
+          ? std::make_optional(std::make_tuple(
+              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
+              sampling_post_processing_usecase.fanouts.size()))
+          : std::nullopt;
+
+      raft::copy(renumbered_and_compressed_edgelist_srcs.data(),
+                 org_edgelist_srcs.data(),
+                 org_edgelist_srcs.size(),
+                 handle.get_stream());
+      raft::copy(renumbered_and_compressed_edgelist_dsts.data(),
+                 org_edgelist_dsts.data(),
+                 org_edgelist_dsts.size(),
+                 handle.get_stream());
+      if (renumbered_and_compressed_edgelist_weights) {
+        raft::copy((*renumbered_and_compressed_edgelist_weights).data(),
+                   (*org_edgelist_weights).data(),
+                   (*org_edgelist_weights).size(),
+                   handle.get_stream());
+      }
+      if (renumbered_and_compressed_edgelist_hops) {
+        raft::copy(std::get<0>(*renumbered_and_compressed_edgelist_hops).data(),
+                   (*org_edgelist_hops).data(),
+                   (*org_edgelist_hops).size(),
+                   handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<vertex_t>> renumbered_and_compressed_nzd_vertices{
+        std::nullopt};
+      rmm::device_uvector<size_t> renumbered_and_compressed_offsets(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_edgelist_minors(0,
+                                                                              handle.get_stream());
+      std::optional<rmm::device_uvector<size_t>> renumbered_and_compressed_offset_label_hop_offsets{
+        std::nullopt};
+      rmm::device_uvector<vertex_t> renumbered_and_compressed_renumber_map(0, handle.get_stream());
+      std::optional<rmm::device_uvector<size_t>>
+        renumbered_and_compressed_renumber_map_label_offsets{std::nullopt};
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Renumber and compressed sampled edgelist");
+      }
+
+      std::tie(renumbered_and_compressed_nzd_vertices,
+               renumbered_and_compressed_offsets,
+               renumbered_and_compressed_edgelist_minors,
+               renumbered_and_compressed_edgelist_weights,
+               renumbered_and_compressed_edgelist_edge_ids,
+               renumbered_and_compressed_edgelist_edge_types,
+               renumbered_and_compressed_offset_label_hop_offsets,
+               renumbered_and_compressed_renumber_map,
+               renumbered_and_compressed_renumber_map_label_offsets) =
+        cugraph::renumber_and_compress_sampled_edgelist(
+          handle,
+          std::move(renumbered_and_compressed_edgelist_srcs),
+          std::move(renumbered_and_compressed_edgelist_dsts),
+          std::move(renumbered_and_compressed_edgelist_weights),
+          std::move(renumbered_and_compressed_edgelist_edge_ids),
+          std::move(renumbered_and_compressed_edgelist_edge_types),
+          std::move(renumbered_and_compressed_edgelist_hops),
+          org_edgelist_label_offsets
+            ? std::make_optional(std::make_tuple(
+                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
+                                                (*org_edgelist_label_offsets).size()),
+                sampling_post_processing_usecase.num_labels))
+            : std::nullopt,
+          sampling_post_processing_usecase.src_is_major,
+          sampling_post_processing_usecase.compress_per_hop,
+          sampling_post_processing_usecase.doubly_compress);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      if (sampling_post_processing_usecase.check_correctness) {
+        if (renumbered_and_compressed_nzd_vertices) {
+          ASSERT_TRUE(renumbered_and_compressed_offsets.size() ==
+                      (*renumbered_and_compressed_nzd_vertices).size() + 1)
+            << "Renumbered and compressed offset array size should coincide with the number of "
+               "non-zero-degree vertices + 1.";
+        }
+
+        ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                      renumbered_and_compressed_offsets.begin(),
+                                      renumbered_and_compressed_offsets.end()))
+          << "Renumbered and compressed offset array values should be non-decreasing.";
+
+        ASSERT_TRUE(renumbered_and_compressed_offsets.back_element(handle.get_stream()) ==
+                    renumbered_and_compressed_edgelist_minors.size())
+          << "Renumbered and compressed offset array's last value should coincide with the number "
+             "of "
+             "edges.";
+
+        if (renumbered_and_compressed_offset_label_hop_offsets) {
+          ASSERT_TRUE((*renumbered_and_compressed_offset_label_hop_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels *
+                          sampling_post_processing_usecase.fanouts.size() +
+                        1)
+            << "Renumbered and compressed offset (label,hop) offset array size should coincide "
+               "with "
+               "the number of labels * the number of hops + 1.";
+
+          ASSERT_TRUE(
+            thrust::is_sorted(handle.get_thrust_policy(),
+                              (*renumbered_and_compressed_offset_label_hop_offsets).begin(),
+                              (*renumbered_and_compressed_offset_label_hop_offsets).end()))
+            << "Renumbered and compressed offset (label,hop) offset array values should be "
+               "non-decreasing.";
+
+          ASSERT_TRUE((*renumbered_and_compressed_offset_label_hop_offsets)
+                        .back_element(handle.get_stream()) ==
+                      renumbered_and_compressed_offsets.size() - 1)
+            << "Renumbered and compressed offset (label,hop) offset array's last value should "
+               "coincide with the offset array size - 1.";
+        }
+
+        if (renumbered_and_compressed_renumber_map_label_offsets) {
+          ASSERT_TRUE((*renumbered_and_compressed_renumber_map_label_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels + 1)
+            << "Renumbered and compressed offset (label, hop) offset array size should coincide "
+               "with "
+               "the number of labels + 1.";
+
+          ASSERT_TRUE(
+            thrust::is_sorted(handle.get_thrust_policy(),
+                              (*renumbered_and_compressed_renumber_map_label_offsets).begin(),
+                              (*renumbered_and_compressed_renumber_map_label_offsets).end()))
+            << "Renumbered and compressed renumber map label offset array values should be "
+               "non-decreasing.";
+
+          ASSERT_TRUE((*renumbered_and_compressed_renumber_map_label_offsets)
+                        .back_element(handle.get_stream()) ==
+                      renumbered_and_compressed_renumber_map.size())
+            << "Renumbered and compressed renumber map label offset array's last value should "
+               "coincide with the renumber map size.";
+        }
+
+        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t edgelist_start_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t edgelist_end_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
+              : org_edgelist_srcs.size();
+          if (edgelist_start_offset == edgelist_end_offset) continue;
+
+          auto this_label_org_edgelist_srcs =
+            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_dsts =
+            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_hops =
+            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                  (*org_edgelist_hops).data() + edgelist_start_offset,
+                                  edgelist_end_offset - edgelist_start_offset)
+                              : std::nullopt;
+          auto this_label_org_edgelist_weights =
+            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                     (*org_edgelist_weights).data() + edgelist_start_offset,
+                                     edgelist_end_offset - edgelist_start_offset)
+                                 : std::nullopt;
+
+          rmm::device_uvector<vertex_t> this_label_output_edgelist_srcs(0, handle.get_stream());
+          rmm::device_uvector<vertex_t> this_label_output_edgelist_dsts(0, handle.get_stream());
+          auto this_label_output_edgelist_weights =
+            renumbered_and_compressed_edgelist_weights
+              ? std::make_optional<rmm::device_uvector<weight_t>>(0, handle.get_stream())
+              : std::nullopt;
+          this_label_output_edgelist_srcs.reserve(edgelist_end_offset - edgelist_start_offset,
+                                                  handle.get_stream());
+          this_label_output_edgelist_dsts.reserve(edgelist_end_offset - edgelist_start_offset,
+                                                  handle.get_stream());
+          if (this_label_output_edgelist_weights) {
+            (*this_label_output_edgelist_weights)
+              .reserve(edgelist_end_offset - edgelist_start_offset, handle.get_stream());
+          }
+
+          // decompress
+
+          auto num_hops = sampling_post_processing_usecase.fanouts.size();
+          for (size_t j = 0; j < num_hops; ++j) {
+            auto offset_start_offset = renumbered_and_compressed_offset_label_hop_offsets
+                                         ? (*renumbered_and_compressed_offset_label_hop_offsets)
+                                             .element(i * num_hops + j, handle.get_stream())
+                                         : size_t{0};
+            auto offset_end_offset   = renumbered_and_compressed_offset_label_hop_offsets
+                                         ? ((*renumbered_and_compressed_offset_label_hop_offsets)
+                                            .element(i * num_hops + j + 1, handle.get_stream()) +
+                                          1)
+                                         : renumbered_and_compressed_offsets.size();
+
+            auto base_v =
+              (!sampling_post_processing_usecase.doubly_compress &&
+               !sampling_post_processing_usecase.compress_per_hop && (j > 0))
+                ? static_cast<vertex_t>(offset_start_offset -
+                                        (*renumbered_and_compressed_offset_label_hop_offsets)
+                                          .element(i * num_hops, handle.get_stream()))
+                : vertex_t{0};
+
+            raft::device_span<size_t const> d_offsets(
+              renumbered_and_compressed_offsets.data() + offset_start_offset,
+              offset_end_offset - offset_start_offset);
+            std::vector<size_t> h_offsets(d_offsets.size());
+            raft::update_host(
+              h_offsets.data(), d_offsets.data(), h_offsets.size(), handle.get_stream());
+            handle.sync_stream();
+
+            auto old_size = this_label_output_edgelist_srcs.size();
+            this_label_output_edgelist_srcs.resize(old_size + (h_offsets.back() - h_offsets[0]),
+                                                   handle.get_stream());
+            this_label_output_edgelist_dsts.resize(this_label_output_edgelist_srcs.size(),
+                                                   handle.get_stream());
+            if (this_label_output_edgelist_weights) {
+              (*this_label_output_edgelist_weights)
+                .resize(this_label_output_edgelist_srcs.size(), handle.get_stream());
+            }
+            thrust::transform(
+              handle.get_thrust_policy(),
+              thrust::make_counting_iterator(h_offsets[0]),
+              thrust::make_counting_iterator(h_offsets.back()),
+              (sampling_post_processing_usecase.src_is_major
+                 ? this_label_output_edgelist_srcs.begin()
+                 : this_label_output_edgelist_dsts.begin()) +
+                old_size,
+              [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+               nzd_vertices =
+                 renumbered_and_compressed_nzd_vertices
+                   ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                       (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
+                       (offset_end_offset - offset_start_offset) - 1)
+                   : thrust::nullopt,
+               base_v] __device__(size_t i) {
+                auto idx = static_cast<size_t>(thrust::distance(
+                  offsets.begin() + 1,
+                  thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
+                if (nzd_vertices) {
+                  return (*nzd_vertices)[idx];
+                } else {
+                  return base_v + static_cast<vertex_t>(idx);
+                }
+              });
+            thrust::copy(handle.get_thrust_policy(),
+                         renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
+                         renumbered_and_compressed_edgelist_minors.begin() + h_offsets.back(),
+                         (sampling_post_processing_usecase.src_is_major
+                            ? this_label_output_edgelist_dsts.begin()
+                            : this_label_output_edgelist_srcs.begin()) +
+                           old_size);
+            if (this_label_output_edgelist_weights) {
+              thrust::copy(handle.get_thrust_policy(),
+                           (*renumbered_and_compressed_edgelist_weights).begin() + h_offsets[0],
+                           (*renumbered_and_compressed_edgelist_weights).begin() + h_offsets.back(),
+                           (*this_label_output_edgelist_weights).begin() + old_size);
+            }
+          }
+
+          size_t renumber_map_start_offset =
+            renumbered_and_compressed_renumber_map_label_offsets
+              ? (*renumbered_and_compressed_renumber_map_label_offsets)
+                  .element(i, handle.get_stream())
+              : size_t{0};
+          size_t renumber_map_end_offset =
+            renumbered_and_compressed_renumber_map_label_offsets
+              ? (*renumbered_and_compressed_renumber_map_label_offsets)
+                  .element(i + 1, handle.get_stream())
+              : renumbered_and_compressed_renumber_map.size();
+          auto this_label_output_renumber_map = raft::device_span<vertex_t const>(
+            renumbered_and_compressed_renumber_map.data() + renumber_map_start_offset,
+            renumber_map_end_offset - renumber_map_start_offset);
+
+          // check whether renumbering recovers the original edge list
+
+          ASSERT_TRUE(compare_edgelist(
+            handle,
+            this_label_org_edgelist_srcs,
+            this_label_org_edgelist_dsts,
+            this_label_org_edgelist_weights,
+            raft::device_span<vertex_t const>(this_label_output_edgelist_srcs.data(),
+                                              this_label_output_edgelist_srcs.size()),
+            raft::device_span<vertex_t const>(this_label_output_edgelist_dsts.data(),
+                                              this_label_output_edgelist_dsts.size()),
+            this_label_output_edgelist_weights
+              ? std::make_optional<raft::device_span<weight_t const>>(
+                  (*this_label_output_edgelist_weights).data(),
+                  (*this_label_output_edgelist_weights).size())
+              : std::nullopt,
+            std::make_optional(this_label_output_renumber_map)))
+            << "Unrenumbering the renumbered and sorted edge list does not recover the original "
+               "edgelist.";
+
+          // Check the invariants in renumber_map
+
+          ASSERT_TRUE(check_renumber_map_invariants(handle,
+                                                    this_label_org_edgelist_srcs,
+                                                    this_label_org_edgelist_dsts,
+                                                    this_label_org_edgelist_hops,
+                                                    this_label_output_renumber_map,
+                                                    sampling_post_processing_usecase.src_is_major))
+            << "Renumbered and sorted output renumber map violates invariants.";
+        }
+      }
+    }
+
+    {
+      rmm::device_uvector<vertex_t> sorted_edgelist_srcs(org_edgelist_srcs.size(),
+                                                         handle.get_stream());
+      rmm::device_uvector<vertex_t> sorted_edgelist_dsts(org_edgelist_dsts.size(),
+                                                         handle.get_stream());
+      auto sorted_edgelist_weights = org_edgelist_weights
+                                       ? std::make_optional<rmm::device_uvector<weight_t>>(
+                                           (*org_edgelist_weights).size(), handle.get_stream())
+                                       : std::nullopt;
+      std::optional<rmm::device_uvector<edge_id_t>> sorted_edgelist_edge_ids{std::nullopt};
+      std::optional<rmm::device_uvector<edge_type_t>> sorted_edgelist_edge_types{std::nullopt};
+      auto sorted_edgelist_hops =
+        org_edgelist_hops
+          ? std::make_optional(std::make_tuple(
+              rmm::device_uvector<int32_t>((*org_edgelist_hops).size(), handle.get_stream()),
+              sampling_post_processing_usecase.fanouts.size()))
+          : std::nullopt;
+
+      raft::copy(sorted_edgelist_srcs.data(),
+                 org_edgelist_srcs.data(),
+                 org_edgelist_srcs.size(),
+                 handle.get_stream());
+      raft::copy(sorted_edgelist_dsts.data(),
+                 org_edgelist_dsts.data(),
+                 org_edgelist_dsts.size(),
+                 handle.get_stream());
+      if (sorted_edgelist_weights) {
+        raft::copy((*sorted_edgelist_weights).data(),
+                   (*org_edgelist_weights).data(),
+                   (*org_edgelist_weights).size(),
+                   handle.get_stream());
+      }
+      if (sorted_edgelist_hops) {
+        raft::copy(std::get<0>(*sorted_edgelist_hops).data(),
+                   (*org_edgelist_hops).data(),
+                   (*org_edgelist_hops).size(),
+                   handle.get_stream());
+      }
+
+      std::optional<rmm::device_uvector<size_t>> sorted_edgelist_label_hop_offsets{std::nullopt};
+
+      {
+        size_t free_size{};
+        size_t total_size{};
+        RAFT_CUDA_TRY(cudaMemGetInfo(&free_size, &total_size));
+        std::cout << "free_size=" << free_size / (1024.0 * 1024.0 * 1024.0)
+                  << "GB total_size=" << total_size / (1024.0 * 1024.0 * 1024.0) << "GB."
+                  << std::endl;
+      }
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.start("Sort sampled edgelist");
+      }
+
+      std::tie(sorted_edgelist_srcs,
+               sorted_edgelist_dsts,
+               sorted_edgelist_weights,
+               sorted_edgelist_edge_ids,
+               sorted_edgelist_edge_types,
+               sorted_edgelist_label_hop_offsets) =
+        cugraph::sort_sampled_edgelist(
+          handle,
+          std::move(sorted_edgelist_srcs),
+          std::move(sorted_edgelist_dsts),
+          std::move(sorted_edgelist_weights),
+          std::move(sorted_edgelist_edge_ids),
+          std::move(sorted_edgelist_edge_types),
+          std::move(sorted_edgelist_hops),
+          org_edgelist_label_offsets
+            ? std::make_optional(std::make_tuple(
+                raft::device_span<size_t const>((*org_edgelist_label_offsets).data(),
+                                                (*org_edgelist_label_offsets).size()),
+                sampling_post_processing_usecase.num_labels))
+            : std::nullopt,
+          sampling_post_processing_usecase.src_is_major);
+
+      if (cugraph::test::g_perf) {
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+        hr_timer.stop();
+        hr_timer.display_and_clear(std::cout);
+      }
+
+      if (sampling_post_processing_usecase.check_correctness) {
+        if (sorted_edgelist_label_hop_offsets) {
+          ASSERT_TRUE((*sorted_edgelist_label_hop_offsets).size() ==
+                      sampling_post_processing_usecase.num_labels *
+                          sampling_post_processing_usecase.fanouts.size() +
+                        1)
+            << "Sorted edge list (label,hop) offset array size should coincide with "
+               "the number of labels * the number of hops + 1.";
+
+          ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                        (*sorted_edgelist_label_hop_offsets).begin(),
+                                        (*sorted_edgelist_label_hop_offsets).end()))
+            << "Sorted edge list (label,hop) offset array values should be "
+               "non-decreasing.";
+        }
+
+        for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {
+          size_t edgelist_start_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i, handle.get_stream())
+              : size_t{0};
+          size_t edgelist_end_offset =
+            org_edgelist_label_offsets
+              ? (*org_edgelist_label_offsets).element(i + 1, handle.get_stream())
+              : org_edgelist_srcs.size();
+          if (edgelist_start_offset == edgelist_end_offset) continue;
+
+          auto this_label_org_edgelist_srcs =
+            raft::device_span<vertex_t const>(org_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_dsts =
+            raft::device_span<vertex_t const>(org_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_org_edgelist_hops =
+            org_edgelist_hops ? std::make_optional<raft::device_span<int32_t const>>(
+                                  (*org_edgelist_hops).data() + edgelist_start_offset,
+                                  edgelist_end_offset - edgelist_start_offset)
+                              : std::nullopt;
+          auto this_label_org_edgelist_weights =
+            org_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                     (*org_edgelist_weights).data() + edgelist_start_offset,
+                                     edgelist_end_offset - edgelist_start_offset)
+                                 : std::nullopt;
+
+          auto this_label_output_edgelist_srcs =
+            raft::device_span<vertex_t const>(sorted_edgelist_srcs.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_dsts =
+            raft::device_span<vertex_t const>(sorted_edgelist_dsts.data() + edgelist_start_offset,
+                                              edgelist_end_offset - edgelist_start_offset);
+          auto this_label_output_edgelist_weights =
+            sorted_edgelist_weights ? std::make_optional<raft::device_span<weight_t const>>(
+                                        (*sorted_edgelist_weights).data() + edgelist_start_offset,
+                                        edgelist_end_offset - edgelist_start_offset)
+                                    : std::nullopt;
+
+          // check whether the edges are properly sorted
+
+          auto this_label_output_edgelist_majors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_srcs
+                                                     : this_label_output_edgelist_dsts;
+          auto this_label_output_edgelist_minors = sampling_post_processing_usecase.src_is_major
+                                                     ? this_label_output_edgelist_dsts
+                                                     : this_label_output_edgelist_srcs;
+
+          if (this_label_org_edgelist_hops) {
+            auto num_hops   = sampling_post_processing_usecase.fanouts.size();
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            for (size_t j = 0; j < num_hops; ++j) {
+              auto hop_start_offset =
+                (*sorted_edgelist_label_hop_offsets)
+                  .element(i * num_hops + j, handle.get_stream()) -
+                (*sorted_edgelist_label_hop_offsets).element(i * num_hops, handle.get_stream());
+              auto hop_end_offset =
+                (*sorted_edgelist_label_hop_offsets)
+                  .element(i * num_hops + j + 1, handle.get_stream()) -
+                (*sorted_edgelist_label_hop_offsets).element(i * num_hops, handle.get_stream());
+              ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                            edge_first + hop_start_offset,
+                                            edge_first + hop_end_offset))
+                << "Renumbered and sorted output edges are not properly sorted.";
+            }
+          } else {
+            auto edge_first = thrust::make_zip_iterator(this_label_output_edgelist_majors.begin(),
+                                                        this_label_output_edgelist_minors.begin());
+            ASSERT_TRUE(thrust::is_sorted(handle.get_thrust_policy(),
+                                          edge_first,
+                                          edge_first + this_label_output_edgelist_majors.size()))
+              << "Renumbered and sorted output edges are not properly sorted.";
+          }
+
+          // check whether renumbering recovers the original edge list
+
+          ASSERT_TRUE(
+            compare_edgelist(handle,
+                             this_label_org_edgelist_srcs,
+                             this_label_org_edgelist_dsts,
+                             this_label_org_edgelist_weights,
+                             this_label_output_edgelist_srcs,
+                             this_label_output_edgelist_dsts,
+                             this_label_output_edgelist_weights,
+                             std::optional<raft::device_span<vertex_t const>>{std::nullopt}))
+            << "Sorted edge list does not coincide with the original edgelist.";
+        }
+      }
+    }
+  }
+};
+
+using Tests_SamplingPostProcessing_File = Tests_SamplingPostProcessing<cugraph::test::File_Usecase>;
+using Tests_SamplingPostProcessing_Rmat = Tests_SamplingPostProcessing<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_SamplingPostProcessing_File, CheckInt32Int32)
+{
+  run_current_test<int32_t, int32_t>(override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingPostProcessing_Rmat, CheckInt32Int32)
+{
+  run_current_test<int32_t, int32_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingPostProcessing_Rmat, CheckInt32Int64)
+{
+  run_current_test<int32_t, int64_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_SamplingPostProcessing_Rmat, CheckInt64Int64)
+{
+  run_current_test<int64_t, int64_t>(override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_SamplingPostProcessing_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 4, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 25}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 25}, true, true, true, false, true}),
+    ::testing::Values(cugraph::test::File_Usecase("karate.mtx"),
+                      cugraph::test::File_Usecase("dolphins.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_SamplingPostProcessing_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{1, 16, {5, 10, 15}, true, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {10}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, false, true, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, false, true, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, false, false, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, false, true, true},
+      SamplingPostProcessing_Usecase{32, 16, {5, 10, 15}, true, true, true, false, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test,
+  Tests_SamplingPostProcessing_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {10}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{1, 64, {5, 10, 15}, true, true, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {10}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, false, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, false, true, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, false, true, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, false, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, false, true, false},
+      SamplingPostProcessing_Usecase{256, 64, {5, 10, 15}, true, true, true, false, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()

From b2e85bff39a411d02f5a167f7bfab376ae9ccb67 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Tue, 19 Sep 2023 13:33:01 -0400
Subject: [PATCH 39/72] Update `cugraph-dgl` conv layers to use improved graph
 class (#3849)

This PR:
- Removes the usage of the deprecated `StaticCSC` and `SampledCSC`
- Support creating CSR and storing edge information in SparseGraph
- clean up unit tests
- Adds GATv2Conv layer
- Adds `pylibcugraphops` as a dependency of `cugraph-dgl` conda package

Authors:
  - Tingyu Wang (https://github.com/tingyu66)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3849
---
 conda/recipes/cugraph-dgl/meta.yaml           |   1 +
 .../cugraph_dgl/nn/conv/__init__.py           |   2 +
 .../cugraph-dgl/cugraph_dgl/nn/conv/base.py   | 262 +++++++++++++-----
 .../cugraph_dgl/nn/conv/gatconv.py            | 140 +++++++---
 .../cugraph_dgl/nn/conv/gatv2conv.py          | 249 +++++++++++++++++
 .../cugraph_dgl/nn/conv/relgraphconv.py       |  70 ++---
 .../cugraph_dgl/nn/conv/sageconv.py           | 122 ++++----
 .../cugraph_dgl/nn/conv/transformerconv.py    |  20 +-
 python/cugraph-dgl/tests/conftest.py          |   3 +
 python/cugraph-dgl/tests/nn/test_gatconv.py   | 100 ++++---
 python/cugraph-dgl/tests/nn/test_gatv2conv.py | 147 ++++++++++
 .../cugraph-dgl/tests/nn/test_relgraphconv.py |  71 +++--
 python/cugraph-dgl/tests/nn/test_sageconv.py  |  65 +++--
 .../cugraph-dgl/tests/nn/test_sparsegraph.py  |  28 +-
 .../tests/nn/test_transformerconv.py          |  41 ++-
 python/cugraph-dgl/tests/test_dataset.py      |   2 +-
 ...ograph.py => test_from_dgl_heterograph.py} |   0
 17 files changed, 978 insertions(+), 345 deletions(-)
 create mode 100644 python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
 create mode 100644 python/cugraph-dgl/tests/nn/test_gatv2conv.py
 rename python/cugraph-dgl/tests/{test_from_dgl_hetrograph.py => test_from_dgl_heterograph.py} (100%)

diff --git a/conda/recipes/cugraph-dgl/meta.yaml b/conda/recipes/cugraph-dgl/meta.yaml
index 2fbc6360c04..9e9fcd2faf1 100644
--- a/conda/recipes/cugraph-dgl/meta.yaml
+++ b/conda/recipes/cugraph-dgl/meta.yaml
@@ -26,6 +26,7 @@ requirements:
     - dgl >=1.1.0.cu*
     - numba >=0.57
     - numpy >=1.21
+    - pylibcugraphops ={{ version }}
     - python
     - pytorch
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
index e5acbf34478..3e7f2f076f0 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/__init__.py
@@ -13,6 +13,7 @@
 
 from .base import SparseGraph
 from .gatconv import GATConv
+from .gatv2conv import GATv2Conv
 from .relgraphconv import RelGraphConv
 from .sageconv import SAGEConv
 from .transformerconv import TransformerConv
@@ -20,6 +21,7 @@
 __all__ = [
     "SparseGraph",
     "GATConv",
+    "GATv2Conv",
     "RelGraphConv",
     "SAGEConv",
     "TransformerConv",
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
index 0eeaed29d86..307eb33078e 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
@@ -17,38 +17,7 @@
 
 torch = import_optional("torch")
 ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class BaseConv(torch.nn.Module):
-    r"""An abstract base class for cugraph-ops nn module."""
-
-    def __init__(self):
-        super().__init__()
-        self._cached_offsets_fg = None
-
-    def reset_parameters(self):
-        r"""Resets all learnable parameters of the module."""
-        raise NotImplementedError
-
-    def forward(self, *args):
-        r"""Runs the forward pass of the module."""
-        raise NotImplementedError
-
-    def pad_offsets(self, offsets: torch.Tensor, size: int) -> torch.Tensor:
-        r"""Pad zero-in-degree nodes to the end of offsets to reach size. This
-        is used to augment offset tensors from DGL blocks (MFGs) to be
-        compatible with cugraph-ops full-graph primitives."""
-        if self._cached_offsets_fg is None:
-            self._cached_offsets_fg = torch.empty(
-                size, dtype=offsets.dtype, device=offsets.device
-            )
-        elif self._cached_offsets_fg.numel() < size:
-            self._cached_offsets_fg.resize_(size)
-
-        self._cached_offsets_fg[: offsets.numel()] = offsets
-        self._cached_offsets_fg[offsets.numel() : size] = offsets[-1]
-
-        return self._cached_offsets_fg[:size]
+dgl = import_optional("dgl")
 
 
 def compress_ids(ids: torch.Tensor, size: int) -> torch.Tensor:
@@ -63,8 +32,9 @@ def decompress_ids(c_ids: torch.Tensor) -> torch.Tensor:
 
 
 class SparseGraph(object):
-    r"""A god-class to store different sparse formats needed by cugraph-ops
-    and facilitate sparse format conversions.
+    r"""A class to create and store different sparse formats needed by
+    cugraph-ops. It always creates a CSC representation and can provide COO- or
+    CSR-format if needed.
 
     Parameters
     ----------
@@ -89,25 +59,43 @@ class SparseGraph(object):
         consists of the sources between `src_indices[cdst_indices[k]]` and
         `src_indices[cdst_indices[k+1]]`.
 
-    dst_ids_is_sorted: bool
-        Whether `dst_ids` has been sorted in an ascending order. When sorted,
-        creating CSC layout is much faster.
+    values: torch.Tensor, optional
+        Values on the edges.
+
+    is_sorted: bool
+        Whether the COO inputs (src_ids, dst_ids, values) have been sorted by
+        `dst_ids` in an ascending order. CSC layout creation is much faster
+        when sorted.
 
     formats: str or tuple of str, optional
-        The desired sparse formats to create for the graph.
+        The desired sparse formats to create for the graph. The formats tuple
+        must include "csc". Default: "csc".
 
     reduce_memory: bool, optional
         When set, the tensors are not required by the desired formats will be
-        set to `None`.
+        set to `None`. Default: True.
 
     Notes
     -----
     For MFGs (sampled graphs), the node ids must have been renumbered.
     """
 
-    supported_formats = {"coo": ("src_ids", "dst_ids"), "csc": ("cdst_ids", "src_ids")}
-
-    all_tensors = set(["src_ids", "dst_ids", "csrc_ids", "cdst_ids"])
+    supported_formats = {
+        "coo": ("_src_ids", "_dst_ids"),
+        "csc": ("_cdst_ids", "_src_ids"),
+        "csr": ("_csrc_ids", "_dst_ids", "_perm_csc2csr"),
+    }
+
+    all_tensors = set(
+        [
+            "_src_ids",
+            "_dst_ids",
+            "_csrc_ids",
+            "_cdst_ids",
+            "_perm_coo2csc",
+            "_perm_csc2csr",
+        ]
+    )
 
     def __init__(
         self,
@@ -116,15 +104,19 @@ def __init__(
         dst_ids: Optional[torch.Tensor] = None,
         csrc_ids: Optional[torch.Tensor] = None,
         cdst_ids: Optional[torch.Tensor] = None,
-        dst_ids_is_sorted: bool = False,
-        formats: Optional[Union[str, Tuple[str]]] = None,
+        values: Optional[torch.Tensor] = None,
+        is_sorted: bool = False,
+        formats: Union[str, Tuple[str]] = "csc",
         reduce_memory: bool = True,
     ):
         self._num_src_nodes, self._num_dst_nodes = size
-        self._dst_ids_is_sorted = dst_ids_is_sorted
+        self._is_sorted = is_sorted
 
         if dst_ids is None and cdst_ids is None:
-            raise ValueError("One of 'dst_ids' and 'cdst_ids' must be given.")
+            raise ValueError(
+                "One of 'dst_ids' and 'cdst_ids' must be given "
+                "to create a SparseGraph."
+            )
 
         if src_ids is not None:
             src_ids = src_ids.contiguous()
@@ -148,21 +140,40 @@ def __init__(
                 )
             cdst_ids = cdst_ids.contiguous()
 
+        if values is not None:
+            values = values.contiguous()
+
         self._src_ids = src_ids
         self._dst_ids = dst_ids
         self._csrc_ids = csrc_ids
         self._cdst_ids = cdst_ids
-        self._perm = None
+        self._values = values
+        self._perm_coo2csc = None
+        self._perm_csc2csr = None
 
         if isinstance(formats, str):
             formats = (formats,)
-
-        if formats is not None:
-            for format_ in formats:
-                assert format_ in SparseGraph.supported_formats
-                self.__getattribute__(f"_create_{format_}")()
         self._formats = formats
 
+        if "csc" not in formats:
+            raise ValueError(
+                f"{self.__class__.__name__}.formats must contain "
+                f"'csc', but got {formats}."
+            )
+
+        # always create csc first
+        if self._cdst_ids is None:
+            if not self._is_sorted:
+                self._dst_ids, self._perm_coo2csc = torch.sort(self._dst_ids)
+                self._src_ids = self._src_ids[self._perm_coo2csc]
+                if self._values is not None:
+                    self._values = self._values[self._perm_coo2csc]
+            self._cdst_ids = compress_ids(self._dst_ids, self._num_dst_nodes)
+
+        for format_ in formats:
+            assert format_ in SparseGraph.supported_formats
+            self.__getattribute__(f"{format_}")()
+
         self._reduce_memory = reduce_memory
         if reduce_memory:
             self.reduce_memory()
@@ -170,8 +181,6 @@ def __init__(
     def reduce_memory(self):
         """Remove the tensors that are not necessary to create the desired sparse
         formats to reduce memory footprint."""
-
-        self._perm = None
         if self._formats is None:
             return
 
@@ -181,16 +190,22 @@ def reduce_memory(self):
         for t in SparseGraph.all_tensors.difference(set(tensors_needed)):
             self.__dict__[t] = None
 
-    def _create_coo(self):
+    def src_ids(self) -> torch.Tensor:
+        return self._src_ids
+
+    def cdst_ids(self) -> torch.Tensor:
+        return self._cdst_ids
+
+    def dst_ids(self) -> torch.Tensor:
         if self._dst_ids is None:
             self._dst_ids = decompress_ids(self._cdst_ids)
+        return self._dst_ids
 
-    def _create_csc(self):
-        if self._cdst_ids is None:
-            if not self._dst_ids_is_sorted:
-                self._dst_ids, self._perm = torch.sort(self._dst_ids)
-                self._src_ids = self._src_ids[self._perm]
-            self._cdst_ids = compress_ids(self._dst_ids, self._num_dst_nodes)
+    def csrc_ids(self) -> torch.Tensor:
+        if self._csrc_ids is None:
+            src_ids, self._perm_csc2csr = torch.sort(self._src_ids)
+            self._csrc_ids = compress_ids(src_ids, self._num_src_nodes)
+        return self._csrc_ids
 
     def num_src_nodes(self):
         return self._num_src_nodes
@@ -198,21 +213,134 @@ def num_src_nodes(self):
     def num_dst_nodes(self):
         return self._num_dst_nodes
 
+    def values(self):
+        return self._values
+
     def formats(self):
         return self._formats
 
-    def coo(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def coo(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         if "coo" not in self.formats():
             raise RuntimeError(
                 "The SparseGraph did not create a COO layout. "
-                "Set 'formats' to include 'coo' when creating the graph."
+                "Set 'formats' list to include 'coo' when creating the graph."
             )
-        return (self._src_ids, self._dst_ids)
+        return self.src_ids(), self.dst_ids(), self._values
 
-    def csc(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def csc(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         if "csc" not in self.formats():
             raise RuntimeError(
                 "The SparseGraph did not create a CSC layout. "
-                "Set 'formats' to include 'csc' when creating the graph."
+                "Set 'formats' list to include 'csc' when creating the graph."
+            )
+        return self.cdst_ids(), self.src_ids(), self._values
+
+    def csr(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        if "csr" not in self.formats():
+            raise RuntimeError(
+                "The SparseGraph did not create a CSR layout. "
+                "Set 'formats' list to include 'csr' when creating the graph."
+            )
+        csrc_ids = self.csrc_ids()
+        dst_ids = self.dst_ids()[self._perm_csc2csr]
+        value = self._values
+        if value is not None:
+            value = value[self._perm_csc2csr]
+        return csrc_ids, dst_ids, value
+
+
+class BaseConv(torch.nn.Module):
+    r"""An abstract base class for cugraph-ops nn module."""
+
+    def __init__(self):
+        super().__init__()
+
+    def reset_parameters(self):
+        r"""Resets all learnable parameters of the module."""
+        raise NotImplementedError
+
+    def forward(self, *args):
+        r"""Runs the forward pass of the module."""
+        raise NotImplementedError
+
+    def get_cugraph_ops_CSC(
+        self,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
+        is_bipartite: bool = False,
+        max_in_degree: Optional[int] = None,
+    ) -> ops_torch.CSC:
+        """Create CSC structure needed by cugraph-ops."""
+
+        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
+            raise TypeError(
+                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
+                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
             )
-        return (self._cdst_ids, self._src_ids)
+
+        # TODO: max_in_degree should default to None in pylibcugraphops
+        if max_in_degree is None:
+            max_in_degree = -1
+
+        if isinstance(g, SparseGraph):
+            offsets, indices, _ = g.csc()
+        else:
+            offsets, indices, _ = g.adj_tensors("csc")
+
+        graph = ops_torch.CSC(
+            offsets=offsets,
+            indices=indices,
+            num_src_nodes=g.num_src_nodes(),
+            dst_max_in_degree=max_in_degree,
+            is_bipartite=is_bipartite,
+        )
+
+        return graph
+
+    def get_cugraph_ops_HeteroCSC(
+        self,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
+        num_edge_types: int,
+        etypes: Optional[torch.Tensor] = None,
+        is_bipartite: bool = False,
+        max_in_degree: Optional[int] = None,
+    ) -> ops_torch.HeteroCSC:
+        """Create HeteroCSC structure needed by cugraph-ops."""
+
+        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
+            raise TypeError(
+                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
+                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
+            )
+
+        # TODO: max_in_degree should default to None in pylibcugraphops
+        if max_in_degree is None:
+            max_in_degree = -1
+
+        if isinstance(g, SparseGraph):
+            offsets, indices, etypes = g.csc()
+            if etypes is None:
+                raise ValueError(
+                    "SparseGraph must have 'values' to create HeteroCSC. "
+                    "Pass in edge types as 'values' when creating the SparseGraph."
+                )
+            etypes = etypes.int()
+        else:
+            if etypes is None:
+                raise ValueError(
+                    "'etypes' is required when creating HeteroCSC "
+                    "from dgl.DGLHeteroGraph."
+                )
+            offsets, indices, perm = g.adj_tensors("csc")
+            etypes = etypes[perm].int()
+
+        graph = ops_torch.HeteroCSC(
+            offsets=offsets,
+            indices=indices,
+            edge_types=etypes,
+            num_src_nodes=g.num_src_nodes(),
+            num_edge_types=num_edge_types,
+            dst_max_in_degree=max_in_degree,
+            is_bipartite=is_bipartite,
+        )
+
+        return graph
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
index 239def5b677..8843e61ad89 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
@@ -10,13 +10,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Torch Module for graph attention network layer using the aggregation
-primitives in cugraph-ops"""
-# pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
-from __future__ import annotations
+
 from typing import Optional, Tuple, Union
 
-from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
@@ -32,13 +29,15 @@ class GATConv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int, pair of ints
+    in_feats : int or tuple
         Input feature size. A pair denotes feature sizes of source and
         destination nodes.
     out_feats : int
         Output feature size.
     num_heads : int
-        Number of heads in Multi-Head Attention.
+        Number of heads in multi-head attention.
+    feat_drop : float, optional
+        Dropout rate on feature. Defaults: ``0``.
     concat : bool, optional
         If False, the multi-head attentions are averaged instead of concatenated.
         Default: ``True``.
@@ -46,6 +45,15 @@ class GATConv(BaseConv):
         Edge feature size. Default: ``None``.
     negative_slope : float, optional
         LeakyReLU angle of negative slope. Defaults: ``0.2``.
+    residual : bool, optional
+        If True, use residual connection. Defaults: ``False``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will
+        be invalid since no message will be passed to those nodes. This is
+        harmful for some applications causing silent performance regression.
+        This module will raise a DGLError if it detects 0-in-degree nodes in
+        input graph. By setting ``True``, it will suppress the check and let the
+        users handle it by themselves. Defaults: ``False``.
     bias : bool, optional
         If True, learns a bias term. Defaults: ``True``.
 
@@ -81,37 +89,46 @@ class GATConv(BaseConv):
             [ 1.6477, -1.9986],
             [ 1.1138, -1.9302]]], device='cuda:0', grad_fn=<ViewBackward0>)
     """
-    MAX_IN_DEGREE_MFG = 200
 
     def __init__(
         self,
         in_feats: Union[int, Tuple[int, int]],
         out_feats: int,
         num_heads: int,
+        feat_drop: float = 0.0,
         concat: bool = True,
         edge_feats: Optional[int] = None,
         negative_slope: float = 0.2,
+        residual: bool = False,
+        allow_zero_in_degree: bool = False,
         bias: bool = True,
     ):
         super().__init__()
         self.in_feats = in_feats
         self.out_feats = out_feats
+        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
+        self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.allow_zero_in_degree = allow_zero_in_degree
 
         if isinstance(in_feats, int):
-            self.fc = nn.Linear(in_feats, num_heads * out_feats, bias=False)
+            self.lin = nn.Linear(in_feats, num_heads * out_feats, bias=False)
         else:
-            self.fc_src = nn.Linear(in_feats[0], num_heads * out_feats, bias=False)
-            self.fc_dst = nn.Linear(in_feats[1], num_heads * out_feats, bias=False)
+            self.lin_src = nn.Linear(
+                self.in_feats_src, num_heads * out_feats, bias=False
+            )
+            self.lin_dst = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=False
+            )
 
         if edge_feats is not None:
-            self.fc_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
+            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
             self.attn_weights = nn.Parameter(torch.Tensor(3 * num_heads * out_feats))
         else:
-            self.register_parameter("fc_edge", None)
+            self.register_parameter("lin_edge", None)
             self.attn_weights = nn.Parameter(torch.Tensor(2 * num_heads * out_feats))
 
         if bias and concat:
@@ -121,28 +138,40 @@ def __init__(
         else:
             self.register_buffer("bias", None)
 
+        self.residual = residual and self.in_feats_dst != out_feats * num_heads
+        if self.residual:
+            self.lin_res = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=bias
+            )
+        else:
+            self.register_buffer("lin_res", None)
+
         self.reset_parameters()
 
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
-        if hasattr(self, "fc"):
-            nn.init.xavier_normal_(self.fc.weight, gain=gain)
+        if hasattr(self, "lin"):
+            nn.init.xavier_normal_(self.lin.weight, gain=gain)
         else:
-            nn.init.xavier_normal_(self.fc_src.weight, gain=gain)
-            nn.init.xavier_normal_(self.fc_dst.weight, gain=gain)
+            nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
+            nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
 
         nn.init.xavier_normal_(
             self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
         )
-        if self.fc_edge is not None:
-            self.fc_edge.reset_parameters()
+        if self.lin_edge is not None:
+            self.lin_edge.reset_parameters()
+
+        if self.lin_res is not None:
+            self.lin_res.reset_parameters()
+
         if self.bias is not None:
             nn.init.zeros_(self.bias)
 
     def forward(
         self,
-        g: dgl.DGLHeteroGraph,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
         nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
@@ -151,18 +180,17 @@ def forward(
 
         Parameters
         ----------
-        graph : DGLGraph
+        graph : DGLGraph or SparseGraph
             The graph.
         nfeat : torch.Tensor
             Input features of shape :math:`(N, D_{in})`.
         efeat: torch.Tensor, optional
             Optional edge features.
         max_in_degree : int
-            Maximum in-degree of destination nodes. It is only effective when
-            :attr:`g` is a :class:`DGLBlock`, i.e., bipartite graph. When
-            :attr:`g` is generated from a neighbor sampler, the value should be
-            set to the corresponding :attr:`fanout`. If not given,
-            :attr:`max_in_degree` will be calculated on-the-fly.
+            Maximum in-degree of destination nodes. When :attr:`g` is generated
+            from a neighbor sampler, the value should be set to the corresponding
+            :attr:`fanout`. This option is used to invoke the MFG-variant of
+            cugraph-ops kernel.
 
         Returns
         -------
@@ -171,49 +199,63 @@ def forward(
             :math:`H` is the number of heads, and :math:`D_{out}` is size of
             output feature.
         """
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        bipartite = not isinstance(nfeat, torch.Tensor)
-        offsets, indices, _ = g.adj_tensors("csc")
-
-        graph = ops_torch.CSC(
-            offsets=offsets,
-            indices=indices,
-            num_src_nodes=g.num_src_nodes(),
-            dst_max_in_degree=max_in_degree,
-            is_bipartite=bipartite,
+        if isinstance(g, dgl.DGLHeteroGraph):
+            if not self.allow_zero_in_degree:
+                if (g.in_degrees() == 0).any():
+                    raise dgl.base.DGLError(
+                        "There are 0-in-degree nodes in the graph, "
+                        "output for those nodes will be invalid. "
+                        "This is harmful for some applications, "
+                        "causing silent performance regression. "
+                        "Adding self-loop on the input graph by "
+                        "calling `g = dgl.add_self_loop(g)` will resolve "
+                        "the issue. Setting ``allow_zero_in_degree`` "
+                        "to be `True` when constructing this module will "
+                        "suppress the check and let the code run."
+                    )
+
+        bipartite = isinstance(nfeat, (list, tuple))
+
+        _graph = self.get_cugraph_ops_CSC(
+            g, is_bipartite=bipartite, max_in_degree=max_in_degree
         )
 
+        if bipartite:
+            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
+            nfeat_dst_orig = nfeat[1]
+        else:
+            nfeat = self.feat_drop(nfeat)
+            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
+
         if efeat is not None:
-            if self.fc_edge is None:
+            if self.lin_edge is None:
                 raise RuntimeError(
                     f"{self.__class__.__name__}.edge_feats must be set to "
                     f"accept edge features."
                 )
-            efeat = self.fc_edge(efeat)
+            efeat = self.lin_edge(efeat)
 
         if bipartite:
-            if not hasattr(self, "fc_src"):
+            if not hasattr(self, "lin_src"):
                 raise RuntimeError(
                     f"{self.__class__.__name__}.in_feats must be a pair of "
                     f"integers to allow bipartite node features, but got "
                     f"{self.in_feats}."
                 )
-            nfeat_src = self.fc_src(nfeat[0])
-            nfeat_dst = self.fc_dst(nfeat[1])
+            nfeat_src = self.lin_src(nfeat[0])
+            nfeat_dst = self.lin_dst(nfeat[1])
         else:
-            if not hasattr(self, "fc"):
+            if not hasattr(self, "lin"):
                 raise RuntimeError(
                     f"{self.__class__.__name__}.in_feats is expected to be an "
                     f"integer, but got {self.in_feats}."
                 )
-            nfeat = self.fc(nfeat)
+            nfeat = self.lin(nfeat)
 
         out = ops_torch.operators.mha_gat_n2n(
             (nfeat_src, nfeat_dst) if bipartite else nfeat,
             self.attn_weights,
-            graph,
+            _graph,
             num_heads=self.num_heads,
             activation="LeakyReLU",
             negative_slope=self.negative_slope,
@@ -224,6 +266,12 @@ def forward(
         if self.concat:
             out = out.view(-1, self.num_heads, self.out_feats)
 
+        if self.residual:
+            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
+            if not self.concat:
+                res = res.mean(dim=1)
+            out = out + res
+
         if self.bias is not None:
             out = out + self.bias
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
new file mode 100644
index 00000000000..209a5fe1a8d
--- /dev/null
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple, Union
+
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
+from cugraph.utilities.utils import import_optional
+
+dgl = import_optional("dgl")
+torch = import_optional("torch")
+nn = import_optional("torch.nn")
+ops_torch = import_optional("pylibcugraphops.pytorch")
+
+
+class GATv2Conv(BaseConv):
+    r"""GATv2 from `How Attentive are Graph Attention Networks?
+    <https://arxiv.org/pdf/2105.14491.pdf>`__, with the sparse aggregation
+    accelerated by cugraph-ops.
+
+    Parameters
+    ----------
+    in_feats : int, or pair of ints
+        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.
+        If the layer is to be applied to a unidirectional bipartite graph, `in_feats`
+        specifies the input feature size on both the source and destination nodes.
+        If a scalar is given, the source and destination node feature size
+        would take the same value.
+    out_feats : int
+        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
+    num_heads : int
+        Number of heads in Multi-Head Attention.
+    feat_drop : float, optional
+        Dropout rate on feature. Defaults: ``0``.
+    concat : bool, optional
+        If False, the multi-head attentions are averaged instead of concatenated.
+        Default: ``True``.
+    edge_feats : int, optional
+        Edge feature size. Default: ``None``.
+    negative_slope : float, optional
+        LeakyReLU angle of negative slope. Defaults: ``0.2``.
+    residual : bool, optional
+        If True, use residual connection. Defaults: ``False``.
+    allow_zero_in_degree : bool, optional
+        If there are 0-in-degree nodes in the graph, output for those nodes will
+        be invalid since no message will be passed to those nodes. This is
+        harmful for some applications causing silent performance regression.
+        This module will raise a DGLError if it detects 0-in-degree nodes in
+        input graph. By setting ``True``, it will suppress the check and let the
+        users handle it by themselves. Defaults: ``False``.
+    bias : bool, optional
+        If set to :obj:`False`, the layer will not learn
+        an additive bias. (default: :obj:`True`)
+    share_weights : bool, optional
+        If set to :obj:`True`, the same matrix for :math:`W_{left}` and
+        :math:`W_{right}` in the above equations, will be applied to the source
+        and the target node of every edge. (default: :obj:`False`)
+    """
+
+    def __init__(
+        self,
+        in_feats: Union[int, Tuple[int, int]],
+        out_feats: int,
+        num_heads: int,
+        feat_drop: float = 0.0,
+        concat: bool = True,
+        edge_feats: Optional[int] = None,
+        negative_slope: float = 0.2,
+        residual: bool = False,
+        allow_zero_in_degree: bool = False,
+        bias: bool = True,
+        share_weights: bool = False,
+    ):
+        super().__init__()
+        self.in_feats = in_feats
+        self.out_feats = out_feats
+        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
+        self.num_heads = num_heads
+        self.feat_drop = nn.Dropout(feat_drop)
+        self.concat = concat
+        self.edge_feats = edge_feats
+        self.negative_slope = negative_slope
+        self.allow_zero_in_degree = allow_zero_in_degree
+        self.share_weights = share_weights
+
+        self.lin_src = nn.Linear(self.in_feats_src, num_heads * out_feats, bias=bias)
+        if share_weights:
+            if self.in_feats_src != self.in_feats_dst:
+                raise ValueError(
+                    f"Input feature size of source and destination "
+                    f"nodes must be identical when share_weights is enabled, "
+                    f"but got {self.in_feats_src} and {self.in_feats_dst}."
+                )
+            self.lin_dst = self.lin_src
+        else:
+            self.lin_dst = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=bias
+            )
+
+        self.attn = nn.Parameter(torch.Tensor(num_heads * out_feats))
+
+        if edge_feats is not None:
+            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
+        else:
+            self.register_parameter("lin_edge", None)
+
+        if bias and concat:
+            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
+        elif bias and not concat:
+            self.bias = nn.Parameter(torch.Tensor(out_feats))
+        else:
+            self.register_buffer("bias", None)
+
+        self.residual = residual and self.in_feats_dst != out_feats * num_heads
+        if self.residual:
+            self.lin_res = nn.Linear(
+                self.in_feats_dst, num_heads * out_feats, bias=bias
+            )
+        else:
+            self.register_buffer("lin_res", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        r"""Reinitialize learnable parameters."""
+        gain = nn.init.calculate_gain("relu")
+        nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
+        nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
+
+        nn.init.xavier_normal_(
+            self.attn.view(-1, self.num_heads, self.out_feats), gain=gain
+        )
+        if self.lin_edge is not None:
+            self.lin_edge.reset_parameters()
+
+        if self.lin_res is not None:
+            self.lin_res.reset_parameters()
+
+        if self.bias is not None:
+            nn.init.zeros_(self.bias)
+
+    def forward(
+        self,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
+        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        efeat: Optional[torch.Tensor] = None,
+        max_in_degree: Optional[int] = None,
+    ) -> torch.Tensor:
+        r"""Forward computation.
+
+        Parameters
+        ----------
+        graph : DGLGraph or SparseGraph
+            The graph.
+        nfeat : torch.Tensor
+            Input features of shape :math:`(N, D_{in})`.
+        efeat: torch.Tensor, optional
+            Optional edge features.
+        max_in_degree : int
+            Maximum in-degree of destination nodes. When :attr:`g` is generated
+            from a neighbor sampler, the value should be set to the corresponding
+            :attr:`fanout`. This option is used to invoke the MFG-variant of
+            cugraph-ops kernel.
+
+        Returns
+        -------
+        torch.Tensor
+            The output feature of shape :math:`(N, H, D_{out})` where
+            :math:`H` is the number of heads, and :math:`D_{out}` is size of
+            output feature.
+        """
+
+        if isinstance(g, dgl.DGLHeteroGraph):
+            if not self.allow_zero_in_degree:
+                if (g.in_degrees() == 0).any():
+                    raise dgl.base.DGLError(
+                        "There are 0-in-degree nodes in the graph, "
+                        "output for those nodes will be invalid. "
+                        "This is harmful for some applications, "
+                        "causing silent performance regression. "
+                        "Adding self-loop on the input graph by "
+                        "calling `g = dgl.add_self_loop(g)` will resolve "
+                        "the issue. Setting ``allow_zero_in_degree`` "
+                        "to be `True` when constructing this module will "
+                        "suppress the check and let the code run."
+                    )
+
+        nfeat_bipartite = isinstance(nfeat, (list, tuple))
+        graph_bipartite = nfeat_bipartite or self.share_weights is False
+
+        _graph = self.get_cugraph_ops_CSC(
+            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
+        )
+
+        if nfeat_bipartite:
+            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
+            nfeat_dst_orig = nfeat[1]
+        else:
+            nfeat = self.feat_drop(nfeat)
+            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
+
+        if efeat is not None:
+            if self.lin_edge is None:
+                raise RuntimeError(
+                    f"{self.__class__.__name__}.edge_feats must be set to "
+                    f"accept edge features."
+                )
+            efeat = self.lin_edge(efeat)
+
+        if nfeat_bipartite:
+            nfeat = (self.lin_src(nfeat[0]), self.lin_dst(nfeat[1]))
+        elif graph_bipartite:
+            nfeat = (self.lin_src(nfeat), self.lin_dst(nfeat[: g.num_dst_nodes()]))
+        else:
+            nfeat = self.lin_src(nfeat)
+
+        out = ops_torch.operators.mha_gat_v2_n2n(
+            nfeat,
+            self.attn,
+            _graph,
+            num_heads=self.num_heads,
+            activation="LeakyReLU",
+            negative_slope=self.negative_slope,
+            concat_heads=self.concat,
+            edge_feat=efeat,
+        )[: g.num_dst_nodes()]
+
+        if self.concat:
+            out = out.view(-1, self.num_heads, self.out_feats)
+
+        if self.residual:
+            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
+            if not self.concat:
+                res = res.mean(dim=1)
+            out = out + res
+
+        if self.bias is not None:
+            out = out + self.bias
+
+        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
index 89e49011cf7..54916674210 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
@@ -10,14 +10,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Torch Module for Relational graph convolution layer using the aggregation
-primitives in cugraph-ops"""
-# pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
-from __future__ import annotations
+
 import math
-from typing import Optional
+from typing import Optional, Union
 
-from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
@@ -29,13 +26,8 @@
 class RelGraphConv(BaseConv):
     r"""An accelerated relational graph convolution layer from `Modeling
     Relational Data with Graph Convolutional Networks
-    <https://arxiv.org/abs/1703.06103>`__ that leverages the highly-optimized
-    aggregation primitives in cugraph-ops.
-
-    See :class:`dgl.nn.pytorch.conv.RelGraphConv` for mathematical model.
-
-    This module depends on :code:`pylibcugraphops` package, which can be
-    installed via :code:`conda install -c nvidia pylibcugraphops>=23.02`.
+    <https://arxiv.org/abs/1703.06103>`__, with the sparse aggregation
+    accelerated by cugraph-ops.
 
     Parameters
     ----------
@@ -84,7 +76,6 @@ class RelGraphConv(BaseConv):
             [-1.4335, -2.3758],
             [-1.4331, -2.3295]], device='cuda:0', grad_fn=<AddBackward0>)
     """
-    MAX_IN_DEGREE_MFG = 500
 
     def __init__(
         self,
@@ -148,7 +139,7 @@ def reset_parameters(self):
 
     def forward(
         self,
-        g: dgl.DGLHeteroGraph,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
         feat: torch.Tensor,
         etypes: torch.Tensor,
         max_in_degree: Optional[int] = None,
@@ -167,49 +158,24 @@ def forward(
             so any input of other integer types will be casted into int32,
             thus introducing some overhead. Pass in int32 tensors directly
             for best performance.
-        max_in_degree : int, optional
-            Maximum in-degree of destination nodes. It is only effective when
-            :attr:`g` is a :class:`DGLBlock`, i.e., bipartite graph. When
-            :attr:`g` is generated from a neighbor sampler, the value should be
-            set to the corresponding :attr:`fanout`. If not given,
-            :attr:`max_in_degree` will be calculated on-the-fly.
+        max_in_degree : int
+            Maximum in-degree of destination nodes. When :attr:`g` is generated
+            from a neighbor sampler, the value should be set to the corresponding
+            :attr:`fanout`. This option is used to invoke the MFG-variant of
+            cugraph-ops kernel.
 
         Returns
         -------
         torch.Tensor
             New node features. Shape: :math:`(|V|, D_{out})`.
         """
-        offsets, indices, edge_ids = g.adj_tensors("csc")
-        edge_types_perm = etypes[edge_ids.long()].int()
-
-        if g.is_block:
-            if max_in_degree is None:
-                max_in_degree = g.in_degrees().max().item()
-
-            if max_in_degree < self.MAX_IN_DEGREE_MFG:
-                _graph = ops_torch.SampledHeteroCSC(
-                    offsets,
-                    indices,
-                    edge_types_perm,
-                    max_in_degree,
-                    g.num_src_nodes(),
-                    self.num_rels,
-                )
-            else:
-                offsets_fg = self.pad_offsets(offsets, g.num_src_nodes() + 1)
-                _graph = ops_torch.StaticHeteroCSC(
-                    offsets_fg,
-                    indices,
-                    edge_types_perm,
-                    self.num_rels,
-                )
-        else:
-            _graph = ops_torch.StaticHeteroCSC(
-                offsets,
-                indices,
-                edge_types_perm,
-                self.num_rels,
-            )
+        _graph = self.get_cugraph_ops_HeteroCSC(
+            g,
+            num_edge_types=self.num_rels,
+            etypes=etypes,
+            is_bipartite=False,
+            max_in_degree=max_in_degree,
+        )
 
         h = ops_torch.operators.agg_hg_basis_n2n_post(
             feat,
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
index 60f4c505e19..a3f946d7cb4 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
@@ -10,11 +10,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Torch Module for GraphSAGE layer using the aggregation primitives in
-cugraph-ops"""
-# pylint: disable=no-member, arguments-differ, invalid-name, too-many-arguments
-from __future__ import annotations
-from typing import Optional, Union
+
+from typing import Optional, Tuple, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -27,22 +24,18 @@
 
 class SAGEConv(BaseConv):
     r"""An accelerated GraphSAGE layer from `Inductive Representation Learning
-    on Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`__ that leverages the
-    highly-optimized aggregation primitives in cugraph-ops.
-
-    See :class:`dgl.nn.pytorch.conv.SAGEConv` for mathematical model.
-
-    This module depends on :code:`pylibcugraphops` package, which can be
-    installed via :code:`conda install -c nvidia pylibcugraphops>=23.02`.
+    on Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`, with the sparse
+    aggregation accelerated by cugraph-ops.
 
     Parameters
     ----------
-    in_feats : int
-        Input feature size.
+    in_feats : int or tuple
+        Input feature size. If a scalar is given, the source and destination
+        nodes are required to be the same.
     out_feats : int
         Output feature size.
     aggregator_type : str
-        Aggregator type to use (``mean``, ``sum``, ``min``, ``max``).
+        Aggregator type to use ("mean", "sum", "min", "max", "pool", "gcn").
     feat_drop : float
         Dropout rate on features, default: ``0``.
     bias : bool
@@ -68,38 +61,57 @@ class SAGEConv(BaseConv):
             [-1.1690,  0.1952],
             [-1.1690,  0.1952]], device='cuda:0', grad_fn=<AddmmBackward0>)
     """
-    MAX_IN_DEGREE_MFG = 500
+    valid_aggr_types = {"mean", "sum", "min", "max", "pool", "gcn"}
 
     def __init__(
         self,
-        in_feats: int,
+        in_feats: Union[int, Tuple[int, int]],
         out_feats: int,
         aggregator_type: str = "mean",
         feat_drop: float = 0.0,
         bias: bool = True,
     ):
         super().__init__()
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        valid_aggr_types = {"max", "min", "mean", "sum"}
-        if aggregator_type not in valid_aggr_types:
+
+        if aggregator_type not in self.valid_aggr_types:
             raise ValueError(
-                f"Invalid aggregator_type. Must be one of {valid_aggr_types}. "
+                f"Invalid aggregator_type. Must be one of {self.valid_aggr_types}. "
                 f"But got '{aggregator_type}' instead."
             )
-        self.aggr = aggregator_type
+
+        self.aggregator_type = aggregator_type
+        self._aggr = aggregator_type
+        self.in_feats = in_feats
+        self.out_feats = out_feats
+        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.feat_drop = nn.Dropout(feat_drop)
 
-        self.linear = nn.Linear(2 * in_feats, out_feats, bias=bias)
+        if self.aggregator_type == "gcn":
+            self._aggr = "mean"
+            self.lin = nn.Linear(self.in_feats_src, out_feats, bias=bias)
+        else:
+            self.lin = nn.Linear(
+                self.in_feats_src + self.in_feats_dst, out_feats, bias=bias
+            )
+
+        if self.aggregator_type == "pool":
+            self._aggr = "max"
+            self.pre_lin = nn.Linear(self.in_feats_src, self.in_feats_src)
+        else:
+            self.register_parameter("pre_lin", None)
+
+        self.reset_parameters()
 
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
-        self.linear.reset_parameters()
+        self.lin.reset_parameters()
+        if self.pre_lin is not None:
+            self.pre_lin.reset_parameters()
 
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: torch.Tensor,
+        feat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
         r"""Forward computation.
@@ -108,7 +120,7 @@ def forward(
         ----------
         g : DGLGraph or SparseGraph
             The graph.
-        feat : torch.Tensor
+        feat : torch.Tensor or tuple
             Node features. Shape: :math:`(|V|, D_{in})`.
         max_in_degree : int
             Maximum in-degree of destination nodes. When :attr:`g` is generated
@@ -121,36 +133,34 @@ def forward(
         torch.Tensor
             Output node features. Shape: :math:`(|V|, D_{out})`.
         """
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        if isinstance(g, SparseGraph):
-            assert "csc" in g.formats()
-            offsets, indices = g.csc()
-            _graph = ops_torch.CSC(
-                offsets=offsets,
-                indices=indices,
-                num_src_nodes=g.num_src_nodes(),
-                dst_max_in_degree=max_in_degree,
-            )
-        elif isinstance(g, dgl.DGLHeteroGraph):
-            offsets, indices, _ = g.adj_tensors("csc")
-            _graph = ops_torch.CSC(
-                offsets=offsets,
-                indices=indices,
-                num_src_nodes=g.num_src_nodes(),
-                dst_max_in_degree=max_in_degree,
-            )
-        else:
-            raise TypeError(
-                f"The graph has to be either a 'SparseGraph' or "
-                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
-            )
+        feat_bipartite = isinstance(feat, (list, tuple))
+        graph_bipartite = feat_bipartite or self.aggregator_type == "pool"
+
+        _graph = self.get_cugraph_ops_CSC(
+            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
+        )
 
-        feat = self.feat_drop(feat)
-        h = ops_torch.operators.agg_concat_n2n(feat, _graph, self.aggr)[
+        if feat_bipartite:
+            feat = (self.feat_drop(feat[0]), self.feat_drop(feat[1]))
+        else:
+            feat = self.feat_drop(feat)
+
+        if self.aggregator_type == "pool":
+            if feat_bipartite:
+                feat = (self.pre_lin(feat[0]).relu(), feat[1])
+            else:
+                feat = (self.pre_lin(feat).relu(), feat[: g.num_dst_nodes()])
+            # force ctx.needs_input_grad=True in cugraph-ops autograd function
+            feat[0].requires_grad_()
+            feat[1].requires_grad_()
+
+        out = ops_torch.operators.agg_concat_n2n(feat, _graph, self._aggr)[
             : g.num_dst_nodes()
         ]
-        h = self.linear(h)
 
-        return h
+        if self.aggregator_type == "gcn":
+            out = out[:, : self.in_feats_src]
+
+        out = self.lin(out)
+
+        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
index 5cd5fbbaebe..8481b9ee265 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
@@ -10,9 +10,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from typing import Optional, Tuple, Union
 
-from cugraph_dgl.nn.conv.base import BaseConv
+from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
 
 dgl = import_optional("dgl")
@@ -114,7 +115,7 @@ def reset_parameters(self):
 
     def forward(
         self,
-        g: dgl.DGLHeteroGraph,
+        g: Union[SparseGraph, dgl.DGLHeteroGraph],
         nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -130,17 +131,12 @@ def forward(
         efeat: torch.Tensor, optional
             Edge feature tensor. Default: ``None``.
         """
-        offsets, indices, _ = g.adj_tensors("csc")
-        graph = ops_torch.CSC(
-            offsets=offsets,
-            indices=indices,
-            num_src_nodes=g.num_src_nodes(),
-            is_bipartite=True,
-        )
-
-        if isinstance(nfeat, torch.Tensor):
+        feat_bipartite = isinstance(nfeat, (list, tuple))
+        if not feat_bipartite:
             nfeat = (nfeat, nfeat)
 
+        _graph = self.get_cugraph_ops_CSC(g, is_bipartite=True)
+
         query = self.lin_query(nfeat[1][: g.num_dst_nodes()])
         key = self.lin_key(nfeat[0])
         value = self.lin_value(nfeat[0])
@@ -157,7 +153,7 @@ def forward(
             key_emb=key,
             query_emb=query,
             value_emb=value,
-            graph=graph,
+            graph=_graph,
             num_heads=self.num_heads,
             concat_heads=self.concat,
             edge_emb=efeat,
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/tests/conftest.py
index 6f8690d1140..a3863ed81fa 100644
--- a/python/cugraph-dgl/tests/conftest.py
+++ b/python/cugraph-dgl/tests/conftest.py
@@ -40,16 +40,19 @@ class SparseGraphData1:
     nnz = 6
     src_ids = torch.IntTensor([0, 1, 2, 3, 2, 5]).cuda()
     dst_ids = torch.IntTensor([1, 2, 3, 4, 0, 3]).cuda()
+    values = torch.IntTensor([10, 20, 30, 40, 50, 60]).cuda()
 
     # CSR
     src_ids_sorted_by_src = torch.IntTensor([0, 1, 2, 2, 3, 5]).cuda()
     dst_ids_sorted_by_src = torch.IntTensor([1, 2, 0, 3, 4, 3]).cuda()
     csrc_ids = torch.IntTensor([0, 1, 2, 4, 5, 5, 6]).cuda()
+    values_csr = torch.IntTensor([10, 20, 50, 30, 40, 60]).cuda()
 
     # CSC
     src_ids_sorted_by_dst = torch.IntTensor([2, 0, 1, 5, 2, 3]).cuda()
     dst_ids_sorted_by_dst = torch.IntTensor([0, 1, 2, 3, 3, 4]).cuda()
     cdst_ids = torch.IntTensor([0, 1, 2, 3, 5, 6]).cuda()
+    values_csc = torch.IntTensor([50, 10, 20, 60, 30, 40]).cuda()
 
 
 @pytest.fixture
diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/tests/nn/test_gatconv.py
index 7ed65645a28..ef3047dc2cd 100644
--- a/python/cugraph-dgl/tests/nn/test_gatconv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatconv.py
@@ -10,69 +10,84 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=too-many-arguments, too-many-locals
 
 import pytest
 
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import GATConv as CuGraphGATConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("idtype_int", [False, True])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
-def test_gatconv_equality(bipartite, idtype_int, max_in_degree, num_heads, to_block):
-    GATConv = dgl.nn.GATConv
-    CuGraphGATConv = cugraph_dgl.nn.GATConv
-    device = "cuda"
-    g = create_graph1().to(device)
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
+def test_gatconv_equality(
+    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+):
+    from dgl.nn.pytorch import GATConv
+
+    g = create_graph1().to("cuda")
 
     if idtype_int:
         g = g.int()
-
     if to_block:
         g = dgl.to_block(g)
 
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0], device=device),
-            torch.rand(g.num_dst_nodes(), in_feats[1], device=device),
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats, device=device)
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
     out_feats = 2
 
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
+        )
+    elif sparse_format == "csc":
+        offsets, indices, _ = g.adj_tensors("csc")
+        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
+
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False}
+    kwargs = {"bias": False, "allow_zero_in_degree": True}
 
-    conv1 = GATConv(*args, **kwargs, allow_zero_in_degree=True).to(device)
+    conv1 = GATConv(*args, **kwargs).cuda()
     out1 = conv1(g, nfeat)
 
-    conv2 = CuGraphGATConv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATConv(*args, **kwargs).cuda()
     dim = num_heads * out_feats
     with torch.no_grad():
         conv2.attn_weights.data[:dim] = conv1.attn_l.data.flatten()
         conv2.attn_weights.data[dim:] = conv1.attn_r.data.flatten()
         if bipartite:
-            conv2.fc_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-            conv2.fc_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+            conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
+            conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
         else:
-            conv2.fc.weight.data = conv1.fc.weight.data.detach().clone()
-    out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
+            conv2.lin.weight.data = conv1.fc.weight.data.detach().clone()
+        if residual and conv2.residual:
+            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
 
-    assert torch.allclose(out1, out2, atol=1e-6)
+    if sparse_format is not None:
+        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
+    else:
+        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
+
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out1 = torch.rand_like(out1)
     grad_out2 = grad_out1.clone().detach()
@@ -81,18 +96,18 @@ def test_gatconv_equality(bipartite, idtype_int, max_in_degree, num_heads, to_bl
 
     if bipartite:
         assert torch.allclose(
-            conv1.fc_src.weight.grad, conv2.fc_src.weight.grad, atol=1e-6
+            conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
         )
         assert torch.allclose(
-            conv1.fc_dst.weight.grad, conv2.fc_dst.weight.grad, atol=1e-6
+            conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
         )
     else:
-        assert torch.allclose(conv1.fc.weight.grad, conv2.fc.weight.grad, atol=1e-6)
+        assert torch.allclose(conv1.fc.weight.grad, conv2.lin.weight.grad, atol=ATOL)
 
     assert torch.allclose(
         torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
         conv2.attn_weights.grad.view(2, num_heads, out_feats),
-        atol=1e-6,
+        atol=ATOL,
     )
 
 
@@ -106,10 +121,7 @@ def test_gatconv_equality(bipartite, idtype_int, max_in_degree, num_heads, to_bl
 def test_gatconv_edge_feats(
     bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
 ):
-    from cugraph_dgl.nn import GATConv
-
-    device = "cuda"
-    g = create_graph1().to(device)
+    g = create_graph1().to("cuda")
 
     if to_block:
         g = dgl.to_block(g)
@@ -117,24 +129,30 @@ def test_gatconv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0], device=device),
-            torch.rand(g.num_dst_nodes(), in_feats[1], device=device),
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats, device=device)
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats, device=device)
+        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
     else:
         edge_feats = None
         efeat = None
 
-    conv = GATConv(
-        in_feats, out_feats, num_heads, concat=concat, edge_feats=edge_feats, bias=bias
-    ).to(device)
+    conv = CuGraphGATConv(
+        in_feats,
+        out_feats,
+        num_heads,
+        concat=concat,
+        edge_feats=edge_feats,
+        bias=bias,
+        allow_zero_in_degree=True,
+    ).cuda()
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
     grad_out = torch.rand_like(out)
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
new file mode 100644
index 00000000000..cc46a6e4b39
--- /dev/null
+++ b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import GATv2Conv as CuGraphGATv2Conv
+from .common import create_graph1
+
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
+
+
+@pytest.mark.parametrize("bipartite", [False, True])
+@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
+@pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("residual", [False, True])
+@pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
+def test_gatv2conv_equality(
+    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+):
+    from dgl.nn.pytorch import GATv2Conv
+
+    g = create_graph1().to("cuda")
+
+    if idtype_int:
+        g = g.int()
+    if to_block:
+        g = dgl.to_block(g)
+
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+
+    if bipartite:
+        in_feats = (10, 3)
+        nfeat = (
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+        )
+    else:
+        in_feats = 10
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+    out_feats = 2
+
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
+        )
+    elif sparse_format == "csc":
+        offsets, indices, _ = g.adj_tensors("csc")
+        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
+
+    args = (in_feats, out_feats, num_heads)
+    kwargs = {"bias": False, "allow_zero_in_degree": True}
+
+    conv1 = GATv2Conv(*args, **kwargs).cuda()
+    out1 = conv1(g, nfeat)
+
+    conv2 = CuGraphGATv2Conv(*args, **kwargs).cuda()
+    with torch.no_grad():
+        conv2.attn.data = conv1.attn.data.flatten()
+        conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
+        conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+        if residual and conv2.residual:
+            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+
+    if sparse_format is not None:
+        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
+    else:
+        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
+
+    assert torch.allclose(out1, out2, atol=ATOL)
+
+    grad_out1 = torch.rand_like(out1)
+    grad_out2 = grad_out1.clone().detach()
+    out1.backward(grad_out1)
+    out2.backward(grad_out2)
+
+    assert torch.allclose(
+        conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
+    )
+    assert torch.allclose(
+        conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
+    )
+
+    assert torch.allclose(conv1.attn.grad, conv1.attn.grad, atol=ATOL)
+
+
+@pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("bipartite", [False, True])
+@pytest.mark.parametrize("concat", [False, True])
+@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("num_heads", [1, 2, 7])
+@pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("use_edge_feats", [False, True])
+def test_gatv2conv_edge_feats(
+    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+):
+    g = create_graph1().to("cuda")
+
+    if to_block:
+        g = dgl.to_block(g)
+
+    if bipartite:
+        in_feats = (10, 3)
+        nfeat = (
+            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+        )
+    else:
+        in_feats = 10
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+    out_feats = 2
+
+    if use_edge_feats:
+        edge_feats = 3
+        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+    else:
+        edge_feats = None
+        efeat = None
+
+    conv = CuGraphGATv2Conv(
+        in_feats,
+        out_feats,
+        num_heads,
+        concat=concat,
+        edge_feats=edge_feats,
+        bias=bias,
+        allow_zero_in_degree=True,
+    ).cuda()
+    out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
+
+    grad_out = torch.rand_like(out)
+    out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
index d2ae6a23978..901f9ba1433 100644
--- a/python/cugraph-dgl/tests/nn/test_relgraphconv.py
+++ b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
@@ -10,20 +10,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=too-many-arguments, too-many-locals
 
 import pytest
 
-try:
-    import cugraph_dgl
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import RelGraphConv as CuGraphRelGraphConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("idtype_int", [False, True])
@@ -32,12 +29,17 @@
 @pytest.mark.parametrize("regularizer", [None, "basis"])
 @pytest.mark.parametrize("self_loop", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_relgraphconv_equality(
-    idtype_int, max_in_degree, num_bases, regularizer, self_loop, to_block
+    idtype_int,
+    max_in_degree,
+    num_bases,
+    regularizer,
+    self_loop,
+    to_block,
+    sparse_format,
 ):
-    RelGraphConv = dgl.nn.RelGraphConv
-    CuGraphRelGraphConv = cugraph_dgl.nn.RelGraphConv
-    device = "cuda"
+    from dgl.nn.pytorch import RelGraphConv
 
     in_feat, out_feat, num_rels = 10, 2, 3
     args = (in_feat, out_feat, num_rels)
@@ -47,34 +49,57 @@ def test_relgraphconv_equality(
         "bias": False,
         "self_loop": self_loop,
     }
-    g = create_graph1().to(device)
-    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).to(device)
+    g = create_graph1().to("cuda")
+    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).cuda()
+
     if idtype_int:
         g = g.int()
     if to_block:
         g = dgl.to_block(g)
-    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
+
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+    feat = torch.rand(g.num_src_nodes(), in_feat).cuda()
+
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size,
+            src_ids=g.edges()[0],
+            dst_ids=g.edges()[1],
+            values=g.edata[dgl.ETYPE],
+            formats="csc",
+        )
+    elif sparse_format == "csc":
+        offsets, indices, perm = g.adj_tensors("csc")
+        etypes = g.edata[dgl.ETYPE][perm]
+        sg = SparseGraph(
+            size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
+        )
 
     torch.manual_seed(0)
-    conv1 = RelGraphConv(*args, **kwargs).to(device)
+    conv1 = RelGraphConv(*args, **kwargs).cuda()
 
     torch.manual_seed(0)
     kwargs["apply_norm"] = False
-    conv2 = CuGraphRelGraphConv(*args, **kwargs).to(device)
+    conv2 = CuGraphRelGraphConv(*args, **kwargs).cuda()
 
     out1 = conv1(g, feat, g.edata[dgl.ETYPE])
-    out2 = conv2(g, feat, g.edata[dgl.ETYPE], max_in_degree=max_in_degree)
-    assert torch.allclose(out1, out2, atol=1e-06)
+
+    if sparse_format is not None:
+        out2 = conv2(sg, feat, sg.values(), max_in_degree=max_in_degree)
+    else:
+        out2 = conv2(g, feat, g.edata[dgl.ETYPE], max_in_degree=max_in_degree)
+
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out = torch.rand_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
 
     end = -1 if self_loop else None
-    assert torch.allclose(conv1.linear_r.W.grad, conv2.W.grad[:end], atol=1e-6)
+    assert torch.allclose(conv1.linear_r.W.grad, conv2.W.grad[:end], atol=ATOL)
 
     if self_loop:
-        assert torch.allclose(conv1.loop_weight.grad, conv2.W.grad[-1], atol=1e-6)
+        assert torch.allclose(conv1.loop_weight.grad, conv2.W.grad[-1], atol=ATOL)
 
     if regularizer is not None:
-        assert torch.allclose(conv1.linear_r.coeff.grad, conv2.coeff.grad, atol=1e-6)
+        assert torch.allclose(conv1.linear_r.coeff.grad, conv2.coeff.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index 447bbe49460..e2acf9e6596 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -10,31 +10,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# pylint: disable=too-many-arguments, too-many-locals
 
 import pytest
 
-from cugraph.utilities.utils import import_optional
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
 
+ATOL = 1e-6
 
+
+@pytest.mark.parametrize("aggr", ["mean", "pool"])
 @pytest.mark.parametrize("bias", [False, True])
+@pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("idtype_int", [False, True])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block, sparse_format):
-    SAGEConv = dgl.nn.SAGEConv
-    device = "cuda"
+def test_sageconv_equality(
+    aggr, bias, bipartite, idtype_int, max_in_degree, to_block, sparse_format
+):
+    from dgl.nn.pytorch import SAGEConv
 
-    in_feat, out_feat = 5, 2
-    kwargs = {"aggregator_type": "mean", "bias": bias}
-    g = create_graph1().to(device)
+    kwargs = {"aggregator_type": aggr, "bias": bias}
+    g = create_graph1().to("cuda")
 
     if idtype_int:
         g = g.int()
@@ -42,7 +44,17 @@ def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block, sparse_for
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
-    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
+
+    if bipartite:
+        in_feats = (5, 3)
+        feat = (
+            torch.rand(size[0], in_feats[0], requires_grad=True).cuda(),
+            torch.rand(size[1], in_feats[1], requires_grad=True).cuda(),
+        )
+    else:
+        in_feats = 5
+        feat = torch.rand(size[0], in_feats).cuda()
+    out_feats = 2
 
     if sparse_format == "coo":
         sg = SparseGraph(
@@ -52,39 +64,38 @@ def test_SAGEConv_equality(bias, idtype_int, max_in_degree, to_block, sparse_for
         offsets, indices, _ = g.adj_tensors("csc")
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
-    torch.manual_seed(0)
-    conv1 = SAGEConv(in_feat, out_feat, **kwargs).to(device)
-
-    torch.manual_seed(0)
-    conv2 = CuGraphSAGEConv(in_feat, out_feat, **kwargs).to(device)
+    conv1 = SAGEConv(in_feats, out_feats, **kwargs).cuda()
+    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).cuda()
 
+    in_feats_src = conv2.in_feats_src
     with torch.no_grad():
-        conv2.linear.weight.data[:, :in_feat] = conv1.fc_neigh.weight.data
-        conv2.linear.weight.data[:, in_feat:] = conv1.fc_self.weight.data
+        conv2.lin.weight.data[:, :in_feats_src] = conv1.fc_neigh.weight.data
+        conv2.lin.weight.data[:, in_feats_src:] = conv1.fc_self.weight.data
         if bias:
-            conv2.linear.bias.data[:] = conv1.fc_self.bias.data
+            conv2.lin.bias.data[:] = conv1.fc_self.bias.data
+        if aggr == "pool":
+            conv2.pre_lin.weight.data[:] = conv1.fc_pool.weight.data
+            conv2.pre_lin.bias.data[:] = conv1.fc_pool.bias.data
 
     out1 = conv1(g, feat)
     if sparse_format is not None:
         out2 = conv2(sg, feat, max_in_degree=max_in_degree)
     else:
         out2 = conv2(g, feat, max_in_degree=max_in_degree)
-    assert torch.allclose(out1, out2, atol=1e-06)
+    assert torch.allclose(out1, out2, atol=ATOL)
 
     grad_out = torch.rand_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
     assert torch.allclose(
         conv1.fc_neigh.weight.grad,
-        conv2.linear.weight.grad[:, :in_feat],
-        atol=1e-6,
+        conv2.lin.weight.grad[:, :in_feats_src],
+        atol=ATOL,
     )
     assert torch.allclose(
         conv1.fc_self.weight.grad,
-        conv2.linear.weight.grad[:, in_feat:],
-        atol=1e-6,
+        conv2.lin.weight.grad[:, in_feats_src:],
+        atol=ATOL,
     )
     if bias:
-        assert torch.allclose(
-            conv1.fc_self.bias.grad, conv2.linear.bias.grad, atol=1e-6
-        )
+        assert torch.allclose(conv1.fc_self.bias.grad, conv2.lin.bias.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/tests/nn/test_sparsegraph.py b/python/cugraph-dgl/tests/nn/test_sparsegraph.py
index 3fb01575d66..09c0df202ff 100644
--- a/python/cugraph-dgl/tests/nn/test_sparsegraph.py
+++ b/python/cugraph-dgl/tests/nn/test_sparsegraph.py
@@ -19,32 +19,42 @@
 
 def test_coo2csc(sparse_graph_1):
     data = sparse_graph_1
-    values = torch.ones(data.nnz).cuda()
+
     g = SparseGraph(
-        size=data.size, src_ids=data.src_ids, dst_ids=data.dst_ids, formats="csc"
+        size=data.size,
+        src_ids=data.src_ids,
+        dst_ids=data.dst_ids,
+        values=data.values,
+        formats=["csc"],
     )
-    cdst_ids, src_ids = g.csc()
+    cdst_ids, src_ids, values = g.csc()
 
     new = torch.sparse_csc_tensor(cdst_ids, src_ids, values).cuda()
     old = torch.sparse_coo_tensor(
-        torch.vstack((data.src_ids, data.dst_ids)), values
+        torch.vstack((data.src_ids, data.dst_ids)), data.values
     ).cuda()
     torch.allclose(new.to_dense(), old.to_dense())
 
 
-def test_csc2coo(sparse_graph_1):
+def test_csc_input(sparse_graph_1):
     data = sparse_graph_1
-    values = torch.ones(data.nnz).cuda()
+
     g = SparseGraph(
         size=data.size,
         src_ids=data.src_ids_sorted_by_dst,
         cdst_ids=data.cdst_ids,
-        formats="coo",
+        values=data.values_csc,
+        formats=["coo", "csc", "csr"],
     )
-    src_ids, dst_ids = g.coo()
+    src_ids, dst_ids, values = g.coo()
 
     new = torch.sparse_coo_tensor(torch.vstack((src_ids, dst_ids)), values).cuda()
     old = torch.sparse_csc_tensor(
-        data.cdst_ids, data.src_ids_sorted_by_dst, values
+        data.cdst_ids, data.src_ids_sorted_by_dst, data.values_csc
     ).cuda()
     torch.allclose(new.to_dense(), old.to_dense())
+
+    csrc_ids, dst_ids, values = g.csr()
+
+    new = torch.sparse_csr_tensor(csrc_ids, dst_ids, values).cuda()
+    torch.allclose(new.to_dense(), old.to_dense())
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
index 00476b9f0bb..b2b69cb35ab 100644
--- a/python/cugraph-dgl/tests/nn/test_transformerconv.py
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -13,16 +13,14 @@
 
 import pytest
 
-try:
-    from cugraph_dgl.nn import TransformerConv
-except ModuleNotFoundError:
-    pytest.skip("cugraph_dgl not available", allow_module_level=True)
-
-from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn.conv.base import SparseGraph
+from cugraph_dgl.nn import TransformerConv
 from .common import create_graph1
 
-torch = import_optional("torch")
-dgl = import_optional("dgl")
+dgl = pytest.importorskip("dgl", reason="DGL not available")
+torch = pytest.importorskip("torch", reason="PyTorch not available")
+
+ATOL = 1e-6
 
 
 @pytest.mark.parametrize("beta", [False, True])
@@ -32,8 +30,16 @@
 @pytest.mark.parametrize("num_heads", [1, 2, 3, 4])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
-def test_TransformerConv(
-    beta, bipartite_node_feats, concat, idtype_int, num_heads, to_block, use_edge_feats
+@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
+def test_transformerconv(
+    beta,
+    bipartite_node_feats,
+    concat,
+    idtype_int,
+    num_heads,
+    to_block,
+    use_edge_feats,
+    sparse_format,
 ):
     device = "cuda"
     g = create_graph1().to(device)
@@ -44,6 +50,15 @@ def test_TransformerConv(
     if to_block:
         g = dgl.to_block(g)
 
+    size = (g.num_src_nodes(), g.num_dst_nodes())
+    if sparse_format == "coo":
+        sg = SparseGraph(
+            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
+        )
+    elif sparse_format == "csc":
+        offsets, indices, _ = g.adj_tensors("csc")
+        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
+
     if bipartite_node_feats:
         in_node_feats = (5, 3)
         nfeat = (
@@ -71,6 +86,10 @@ def test_TransformerConv(
         edge_feats=edge_feats,
     ).to(device)
 
-    out = conv(g, nfeat, efeat)
+    if sparse_format is not None:
+        out = conv(sg, nfeat, efeat)
+    else:
+        out = conv(g, nfeat, efeat)
+
     grad_out = torch.rand_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/test_dataset.py b/python/cugraph-dgl/tests/test_dataset.py
index 69d50261e55..5db443dc0d8 100644
--- a/python/cugraph-dgl/tests/test_dataset.py
+++ b/python/cugraph-dgl/tests/test_dataset.py
@@ -123,6 +123,6 @@ def test_homogeneous_sampled_graphs_from_dataframe(return_type, seed_node):
             assert dgl_block.num_src_nodes() == cugraph_dgl_graph.num_src_nodes()
             assert dgl_block.num_dst_nodes() == cugraph_dgl_graph.num_dst_nodes()
             dgl_offsets, dgl_indices, _ = dgl_block.adj_tensors("csc")
-            cugraph_offsets, cugraph_indices = cugraph_dgl_graph.csc()
+            cugraph_offsets, cugraph_indices, _ = cugraph_dgl_graph.csc()
             assert torch.equal(dgl_offsets.to("cpu"), cugraph_offsets.to("cpu"))
             assert torch.equal(dgl_indices.to("cpu"), cugraph_indices.to("cpu"))
diff --git a/python/cugraph-dgl/tests/test_from_dgl_hetrograph.py b/python/cugraph-dgl/tests/test_from_dgl_heterograph.py
similarity index 100%
rename from python/cugraph-dgl/tests/test_from_dgl_hetrograph.py
rename to python/cugraph-dgl/tests/test_from_dgl_heterograph.py

From ed7b1a41fe502c9097c4ac9688f08c1d1e5fd33f Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Tue, 19 Sep 2023 13:48:52 -0400
Subject: [PATCH 40/72] New mtmg API for integration (#3521)

Creating a new API for integrating multi-threaded multi-GPU programs into the cugraph library.

This API will extend our OPG (one [process] per GPU) model to support a single process handling multiple GPUs, and will also ultimately support a multi-node configuration where some compute nodes might not have GPUs.

closes https://github.com/rapidsai/graph_dl/issues/241

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/3521
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/cmake/thirdparty/get_ucp.cmake            |  35 ++
 .../mtmg/detail/device_shared_device_span.hpp |  39 ++
 .../detail/device_shared_device_vector.hpp    |  58 +++
 .../mtmg/detail/device_shared_wrapper.hpp     | 123 +++++
 .../mtmg/detail/per_device_edgelist.hpp       | 275 +++++++++++
 cpp/include/cugraph/mtmg/edge_property.hpp    |  60 +++
 .../cugraph/mtmg/edge_property_view.hpp       |  33 ++
 cpp/include/cugraph/mtmg/edgelist.hpp         |  65 +++
 cpp/include/cugraph/mtmg/graph.hpp            | 136 ++++++
 cpp/include/cugraph/mtmg/graph_view.hpp       |  34 ++
 cpp/include/cugraph/mtmg/handle.hpp           | 111 +++++
 cpp/include/cugraph/mtmg/instance_manager.hpp |  98 ++++
 .../cugraph/mtmg/per_thread_edgelist.hpp      | 174 +++++++
 cpp/include/cugraph/mtmg/renumber_map.hpp     |  40 ++
 .../cugraph/mtmg/renumber_map_view.hpp        |  32 ++
 cpp/include/cugraph/mtmg/resource_manager.hpp | 225 +++++++++
 cpp/include/cugraph/mtmg/vertex_result.hpp    |  40 ++
 .../cugraph/mtmg/vertex_result_view.hpp       |  49 ++
 cpp/src/link_analysis/pagerank_impl.cuh       |   8 +-
 cpp/src/mtmg/vertex_result.cu                 | 167 +++++++
 cpp/tests/CMakeLists.txt                      |   8 +
 cpp/tests/mtmg/threaded_test.cu               | 459 ++++++++++++++++++
 23 files changed, 2268 insertions(+), 3 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_ucp.cmake
 create mode 100644 cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
 create mode 100644 cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
 create mode 100644 cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
 create mode 100644 cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
 create mode 100644 cpp/include/cugraph/mtmg/edge_property.hpp
 create mode 100644 cpp/include/cugraph/mtmg/edge_property_view.hpp
 create mode 100644 cpp/include/cugraph/mtmg/edgelist.hpp
 create mode 100644 cpp/include/cugraph/mtmg/graph.hpp
 create mode 100644 cpp/include/cugraph/mtmg/graph_view.hpp
 create mode 100644 cpp/include/cugraph/mtmg/handle.hpp
 create mode 100644 cpp/include/cugraph/mtmg/instance_manager.hpp
 create mode 100644 cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
 create mode 100644 cpp/include/cugraph/mtmg/renumber_map.hpp
 create mode 100644 cpp/include/cugraph/mtmg/renumber_map_view.hpp
 create mode 100644 cpp/include/cugraph/mtmg/resource_manager.hpp
 create mode 100644 cpp/include/cugraph/mtmg/vertex_result.hpp
 create mode 100644 cpp/include/cugraph/mtmg/vertex_result_view.hpp
 create mode 100644 cpp/src/mtmg/vertex_result.cu
 create mode 100644 cpp/tests/mtmg/threaded_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 69a488de0b8..a6c26ee3b91 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -166,6 +166,7 @@ endif()
 
 include(cmake/thirdparty/get_nccl.cmake)
 include(cmake/thirdparty/get_cuhornet.cmake)
+include(cmake/thirdparty/get_ucp.cmake)
 
 if(BUILD_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
@@ -292,6 +293,7 @@ set(CUGRAPH_SOURCES
     src/community/triangle_count_mg.cu
     src/traversal/k_hop_nbrs_sg.cu
     src/traversal/k_hop_nbrs_mg.cu
+    src/mtmg/vertex_result.cu
 )
 
 if(USE_CUGRAPH_OPS)
diff --git a/cpp/cmake/thirdparty/get_ucp.cmake b/cpp/cmake/thirdparty/get_ucp.cmake
new file mode 100644
index 00000000000..dcc4956a34e
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_ucp.cmake
@@ -0,0 +1,35 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_ucp)
+
+    if(TARGET UCP::UCP)
+        return()
+    endif()
+
+    rapids_find_generate_module(UCP
+        HEADER_NAMES ucp.h
+        LIBRARY_NAMES ucp
+        INCLUDE_SUFFIXES ucp/api
+    )
+
+    # Currently UCP has no CMake build-system so we require
+    # it built and installed on the machine already
+    rapids_find_package(UCP REQUIRED)
+
+endfunction()
+
+find_and_configure_ucp()
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
new file mode 100644
index 00000000000..37398891370
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_device_span.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <raft/core/device_span.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Wrap an object to be available for each GPU
+ *
+ * In the MTMG environment we need the ability to manage a collection of objects
+ * that are associated with a particular GPU, and fetch the objects from an
+ * arbitrary GPU thread.  This object will wrap any object and allow it to be
+ * accessed from different threads.
+ */
+template <typename T>
+using device_shared_device_span_t = device_shared_wrapper_t<raft::device_span<T>>;
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
new file mode 100644
index 00000000000..7f3992b73bd
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_device_vector.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Wrap an object to be available for each GPU
+ *
+ * In the MTMG environment we need the ability to manage a collection of objects
+ * that are associated with a particular GPU, and fetch the objects from an
+ * arbitrary GPU thread.  This object will wrap any object and allow it to be
+ * accessed from different threads.
+ */
+template <typename T>
+class device_shared_device_vector_t : public device_shared_wrapper_t<rmm::device_uvector<T>> {
+  using parent_t = detail::device_shared_wrapper_t<rmm::device_uvector<T>>;
+
+ public:
+  /**
+   * @brief Create a device_shared_device_span (read only view)
+   */
+  auto view()
+  {
+    std::lock_guard<std::mutex> lock(parent_t::lock_);
+
+    device_shared_device_span_t<T const> result;
+
+    std::for_each(parent_t::objects_.begin(), parent_t::objects_.end(), [&result](auto& p) {
+      result.set(p.first, raft::device_span<T const>{p.second.data(), p.second.size()});
+    });
+
+    return result;
+  }
+};
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
new file mode 100644
index 00000000000..c4cacb401af
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/utilities/error.hpp>
+
+#include <map>
+#include <mutex>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief  Wrap an object to be available for each GPU
+ *
+ * In the MTMG environment we need the ability to manage a collection of objects
+ * that are associated with a particular GPU, and fetch the objects from an
+ * arbitrary GPU thread.  This object will wrap any object and allow it to be
+ * accessed from different threads.
+ */
+template <typename T>
+class device_shared_wrapper_t {
+ public:
+  using wrapped_t = T;
+
+  device_shared_wrapper_t() = default;
+  device_shared_wrapper_t(device_shared_wrapper_t&& other) : objects_{std::move(other.objects_)} {}
+  device_shared_wrapper_t& operator=(device_shared_wrapper_t&& other)
+  {
+    objects_ = std::move(other.objects_);
+    return *this;
+  }
+
+  /**
+   * @brief Move a wrapped object into the wrapper for this thread
+   *
+   * @param handle  Handle is used to identify the GPU we associated this object with
+   * @param obj     Wrapped object
+   */
+  void set(cugraph::mtmg::handle_t const& handle, wrapped_t&& obj)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(handle.get_local_rank());
+    CUGRAPH_EXPECTS(pos == objects_.end(), "Cannot overwrite wrapped object");
+
+    objects_.insert(std::make_pair(handle.get_local_rank(), std::move(obj)));
+  }
+
+  /**
+   * @brief Move a wrapped object into the wrapper for this thread
+   *
+   * @param local_rank  Identify which GPU to associated this object with
+   * @param obj         Wrapped object
+   */
+  void set(int local_rank, wrapped_t&& obj)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(local_rank);
+    CUGRAPH_EXPECTS(pos == objects_.end(), "Cannot overwrite wrapped object");
+
+    objects_.insert(std::make_pair(local_rank, std::move(obj)));
+  }
+
+ public:
+  /**
+   * @brief Get reference to an object for a particular thread
+   *
+   * @param handle  Handle is used to identify the GPU we associated this object with
+   * @return Reference to the wrapped object
+   */
+  wrapped_t& get(cugraph::mtmg::handle_t const& handle)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(handle.get_local_rank());
+    CUGRAPH_EXPECTS(pos != objects_.end(), "Uninitialized wrapped object");
+
+    return pos->second;
+  }
+
+  /**
+   * @brief Get the pointer to an object for a particular thread from this wrapper
+   *
+   * @param handle  Handle is used to identify the GPU we associated this object with
+   * @return Shared pointer the wrapped object
+   */
+  wrapped_t const& get(cugraph::mtmg::handle_t const& handle) const
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    auto pos = objects_.find(handle.get_local_rank());
+
+    CUGRAPH_EXPECTS(pos != objects_.end(), "Uninitialized wrapped object");
+
+    return pos->second;
+  }
+
+ protected:
+  mutable std::mutex lock_{};
+  std::map<int, wrapped_t> objects_{};
+};
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
new file mode 100644
index 00000000000..8011146ee4f
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+
+// FIXME: Could use std::span once compiler supports C++20
+#include <raft/core/host_span.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace mtmg {
+namespace detail {
+
+/**
+ * @brief An edgelist for each GPU
+ *
+ * Manages an edge list for edges associated with a particular GPU.  Multiple threads
+ * can call the append() method, possibly concurrently.  To avoid constantly copying
+ * when the buffers fill up, the class will create a device buffer containing a
+ * number of elements specified in the constructor.  When that device buffer is full
+ * we will create a new buffer.
+ *
+ * When we try and use the edgelist we will consolidate the buffers, since at that
+ * time we know the entire size required.
+ *
+ * Important note, the expectation is that this object will be used in two phases:
+ *  1) The append() method will be used to fill buffers with edges
+ *  2) The edges will be consumed to create a graph
+ *
+ * These two phases are expected to be disjoint.  The calling process is expected to
+ * manage some barrier so that all threads are guaranteed to be completed before changing
+ * phases.  If an append() call (part of the filling phase) overlaps with calls to
+ * finalize_buffer(), consolidate_and_shuffle(), get_src(), get_dst(), get_wgt(),
+ * get_edge_id() and get_edge_type() then the behavior is undefined (data might change
+ * in some non-deterministic way).
+ */
+template <typename vertex_t, typename weight_t, typename edge_t, typename edge_type_t>
+class per_device_edgelist_t {
+ public:
+  per_device_edgelist_t()                                        = delete;
+  per_device_edgelist_t(per_device_edgelist_t const&)            = delete;
+  per_device_edgelist_t& operator=(per_device_edgelist_t const&) = delete;
+  per_device_edgelist_t& operator=(per_device_edgelist_t&&)      = delete;
+
+  per_device_edgelist_t(cugraph::mtmg::handle_t const& handle,
+                        size_t device_buffer_size,
+                        bool use_weight,
+                        bool use_edge_id,
+                        bool use_edge_type)
+    : device_buffer_size_{device_buffer_size},
+      current_pos_{0},
+      src_{},
+      dst_{},
+      wgt_{std::nullopt},
+      edge_id_{std::nullopt},
+      edge_type_{std::nullopt}
+  {
+    if (use_weight) { wgt_ = std::make_optional(std::vector<rmm::device_uvector<weight_t>>()); }
+
+    if (use_edge_id) { edge_id_ = std::make_optional(std::vector<rmm::device_uvector<edge_t>>()); }
+
+    if (use_edge_type) {
+      edge_type_ = std::make_optional(std::vector<rmm::device_uvector<edge_type_t>>());
+    }
+
+    create_new_buffers(handle);
+  }
+
+  per_device_edgelist_t(per_device_edgelist_t&& other)
+    : device_buffer_size_{other.device_buffer_size_},
+      current_pos_{other.current_pos_},
+      src_{std::move(other.src_)},
+      dst_{std::move(other.dst_)},
+      wgt_{std::move(other.wgt_)},
+      edge_id_{std::move(other.edge_id_)},
+      edge_type_{std::move(other.edge_type_)}
+  {
+  }
+
+  /**
+   * @brief Append a list of edges to the edge list
+   *
+   * @param handle     The resource handle
+   * @param src        Source vertex id
+   * @param dst        Destination vertex id
+   * @param wgt        Edge weight
+   * @param edge_id    Edge id
+   * @param edge_type  Edge type
+   */
+  void append(handle_t const& handle,
+              raft::host_span<vertex_t const> src,
+              raft::host_span<vertex_t const> dst,
+              std::optional<raft::host_span<weight_t const>> wgt,
+              std::optional<raft::host_span<edge_t const>> edge_id,
+              std::optional<raft::host_span<edge_type_t const>> edge_type)
+  {
+    // FIXME:  This lock guard could be on a smaller region, but it
+    //   would require more careful coding.  The raft::update_device
+    //   calls could be done without the lock if we made a local
+    //   of the values of *.back() and did an increment of current_pos_
+    //   while we hold the lock.
+    std::lock_guard<std::mutex> lock(lock_);
+
+    size_t count = src.size();
+    size_t pos   = 0;
+
+    while (count > 0) {
+      size_t copy_count = std::min(count, (src_.back().size() - current_pos_));
+
+      raft::update_device(
+        src_.back().begin() + current_pos_, src.begin() + pos, copy_count, handle.get_stream());
+      raft::update_device(
+        dst_.back().begin() + current_pos_, dst.begin() + pos, copy_count, handle.get_stream());
+      if (wgt)
+        raft::update_device(
+          wgt_->back().begin() + current_pos_, wgt->begin() + pos, copy_count, handle.get_stream());
+      if (edge_id)
+        raft::update_device(edge_id_->back().begin() + current_pos_,
+                            edge_id->begin() + pos,
+                            copy_count,
+                            handle.get_stream());
+      if (edge_type)
+        raft::update_device(edge_type_->back().begin() + current_pos_,
+                            edge_type->begin() + pos,
+                            copy_count,
+                            handle.get_stream());
+
+      count -= copy_count;
+      pos += copy_count;
+      current_pos_ += copy_count;
+
+      if (current_pos_ == src_.back().size()) { create_new_buffers(handle); }
+    }
+
+    handle.raft_handle().sync_stream();
+  }
+
+  /**
+   * @brief  Mark the edgelist as ready for reading (all writes are complete)
+   *
+   * @param handle     The resource handle
+   */
+  void finalize_buffer(handle_t const& handle)
+  {
+    src_.back().resize(current_pos_, handle.get_stream());
+    dst_.back().resize(current_pos_, handle.get_stream());
+    if (wgt_) wgt_->back().resize(current_pos_, handle.get_stream());
+    if (edge_id_) edge_id_->back().resize(current_pos_, handle.get_stream());
+    if (edge_type_) edge_type_->back().resize(current_pos_, handle.get_stream());
+  }
+
+  bool use_weight() const { return wgt_.has_value(); }
+
+  bool use_edge_id() const { return edge_id_.has_value(); }
+
+  bool use_edge_type() const { return edge_type_.has_value(); }
+
+  std::vector<rmm::device_uvector<vertex_t>>& get_src() { return src_; }
+  std::vector<rmm::device_uvector<vertex_t>>& get_dst() { return dst_; }
+  std::optional<std::vector<rmm::device_uvector<weight_t>>>& get_wgt() { return wgt_; }
+  std::optional<std::vector<rmm::device_uvector<edge_t>>>& get_edge_id() { return edge_id_; }
+  std::optional<std::vector<rmm::device_uvector<edge_type_t>>>& get_edge_type()
+  {
+    return edge_type_;
+  }
+
+  /**
+   * @brief Consolidate edgelists (if necessary) and shuffle to the proper GPU
+   *
+   * @param handle    The resource handle
+   */
+  void consolidate_and_shuffle(cugraph::mtmg::handle_t const& handle, bool store_transposed)
+  {
+    if (src_.size() > 1) {
+      size_t total_size = std::transform_reduce(
+        src_.begin(), src_.end(), size_t{0}, std::plus<size_t>(), [](auto& d_vector) {
+          return d_vector.size();
+        });
+
+      resize_and_copy_buffers(handle.get_stream(), src_, total_size);
+      resize_and_copy_buffers(handle.get_stream(), dst_, total_size);
+      if (wgt_) resize_and_copy_buffers(handle.get_stream(), *wgt_, total_size);
+      if (edge_id_) resize_and_copy_buffers(handle.get_stream(), *edge_id_, total_size);
+      if (edge_type_) resize_and_copy_buffers(handle.get_stream(), *edge_type_, total_size);
+    }
+
+    auto tmp_wgt     = wgt_ ? std::make_optional(std::move((*wgt_)[0])) : std::nullopt;
+    auto tmp_edge_id = edge_id_ ? std::make_optional(std::move((*edge_id_)[0])) : std::nullopt;
+    auto tmp_edge_type =
+      edge_type_ ? std::make_optional(std::move((*edge_type_)[0])) : std::nullopt;
+
+    std::tie(store_transposed ? dst_[0] : src_[0],
+             store_transposed ? src_[0] : dst_[0],
+             tmp_wgt,
+             tmp_edge_id,
+             tmp_edge_type) =
+      cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+        handle.raft_handle(),
+        store_transposed ? std::move(dst_[0]) : std::move(src_[0]),
+        store_transposed ? std::move(src_[0]) : std::move(dst_[0]),
+        std::move(tmp_wgt),
+        std::move(tmp_edge_id),
+        std::move(tmp_edge_type));
+
+    if (tmp_wgt) ((*wgt_)[0]) = std::move(*tmp_wgt);
+    if (tmp_edge_id) ((*edge_id_)[0]) = std::move(*tmp_edge_id);
+    if (tmp_edge_type) ((*edge_type_)[0]) = std::move(*tmp_edge_type);
+  }
+
+ private:
+  template <typename T>
+  void resize_and_copy_buffers(rmm::cuda_stream_view stream,
+                               std::vector<rmm::device_uvector<T>>& buffer,
+                               size_t total_size)
+  {
+    size_t pos = buffer[0].size();
+    buffer[0].resize(total_size, stream);
+
+    for (size_t i = 1; i < buffer.size(); ++i) {
+      raft::copy(buffer[0].data() + pos, buffer[i].data(), buffer[i].size(), stream);
+      pos += buffer[i].size();
+      buffer[i].resize(0, stream);
+      buffer[i].shrink_to_fit(stream);
+    }
+
+    std::vector<rmm::device_uvector<T>> new_buffer;
+    new_buffer.push_back(std::move(buffer[0]));
+    buffer = std::move(new_buffer);
+  }
+
+  void create_new_buffers(cugraph::mtmg::handle_t const& handle)
+  {
+    src_.emplace_back(device_buffer_size_, handle.get_stream());
+    dst_.emplace_back(device_buffer_size_, handle.get_stream());
+
+    if (wgt_) { wgt_->emplace_back(device_buffer_size_, handle.get_stream()); }
+
+    if (edge_id_) { edge_id_->emplace_back(device_buffer_size_, handle.get_stream()); }
+
+    if (edge_type_) { edge_type_->emplace_back(device_buffer_size_, handle.get_stream()); }
+
+    current_pos_ = 0;
+  }
+
+  mutable std::mutex lock_{};
+
+  size_t current_pos_{0};
+  size_t device_buffer_size_{0};
+
+  std::vector<rmm::device_uvector<vertex_t>> src_{};
+  std::vector<rmm::device_uvector<vertex_t>> dst_{};
+  std::optional<std::vector<rmm::device_uvector<weight_t>>> wgt_{};
+  std::optional<std::vector<rmm::device_uvector<edge_t>>> edge_id_{};
+  std::optional<std::vector<rmm::device_uvector<edge_type_t>>> edge_type_{};
+};
+
+}  // namespace detail
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/edge_property.hpp b/cpp/include/cugraph/mtmg/edge_property.hpp
new file mode 100644
index 00000000000..afa72492b9a
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/edge_property.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/edge_property_view.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Edge property object for each GPU
+ */
+template <typename graph_view_t, typename property_t>
+class edge_property_t : public detail::device_shared_wrapper_t<
+                          cugraph::edge_property_t<typename graph_view_t::wrapped_t, property_t>> {
+ public:
+  using parent_t = detail::device_shared_wrapper_t<
+    cugraph::edge_property_t<typename graph_view_t::wrapped_t, property_t>>;
+
+  /**
+   * @brief Return a edge_property_view_t (read only)
+   */
+  auto view()
+  {
+    std::lock_guard<std::mutex> lock(parent_t::lock_);
+
+    using edge_t = typename graph_view_t::wrapped_t::edge_type;
+    using buffer_t =
+      typename cugraph::edge_property_t<typename graph_view_t::wrapped_t, property_t>::buffer_type;
+    std::vector<buffer_t> buffers{};
+    using const_value_iterator_t = decltype(get_dataframe_buffer_cbegin(buffers[0]));
+
+    edge_property_view_t<edge_t, const_value_iterator_t> result;
+
+    std::for_each(parent_t::objects_.begin(), parent_t::objects_.end(), [&result](auto& p) {
+      result.set(p.first, p.second.view());
+    });
+
+    return result;
+  }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/edge_property_view.hpp b/cpp/include/cugraph/mtmg/edge_property_view.hpp
new file mode 100644
index 00000000000..c84a6458e1d
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/edge_property_view.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Edge property object for each GPU
+ */
+template <typename edge_t, typename value_iterator_t>
+using edge_property_view_t =
+  detail::device_shared_wrapper_t<cugraph::edge_property_view_t<edge_t, value_iterator_t>>;
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/edgelist.hpp b/cpp/include/cugraph/mtmg/edgelist.hpp
new file mode 100644
index 00000000000..90c53dfbb64
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/edgelist.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/detail/per_device_edgelist.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Edgelist object for each GPU
+ */
+template <typename vertex_t, typename weight_t, typename edge_t, typename edge_type_t>
+class edgelist_t : public detail::device_shared_wrapper_t<
+                     detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>> {
+ public:
+  /**
+   * @brief Create a per_device_edgelist for this GPU
+   */
+  void set(handle_t const& handle,
+           size_t device_buffer_size,
+           bool use_weight,
+           bool use_edge_id,
+           bool use_edge_type)
+  {
+    detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> tmp(
+      handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+
+    detail::device_shared_wrapper_t<
+      detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>>::set(handle,
+                                                                                   std::move(tmp));
+  }
+
+  /**
+   * @brief Stop inserting edges into this edgelist so we can use the edges
+   */
+  void finalize_buffer(handle_t const& handle) { this->get(handle).finalize_buffer(handle); }
+
+  /**
+   * @brief Consolidate for the edgelist edges into a single edgelist and then
+   *        shuffle across GPUs.
+   */
+  void consolidate_and_shuffle(cugraph::mtmg::handle_t const& handle, bool store_transposed)
+  {
+    this->get(handle).consolidate_and_shuffle(handle, store_transposed);
+  }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/graph.hpp b/cpp/include/cugraph/mtmg/graph.hpp
new file mode 100644
index 00000000000..76a2f401425
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/graph.hpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/edge_property.hpp>
+#include <cugraph/mtmg/graph_view.hpp>
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Graph object for each GPU
+ */
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+class graph_t : public detail::device_shared_wrapper_t<
+                  cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>> {
+  using parent_t = detail::device_shared_wrapper_t<
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>>;
+
+ public:
+  /**
+   * @brief Create an MTMG graph view (read only)
+   */
+  auto view()
+  {
+    std::lock_guard<std::mutex> lock(parent_t::lock_);
+
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> result;
+
+    std::for_each(parent_t::objects_.begin(), parent_t::objects_.end(), [&result](auto& p) {
+      result.set(p.first, std::move(p.second.view()));
+    });
+
+    return result;
+  }
+};
+
+/**
+ * @brief Create an MTMG graph from an edgelist
+ *
+ * @param[in]  handle             Resource handle
+ * @param[in]  edgelist           Edgelist
+ * @param[in]  graph_properties   Graph properties
+ * @param[in]  renumber           If true, renumber graph (must be true for MG)
+ * @param[out] graph              MTMG graph is stored here
+ * @param[out] edge_weights       MTMG edge weights is stored here
+ * @param[out] edge_ids           MTMG edge ids is stored here
+ * @param[out] edge_types         MTMG edge types is stored here
+ * @param[in]  renumber_map       MTMG renumber_map is stored here
+ * @param[in]  do_expensive_check A flag to run expensive checks for input arguments (if set to
+ * `true`).
+ */
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          typename edge_id_t,
+          typename edge_type_t,
+          bool store_transposed,
+          bool multi_gpu>
+void create_graph_from_edgelist(
+  handle_t const& handle,
+  cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_id_t, edge_type_t>& edgelist,
+  graph_properties_t graph_properties,
+  bool renumber,
+  cugraph::mtmg::graph_t<vertex_t, edge_t, store_transposed, multi_gpu>& graph,
+  std::optional<cugraph::mtmg::edge_property_t<
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+    weight_t>>& edge_weights,
+  std::optional<cugraph::mtmg::edge_property_t<
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+    edge_id_t>>& edge_ids,
+  std::optional<cugraph::mtmg::edge_property_t<
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>,
+    edge_type_t>>& edge_types,
+  std::optional<cugraph::mtmg::renumber_map_t<vertex_t>>& renumber_map,
+  bool do_expensive_check = false)
+{
+  if (handle.get_thread_rank() > 0) return;
+
+  CUGRAPH_EXPECTS(renumber_map.has_value() == renumber,
+                  "Renumbering set to true, but no space for renumber map");
+
+  auto& my_edgelist = edgelist.get(handle);
+
+  CUGRAPH_EXPECTS(my_edgelist.get_src().size() > 0, "Cannot create graph without an edge list");
+  CUGRAPH_EXPECTS(my_edgelist.get_src().size() == 1,
+                  "Must consolidate edges into a single list before creating graph");
+
+  auto [local_graph, local_edge_weights, local_edge_ids, local_edge_types, local_renumber_map] =
+    cugraph::create_graph_from_edgelist<vertex_t,
+                                        edge_t,
+                                        weight_t,
+                                        edge_id_t,
+                                        edge_type_t,
+                                        store_transposed,
+                                        multi_gpu>(
+      handle.raft_handle(),
+      std::nullopt,
+      std::move(my_edgelist.get_src()[0]),
+      std::move(my_edgelist.get_dst()[0]),
+      my_edgelist.get_wgt() ? std::make_optional(std::move((*my_edgelist.get_wgt())[0]))
+                            : std::nullopt,
+      my_edgelist.get_edge_id() ? std::make_optional(std::move((*my_edgelist.get_edge_id())[0]))
+                                : std::nullopt,
+      my_edgelist.get_edge_type() ? std::make_optional(std::move((*my_edgelist.get_edge_type())[0]))
+                                  : std::nullopt,
+      graph_properties,
+      renumber,
+      do_expensive_check);
+
+  graph.set(handle, std::move(local_graph));
+  if (edge_weights) edge_weights->set(handle, std::move(*local_edge_weights));
+  if (edge_ids) edge_ids->set(handle, std::move(*local_edge_ids));
+  if (edge_types) edge_types->set(handle, std::move(*local_edge_types));
+  if (renumber) renumber_map->set(handle, std::move(*local_renumber_map));
+}
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/graph_view.hpp b/cpp/include/cugraph/mtmg/graph_view.hpp
new file mode 100644
index 00000000000..94347e016ea
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/graph_view.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/graph_view.hpp>
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Graph view for each GPU
+ */
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+using graph_view_t = detail::device_shared_wrapper_t<
+  cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>>;
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/handle.hpp b/cpp/include/cugraph/mtmg/handle.hpp
new file mode 100644
index 00000000000..f23bce5aeac
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/handle.hpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Resource handler
+ *
+ * Multi-threaded resource handler.  Every GPU gets a raft::handle object that provides access to
+ * the GPU resources.  In a multi-threaded environment multiple threads will share a particular GPU.
+ * Following the MPI model, each thread will be assigned to a thread rank.
+ *
+ */
+class handle_t {
+ public:
+  /**
+   * @brief Constructor
+   *
+   * @param raft_handle   Raft handle for the resources
+   * @param thread_rank   Rank for this thread
+   */
+  handle_t(raft::handle_t const& raft_handle, int thread_rank, size_t device_id)
+    : raft_handle_(raft_handle),
+      thread_rank_(thread_rank),
+      local_rank_(raft_handle.get_comms().get_rank()),  // FIXME: update for multi-node
+      device_id_(device_id)
+  {
+  }
+
+  /**
+   * @brief Get the raft handle
+   *
+   * @return const reference to a raft handle
+   */
+  raft::handle_t const& raft_handle() const { return raft_handle_; }
+
+  /**
+   * @brief Get cuda stream
+   *
+   * @return cuda stream
+   */
+  rmm::cuda_stream_view get_stream() const
+  {
+    return raft_handle_.is_stream_pool_initialized()
+             ? raft_handle_.get_stream_from_stream_pool(device_id_)
+             : raft_handle_.get_stream();
+  }
+
+  /**
+   * @brief Get thread rank
+   *
+   * @return thread rank
+   */
+  int get_thread_rank() const { return thread_rank_; }
+
+  /**
+   * @brief Get number of gpus
+   *
+   * @return number of gpus
+   */
+  int get_size() const { return raft_handle_.get_comms().get_size(); }
+
+  /**
+   * @brief Get number of local gpus
+   *
+   * @return number of local gpus
+   */
+  // FIXME: wrong for multi-node
+  int get_local_size() const { return raft_handle_.get_comms().get_size(); }
+
+  /**
+   * @brief Get gpu rank
+   *
+   * @return gpu rank
+   */
+  int get_rank() const { return raft_handle_.get_comms().get_rank(); }
+
+  /**
+   * @brief Get local gpu rank
+   *
+   * @return local gpu rank
+   */
+  int get_local_rank() const { return local_rank_; }
+
+ private:
+  raft::handle_t const& raft_handle_;
+  int thread_rank_;
+  int local_rank_;
+  size_t device_id_;
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/instance_manager.hpp b/cpp/include/cugraph/mtmg/instance_manager.hpp
new file mode 100644
index 00000000000..8bf62b56f4b
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/instance_manager.hpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+
+#include <nccl.h>
+
+#include <vector>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Manages a subset of the cluster for a set of graph computations
+ */
+class instance_manager_t {
+ public:
+  /**
+   * @brief Constructor
+   *
+   * @param handles   Vector of RAFT handles, one for each device on this node
+   */
+  instance_manager_t(std::vector<std::unique_ptr<raft::handle_t>>&& handles,
+                     std::vector<std::unique_ptr<ncclComm_t>>&& nccl_comms,
+                     std::vector<rmm::cuda_device_id>&& device_ids,
+                     int local_gpu_count)
+    : thread_counter_{0},
+      raft_handle_{std::move(handles)},
+      nccl_comms_{std::move(nccl_comms)},
+      device_ids_{std::move(device_ids)},
+      local_gpu_count_{local_gpu_count}
+  {
+  }
+
+  /**
+   * @brief Get handle
+   *
+   * The instance manager will construct a handle appropriate for the thread making
+   * the request.  Threads will be assigned to GPUs in a round-robin fashion to
+   * spread requesting threads around the GPU resources.
+   *
+   * This function will be CPU thread-safe.
+   *
+   * @return a handle for this thread.
+   */
+  handle_t get_handle()
+  {
+    int local_id = thread_counter_++;
+
+    RAFT_CUDA_TRY(cudaSetDevice(device_ids_[local_id % raft_handle_.size()].value()));
+    return handle_t(*raft_handle_[local_id % raft_handle_.size()],
+                    local_id / raft_handle_.size(),
+                    static_cast<size_t>(local_id % raft_handle_.size()));
+  }
+
+  /**
+   * @brief Reset the thread counter
+   *
+   * After a parallel activity is completed, we need to reset the thread counter so that
+   * future threads will round robin around the GPUs properly.
+   */
+  void reset_threads() { thread_counter_.store(0); }
+
+  /**
+   * @brief Number of local GPUs in the instance
+   */
+  int get_local_gpu_count() { return local_gpu_count_; }
+
+ private:
+  // FIXME: Should this be an std::map<> where the key is the rank?
+  //        On a multi-node system we might have nodes with fewer
+  //        (or no) GPUs, so mapping rank to a handle might be a challenge
+  //
+  std::vector<std::unique_ptr<raft::handle_t>> raft_handle_{};
+  std::vector<std::unique_ptr<ncclComm_t>> nccl_comms_{};
+  std::vector<rmm::cuda_device_id> device_ids_{};
+  int local_gpu_count_{};
+
+  std::atomic<int> thread_counter_{0};
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp b/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
new file mode 100644
index 00000000000..b672db48719
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_wrapper.hpp>
+#include <cugraph/mtmg/detail/per_device_edgelist.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Supports creating an edgelist from individual host threads
+ *
+ * A cugraph edgelist needs to contain all of the edges necessary to create the graph
+ * stored in GPU memory (distributed across multiple GPUs in a multi-GPU configuration).
+ *
+ * This class provides a mechanism for populating the edgelist object from independent CPU threads.
+ *
+ * Calls to the append() method will take edges (in CPU host memory) and append them to a local
+ * buffer.  As the local buffer fills, the buffer will be sent to GPU memory using the flush()
+ * method.  This allows the CPU to GPU transfers to be larger (and consequently more efficient).
+ */
+template <typename vertex_t, typename weight_t, typename edge_t, typename edge_type_t>
+class per_thread_edgelist_t {
+ public:
+  per_thread_edgelist_t()                             = delete;
+  per_thread_edgelist_t(per_thread_edgelist_t const&) = delete;
+
+  /**
+   * @brief Only constructor
+   *
+   * @param edgelist            The edge list this thread_edgelist_t should be associated with
+   * @param thread_buffer_size  Size of the local buffer for accumulating edges on the CPU
+   */
+  per_thread_edgelist_t(
+    detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>& edgelist,
+    size_t thread_buffer_size)
+    : edgelist_{edgelist},
+      current_pos_{0},
+      src_(thread_buffer_size),
+      dst_(thread_buffer_size),
+      wgt_{std::nullopt},
+      edge_id_{std::nullopt},
+      edge_type_{std::nullopt}
+  {
+    if (edgelist.use_weight()) wgt_ = std::make_optional(std::vector<weight_t>(thread_buffer_size));
+
+    if (edgelist.use_edge_id())
+      edge_id_ = std::make_optional(std::vector<edge_t>(thread_buffer_size));
+
+    if (edgelist.use_edge_type())
+      edge_type_ = std::make_optional(std::vector<edge_type_t>(thread_buffer_size));
+  }
+
+  /**
+   * @brief Append an edge to the edge list
+   *
+   * @param handle     The resource handle
+   * @param src        Source vertex id
+   * @param dst        Destination vertex id
+   * @param wgt        Edge weight
+   * @param edge_id    Edge id
+   * @param edge_type  Edge type
+   */
+  void append(handle_t const& handle,
+              vertex_t src,
+              vertex_t dst,
+              std::optional<weight_t> wgt,
+              std::optional<edge_t> edge_id,
+              std::optional<edge_type_t> edge_type)
+  {
+    if (current_pos_ == src_.size()) { flush(handle); }
+
+    src_[current_pos_] = src;
+    dst_[current_pos_] = dst;
+    if (wgt) (*wgt_)[current_pos_] = *wgt;
+    if (edge_id) (*edge_id_)[current_pos_] = *edge_id;
+    if (edge_type) (*edge_type_)[current_pos_] = *edge_type;
+
+    ++current_pos_;
+  }
+
+  /**
+   * @brief Append a list of edges to the edge list
+   *
+   * @param handle     The resource handle
+   * @param src        Source vertex id
+   * @param dst        Destination vertex id
+   * @param wgt        Edge weight
+   * @param edge_id    Edge id
+   * @param edge_type  Edge type
+   */
+  void append(handle_t const& handle,
+              raft::host_span<vertex_t const> src,
+              raft::host_span<vertex_t const> dst,
+              std::optional<raft::host_span<weight_t const>> wgt,
+              std::optional<raft::host_span<edge_t const>> edge_id,
+              std::optional<raft::host_span<edge_type_t const>> edge_type)
+  {
+    size_t count = src.size();
+    size_t pos   = 0;
+
+    while (count > 0) {
+      size_t copy_count = std::min(count, (src_.size() - current_pos_));
+
+      std::copy(src.begin() + pos, src.begin() + pos + copy_count, src_.begin() + current_pos_);
+      std::copy(dst.begin() + pos, dst.begin() + pos + copy_count, dst_.begin() + current_pos_);
+      if (wgt)
+        std::copy(wgt.begin() + pos, wgt.begin() + pos + copy_count, wgt_->begin() + current_pos_);
+      if (edge_id)
+        std::copy(edge_id.begin() + pos,
+                  edge_id.begin() + pos + copy_count,
+                  edge_id_->begin() + current_pos_);
+      if (edge_type)
+        std::copy(edge_type.begin() + pos,
+                  edge_type.begin() + pos + copy_count,
+                  edge_type_->begin() + current_pos_);
+
+      if (current_pos_ == src_.size()) { flush(handle); }
+
+      count -= copy_count;
+      pos += copy_count;
+    }
+  }
+
+  /**
+   * @brief Flush thread data from host to GPU memory
+   *
+   * @param handle     The resource handle
+   */
+  void flush(handle_t const& handle)
+  {
+    edgelist_.append(
+      handle,
+      raft::host_span<vertex_t const>{src_.data(), current_pos_},
+      raft::host_span<vertex_t const>{dst_.data(), current_pos_},
+      wgt_ ? std::make_optional(raft::host_span<weight_t const>{wgt_->data(), current_pos_})
+           : std::nullopt,
+      edge_id_ ? std::make_optional(raft::host_span<edge_t const>{edge_id_->data(), current_pos_})
+               : std::nullopt,
+      edge_type_
+        ? std::make_optional(raft::host_span<edge_type_t const>{edge_type_->data(), current_pos_})
+        : std::nullopt);
+
+    current_pos_ = 0;
+  }
+
+ private:
+  detail::per_device_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>& edgelist_;
+  size_t current_pos_{0};
+  std::vector<vertex_t> src_{};
+  std::vector<vertex_t> dst_{};
+  std::optional<std::vector<weight_t>> wgt_{};
+  std::optional<std::vector<edge_t>> edge_id_{};
+  std::optional<std::vector<edge_type_t>> edge_type_{};
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/renumber_map.hpp b/cpp/include/cugraph/mtmg/renumber_map.hpp
new file mode 100644
index 00000000000..da07d61bd96
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/renumber_map.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_vector.hpp>
+#include <cugraph/mtmg/renumber_map_view.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device vector for storing a renumber map
+ */
+template <typename vertex_t>
+class renumber_map_t : public detail::device_shared_device_vector_t<vertex_t> {
+  using parent_t = detail::device_shared_device_vector_t<vertex_t>;
+
+ public:
+  /**
+   * @brief Return a view (read only) of the renumber map
+   */
+  auto view() { return static_cast<renumber_map_view_t<vertex_t>>(this->parent_t::view()); }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/renumber_map_view.hpp b/cpp/include/cugraph/mtmg/renumber_map_view.hpp
new file mode 100644
index 00000000000..5ff7ff5e100
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/renumber_map_view.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span.hpp>
+#include <cugraph/mtmg/handle.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device span for storing a renumber map
+ */
+template <typename vertex_t>
+using renumber_map_view_t = detail::device_shared_device_span_t<vertex_t const>;
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/resource_manager.hpp b/cpp/include/cugraph/mtmg/resource_manager.hpp
new file mode 100644
index 00000000000..b4633626e7c
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/resource_manager.hpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/mtmg/instance_manager.hpp>
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/comms/std_comms.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <execution>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief Class for managing local and remote GPU resources for use in
+ *   multi-threaded multi-GPU interface.
+ *
+ * Each process in a multi-GPU configuration should have an instance of this
+ * class.  The resource manager object should be configured by calling
+ * register_local_gpu (or register_remote_gpu once we support a multi-node
+ * configuration) to allocate resources that can be used in the mtmg space.
+ *
+ * When we want to execute some graph computations, we need to create an instance for execution.
+ * Based on how big a subset of the desired compute resources is desired, we can allocate some
+ * number of GPUs to the problem (up to the total set of managed resources).
+ *
+ * The returned instance can be used to create a graph, execute one or more algorithms, etc.  Once
+ * we are done the caller can delete the instance.
+ *
+ * At the moment, the caller is assumed to be responsible for scheduling use of the resources.
+ *
+ * For our first release, we will only consider a single node multi-GPU configuration, so the remote
+ * GPU methods are currently disabled via ifdef.
+ */
+class resource_manager_t {
+ public:
+  /**
+   * @brief Default constructor
+   */
+  resource_manager_t() {}
+
+  /**
+   * @brief add a local GPU to the resource manager.
+   *
+   * @param rank       The rank to assign to the local GPU
+   * @param device_id  The device_id corresponding to this rank
+   */
+  void register_local_gpu(int rank, rmm::cuda_device_id device_id)
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    CUGRAPH_EXPECTS(local_rank_map_.find(rank) == local_rank_map_.end(),
+                    "cannot register same rank multiple times");
+
+    int num_gpus_this_node;
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_this_node));
+
+    CUGRAPH_EXPECTS((device_id.value() >= 0) && (device_id.value() < num_gpus_this_node),
+                    "device id out of range");
+
+    local_rank_map_.insert(std::pair(rank, device_id));
+
+    RAFT_CUDA_TRY(cudaSetDevice(device_id.value()));
+
+    // FIXME: There is a bug in the cuda_memory_resource that results in a Hang.
+    //   using the pool resource as a work-around.
+    //
+    // There is a deprecated environment variable: NCCL_LAUNCH_MODE=GROUP
+    // which should temporarily work around this problem.
+    //
+    // Ultimately there should be some RMM parameters passed into this function
+    // (or the constructor of the object) to configure this behavior
+#if 0
+    auto per_device_it = per_device_rmm_resources_.insert(
+      std::pair{rank, std::make_shared<rmm::mr::cuda_memory_resource>()});
+#else
+    auto const [free, total] = rmm::detail::available_device_memory();
+    auto const min_alloc =
+      rmm::detail::align_down(std::min(free, total / 6), rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+
+    auto per_device_it = per_device_rmm_resources_.insert(
+      std::pair{rank,
+                rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+                  std::make_shared<rmm::mr::cuda_memory_resource>(), min_alloc)});
+#endif
+
+    rmm::mr::set_per_device_resource(device_id, per_device_it.first->second.get());
+  }
+
+  /**
+   * @brief Create an instance using a subset of the registered resources
+   *
+   * The selected set of resources will be configured as an instance manager.
+   * If @ranks_to_include is a proper subset of the registered resources,
+   * ranks will be renumbered into the range [0, @p ranks_to_use.size()), making
+   * it a proper configuration.
+   *
+   * @param ranks_to_use        a vector containing the ranks to include in the instance.
+   *   Must be a subset of the entire set of available ranks.
+   * @param instance_manager_id a ncclUniqueId that is shared by all processes participating
+   *   in this instance.  All processes must use the same ID in this call, it is up
+   *   to the calling code to share this ID properly before the call.
+   *
+   * @return unique pointer to instance manager
+   */
+  std::unique_ptr<instance_manager_t> create_instance_manager(
+    std::vector<int> ranks_to_include, ncclUniqueId instance_manager_id) const
+  {
+    std::for_each(
+      ranks_to_include.begin(), ranks_to_include.end(), [local_ranks = local_rank_map_](int rank) {
+        CUGRAPH_EXPECTS(local_ranks.find(rank) != local_ranks.end(),
+                        "requesting inclusion of an invalid rank");
+      });
+
+    std::vector<std::unique_ptr<ncclComm_t>> nccl_comms{};
+    std::vector<std::unique_ptr<raft::handle_t>> handles{};
+    std::vector<rmm::cuda_device_id> device_ids{};
+
+    nccl_comms.reserve(ranks_to_include.size());
+    handles.reserve(ranks_to_include.size());
+    device_ids.reserve(ranks_to_include.size());
+
+    // FIXME: not quite right for multi-node
+    auto gpu_row_comm_size = static_cast<int>(sqrt(static_cast<double>(ranks_to_include.size())));
+    while (ranks_to_include.size() % gpu_row_comm_size != 0) {
+      --gpu_row_comm_size;
+    }
+
+    // FIXME: not quite right for multi-node
+    for (size_t i = 0; i < ranks_to_include.size(); ++i) {
+      int rank = ranks_to_include[i];
+      auto pos = local_rank_map_.find(rank);
+      RAFT_CUDA_TRY(cudaSetDevice(pos->second.value()));
+
+      raft::handle_t tmp_handle;
+
+      nccl_comms.push_back(std::make_unique<ncclComm_t>());
+      handles.push_back(
+        std::make_unique<raft::handle_t>(tmp_handle, per_device_rmm_resources_.find(rank)->second));
+      device_ids.push_back(pos->second);
+    }
+
+    std::vector<std::thread> running_threads;
+
+    for (size_t i = 0; i < ranks_to_include.size(); ++i) {
+      running_threads.emplace_back([instance_manager_id,
+                                    idx = i,
+                                    gpu_row_comm_size,
+                                    comm_size = ranks_to_include.size(),
+                                    &ranks_to_include,
+                                    &local_rank_map = local_rank_map_,
+                                    &nccl_comms,
+                                    &handles]() {
+        int rank = ranks_to_include[idx];
+        auto pos = local_rank_map.find(rank);
+        RAFT_CUDA_TRY(cudaSetDevice(pos->second.value()));
+
+        NCCL_TRY(ncclCommInitRank(nccl_comms[idx].get(), comm_size, instance_manager_id, rank));
+
+        raft::comms::build_comms_nccl_only(handles[idx].get(), *nccl_comms[idx], comm_size, rank);
+
+        cugraph::partition_manager::init_subcomm(*handles[idx], gpu_row_comm_size);
+      });
+    }
+
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+
+    // FIXME: Update for multi-node
+    return std::make_unique<instance_manager_t>(
+      std::move(handles), std::move(nccl_comms), std::move(device_ids), ranks_to_include.size());
+  }
+
+  /**
+   * @brief Get a list of all of the currently registered ranks
+   *
+   * @return A copy of the list of ranks.
+   */
+  std::vector<int> registered_ranks() const
+  {
+    std::lock_guard<std::mutex> lock(lock_);
+
+    //
+    // C++20 mechanism:
+    // return std::vector<int>{ std::views::keys(local_rank_map_).begin(),
+    //                          std::views::keys(local_rank_map_).end() };
+    //  Would need a bit more complicated to handle remote_rank_map_ also
+    //
+    std::vector<int> registered_ranks(local_rank_map_.size());
+    std::transform(
+      local_rank_map_.begin(), local_rank_map_.end(), registered_ranks.begin(), [](auto pair) {
+        return pair.first;
+      });
+
+    return registered_ranks;
+  }
+
+ private:
+  mutable std::mutex lock_{};
+  std::map<int, rmm::cuda_device_id> local_rank_map_{};
+  std::map<int, std::shared_ptr<rmm::mr::device_memory_resource>> per_device_rmm_resources_{};
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/vertex_result.hpp b/cpp/include/cugraph/mtmg/vertex_result.hpp
new file mode 100644
index 00000000000..e8999b35aa9
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/vertex_result.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_vector.hpp>
+#include <cugraph/mtmg/vertex_result_view.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device vector for storing vertex results
+ */
+template <typename result_t>
+class vertex_result_t : public detail::device_shared_device_vector_t<result_t> {
+  using parent_t = detail::device_shared_device_vector_t<result_t>;
+
+ public:
+  /**
+   * @brief Create a vertex result view (read only)
+   */
+  auto view() { return static_cast<vertex_result_view_t<result_t>>(this->parent_t::view()); }
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/vertex_result_view.hpp b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
new file mode 100644
index 00000000000..7a7070d6f2a
--- /dev/null
+++ b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/mtmg/detail/device_shared_device_span.hpp>
+#include <cugraph/mtmg/graph_view.hpp>
+#include <cugraph/mtmg/handle.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+
+namespace cugraph {
+namespace mtmg {
+
+/**
+ * @brief An MTMG device span for referencing a vertex result
+ */
+template <typename result_t>
+class vertex_result_view_t : public detail::device_shared_device_span_t<result_t const> {
+  using parent_t = detail::device_shared_device_span_t<result_t const>;
+
+ public:
+  vertex_result_view_t(parent_t&& other) : parent_t{std::move(other)} {}
+
+  /**
+   * @brief Gather results from specified vertices into a device vector
+   */
+  template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+  rmm::device_uvector<result_t> gather(
+    handle_t const& handle,
+    raft::device_span<vertex_t const> vertices,
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+    std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view);
+};
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index 3a84cdedfda..92c70fcff20 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -388,9 +388,11 @@ void pagerank(raft::handle_t const& handle,
     handle,
     graph_view,
     edge_weight_view,
-    std::make_optional(raft::device_span<weight_t const>{
-      *precomputed_vertex_out_weight_sums,
-      static_cast<size_t>(graph_view.local_vertex_partition_range_size())}),
+    precomputed_vertex_out_weight_sums
+      ? std::make_optional(raft::device_span<weight_t const>{
+          *precomputed_vertex_out_weight_sums,
+          static_cast<size_t>(graph_view.local_vertex_partition_range_size())})
+      : std::nullopt,
     personalization_vertices
       ? std::make_optional(std::make_tuple(
           raft::device_span<vertex_t const>{*personalization_vertices,
diff --git a/cpp/src/mtmg/vertex_result.cu b/cpp/src/mtmg/vertex_result.cu
new file mode 100644
index 00000000000..a669a127f41
--- /dev/null
+++ b/cpp/src/mtmg/vertex_result.cu
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/mtmg/vertex_result_view.hpp>
+#include <cugraph/vertex_partition_device_view.cuh>
+
+#include <detail/graph_partition_utils.cuh>
+
+#include <thrust/gather.h>
+
+namespace cugraph {
+namespace mtmg {
+
+template <typename result_t>
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
+  handle_t const& handle,
+  raft::device_span<vertex_t const> vertices,
+  cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view)
+{
+  auto this_gpu_graph_view = graph_view.get(handle);
+
+  rmm::device_uvector<vertex_t> local_vertices(vertices.size(), handle.get_stream());
+  rmm::device_uvector<int> vertex_gpu_ids(vertices.size(), handle.get_stream());
+  rmm::device_uvector<size_t> vertex_pos(vertices.size(), handle.get_stream());
+  rmm::device_uvector<result_t> result(vertices.size(), handle.get_stream());
+
+  raft::copy(local_vertices.data(), vertices.data(), vertices.size(), handle.get_stream());
+  cugraph::detail::scalar_fill(
+    handle.get_stream(), vertex_gpu_ids.data(), vertex_gpu_ids.size(), handle.get_rank());
+  cugraph::detail::sequence_fill(
+    handle.get_stream(), vertex_pos.data(), vertex_pos.size(), size_t{0});
+
+  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+    this_gpu_graph_view.vertex_partition_range_lasts().size(), handle.get_stream());
+  raft::update_device(d_vertex_partition_range_lasts.data(),
+                      this_gpu_graph_view.vertex_partition_range_lasts().data(),
+                      this_gpu_graph_view.vertex_partition_range_lasts().size(),
+                      handle.get_stream());
+
+  if (renumber_map_view) {
+    cugraph::renumber_ext_vertices<vertex_t, multi_gpu>(
+      handle.raft_handle(),
+      local_vertices.data(),
+      local_vertices.size(),
+      renumber_map_view->get(handle).data(),
+      this_gpu_graph_view.local_vertex_partition_range_first(),
+      this_gpu_graph_view.local_vertex_partition_range_last());
+  }
+
+  auto const major_comm_size =
+    handle.raft_handle().get_subcomm(cugraph::partition_manager::major_comm_name()).get_size();
+  auto const minor_comm_size =
+    handle.raft_handle().get_subcomm(cugraph::partition_manager::minor_comm_name()).get_size();
+
+  std::forward_as_tuple(local_vertices, std::tie(vertex_gpu_ids, vertex_pos), std::ignore) =
+    groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.raft_handle().get_comms(),
+      local_vertices.begin(),
+      local_vertices.end(),
+      thrust::make_zip_iterator(vertex_gpu_ids.begin(), vertex_pos.begin()),
+      cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+        raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
+                                          d_vertex_partition_range_lasts.size()),
+        major_comm_size,
+        minor_comm_size},
+      handle.get_stream());
+
+  //
+  //  Now gather
+  //
+  rmm::device_uvector<result_t> tmp_result(local_vertices.size(), handle.get_stream());
+
+  auto& wrapped = this->get(handle);
+
+  auto vertex_partition = vertex_partition_device_view_t<vertex_t, multi_gpu>(
+    this_gpu_graph_view.local_vertex_partition_view());
+
+  auto iter =
+    thrust::make_transform_iterator(local_vertices.begin(), [vertex_partition] __device__(auto v) {
+      return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
+    });
+
+  thrust::gather(handle.raft_handle().get_thrust_policy(),
+                 iter,
+                 iter + local_vertices.size(),
+                 wrapped.begin(),
+                 tmp_result.begin());
+
+  //
+  // Shuffle back
+  //
+  std::forward_as_tuple(std::ignore, std::tie(std::ignore, vertex_pos, tmp_result), std::ignore) =
+    groupby_gpu_id_and_shuffle_kv_pairs(
+      handle.raft_handle().get_comms(),
+      vertex_gpu_ids.begin(),
+      vertex_gpu_ids.end(),
+      thrust::make_zip_iterator(local_vertices.begin(), vertex_pos.begin(), tmp_result.begin()),
+      [] __device__(int gpu) { return gpu; },
+      handle.get_stream());
+
+  //
+  // Finally, reorder result
+  //
+  thrust::scatter(handle.raft_handle().get_thrust_policy(),
+                  tmp_result.begin(),
+                  tmp_result.end(),
+                  vertex_pos.begin(),
+                  result.begin());
+
+  return result;
+}
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+}  // namespace mtmg
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 5e1e1d6ace3..f08606df8ea 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -419,6 +419,14 @@ ConfigureTest(K_HOP_NBRS_TEST traversal/k_hop_nbrs_test.cpp)
 # - install tests ---------------------------------------------------------------------------------
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcugraph)
 
+###################################################################################################
+# - MTMG tests -------------------------------------------------------------------------
+ConfigureTest(MTMG_TEST mtmg/threaded_test.cu)
+target_link_libraries(MTMG_TEST
+                      PRIVATE
+                      UCP::UCP
+                    )
+
 ###################################################################################################
 # - MG tests --------------------------------------------------------------------------------------
 
diff --git a/cpp/tests/mtmg/threaded_test.cu b/cpp/tests/mtmg/threaded_test.cu
new file mode 100644
index 00000000000..c5dc2d3c7ce
--- /dev/null
+++ b/cpp/tests/mtmg/threaded_test.cu
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/mtmg/edgelist.hpp>
+#include <cugraph/mtmg/graph.hpp>
+#include <cugraph/mtmg/per_thread_edgelist.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+#include <cugraph/mtmg/resource_manager.hpp>
+#include <cugraph/mtmg/vertex_result.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <nccl.h>
+
+#include <vector>
+
+#include <thrust/count.h>
+#include <thrust/unique.h>
+
+struct Multithreaded_Usecase {
+  bool test_weighted{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Multithreaded
+  : public ::testing::TestWithParam<std::tuple<Multithreaded_Usecase, input_usecase_t>> {
+ public:
+  Tests_Multithreaded() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  std::vector<int> get_gpu_list()
+  {
+    int num_gpus_per_node{1};
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
+
+    std::vector<int> gpu_list(num_gpus_per_node);
+    std::iota(gpu_list.begin(), gpu_list.end(), 0);
+
+    return gpu_list;
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename result_t,
+            bool multi_gpu>
+  void run_current_test(
+    std::tuple<Multithreaded_Usecase const&, input_usecase_t const&> const& param,
+    std::vector<int> gpu_list)
+  {
+    using edge_type_t = int32_t;
+
+    constexpr bool renumber           = true;
+    constexpr bool do_expensive_check = false;
+
+    auto [multithreaded_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+
+    result_t constexpr alpha{0.85};
+    result_t constexpr epsilon{1e-6};
+
+    size_t device_buffer_size{64 * 1024 * 1024};
+    size_t thread_buffer_size{4 * 1024 * 1024};
+
+    int num_gpus    = gpu_list.size();
+    int num_threads = num_gpus * 4;
+
+    cugraph::mtmg::resource_manager_t resource_manager;
+
+    std::for_each(gpu_list.begin(), gpu_list.end(), [&resource_manager](int gpu_id) {
+      resource_manager.register_local_gpu(gpu_id, rmm::cuda_device_id{gpu_id});
+    });
+
+    ncclUniqueId instance_manager_id;
+    ncclGetUniqueId(&instance_manager_id);
+
+    auto instance_manager = resource_manager.create_instance_manager(
+      resource_manager.registered_ranks(), instance_manager_id);
+
+    cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> edgelist;
+    cugraph::mtmg::graph_t<vertex_t, edge_t, true, multi_gpu> graph;
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu> graph_view;
+    cugraph::mtmg::vertex_result_t<result_t> pageranks;
+    std::optional<cugraph::mtmg::renumber_map_t<vertex_t>> renumber_map =
+      std::make_optional<cugraph::mtmg::renumber_map_t<vertex_t>>();
+
+    auto edge_weights = multithreaded_usecase.test_weighted
+                          ? std::make_optional<cugraph::mtmg::edge_property_t<
+                              cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+                              weight_t>>()
+                          : std::nullopt;
+
+    //
+    // Simulate graph creation by spawning threads to walk through the
+    // local COO and add edges
+    //
+    std::vector<std::thread> running_threads;
+
+    //  Initialize shared edgelist object, one per GPU
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &edgelist,
+                                    device_buffer_size,
+                                    use_weight    = true,
+                                    use_edge_id   = false,
+                                    use_edge_type = false]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        edgelist.set(thread_handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    // Load SG edge list
+    auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] =
+      input_usecase.template construct_edgelist<vertex_t, weight_t>(
+        handle, multithreaded_usecase.test_weighted, false, false);
+
+    auto h_src_v         = cugraph::test::to_host(handle, d_src_v);
+    auto h_dst_v         = cugraph::test::to_host(handle, d_dst_v);
+    auto h_weights_v     = cugraph::test::to_host(handle, d_weights_v);
+    auto unique_vertices = cugraph::test::to_host(handle, d_vertices_v);
+
+    // Load edgelist from different threads.  We'll use more threads than GPUs here
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    thread_buffer_size,
+                                    &edgelist,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+        cugraph::mtmg::per_thread_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>
+          per_thread_edgelist(edgelist.get(thread_handle), thread_buffer_size);
+
+        for (size_t j = i; j < h_src_v.size(); j += num_threads) {
+#if 0
+          if (h_weights_v) {
+            thread_edgelist.append(
+              thread_handle, h_src_v[j], h_dst_v[j], (*h_weights_v)[j], std::nullopt, std::nullopt);
+          } else {
+            thread_edgelist.append(
+              thread_handle, h_src_v[j], h_dst_v[j], std::nullopt, std::nullopt, std::nullopt);
+          }
+#endif
+          per_thread_edgelist.append(
+            thread_handle,
+            h_src_v[j],
+            h_dst_v[j],
+            h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
+            std::nullopt,
+            std::nullopt);
+        }
+
+        per_thread_edgelist.flush(thread_handle);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph,
+                                    &edge_weights,
+                                    &edgelist,
+                                    &renumber_map,
+                                    &pageranks,
+                                    is_symmetric = is_symmetric,
+                                    renumber,
+                                    do_expensive_check]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        if (thread_handle.get_thread_rank() > 0) return;
+
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+          edge_t>>
+          edge_ids{std::nullopt};
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, true, multi_gpu>,
+          int32_t>>
+          edge_types{std::nullopt};
+
+        edgelist.finalize_buffer(thread_handle);
+        edgelist.consolidate_and_shuffle(thread_handle, true);
+
+        cugraph::mtmg::
+          create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, true, multi_gpu>(
+            thread_handle,
+            edgelist,
+            cugraph::graph_properties_t{is_symmetric, true},
+            renumber,
+            graph,
+            edge_weights,
+            edge_ids,
+            edge_types,
+            renumber_map,
+            do_expensive_check);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    graph_view = graph.view();
+
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back(
+        [&instance_manager, &graph_view, &edge_weights, &pageranks, alpha, epsilon]() {
+          auto thread_handle = instance_manager->get_handle();
+
+          if (thread_handle.get_thread_rank() > 0) return;
+
+          auto [local_pageranks, metadata] =
+            cugraph::pagerank<vertex_t, edge_t, weight_t, weight_t, true>(
+              thread_handle.raft_handle(),
+              graph_view.get(thread_handle),
+              edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
+                           : std::nullopt,
+              std::nullopt,
+              std::nullopt,
+              std::nullopt,
+              alpha,
+              epsilon,
+              500,
+              true);
+
+          pageranks.set(thread_handle, std::move(local_pageranks));
+        });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    std::vector<std::tuple<std::vector<vertex_t>, std::vector<result_t>>> computed_pageranks_v;
+    std::mutex computed_pageranks_lock{};
+
+    auto pageranks_view    = pageranks.view();
+    auto renumber_map_view = renumber_map ? std::make_optional(renumber_map->view()) : std::nullopt;
+
+    // Load computed_pageranks from different threads.
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph_view,
+                                    &renumber_map_view,
+                                    &pageranks_view,
+                                    &computed_pageranks_lock,
+                                    &computed_pageranks_v,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    &unique_vertices,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        auto number_of_vertices = unique_vertices->size();
+
+        std::vector<vertex_t> my_vertex_list;
+        my_vertex_list.reserve((number_of_vertices + num_threads - 1) / num_threads);
+
+        for (size_t j = i; j < number_of_vertices; j += num_threads) {
+          my_vertex_list.push_back((*unique_vertices)[j]);
+        }
+
+        rmm::device_uvector<vertex_t> d_my_vertex_list(my_vertex_list.size(),
+                                                       thread_handle.raft_handle().get_stream());
+        raft::update_device(d_my_vertex_list.data(),
+                            my_vertex_list.data(),
+                            my_vertex_list.size(),
+                            thread_handle.raft_handle().get_stream());
+
+        auto d_my_pageranks = pageranks_view.gather(
+          thread_handle,
+          raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
+          graph_view,
+          renumber_map_view);
+
+        std::vector<result_t> my_pageranks(d_my_pageranks.size());
+        raft::update_host(my_pageranks.data(),
+                          d_my_pageranks.data(),
+                          d_my_pageranks.size(),
+                          thread_handle.raft_handle().get_stream());
+
+        {
+          std::lock_guard<std::mutex> lock(computed_pageranks_lock);
+          computed_pageranks_v.push_back(
+            std::make_tuple(std::move(my_vertex_list), std::move(my_pageranks)));
+        }
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    if (multithreaded_usecase.check_correctness) {
+      // Want to compare the results in computed_pageranks_v with SG results
+      cugraph::graph_t<vertex_t, edge_t, true, false> sg_graph(handle);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, true, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+      std::optional<rmm::device_uvector<vertex_t>> sg_renumber_map{std::nullopt};
+
+      std::tie(sg_graph, sg_edge_weights, std::ignore, std::ignore, sg_renumber_map) = cugraph::
+        create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, true, false>(
+          handle,
+          std::nullopt,
+          std::move(d_src_v),
+          std::move(d_dst_v),
+          std::move(d_weights_v),
+          std::nullopt,
+          std::nullopt,
+          cugraph::graph_properties_t{is_symmetric, true},
+          true);
+
+      auto [sg_pageranks, meta] = cugraph::pagerank<vertex_t, edge_t, weight_t, weight_t, false>(
+        handle,
+        sg_graph.view(),
+        sg_edge_weights ? std::make_optional(sg_edge_weights->view()) : std::nullopt,
+        std::nullopt,
+        std::nullopt,
+        std::nullopt,
+        alpha,
+        epsilon);
+
+      auto h_sg_pageranks    = cugraph::test::to_host(handle, sg_pageranks);
+      auto h_sg_renumber_map = cugraph::test::to_host(handle, sg_renumber_map);
+      auto compare_functor   = cugraph::test::nearly_equal<weight_t>{
+        weight_t{1e-3},
+        weight_t{(weight_t{1} / static_cast<weight_t>(h_sg_pageranks.size())) * weight_t{1e-3}}};
+
+      std::for_each(
+        computed_pageranks_v.begin(),
+        computed_pageranks_v.end(),
+        [h_sg_pageranks, compare_functor, h_sg_renumber_map](auto t1) {
+          std::for_each(
+            thrust::make_zip_iterator(std::get<0>(t1).begin(), std::get<1>(t1).begin()),
+            thrust::make_zip_iterator(std::get<0>(t1).end(), std::get<1>(t1).end()),
+            [h_sg_pageranks, compare_functor, h_sg_renumber_map](auto t2) {
+              vertex_t v  = thrust::get<0>(t2);
+              weight_t pr = thrust::get<1>(t2);
+
+              auto pos    = std::find(h_sg_renumber_map->begin(), h_sg_renumber_map->end(), v);
+              auto offset = std::distance(h_sg_renumber_map->begin(), pos);
+
+              ASSERT_TRUE(compare_functor(pr, h_sg_pageranks[offset]))
+                << "vertex " << v << ", SG result = " << h_sg_pageranks[offset]
+                << ", mtmg result = " << pr << ", renumber map = " << (*h_sg_renumber_map)[offset];
+            });
+        });
+    }
+  }
+};
+
+using Tests_Multithreaded_File = Tests_Multithreaded<cugraph::test::File_Usecase>;
+using Tests_Multithreaded_Rmat = Tests_Multithreaded<cugraph::test::Rmat_Usecase>;
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_Multithreaded_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+TEST_P(Tests_Multithreaded_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+INSTANTIATE_TEST_SUITE_P(file_test,
+                         Tests_Multithreaded_File,
+                         ::testing::Combine(
+                           // enable correctness checks
+                           ::testing::Values(Multithreaded_Usecase{false, true},
+                                             Multithreaded_Usecase{true, true}),
+                           ::testing::Values(cugraph::test::File_Usecase("karate.csv"),
+                                             cugraph::test::File_Usecase("dolphins.csv"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(Multithreaded_Usecase{false, true}, Multithreaded_Usecase{true, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_File,
+  ::testing::Combine(
+    // disable correctness checks
+    ::testing::Values(Multithreaded_Usecase{false, false}, Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Multithreaded_Usecase{false, false}, Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()

From 686c3727782c6d303385d7ecdb0330d890e8184d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Sep 2023 14:20:15 -0500
Subject: [PATCH 41/72] Update to clang 16.0.6. (#3859)

This PR updates cugraph to use clang 16.0.6. The previous version 16.0.1 has some minor formatting issues affecting several RAPIDS repos.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3859
---
 .pre-commit-config.yaml                        |  2 +-
 cpp/src/components/legacy/scc_matrix.cuh       |  2 +-
 cpp/src/cores/core_number_impl.cuh             |  2 +-
 ...er_v_random_select_transform_outgoing_e.cuh |  4 ++--
 cpp/src/sampling/random_walks.cuh              | 18 +++++++++---------
 cpp/src/structure/renumber_edgelist_impl.cuh   |  2 +-
 cpp/src/traversal/bfs_impl.cuh                 |  2 +-
 cpp/tests/prims/mg_extract_transform_e.cu      |  4 ++--
 cpp/tests/traversal/mg_sssp_test.cpp           |  2 +-
 cpp/tests/traversal/sssp_test.cpp              |  2 +-
 cpp/tests/utilities/test_utilities.hpp         |  2 +-
 11 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0f05aedf1a1..865d06b20e4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
         additional_dependencies:
           - flake8==6.0.0
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.1
+    rev: v16.0.6
     hooks:
       - id: clang-format
         exclude: |
diff --git a/cpp/src/components/legacy/scc_matrix.cuh b/cpp/src/components/legacy/scc_matrix.cuh
index 3d56bdc5bf4..d044123bed0 100644
--- a/cpp/src/components/legacy/scc_matrix.cuh
+++ b/cpp/src/components/legacy/scc_matrix.cuh
@@ -68,7 +68,7 @@ struct SCC_Data {
   SCC_Data(size_t nrows,
            const IndexT* p_d_r_o,  // row_offsets
            const IndexT* p_d_c_i)
-    :                              // column indices
+    :  // column indices
       nrows_(nrows),
       p_d_r_o_(p_d_r_o),
       p_d_c_i_(p_d_c_i),
diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh
index b63ae60f052..ea8e2a9c4ee 100644
--- a/cpp/src/cores/core_number_impl.cuh
+++ b/cpp/src/cores/core_number_impl.cuh
@@ -72,7 +72,7 @@ struct v_to_core_number_t {
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename edge_t>
 struct mult_degree_by_two_t {
-  __device__ edge_t operator()(edge_t d) const { return d* edge_t{2}; }
+  __device__ edge_t operator()(edge_t d) const { return d * edge_t{2}; }
 };
 
 }  // namespace
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 3375a651982..e6db21f1c7c 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -287,7 +287,7 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
 #ifndef NO_CUGRAPH_OPS
   edge_t mid_partition_degree_range_last = static_cast<edge_t>(K * 10);  // tuning parameter
   assert(mid_partition_degree_range_last > K);
-  size_t high_partition_over_sampling_K = K * 2;                         // tuning parameter
+  size_t high_partition_over_sampling_K = K * 2;  // tuning parameter
   assert(high_partition_over_sampling_K > K);
 
   rmm::device_uvector<edge_t> sample_nbr_indices(frontier_degrees.size() * K, handle.get_stream());
@@ -883,7 +883,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle,
     sample_nbr_indices);  // neighbor index within an edge partition (note that each vertex's
                           // neighbors are distributed in minor_comm_size partitions)
   std::optional<rmm::device_uvector<size_t>> sample_key_indices{
-    std::nullopt};        // relevant only when (minor_comm_size > 1)
+    std::nullopt};  // relevant only when (minor_comm_size > 1)
   auto local_frontier_sample_counts        = std::vector<size_t>{};
   auto local_frontier_sample_displacements = std::vector<size_t>{};
   if (minor_comm_size > 1) {
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 46789c6b8bd..6a7334e9f1a 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -197,19 +197,19 @@ struct col_indx_extract_t {
   void operator()(
     original::device_vec_t<vertex_t> const& d_coalesced_src_v,  // in: coalesced vector of vertices
     original::device_vec_t<vertex_t> const&
-      d_v_col_indx,       // in: column indices, given by stepper's random engine
+      d_v_col_indx,  // in: column indices, given by stepper's random engine
     original::device_vec_t<vertex_t>&
       d_v_next_vertices,  // out: set of destination vertices, for next step
     original::device_vec_t<weight_t>&
-      d_v_next_weights)   // out: set of weights between src and destination vertices, for next step
+      d_v_next_weights)  // out: set of weights between src and destination vertices, for next step
     const
   {
     thrust::transform_if(
       handle_.get_thrust_policy(),
       thrust::make_counting_iterator<index_t>(0),
-      thrust::make_counting_iterator<index_t>(num_paths_),                         // input1
-      d_v_col_indx.begin(),                                                        // input2
-      out_degs_,                                                                   // stencil
+      thrust::make_counting_iterator<index_t>(num_paths_),  // input1
+      d_v_col_indx.begin(),                                 // input2
+      out_degs_,                                            // stencil
       thrust::make_zip_iterator(
         thrust::make_tuple(d_v_next_vertices.begin(), d_v_next_weights.begin())),  // output
       [max_depth         = max_depth_,
@@ -575,9 +575,9 @@ struct random_walker_t {
       d_crt_out_degs,  // |current set of vertex out degrees| = nelems,
                        // to be used as stencil (don't scatter if 0)
     original::device_vec_t<index_t> const&
-      d_sizes,         // paths sizes used to provide delta in coalesced paths;
-                       // pre-condition: assumed as updated to reflect new vertex additions;
-                       // also, this is the number of _vertices_ in each path;
+      d_sizes,  // paths sizes used to provide delta in coalesced paths;
+                // pre-condition: assumed as updated to reflect new vertex additions;
+                // also, this is the number of _vertices_ in each path;
     // hence for scattering weights this needs to be adjusted; hence the `adjust` parameter
     index_t
       stride,  // stride = coalesce block size (max_depth for vertices; max_depth-1 for weights)
@@ -762,7 +762,7 @@ random_walks_impl(
   // pre-allocate num_paths * max_depth;
   //
   original::device_vec_t<vertex_t> d_coalesced_v(num_paths * max_depth,
-                                                 stream);         // coalesced vertex set
+                                                 stream);  // coalesced vertex set
   original::device_vec_t<weight_t> d_coalesced_w(num_paths * (max_depth - 1),
                                                  stream);         // coalesced weight set
   original::device_vec_t<index_t> d_paths_sz(num_paths, stream);  // paths sizes
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index d7381ba71af..6bc19ff4fe1 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -86,7 +86,7 @@ struct find_unused_id_t {
     for (size_t i = worker_id; i < sorted_local_vertices.size() + size_t{1}; i += num_workers) {
       auto start = (i == size_t{0}) ? std::numeric_limits<vertex_t>::lowest()
                                     : sorted_local_vertices[i - size_t{1}];
-      if (start != std::numeric_limits<vertex_t>::max()) { ++start; };            // now inclusive
+      if (start != std::numeric_limits<vertex_t>::max()) { ++start; };  // now inclusive
       auto end = (i == sorted_local_vertices.size()) ? std::numeric_limits<vertex_t>::max()
                                                      : sorted_local_vertices[i];  // exclusive
       for (vertex_t v = start; v < end; ++v) {
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index 0402184bd93..437071569bf 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -73,7 +73,7 @@ struct e_op_t {
       if (*(prev_visited_flags + packed_bool_offset(dst)) &
           packed_bool_mask(dst)) {  // check if unvisited in previous iterations
         push = false;
-      } else {                      // check if unvisited in this iteration as well
+      } else {  // check if unvisited in this iteration as well
         auto old = visited_flags.atomic_or(dst, true);
         push     = !old;
       }
diff --git a/cpp/tests/prims/mg_extract_transform_e.cu b/cpp/tests/prims/mg_extract_transform_e.cu
index b71fe5ddb5e..bca6471a5bb 100644
--- a/cpp/tests/prims/mg_extract_transform_e.cu
+++ b/cpp/tests/prims/mg_extract_transform_e.cu
@@ -157,8 +157,8 @@ class Tests_MGExtractTransformE
     // 1. create MG graph
 
     constexpr bool is_multi_gpu     = true;
-    constexpr bool renumber         = true;    // needs to be true for multi gpu case
-    constexpr bool store_transposed = false;   // needs to be false for using extract_transform_e
+    constexpr bool renumber         = true;   // needs to be true for multi gpu case
+    constexpr bool store_transposed = false;  // needs to be false for using extract_transform_e
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       handle_->get_comms().barrier();
diff --git a/cpp/tests/traversal/mg_sssp_test.cpp b/cpp/tests/traversal/mg_sssp_test.cpp
index b3e96981f96..ea0353c3743 100644
--- a/cpp/tests/traversal/mg_sssp_test.cpp
+++ b/cpp/tests/traversal/mg_sssp_test.cpp
@@ -214,7 +214,7 @@ class Tests_MGSSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, in
         auto h_sg_predecessors = cugraph::test::to_host(*handle_, d_sg_predecessors);
 
         auto max_weight_element = std::max_element(h_sg_weights.begin(), h_sg_weights.end());
-        auto epsilon            = *max_weight_element* weight_t{1e-6};
+        auto epsilon            = *max_weight_element * weight_t{1e-6};
         auto nearly_equal       = [epsilon](auto lhs, auto rhs) {
           return std::fabs(lhs - rhs) < epsilon;
         };
diff --git a/cpp/tests/traversal/sssp_test.cpp b/cpp/tests/traversal/sssp_test.cpp
index 1e77efa11de..ab73d02c912 100644
--- a/cpp/tests/traversal/sssp_test.cpp
+++ b/cpp/tests/traversal/sssp_test.cpp
@@ -222,7 +222,7 @@ class Tests_SSSP : public ::testing::TestWithParam<std::tuple<SSSP_Usecase, inpu
       }
 
       auto max_weight_element = std::max_element(h_weights.begin(), h_weights.end());
-      auto epsilon            = *max_weight_element* weight_t{1e-6};
+      auto epsilon            = *max_weight_element * weight_t{1e-6};
       auto nearly_equal = [epsilon](auto lhs, auto rhs) { return std::fabs(lhs - rhs) < epsilon; };
 
       ASSERT_TRUE(std::equal(h_reference_distances.begin(),
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 0eff8dedc8f..321a0536e02 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -547,7 +547,7 @@ mg_vertex_property_values_to_sg_vertex_property_values(
   std::optional<raft::device_span<vertex_t const>>
     sg_renumber_map,  // std::nullopt if the SG graph is not renumbered
   std::optional<raft::device_span<vertex_t const>>
-    mg_vertices,      // std::nullopt if the entire local vertex partition range is assumed
+    mg_vertices,  // std::nullopt if the entire local vertex partition range is assumed
   raft::device_span<value_t const> mg_values);
 
 template <typename type_t>

From d93032105ff92a70e28511471444dfcb2557da90 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Wed, 20 Sep 2023 14:19:56 -0700
Subject: [PATCH 42/72] MFG C++ code bug fix (#3865)

cugraph::sort_sampled_edgelist currently returns (label, hop) offsets of all zero.

This PR fixes this.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3865
---
 cpp/src/sampling/sampling_post_processing_impl.cuh  | 10 ++++++++--
 cpp/tests/sampling/sampling_post_processing_test.cu | 11 +++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index ff8da72ff35..0c397d91b20 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -1619,10 +1619,13 @@ renumber_and_sort_sampled_edgelist(
                  (*edgelist_label_hop_offsets).begin(),
                  (*edgelist_label_hop_offsets).end(),
                  size_t{0});
-    thrust::for_each(
+    // FIXME: the device lambda should be placed in cuda::proclaim_return_type<size_t>()
+    // once we update CCCL version to 2.x
+    thrust::transform(
       handle.get_thrust_policy(),
       thrust::make_counting_iterator(size_t{0}),
       thrust::make_counting_iterator(num_labels * num_hops),
+      (*edgelist_label_hop_offsets).begin(),
       [edgelist_label_offsets = edgelist_label_offsets
                                   ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
                                   : thrust::nullopt,
@@ -1743,10 +1746,13 @@ sort_sampled_edgelist(
                  (*edgelist_label_hop_offsets).begin(),
                  (*edgelist_label_hop_offsets).end(),
                  size_t{0});
-    thrust::for_each(
+    // FIXME: the device lambda should be placed in cuda::proclaim_return_type<size_t>()
+    // once we update CCCL version to 2.x
+    thrust::transform(
       handle.get_thrust_policy(),
       thrust::make_counting_iterator(size_t{0}),
       thrust::make_counting_iterator(num_labels * num_hops),
+      (*edgelist_label_hop_offsets).begin(),
       [edgelist_label_offsets = edgelist_label_offsets
                                   ? thrust::make_optional(std::get<0>(*edgelist_label_offsets))
                                   : thrust::nullopt,
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index 422fe953b20..e5267d75ac2 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -635,6 +635,12 @@ class Tests_SamplingPostProcessing
                                         (*renumbered_and_sorted_edgelist_label_hop_offsets).end()))
             << "Renumbered and sorted edge list (label,hop) offset array values should be "
                "non-decreasing.";
+
+          ASSERT_TRUE(
+            (*renumbered_and_sorted_edgelist_label_hop_offsets).back_element(handle.get_stream()) ==
+            renumbered_and_sorted_edgelist_srcs.size())
+            << "Renumbered and sorted edge list (label,hop) offset array's last element should "
+               "coincide with the number of edges.";
         }
 
         if (renumbered_and_sorted_renumber_map_label_offsets) {
@@ -1189,6 +1195,11 @@ class Tests_SamplingPostProcessing
                                         (*sorted_edgelist_label_hop_offsets).end()))
             << "Sorted edge list (label,hop) offset array values should be "
                "non-decreasing.";
+
+          ASSERT_TRUE((*sorted_edgelist_label_hop_offsets).back_element(handle.get_stream()) ==
+                      sorted_edgelist_srcs.size())
+            << "Sorted edge list (label,hop) offset array's last element should coincide with the "
+               "number of edges.";
         }
 
         for (size_t i = 0; i < sampling_post_processing_usecase.num_labels; ++i) {

From a53ab34b804af2865d2d210b801a759d2ca29bc6 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Thu, 21 Sep 2023 19:39:18 +0200
Subject: [PATCH 43/72] Refactor python code for similarity algos to use latest
 CAPI (#3828)

This PR
-  refactors python code for similarity algorithms (Jaccard, Sorensen, Overlap) to use latest CAPI
-  removes legacy cuda c/c++ code and python wrapper around legacy code
-  update CAPI tests
-  remove and update python tests

Closes #2546
Closes #2547
Closes #2548
Closes #2549
Closes #2749

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3828
---
 cpp/CMakeLists.txt                            |   2 -
 cpp/src/link_prediction/legacy/jaccard.cu     | 429 ------------------
 cpp/src/link_prediction/legacy/overlap.cu     | 425 -----------------
 cpp/tests/c_api/mg_similarity_test.c          |  51 ++-
 cpp/tests/c_api/similarity_test.c             |  57 +--
 python/cugraph/CMakeLists.txt                 |   1 -
 .../cugraph/community/induced_subgraph.py     |   9 +-
 .../cugraph/dask/link_prediction/jaccard.py   |   2 +-
 .../cugraph/dask/link_prediction/overlap.py   |   2 +-
 .../cugraph/dask/link_prediction/sorensen.py  |   2 +-
 .../cugraph/cugraph/experimental/__init__.py  |  32 +-
 .../experimental/link_prediction/__init__.py  |  13 -
 .../experimental/link_prediction/jaccard.py   | 255 -----------
 .../experimental/link_prediction/overlap.py   | 223 ---------
 .../experimental/link_prediction/sorensen.py  | 221 ---------
 .../cugraph/link_prediction/CMakeLists.txt    |  22 -
 .../cugraph/link_prediction/__init__.py       |  23 +-
 .../cugraph/link_prediction/jaccard.pxd       |  35 --
 .../cugraph/link_prediction/jaccard.py        | 208 ++++++---
 .../link_prediction/jaccard_wrapper.pyx       | 155 -------
 .../cugraph/link_prediction/overlap.pxd       |  35 --
 .../cugraph/link_prediction/overlap.py        | 212 +++++++--
 .../link_prediction/overlap_wrapper.pyx       | 142 ------
 .../cugraph/link_prediction/sorensen.py       | 223 ++++++---
 .../cugraph/link_prediction/wjaccard.py       | 100 ++--
 .../cugraph/link_prediction/woverlap.py       |  76 ++--
 .../cugraph/link_prediction/wsorensen.py      |  78 ++--
 .../cugraph/cugraph/sampling/random_walks.py  |   9 +-
 .../tests/link_prediction/test_jaccard.py     | 315 +++++++------
 .../tests/link_prediction/test_overlap.py     | 152 ++++---
 .../tests/link_prediction/test_sorensen.py    | 252 ++++++----
 .../tests/link_prediction/test_wjaccard.py    | 177 --------
 .../tests/link_prediction/test_woverlap.py    | 171 -------
 .../tests/link_prediction/test_wsorensen.py   | 181 --------
 python/pylibcugraph/pylibcugraph/__init__.py  |   7 +
 .../pylibcugraph/experimental/__init__.py     |  19 +-
 .../pylibcugraph/jaccard_coefficients.pyx     |  12 +-
 .../pylibcugraph/overlap_coefficients.pyx     |  10 +-
 .../pylibcugraph/sorensen_coefficients.pyx    |  10 +-
 39 files changed, 1129 insertions(+), 3219 deletions(-)
 delete mode 100644 cpp/src/link_prediction/legacy/jaccard.cu
 delete mode 100644 cpp/src/link_prediction/legacy/overlap.cu
 delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/__init__.py
 delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/jaccard.py
 delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/overlap.py
 delete mode 100644 python/cugraph/cugraph/experimental/link_prediction/sorensen.py
 delete mode 100644 python/cugraph/cugraph/link_prediction/CMakeLists.txt
 delete mode 100644 python/cugraph/cugraph/link_prediction/jaccard.pxd
 delete mode 100644 python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx
 delete mode 100644 python/cugraph/cugraph/link_prediction/overlap.pxd
 delete mode 100644 python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx
 delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py
 delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_woverlap.py
 delete mode 100644 python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a6c26ee3b91..0d7bd86075d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -196,8 +196,6 @@ set(CUGRAPH_SOURCES
     src/utilities/path_retrieval.cu
     src/structure/legacy/graph.cu
     src/linear_assignment/legacy/hungarian.cu
-    src/link_prediction/legacy/jaccard.cu
-    src/link_prediction/legacy/overlap.cu
     src/link_prediction/jaccard_sg.cu
     src/link_prediction/sorensen_sg.cu
     src/link_prediction/overlap_sg.cu
diff --git a/cpp/src/link_prediction/legacy/jaccard.cu b/cpp/src/link_prediction/legacy/jaccard.cu
deleted file mode 100644
index d0b240e3c77..00000000000
--- a/cpp/src/link_prediction/legacy/jaccard.cu
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cugraph/legacy/graph.hpp>
-#include <cugraph/utilities/error.hpp>
-#include <utilities/graph_utils.cuh>
-
-#include <rmm/device_vector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/fill.h>
-
-namespace cugraph {
-namespace detail {
-
-// Volume of neighboors (*weight_s)
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void jaccard_row_sum(
-  vertex_t n, edge_t const* csrPtr, vertex_t const* csrInd, weight_t const* v, weight_t* work)
-{
-  vertex_t row;
-  edge_t start, end, length;
-  weight_t sum;
-
-  for (row = threadIdx.y + blockIdx.y * blockDim.y; row < n; row += gridDim.y * blockDim.y) {
-    start  = csrPtr[row];
-    end    = csrPtr[row + 1];
-    length = end - start;
-
-    // compute row sums
-    if (weighted) {
-      sum = parallel_prefix_sum(length, csrInd + start, v);
-      if (threadIdx.x == 0) work[row] = sum;
-    } else {
-      work[row] = static_cast<weight_t>(length);
-    }
-  }
-}
-
-// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void jaccard_is(vertex_t n,
-                           edge_t const* csrPtr,
-                           vertex_t const* csrInd,
-                           weight_t const* v,
-                           weight_t* work,
-                           weight_t* weight_i,
-                           weight_t* weight_s)
-{
-  edge_t i, j, Ni, Nj;
-  vertex_t row, col;
-  vertex_t ref, cur, ref_col, cur_col, match;
-  weight_t ref_val;
-
-  for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
-    for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1];
-         j += gridDim.y * blockDim.y) {
-      col = csrInd[j];
-      // find which row has least elements (and call it reference row)
-      Ni  = csrPtr[row + 1] - csrPtr[row];
-      Nj  = csrPtr[col + 1] - csrPtr[col];
-      ref = (Ni < Nj) ? row : col;
-      cur = (Ni < Nj) ? col : row;
-
-      // compute new sum weights
-      weight_s[j] = work[row] + work[col];
-
-      // compute new intersection weights
-      // search for the element with the same column index in the reference row
-      for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
-           i += gridDim.x * blockDim.x) {
-        match   = -1;
-        ref_col = csrInd[i];
-        if (weighted) {
-          ref_val = v[ref_col];
-        } else {
-          ref_val = 1.0;
-        }
-
-        // binary search (column indices are sorted within each row)
-        edge_t left  = csrPtr[cur];
-        edge_t right = csrPtr[cur + 1] - 1;
-        while (left <= right) {
-          edge_t middle = (left + right) >> 1;
-          cur_col       = csrInd[middle];
-          if (cur_col > ref_col) {
-            right = middle - 1;
-          } else if (cur_col < ref_col) {
-            left = middle + 1;
-          } else {
-            match = middle;
-            break;
-          }
-        }
-
-        // if the element with the same column index in the reference row has been found
-        if (match != -1) { atomicAdd(&weight_i[j], ref_val); }
-      }
-    }
-  }
-}
-
-// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-// Using list of node pairs
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void jaccard_is_pairs(edge_t num_pairs,
-                                 edge_t const* csrPtr,
-                                 vertex_t const* csrInd,
-                                 vertex_t const* first_pair,
-                                 vertex_t const* second_pair,
-                                 weight_t const* v,
-                                 weight_t* work,
-                                 weight_t* weight_i,
-                                 weight_t* weight_s)
-{
-  edge_t i, idx, Ni, Nj, match;
-  vertex_t row, col, ref, cur, ref_col, cur_col;
-  weight_t ref_val;
-
-  for (idx = threadIdx.z + blockIdx.z * blockDim.z; idx < num_pairs;
-       idx += gridDim.z * blockDim.z) {
-    row = first_pair[idx];
-    col = second_pair[idx];
-
-    // find which row has least elements (and call it reference row)
-    Ni  = csrPtr[row + 1] - csrPtr[row];
-    Nj  = csrPtr[col + 1] - csrPtr[col];
-    ref = (Ni < Nj) ? row : col;
-    cur = (Ni < Nj) ? col : row;
-
-    // compute new sum weights
-    weight_s[idx] = work[row] + work[col];
-
-    // compute new intersection weights
-    // search for the element with the same column index in the reference row
-    for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
-         i += gridDim.x * blockDim.x) {
-      match   = -1;
-      ref_col = csrInd[i];
-      if (weighted) {
-        ref_val = v[ref_col];
-      } else {
-        ref_val = 1.0;
-      }
-
-      // binary search (column indices are sorted within each row)
-      edge_t left  = csrPtr[cur];
-      edge_t right = csrPtr[cur + 1] - 1;
-      while (left <= right) {
-        edge_t middle = (left + right) >> 1;
-        cur_col       = csrInd[middle];
-        if (cur_col > ref_col) {
-          right = middle - 1;
-        } else if (cur_col < ref_col) {
-          left = middle + 1;
-        } else {
-          match = middle;
-          break;
-        }
-      }
-
-      // if the element with the same column index in the reference row has been found
-      if (match != -1) { atomicAdd(&weight_i[idx], ref_val); }
-    }
-  }
-}
-
-// Jaccard  weights (*weight)
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void jaccard_jw(edge_t e,
-                           weight_t const* weight_i,
-                           weight_t const* weight_s,
-                           weight_t* weight_j)
-{
-  edge_t j;
-  weight_t Wi, Ws, Wu;
-
-  for (j = threadIdx.x + blockIdx.x * blockDim.x; j < e; j += gridDim.x * blockDim.x) {
-    Wi          = weight_i[j];
-    Ws          = weight_s[j];
-    Wu          = Ws - Wi;
-    weight_j[j] = (Wi / Wu);
-  }
-}
-
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-int jaccard(vertex_t n,
-            edge_t e,
-            edge_t const* csrPtr,
-            vertex_t const* csrInd,
-            weight_t const* weight_in,
-            weight_t* work,
-            weight_t* weight_i,
-            weight_t* weight_s,
-            weight_t* weight_j)
-{
-  rmm::cuda_stream_view stream_view;
-  dim3 nthreads, nblocks;
-  int y = 4;
-
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = y;
-  nthreads.z = 1;
-  nblocks.x  = 1;
-  nblocks.y  = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS});
-  nblocks.z  = 1;
-
-  // launch kernel
-  jaccard_row_sum<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads, 0, stream_view.value()>>>(n, csrPtr, csrInd, weight_in, work);
-
-  thrust::fill(rmm::exec_policy(stream_view), weight_i, weight_i + e, weight_t{0.0});
-
-  // setup launch configuration
-  nthreads.x = 32 / y;
-  nthreads.y = y;
-  nthreads.z = 8;
-  nblocks.x  = 1;
-  nblocks.y  = 1;
-  nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS});  // 1;
-
-  // launch kernel
-  jaccard_is<weighted, vertex_t, edge_t, weight_t><<<nblocks, nthreads, 0, stream_view.value()>>>(
-    n, csrPtr, csrInd, weight_in, work, weight_i, weight_s);
-
-  // setup launch configuration
-  nthreads.x = min(e, edge_t{CUDA_MAX_KERNEL_THREADS});
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((e + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS});
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-
-  // launch kernel
-  jaccard_jw<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads, 0, stream_view.value()>>>(e, weight_i, weight_s, weight_j);
-
-  return 0;
-}
-
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-int jaccard_pairs(vertex_t n,
-                  edge_t num_pairs,
-                  edge_t const* csrPtr,
-                  vertex_t const* csrInd,
-                  vertex_t const* first_pair,
-                  vertex_t const* second_pair,
-                  weight_t const* weight_in,
-                  weight_t* work,
-                  weight_t* weight_i,
-                  weight_t* weight_s,
-                  weight_t* weight_j)
-{
-  dim3 nthreads, nblocks;
-  int y = 4;
-
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = y;
-  nthreads.z = 1;
-  nblocks.x  = 1;
-  nblocks.y  = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS});
-  nblocks.z  = 1;
-
-  // launch kernel
-  jaccard_row_sum<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(n, csrPtr, csrInd, weight_in, work);
-  cudaDeviceSynchronize();
-
-  // NOTE: initilized weight_i vector with 0.0
-  // fill(num_pairs, weight_i, weight_t{0.0});
-
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = 1;
-  nthreads.z = 8;
-  nblocks.x  = 1;
-  nblocks.y  = 1;
-  nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS});  // 1;
-
-  // launch kernel
-  jaccard_is_pairs<weighted, vertex_t, edge_t, weight_t><<<nblocks, nthreads>>>(
-    num_pairs, csrPtr, csrInd, first_pair, second_pair, weight_in, work, weight_i, weight_s);
-
-  // setup launch configuration
-  nthreads.x = min(num_pairs, edge_t{CUDA_MAX_KERNEL_THREADS});
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((num_pairs + nthreads.x - 1) / nthreads.x, (edge_t)CUDA_MAX_BLOCKS);
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-
-  // launch kernel
-  jaccard_jw<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(num_pairs, weight_i, weight_s, weight_j);
-
-  return 0;
-}
-}  // namespace detail
-
-template <typename VT, typename ET, typename WT>
-void jaccard(legacy::GraphCSRView<VT, ET, WT> const& graph, WT const* weights, WT* result)
-{
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
-
-  rmm::device_vector<WT> weight_i(graph.number_of_edges);
-  rmm::device_vector<WT> weight_s(graph.number_of_edges);
-  rmm::device_vector<WT> work(graph.number_of_vertices);
-
-  if (weights == nullptr) {
-    cugraph::detail::jaccard<false, VT, ET, WT>(graph.number_of_vertices,
-                                                graph.number_of_edges,
-                                                graph.offsets,
-                                                graph.indices,
-                                                weights,
-                                                work.data().get(),
-                                                weight_i.data().get(),
-                                                weight_s.data().get(),
-                                                result);
-  } else {
-    cugraph::detail::jaccard<true, VT, ET, WT>(graph.number_of_vertices,
-                                               graph.number_of_edges,
-                                               graph.offsets,
-                                               graph.indices,
-                                               weights,
-                                               work.data().get(),
-                                               weight_i.data().get(),
-                                               weight_s.data().get(),
-                                               result);
-  }
-}
-
-template <typename VT, typename ET, typename WT>
-void jaccard_list(legacy::GraphCSRView<VT, ET, WT> const& graph,
-                  WT const* weights,
-                  ET num_pairs,
-                  VT const* first,
-                  VT const* second,
-                  WT* result)
-{
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
-  CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first is NULL");
-  CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second in NULL");
-
-  rmm::device_vector<WT> weight_i(num_pairs, WT{0.0});
-  rmm::device_vector<WT> weight_s(num_pairs);
-  rmm::device_vector<WT> work(graph.number_of_vertices);
-
-  if (weights == nullptr) {
-    cugraph::detail::jaccard_pairs<false, VT, ET, WT>(graph.number_of_vertices,
-                                                      num_pairs,
-                                                      graph.offsets,
-                                                      graph.indices,
-                                                      first,
-                                                      second,
-                                                      weights,
-                                                      work.data().get(),
-                                                      weight_i.data().get(),
-                                                      weight_s.data().get(),
-                                                      result);
-  } else {
-    cugraph::detail::jaccard_pairs<true, VT, ET, WT>(graph.number_of_vertices,
-                                                     num_pairs,
-                                                     graph.offsets,
-                                                     graph.indices,
-                                                     first,
-                                                     second,
-                                                     weights,
-                                                     work.data().get(),
-                                                     weight_i.data().get(),
-                                                     weight_s.data().get(),
-                                                     result);
-  }
-}
-
-template void jaccard<int32_t, int32_t, float>(legacy::GraphCSRView<int32_t, int32_t, float> const&,
-                                               float const*,
-                                               float*);
-template void jaccard<int32_t, int32_t, double>(
-  legacy::GraphCSRView<int32_t, int32_t, double> const&, double const*, double*);
-template void jaccard<int64_t, int64_t, float>(legacy::GraphCSRView<int64_t, int64_t, float> const&,
-                                               float const*,
-                                               float*);
-template void jaccard<int64_t, int64_t, double>(
-  legacy::GraphCSRView<int64_t, int64_t, double> const&, double const*, double*);
-template void jaccard_list<int32_t, int32_t, float>(
-  legacy::GraphCSRView<int32_t, int32_t, float> const&,
-  float const*,
-  int32_t,
-  int32_t const*,
-  int32_t const*,
-  float*);
-template void jaccard_list<int32_t, int32_t, double>(
-  legacy::GraphCSRView<int32_t, int32_t, double> const&,
-  double const*,
-  int32_t,
-  int32_t const*,
-  int32_t const*,
-  double*);
-template void jaccard_list<int64_t, int64_t, float>(
-  legacy::GraphCSRView<int64_t, int64_t, float> const&,
-  float const*,
-  int64_t,
-  int64_t const*,
-  int64_t const*,
-  float*);
-template void jaccard_list<int64_t, int64_t, double>(
-  legacy::GraphCSRView<int64_t, int64_t, double> const&,
-  double const*,
-  int64_t,
-  int64_t const*,
-  int64_t const*,
-  double*);
-
-}  // namespace cugraph
diff --git a/cpp/src/link_prediction/legacy/overlap.cu b/cpp/src/link_prediction/legacy/overlap.cu
deleted file mode 100644
index 67d7cd5e4c6..00000000000
--- a/cpp/src/link_prediction/legacy/overlap.cu
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cugraph/legacy/graph.hpp>
-#include <cugraph/utilities/error.hpp>
-#include <rmm/device_vector.hpp>
-#include <utilities/graph_utils.cuh>
-
-namespace cugraph {
-namespace detail {
-
-// Volume of neighboors (*weight_s)
-// TODO: Identical kernel to jaccard_row_sum!!
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void overlap_row_sum(
-  vertex_t n, edge_t const* csrPtr, vertex_t const* csrInd, weight_t const* v, weight_t* work)
-{
-  vertex_t row;
-  edge_t start, end, length;
-  weight_t sum;
-
-  for (row = threadIdx.y + blockIdx.y * blockDim.y; row < n; row += gridDim.y * blockDim.y) {
-    start  = csrPtr[row];
-    end    = csrPtr[row + 1];
-    length = end - start;
-
-    // compute row sums
-    if (weighted) {
-      sum = parallel_prefix_sum(length, csrInd + start, v);
-      if (threadIdx.x == 0) work[row] = sum;
-    } else {
-      work[row] = static_cast<weight_t>(length);
-    }
-  }
-}
-
-// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-// TODO: Identical kernel to jaccard_row_sum!!
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void overlap_is(vertex_t n,
-                           edge_t const* csrPtr,
-                           vertex_t const* csrInd,
-                           weight_t const* v,
-                           weight_t* work,
-                           weight_t* weight_i,
-                           weight_t* weight_s)
-{
-  edge_t i, j, Ni, Nj;
-  vertex_t row, col;
-  vertex_t ref, cur, ref_col, cur_col, match;
-  weight_t ref_val;
-
-  for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
-    for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1];
-         j += gridDim.y * blockDim.y) {
-      col = csrInd[j];
-      // find which row has least elements (and call it reference row)
-      Ni  = csrPtr[row + 1] - csrPtr[row];
-      Nj  = csrPtr[col + 1] - csrPtr[col];
-      ref = (Ni < Nj) ? row : col;
-      cur = (Ni < Nj) ? col : row;
-
-      // compute new sum weights
-      weight_s[j] = min(work[row], work[col]);
-
-      // compute new intersection weights
-      // search for the element with the same column index in the reference row
-      for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
-           i += gridDim.x * blockDim.x) {
-        match   = -1;
-        ref_col = csrInd[i];
-        if (weighted) {
-          ref_val = v[ref_col];
-        } else {
-          ref_val = 1.0;
-        }
-
-        // binary search (column indices are sorted within each row)
-        edge_t left  = csrPtr[cur];
-        edge_t right = csrPtr[cur + 1] - 1;
-        while (left <= right) {
-          edge_t middle = (left + right) >> 1;
-          cur_col       = csrInd[middle];
-          if (cur_col > ref_col) {
-            right = middle - 1;
-          } else if (cur_col < ref_col) {
-            left = middle + 1;
-          } else {
-            match = middle;
-            break;
-          }
-        }
-
-        // if the element with the same column index in the reference row has been found
-        if (match != -1) { atomicAdd(&weight_i[j], ref_val); }
-      }
-    }
-  }
-}
-
-// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-// Using list of node pairs
-// NOTE:  NOT the same as jaccard
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void overlap_is_pairs(edge_t num_pairs,
-                                 edge_t const* csrPtr,
-                                 vertex_t const* csrInd,
-                                 vertex_t const* first_pair,
-                                 vertex_t const* second_pair,
-                                 weight_t const* v,
-                                 weight_t* work,
-                                 weight_t* weight_i,
-                                 weight_t* weight_s)
-{
-  edge_t i, idx, Ni, Nj, match;
-  vertex_t row, col, ref, cur, ref_col, cur_col;
-  weight_t ref_val;
-
-  for (idx = threadIdx.z + blockIdx.z * blockDim.z; idx < num_pairs;
-       idx += gridDim.z * blockDim.z) {
-    row = first_pair[idx];
-    col = second_pair[idx];
-
-    // find which row has least elements (and call it reference row)
-    Ni  = csrPtr[row + 1] - csrPtr[row];
-    Nj  = csrPtr[col + 1] - csrPtr[col];
-    ref = (Ni < Nj) ? row : col;
-    cur = (Ni < Nj) ? col : row;
-
-    // compute new sum weights
-    weight_s[idx] = min(work[row], work[col]);
-
-    // compute new intersection weights
-    // search for the element with the same column index in the reference row
-    for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
-         i += gridDim.x * blockDim.x) {
-      match   = -1;
-      ref_col = csrInd[i];
-      if (weighted) {
-        ref_val = v[ref_col];
-      } else {
-        ref_val = 1.0;
-      }
-
-      // binary search (column indices are sorted within each row)
-      edge_t left  = csrPtr[cur];
-      edge_t right = csrPtr[cur + 1] - 1;
-      while (left <= right) {
-        edge_t middle = (left + right) >> 1;
-        cur_col       = csrInd[middle];
-        if (cur_col > ref_col) {
-          right = middle - 1;
-        } else if (cur_col < ref_col) {
-          left = middle + 1;
-        } else {
-          match = middle;
-          break;
-        }
-      }
-
-      // if the element with the same column index in the reference row has been found
-      if (match != -1) { atomicAdd(&weight_i[idx], ref_val); }
-    }
-  }
-}
-
-// Overlap  weights (*weight)
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-__global__ void overlap_jw(edge_t e,
-                           edge_t const* csrPtr,
-                           vertex_t const* csrInd,
-                           weight_t* weight_i,
-                           weight_t* weight_s,
-                           weight_t* weight_j)
-{
-  edge_t j;
-  weight_t Wi, Wu;
-
-  for (j = threadIdx.x + blockIdx.x * blockDim.x; j < e; j += gridDim.x * blockDim.x) {
-    Wi          = weight_i[j];
-    Wu          = weight_s[j];
-    weight_j[j] = (Wi / Wu);
-  }
-}
-
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-int overlap(vertex_t n,
-            edge_t e,
-            edge_t const* csrPtr,
-            vertex_t const* csrInd,
-            weight_t const* weight_in,
-            weight_t* work,
-            weight_t* weight_i,
-            weight_t* weight_s,
-            weight_t* weight_j)
-{
-  dim3 nthreads, nblocks;
-  int y = 4;
-
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = y;
-  nthreads.z = 1;
-  nblocks.x  = 1;
-  nblocks.y  = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS});
-  nblocks.z  = 1;
-
-  // launch kernel
-  overlap_row_sum<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(n, csrPtr, csrInd, weight_in, work);
-  cudaDeviceSynchronize();
-  fill(e, weight_i, weight_t{0.0});
-
-  // setup launch configuration
-  nthreads.x = 32 / y;
-  nthreads.y = y;
-  nthreads.z = 8;
-  nblocks.x  = 1;
-  nblocks.y  = 1;
-  nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS});  // 1;
-
-  // launch kernel
-  overlap_is<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(n, csrPtr, csrInd, weight_in, work, weight_i, weight_s);
-
-  // setup launch configuration
-  nthreads.x = min(e, edge_t{CUDA_MAX_KERNEL_THREADS});
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((e + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS});
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-
-  // launch kernel
-  overlap_jw<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(e, csrPtr, csrInd, weight_i, weight_s, weight_j);
-
-  return 0;
-}
-
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-int overlap_pairs(vertex_t n,
-                  edge_t num_pairs,
-                  edge_t const* csrPtr,
-                  vertex_t const* csrInd,
-                  vertex_t const* first_pair,
-                  vertex_t const* second_pair,
-                  weight_t const* weight_in,
-                  weight_t* work,
-                  weight_t* weight_i,
-                  weight_t* weight_s,
-                  weight_t* weight_j)
-{
-  dim3 nthreads, nblocks;
-  int y = 4;
-
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = y;
-  nthreads.z = 1;
-  nblocks.x  = 1;
-  nblocks.y  = min((n + nthreads.y - 1) / nthreads.y, vertex_t{CUDA_MAX_BLOCKS});
-  nblocks.z  = 1;
-  // launch kernel
-
-  overlap_row_sum<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(n, csrPtr, csrInd, weight_in, work);
-  cudaDeviceSynchronize();
-  fill(num_pairs, weight_i, weight_t{0.0});
-  // setup launch configuration
-  nthreads.x = 32;
-  nthreads.y = 1;
-  nthreads.z = 8;
-  nblocks.x  = 1;
-  nblocks.y  = 1;
-  nblocks.z  = min((n + nthreads.z - 1) / nthreads.z, vertex_t{CUDA_MAX_BLOCKS});  // 1;
-
-  // launch kernel
-  overlap_is_pairs<weighted, vertex_t, edge_t, weight_t><<<nblocks, nthreads>>>(
-    num_pairs, csrPtr, csrInd, first_pair, second_pair, weight_in, work, weight_i, weight_s);
-
-  // setup launch configuration
-  nthreads.x = min(num_pairs, edge_t{CUDA_MAX_KERNEL_THREADS});
-  nthreads.y = 1;
-  nthreads.z = 1;
-  nblocks.x  = min((num_pairs + nthreads.x - 1) / nthreads.x, edge_t{CUDA_MAX_BLOCKS});
-  nblocks.y  = 1;
-  nblocks.z  = 1;
-  // launch kernel
-
-  overlap_jw<weighted, vertex_t, edge_t, weight_t>
-    <<<nblocks, nthreads>>>(num_pairs, csrPtr, csrInd, weight_i, weight_s, weight_j);
-
-  return 0;
-}
-}  // namespace detail
-
-template <typename VT, typename ET, typename WT>
-void overlap(legacy::GraphCSRView<VT, ET, WT> const& graph, WT const* weights, WT* result)
-{
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
-
-  rmm::device_vector<WT> weight_i(graph.number_of_edges);
-  rmm::device_vector<WT> weight_s(graph.number_of_edges);
-  rmm::device_vector<WT> work(graph.number_of_vertices);
-
-  if (weights == nullptr) {
-    cugraph::detail::overlap<false, VT, ET, WT>(graph.number_of_vertices,
-                                                graph.number_of_edges,
-                                                graph.offsets,
-                                                graph.indices,
-                                                weights,
-                                                work.data().get(),
-                                                weight_i.data().get(),
-                                                weight_s.data().get(),
-                                                result);
-  } else {
-    cugraph::detail::overlap<true, VT, ET, WT>(graph.number_of_vertices,
-                                               graph.number_of_edges,
-                                               graph.offsets,
-                                               graph.indices,
-                                               weights,
-                                               work.data().get(),
-                                               weight_i.data().get(),
-                                               weight_s.data().get(),
-                                               result);
-  }
-}
-
-template <typename VT, typename ET, typename WT>
-void overlap_list(legacy::GraphCSRView<VT, ET, WT> const& graph,
-                  WT const* weights,
-                  ET num_pairs,
-                  VT const* first,
-                  VT const* second,
-                  WT* result)
-{
-  CUGRAPH_EXPECTS(result != nullptr, "Invalid input argument: result pointer is NULL");
-  CUGRAPH_EXPECTS(first != nullptr, "Invalid input argument: first column is NULL");
-  CUGRAPH_EXPECTS(second != nullptr, "Invalid input argument: second column is NULL");
-
-  rmm::device_vector<WT> weight_i(num_pairs);
-  rmm::device_vector<WT> weight_s(num_pairs);
-  rmm::device_vector<WT> work(graph.number_of_vertices);
-
-  if (weights == nullptr) {
-    cugraph::detail::overlap_pairs<false, VT, ET, WT>(graph.number_of_vertices,
-                                                      num_pairs,
-                                                      graph.offsets,
-                                                      graph.indices,
-                                                      first,
-                                                      second,
-                                                      weights,
-                                                      work.data().get(),
-                                                      weight_i.data().get(),
-                                                      weight_s.data().get(),
-                                                      result);
-  } else {
-    cugraph::detail::overlap_pairs<true, VT, ET, WT>(graph.number_of_vertices,
-                                                     num_pairs,
-                                                     graph.offsets,
-                                                     graph.indices,
-                                                     first,
-                                                     second,
-                                                     weights,
-                                                     work.data().get(),
-                                                     weight_i.data().get(),
-                                                     weight_s.data().get(),
-                                                     result);
-  }
-}
-
-template void overlap<int32_t, int32_t, float>(legacy::GraphCSRView<int32_t, int32_t, float> const&,
-                                               float const*,
-                                               float*);
-template void overlap<int32_t, int32_t, double>(
-  legacy::GraphCSRView<int32_t, int32_t, double> const&, double const*, double*);
-template void overlap<int64_t, int64_t, float>(legacy::GraphCSRView<int64_t, int64_t, float> const&,
-                                               float const*,
-                                               float*);
-template void overlap<int64_t, int64_t, double>(
-  legacy::GraphCSRView<int64_t, int64_t, double> const&, double const*, double*);
-template void overlap_list<int32_t, int32_t, float>(
-  legacy::GraphCSRView<int32_t, int32_t, float> const&,
-  float const*,
-  int32_t,
-  int32_t const*,
-  int32_t const*,
-  float*);
-template void overlap_list<int32_t, int32_t, double>(
-  legacy::GraphCSRView<int32_t, int32_t, double> const&,
-  double const*,
-  int32_t,
-  int32_t const*,
-  int32_t const*,
-  double*);
-template void overlap_list<int64_t, int64_t, float>(
-  legacy::GraphCSRView<int64_t, int64_t, float> const&,
-  float const*,
-  int64_t,
-  int64_t const*,
-  int64_t const*,
-  float*);
-template void overlap_list<int64_t, int64_t, double>(
-  legacy::GraphCSRView<int64_t, int64_t, double> const&,
-  double const*,
-  int64_t,
-  int64_t const*,
-  int64_t const*,
-  double*);
-
-}  // namespace cugraph
diff --git a/cpp/tests/c_api/mg_similarity_test.c b/cpp/tests/c_api/mg_similarity_test.c
index 0ac160245ab..336f6c50519 100644
--- a/cpp/tests/c_api/mg_similarity_test.c
+++ b/cpp/tests/c_api/mg_similarity_test.c
@@ -160,15 +160,16 @@ int test_jaccard(const cugraph_resource_handle_t* handle)
 int test_weighted_jaccard(const cugraph_resource_handle_t* handle)
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.357143, 0.208333, 0.0};
 
   return generic_similarity_test(handle,
                                  h_src,
@@ -216,15 +217,16 @@ int test_sorensen(const cugraph_resource_handle_t* handle)
 int test_weighted_sorensen(const cugraph_resource_handle_t* handle)
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.526316, 0.344828, 0.000000};
 
   return generic_similarity_test(handle,
                                  h_src,
@@ -272,15 +274,16 @@ int test_overlap(const cugraph_resource_handle_t* handle)
 int test_weighted_overlap(const cugraph_resource_handle_t* handle)
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.714286, 0.416667, 0.000000};
 
   return generic_similarity_test(handle,
                                  h_src,
diff --git a/cpp/tests/c_api/similarity_test.c b/cpp/tests/c_api/similarity_test.c
index 20af3f3eccd..52f849ccd28 100644
--- a/cpp/tests/c_api/similarity_test.c
+++ b/cpp/tests/c_api/similarity_test.c
@@ -161,15 +161,16 @@ int test_jaccard()
 int test_weighted_jaccard()
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.357143, 0.208333, 0.0};
 
   return generic_similarity_test(h_src,
                                  h_dst,
@@ -215,15 +216,16 @@ int test_sorensen()
 int test_weighted_sorensen()
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.526316, 0.344828, 0.000000};
 
   return generic_similarity_test(h_src,
                                  h_dst,
@@ -269,15 +271,16 @@ int test_overlap()
 int test_weighted_overlap()
 {
   size_t num_edges    = 16;
-  size_t num_vertices = 6;
-  size_t num_pairs    = 10;
+  size_t num_vertices = 7;
+  size_t num_pairs    = 3;
 
-  vertex_t h_src[]    = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[]    = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  weight_t h_wgt[]    = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
-  vertex_t h_first[]  = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
-  vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4};
-  weight_t h_result[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};  // TODO: Fill in
+  vertex_t h_src[]    = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6};
+  vertex_t h_dst[]    = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2};
+  weight_t h_wgt[]    = {0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0};
+
+  vertex_t h_first[]  = {0, 0, 1};
+  vertex_t h_second[] = {1, 2, 3};
+  weight_t h_result[] = {0.714286, 0.416667, 0.000000};
 
   return generic_similarity_test(h_src,
                                  h_dst,
@@ -301,8 +304,8 @@ int main(int argc, char** argv)
   result |= RUN_TEST(test_jaccard);
   result |= RUN_TEST(test_sorensen);
   result |= RUN_TEST(test_overlap);
-  // result |= RUN_TEST(test_weighted_jaccard);
-  // result |= RUN_TEST(test_weighted_sorensen);
-  // result |= RUN_TEST(test_weighted_overlap);
+  result |= RUN_TEST(test_weighted_jaccard);
+  result |= RUN_TEST(test_weighted_sorensen);
+  result |= RUN_TEST(test_weighted_overlap);
   return result;
 }
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index f3b28623b12..ecfcb9b219f 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -89,7 +89,6 @@ add_subdirectory(cugraph/dask/structure)
 add_subdirectory(cugraph/internals)
 add_subdirectory(cugraph/layout)
 add_subdirectory(cugraph/linear_assignment)
-add_subdirectory(cugraph/link_prediction)
 add_subdirectory(cugraph/structure)
 add_subdirectory(cugraph/tree)
 add_subdirectory(cugraph/utilities)
diff --git a/python/cugraph/cugraph/community/induced_subgraph.py b/python/cugraph/cugraph/community/induced_subgraph.py
index 29fe2f29c1e..3a901199b01 100644
--- a/python/cugraph/cugraph/community/induced_subgraph.py
+++ b/python/cugraph/cugraph/community/induced_subgraph.py
@@ -25,11 +25,10 @@
 )
 from cugraph.utilities.utils import import_optional
 
-# FIXME: the networkx.Graph type used in the type annotation for
-# induced_subgraph() is specified using a string literal to avoid depending on
-# and importing networkx. Instead, networkx is imported optionally, which may
-# cause a problem for a type checker if run in an environment where networkx is
-# not installed.
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
 networkx = import_optional("networkx")
 
 
diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
index b3d688584a0..218e6206fc3 100644
--- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
@@ -20,7 +20,7 @@
 from cugraph.dask.common.input_utils import get_distributed_data
 from cugraph.utilities import renumber_vertex_pair
 
-from pylibcugraph.experimental import (
+from pylibcugraph import (
     jaccard_coefficients as pylibcugraph_jaccard_coefficients,
 )
 from pylibcugraph import ResourceHandle
diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py
index c47aeef3c72..5540be28fd1 100644
--- a/python/cugraph/cugraph/dask/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py
@@ -20,7 +20,7 @@
 from cugraph.dask.common.input_utils import get_distributed_data
 from cugraph.utilities import renumber_vertex_pair
 
-from pylibcugraph.experimental import (
+from pylibcugraph import (
     overlap_coefficients as pylibcugraph_overlap_coefficients,
 )
 from pylibcugraph import ResourceHandle
diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
index bb5a3f44f39..24295ac330c 100644
--- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
@@ -20,7 +20,7 @@
 from cugraph.dask.common.input_utils import get_distributed_data
 from cugraph.utilities import renumber_vertex_pair
 
-from pylibcugraph.experimental import (
+from pylibcugraph import (
     sorensen_coefficients as pylibcugraph_sorensen_coefficients,
 )
 from pylibcugraph import ResourceHandle
diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py
index b96b760e634..2309a529047 100644
--- a/python/cugraph/cugraph/experimental/__init__.py
+++ b/python/cugraph/cugraph/experimental/__init__.py
@@ -48,30 +48,22 @@
     experimental_warning_wrapper(EXPERIMENTAL__find_bicliques)
 )
 
-from cugraph.experimental.link_prediction.jaccard import (
-    EXPERIMENTAL__jaccard,
-    EXPERIMENTAL__jaccard_coefficient,
-)
+from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler
 
-jaccard = experimental_warning_wrapper(EXPERIMENTAL__jaccard)
-jaccard_coefficient = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficient)
+BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler)
 
-from cugraph.experimental.link_prediction.sorensen import (
-    EXPERIMENTAL__sorensen,
-    EXPERIMENTAL__sorensen_coefficient,
-)
 
-sorensen = experimental_warning_wrapper(EXPERIMENTAL__sorensen)
-sorensen_coefficient = experimental_warning_wrapper(EXPERIMENTAL__sorensen_coefficient)
+from cugraph.link_prediction.jaccard import jaccard, jaccard_coefficient
 
-from cugraph.experimental.link_prediction.overlap import (
-    EXPERIMENTAL__overlap,
-    EXPERIMENTAL__overlap_coefficient,
-)
+jaccard = promoted_experimental_warning_wrapper(jaccard)
+jaccard_coefficient = promoted_experimental_warning_wrapper(jaccard_coefficient)
 
-overlap = experimental_warning_wrapper(EXPERIMENTAL__overlap)
-overlap_coefficient = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficient)
+from cugraph.link_prediction.sorensen import sorensen, sorensen_coefficient
 
-from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler
+sorensen = promoted_experimental_warning_wrapper(sorensen)
+sorensen_coefficient = promoted_experimental_warning_wrapper(sorensen_coefficient)
 
-BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler)
+from cugraph.link_prediction.overlap import overlap, overlap_coefficient
+
+overlap = promoted_experimental_warning_wrapper(overlap)
+overlap_coefficient = promoted_experimental_warning_wrapper(overlap_coefficient)
diff --git a/python/cugraph/cugraph/experimental/link_prediction/__init__.py b/python/cugraph/cugraph/experimental/link_prediction/__init__.py
deleted file mode 100644
index 081b2ae8260..00000000000
--- a/python/cugraph/cugraph/experimental/link_prediction/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph/cugraph/experimental/link_prediction/jaccard.py b/python/cugraph/cugraph/experimental/link_prediction/jaccard.py
deleted file mode 100644
index 2eba73b3824..00000000000
--- a/python/cugraph/cugraph/experimental/link_prediction/jaccard.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph.utilities import (
-    ensure_cugraph_obj_for_nx,
-    df_edge_score_to_dictionary,
-    renumber_vertex_pair,
-)
-import cudf
-import warnings
-
-from pylibcugraph.experimental import (
-    jaccard_coefficients as pylibcugraph_jaccard_coefficients,
-)
-from pylibcugraph import ResourceHandle
-
-
-# FIXME: Move this function to the utility module so that it can be
-# shared by other algos
-def ensure_valid_dtype(input_graph, vertex_pair):
-
-    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
-    vertex_pair_dtypes = vertex_pair.dtypes
-
-    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
-        warning_msg = (
-            "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. "
-            f"input graph's vertex type is: {vertex_dtype} and got "
-            f"'vertex_pair' of type: {vertex_pair_dtypes}."
-        )
-        warnings.warn(warning_msg, UserWarning)
-        vertex_pair = vertex_pair.astype(vertex_dtype)
-
-    return vertex_pair
-
-
-def EXPERIMENTAL__jaccard(G, vertex_pair=None, use_weight=False):
-    """
-    Compute the Jaccard similarity between each pair of vertices connected by
-    an edge, or between arbitrary pairs of vertices specified by the user.
-    Jaccard similarity is defined between two sets as the ratio of the volume
-    of their intersection divided by the volume of their union. In the context
-    of graphs, the neighborhood of a vertex is seen as a set. The Jaccard
-    similarity weight of each edge represents the strength of connection
-    between vertices based on the relative similarity of their neighbors. If
-    first is specified but second is not, or vice versa, an exception will be
-    thrown.
-
-    NOTE: If the vertex_pair parameter is not specified then the behavior
-    of cugraph.jaccard is different from the behavior of
-    networkx.jaccard_coefficient.
-
-    cugraph.jaccard, in the absence of a specified vertex pair list, will
-    compute the two_hop_neighbors of the entire graph to construct a vertex pair
-    list and will return the jaccard coefficient for those vertex pairs. This is
-    not advisable as the vertex_pairs can grow exponentially with respect to the
-    size of the datasets
-
-    networkx.jaccard_coefficient, in the absence of a specified vertex
-    pair list, will return an upper triangular dense matrix, excluding
-    the diagonal as well as vertex pairs that are directly connected
-    by an edge in the graph, of jaccard coefficients.  Technically, networkx
-    returns a lazy iterator across this upper triangular matrix where
-    the actual jaccard coefficient is computed when the iterator is
-    dereferenced.  Computing a dense matrix of results is not feasible
-    if the number of vertices in the graph is large (100,000 vertices
-    would result in 4.9 billion values in that iterator).
-
-    If your graph is small enough (or you have enough memory and patience)
-    you can get the interesting (non-zero) values that are part of the networkx
-    solution by doing the following:
-
-    >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> pairs = G.get_two_hop_neighbors()
-    >>> df = cugraph.jaccard(G, pairs)
-
-    But please remember that cugraph will fill the dataframe with the entire
-    solution you request, so you'll need enough memory to store the 2-hop
-    neighborhood dataframe.
-
-
-    Parameters
-    ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-
-        This implementation only supports undirected, unweighted Graph.
-
-    vertex_pair : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the jaccard coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the jaccard coefficient for all
-        adjacent vertices in the graph.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df  : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the Jaccard weights. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        df['first'] : cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        df['second'] : cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        df['jaccard_coeff'] : cudf.Series
-            The computed jaccard coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import jaccard as exp_jaccard
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_jaccard(G)
-
-    """
-    if G.is_directed():
-        raise ValueError("Input must be an undirected Graph.")
-
-    if G.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if vertex_pair is None:
-        # Call two_hop neighbor of the entire graph
-        vertex_pair = G.get_two_hop_neighbors()
-
-    v_p_num_col = len(vertex_pair.columns)
-
-    if isinstance(vertex_pair, cudf.DataFrame):
-        vertex_pair = renumber_vertex_pair(G, vertex_pair)
-        vertex_pair = ensure_valid_dtype(G, vertex_pair)
-        src_col_name = vertex_pair.columns[0]
-        dst_col_name = vertex_pair.columns[1]
-        first = vertex_pair[src_col_name]
-        second = vertex_pair[dst_col_name]
-
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    use_weight = False
-    first, second, jaccard_coeff = pylibcugraph_jaccard_coefficients(
-        resource_handle=ResourceHandle(),
-        graph=G._plc_graph,
-        first=first,
-        second=second,
-        use_weight=use_weight,
-        do_expensive_check=False,
-    )
-
-    if G.renumbered:
-        vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True)
-        vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True)
-
-    if v_p_num_col == 2:
-        # single column vertex
-        vertex_pair = vertex_pair.rename(
-            columns={src_col_name: "first", dst_col_name: "second"}
-        )
-
-    df = vertex_pair
-    df["jaccard_coeff"] = cudf.Series(jaccard_coeff)
-
-    return df
-
-
-def EXPERIMENTAL__jaccard_coefficient(G, ebunch=None, use_weight=False):
-    """
-    For NetworkX Compatability.  See `jaccard`
-
-    Parameters
-    ----------
-    graph : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-
-    ebunch : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the jaccard coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the jaccard coefficient for all
-        adjacent vertices in the graph.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df  : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the Jaccard weights. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        ddf['first']: dask_cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        ddf['second']: dask_cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        ddf['jaccard_coeff']: dask_cudf.Series
-            The computed jaccard coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import jaccard_coefficient as exp_jaccard_coefficient
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_jaccard_coefficient(G)
-
-    """
-    vertex_pair = None
-
-    G, isNx = ensure_cugraph_obj_for_nx(G)
-
-    # FIXME: What is the logic behind this since the docstrings mention that 'G' and
-    # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame?
-    if isNx is True and ebunch is not None:
-        vertex_pair = cudf.DataFrame(ebunch)
-
-    df = EXPERIMENTAL__jaccard(G, vertex_pair)
-
-    if isNx is True:
-        df = df_edge_score_to_dictionary(
-            df, k="jaccard_coeff", src="first", dst="second"
-        )
-
-    return df
diff --git a/python/cugraph/cugraph/experimental/link_prediction/overlap.py b/python/cugraph/cugraph/experimental/link_prediction/overlap.py
deleted file mode 100644
index 0981ced4835..00000000000
--- a/python/cugraph/cugraph/experimental/link_prediction/overlap.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph.utilities import (
-    ensure_cugraph_obj_for_nx,
-    df_edge_score_to_dictionary,
-    renumber_vertex_pair,
-)
-import cudf
-import warnings
-
-from pylibcugraph.experimental import (
-    overlap_coefficients as pylibcugraph_overlap_coefficients,
-)
-from pylibcugraph import ResourceHandle
-
-
-# FIXME: Move this function to the utility module so that it can be
-# shared by other algos
-def ensure_valid_dtype(input_graph, vertex_pair):
-
-    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
-    vertex_pair_dtypes = vertex_pair.dtypes
-
-    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
-        warning_msg = (
-            "Overlap requires 'vertex_pair' to match the graph's 'vertex' type. "
-            f"input graph's vertex type is: {vertex_dtype} and got "
-            f"'vertex_pair' of type: {vertex_pair_dtypes}."
-        )
-        warnings.warn(warning_msg, UserWarning)
-        vertex_pair = vertex_pair.astype(vertex_dtype)
-
-    return vertex_pair
-
-
-def EXPERIMENTAL__overlap_coefficient(G, ebunch=None, use_weight=False):
-    """
-    For NetworkX Compatability.  See `overlap`
-
-    Parameters
-    ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-
-    ebunch : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the Overlap coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the overlap coefficient for all
-        adjacent vertices in the graph.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df  : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the overlap weights. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        ddf['first']: dask_cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        ddf['second']: dask_cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        ddf['overlap_coeff']: dask_cudf.Series
-            The computed overlap coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import overlap_coefficient as exp_overlap_coefficient
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_overlap_coefficient(G)
-    """
-    vertex_pair = None
-
-    G, isNx = ensure_cugraph_obj_for_nx(G)
-
-    # FIXME: What is the logic behind this since the docstrings mention that 'G' and
-    # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame?
-    if isNx is True and ebunch is not None:
-        vertex_pair = cudf.DataFrame(ebunch)
-
-    df = EXPERIMENTAL__overlap(G, vertex_pair)
-
-    if isNx is True:
-        df = df_edge_score_to_dictionary(
-            df, k="overlap_coeff", src="first", dst="second"
-        )
-
-    return df
-
-
-def EXPERIMENTAL__overlap(G, vertex_pair=None, use_weight=False):
-    """
-    Compute the Overlap Coefficient between each pair of vertices connected by
-    an edge, or between arbitrary pairs of vertices specified by the user.
-    Overlap Coefficient is defined between two sets as the ratio of the volume
-    of their intersection divided by the smaller of their two volumes. In the
-    context of graphs, the neighborhood of a vertex is seen as a set. The
-    Overlap Coefficient weight of each edge represents the strength of
-    connection between vertices based on the relative similarity of their
-    neighbors. If first is specified but second is not, or vice versa, an
-    exception will be thrown.
-
-    cugraph.overlap, in the absence of a specified vertex pair list, will
-    compute the two_hop_neighbors of the entire graph to construct a vertex pair
-    list and will return the overlap coefficient for those vertex pairs. This is
-    not advisable as the vertex_pairs can grow exponentially with respect to the
-    size of the datasets
-
-    Parameters
-    ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        adjacency list will be computed if not already present.
-
-        This implementation only supports undirected, unweighted Graph.
-
-    vertex_pair : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the overlap coefficient is computed for the
-        given vertex pairs, else, it is computed for all vertex pairs.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the Overlap coefficients. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        df['first'] : cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        df['second'] : cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        df['overlap_coeff'] : cudf.Series
-            The computed overlap coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import overlap as exp_overlap
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_overlap(G)
-
-    """
-
-    if G.is_directed():
-        raise ValueError("Input must be an undirected Graph.")
-
-    if G.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if vertex_pair is None:
-        # Call two_hop neighbor of the entire graph
-        vertex_pair = G.get_two_hop_neighbors()
-
-    v_p_num_col = len(vertex_pair.columns)
-
-    if isinstance(vertex_pair, cudf.DataFrame):
-        vertex_pair = renumber_vertex_pair(G, vertex_pair)
-        vertex_pair = ensure_valid_dtype(G, vertex_pair)
-        src_col_name = vertex_pair.columns[0]
-        dst_col_name = vertex_pair.columns[1]
-        first = vertex_pair[src_col_name]
-        second = vertex_pair[dst_col_name]
-
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    use_weight = False
-    first, second, overlap_coeff = pylibcugraph_overlap_coefficients(
-        resource_handle=ResourceHandle(),
-        graph=G._plc_graph,
-        first=first,
-        second=second,
-        use_weight=use_weight,
-        do_expensive_check=False,
-    )
-
-    if G.renumbered:
-        vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True)
-        vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True)
-
-    if v_p_num_col == 2:
-        # single column vertex
-        vertex_pair = vertex_pair.rename(
-            columns={src_col_name: "first", dst_col_name: "second"}
-        )
-
-    df = vertex_pair
-    df["overlap_coeff"] = cudf.Series(overlap_coeff)
-
-    return df
diff --git a/python/cugraph/cugraph/experimental/link_prediction/sorensen.py b/python/cugraph/cugraph/experimental/link_prediction/sorensen.py
deleted file mode 100644
index ed27e4813d3..00000000000
--- a/python/cugraph/cugraph/experimental/link_prediction/sorensen.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cugraph.utilities import (
-    ensure_cugraph_obj_for_nx,
-    df_edge_score_to_dictionary,
-    renumber_vertex_pair,
-)
-import cudf
-import warnings
-from pylibcugraph.experimental import (
-    sorensen_coefficients as pylibcugraph_sorensen_coefficients,
-)
-from pylibcugraph import ResourceHandle
-
-
-# FIXME: Move this function to the utility module so that it can be
-# shared by other algos
-def ensure_valid_dtype(input_graph, vertex_pair):
-
-    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
-    vertex_pair_dtypes = vertex_pair.dtypes
-
-    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
-        warning_msg = (
-            "Sorensen requires 'vertex_pair' to match the graph's 'vertex' type. "
-            f"input graph's vertex type is: {vertex_dtype} and got "
-            f"'vertex_pair' of type: {vertex_pair_dtypes}."
-        )
-        warnings.warn(warning_msg, UserWarning)
-        vertex_pair = vertex_pair.astype(vertex_dtype)
-
-    return vertex_pair
-
-
-def EXPERIMENTAL__sorensen(G, vertex_pair=None, use_weight=False):
-    """
-    Compute the Sorensen coefficient between each pair of vertices connected by
-    an edge, or between arbitrary pairs of vertices specified by the user.
-    Sorensen coefficient is defined between two sets as the ratio of twice the
-    volume of their intersection divided by the volume of each set.
-    If first is specified but second is not, or vice versa, an exception will
-    be thrown.
-
-    cugraph.sorensen, in the absence of a specified vertex pair list, will
-    compute the two_hop_neighbors of the entire graph to construct a vertex pair
-    list and will return the sorensen coefficient for those vertex pairs. This is
-    not advisable as the vertex_pairs can grow exponentially with respect to the
-    size of the datasets
-
-    Parameters
-    ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not supported yet for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-
-        This implementation only supports undirected, unweighted Graph.
-
-    vertex_pair : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the Sorensen coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the Sorensen coefficient for all
-        adjacent vertices in the graph.
-
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df  : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the Sorensen index. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        df['first'] : cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        df['second'] : cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        df['sorensen_coeff'] : cudf.Series
-            The computed sorensen coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import sorensen as exp_sorensen
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_sorensen(G)
-
-    """
-    if G.is_directed():
-        raise ValueError("Input must be an undirected Graph.")
-
-    if G.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if vertex_pair is None:
-        # Call two_hop neighbor of the entire graph
-        vertex_pair = G.get_two_hop_neighbors()
-
-    v_p_num_col = len(vertex_pair.columns)
-
-    if isinstance(vertex_pair, cudf.DataFrame):
-        vertex_pair = renumber_vertex_pair(G, vertex_pair)
-        vertex_pair = ensure_valid_dtype(G, vertex_pair)
-        src_col_name = vertex_pair.columns[0]
-        dst_col_name = vertex_pair.columns[1]
-        first = vertex_pair[src_col_name]
-        second = vertex_pair[dst_col_name]
-
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    use_weight = False
-    first, second, sorensen_coeff = pylibcugraph_sorensen_coefficients(
-        resource_handle=ResourceHandle(),
-        graph=G._plc_graph,
-        first=first,
-        second=second,
-        use_weight=use_weight,
-        do_expensive_check=False,
-    )
-
-    if G.renumbered:
-        vertex_pair = G.unrenumber(vertex_pair, src_col_name, preserve_order=True)
-        vertex_pair = G.unrenumber(vertex_pair, dst_col_name, preserve_order=True)
-
-    if v_p_num_col == 2:
-        # single column vertex
-        vertex_pair = vertex_pair.rename(
-            columns={src_col_name: "first", dst_col_name: "second"}
-        )
-
-    df = vertex_pair
-    df["sorensen_coeff"] = cudf.Series(sorensen_coeff)
-
-    return df
-
-
-def EXPERIMENTAL__sorensen_coefficient(G, ebunch=None, use_weight=False):
-    """
-    For NetworkX Compatability.  See `sorensen`
-
-    Parameters
-    ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-    ebunch : cudf.DataFrame, optional (default=None)
-        A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the sorensen coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the sorensen coefficient for all
-        adjacent vertices in the graph.
-    use_weight : bool, optional (default=False)
-        Currently not supported
-
-    Returns
-    -------
-    df  : cudf.DataFrame
-        GPU data frame of size E (the default) or the size of the given pairs
-        (first, second) containing the Sorensen weights. The ordering is
-        relative to the adjacency list, or that given by the specified vertex
-        pairs.
-
-        df['first'] : cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified).
-        df['second'] : cudf.Series
-            The second vertex ID of each pair (will be identical to second if
-            specified).
-        df['sorensen_coeff'] : cudf.Series
-            The computed sorensen coefficient between the first and the second
-            vertex ID.
-
-    Examples
-    --------
-    >>> from cugraph.datasets import karate
-    >>> from cugraph.experimental import sorensen_coefficient as exp_sorensen_coef
-    >>> G = karate.get_graph(download=True, ignore_weights=True)
-    >>> df = exp_sorensen_coef(G)
-
-    """
-    vertex_pair = None
-
-    G, isNx = ensure_cugraph_obj_for_nx(G)
-
-    # FIXME: What is the logic behind this since the docstrings mention that 'G' and
-    # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame?
-    if isNx is True and ebunch is not None:
-        vertex_pair = cudf.DataFrame(ebunch)
-
-    df = EXPERIMENTAL__sorensen(G, vertex_pair)
-
-    if isNx is True:
-        df = df_edge_score_to_dictionary(
-            df, k="sorensen_coeff", src="first", dst="second"
-        )
-
-    return df
diff --git a/python/cugraph/cugraph/link_prediction/CMakeLists.txt b/python/cugraph/cugraph/link_prediction/CMakeLists.txt
deleted file mode 100644
index a117cf9afc3..00000000000
--- a/python/cugraph/cugraph/link_prediction/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources jaccard_wrapper.pyx overlap_wrapper.pyx)
-set(linked_libraries cugraph::cugraph)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX link_prediction_
-  ASSOCIATED_TARGETS cugraph
-)
diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py
index a6911d3b8ae..a8517ee7c0f 100644
--- a/python/cugraph/cugraph/link_prediction/__init__.py
+++ b/python/cugraph/cugraph/link_prediction/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,13 +11,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
+from cugraph.utilities.api_tools import deprecated_warning_wrapper
 from cugraph.link_prediction.jaccard import jaccard
 from cugraph.link_prediction.jaccard import jaccard_coefficient
+
+from cugraph.link_prediction.sorensen import sorensen
+from cugraph.link_prediction.sorensen import sorensen_coefficient
+
 from cugraph.link_prediction.overlap import overlap
+from cugraph.link_prediction.overlap import overlap_coefficient
+
+# To be deprecated
 from cugraph.link_prediction.wjaccard import jaccard_w
+
+jaccard_w = deprecated_warning_wrapper(jaccard_w)
+
 from cugraph.link_prediction.woverlap import overlap_w
+
+overlap_w = deprecated_warning_wrapper(overlap_w)
+
 from cugraph.link_prediction.wsorensen import sorensen_w
-from cugraph.link_prediction.jaccard import jaccard_coefficient
-from cugraph.link_prediction.sorensen import sorensen_coefficient
-from cugraph.link_prediction.sorensen import sorensen
-from cugraph.link_prediction.overlap import overlap_coefficient
+
+sorensen_w = deprecated_warning_wrapper(sorensen_w)
diff --git a/python/cugraph/cugraph/link_prediction/jaccard.pxd b/python/cugraph/cugraph/link_prediction/jaccard.pxd
deleted file mode 100644
index 9e8c82ec3d8..00000000000
--- a/python/cugraph/cugraph/link_prediction/jaccard.pxd
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.structure.graph_primtypes cimport *
-
-
-cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
-
-    cdef void jaccard[VT,ET,WT](
-        const GraphCSRView[VT,ET,WT] &graph,
-        const WT *weights,
-        WT *result) except +
-
-    cdef void jaccard_list[VT,ET,WT](
-        const GraphCSRView[VT,ET,WT] &graph,
-        const WT *weights,
-        ET num_pairs,
-        const VT *first,
-        const VT *second,
-        WT *result) except +
diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py
index 334d57f9d80..27bfa58e6b0 100644
--- a/python/cugraph/cugraph/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/link_prediction/jaccard.py
@@ -11,16 +11,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cudf
-from cugraph.link_prediction import jaccard_wrapper
 from cugraph.utilities import (
     ensure_cugraph_obj_for_nx,
     df_edge_score_to_dictionary,
     renumber_vertex_pair,
 )
+import cudf
+import warnings
+from typing import Union, Iterable
+
+from pylibcugraph import (
+    jaccard_coefficients as pylibcugraph_jaccard_coefficients,
+)
+from pylibcugraph import ResourceHandle
+
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
+
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+# FIXME: Move this function to the utility module so that it can be
+# shared by other algos
+def ensure_valid_dtype(input_graph, vertex_pair):
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+    vertex_pair_dtypes = vertex_pair.dtypes
+
+    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
+        warning_msg = (
+            "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. "
+            f"input graph's vertex type is: {vertex_dtype} and got "
+            f"'vertex_pair' of type: {vertex_pair_dtypes}."
+        )
+        warnings.warn(warning_msg, UserWarning)
+        vertex_pair = vertex_pair.astype(vertex_dtype)
+
+    return vertex_pair
 
 
-def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
+def jaccard(
+    input_graph: Graph,
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+    use_weight: bool = False,
+):
     """
     Compute the Jaccard similarity between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -36,13 +74,11 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
     of cugraph.jaccard is different from the behavior of
     networkx.jaccard_coefficient.
 
-    This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
-
     cugraph.jaccard, in the absence of a specified vertex pair list, will
-    use the edges of the graph to construct a vertex pair list and will
-    return the jaccard coefficient for those vertex pairs.
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the jaccard coefficient for those vertex pairs. This is
+    not advisable as the vertex_pairs can grow exponentially with respect to the
+    size of the datasets
 
     networkx.jaccard_coefficient, in the absence of a specified vertex
     pair list, will return an upper triangular dense matrix, excluding
@@ -59,9 +95,9 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
     solution by doing the following:
 
     >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True)
-    >>> pairs = G.get_two_hop_neighbors()
-    >>> df = cugraph.jaccard(G, pairs)
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> pairs = input_graph.get_two_hop_neighbors()
+    >>> df = cugraph.jaccard(input_graph, pairs)
 
     But please remember that cugraph will fill the dataframe with the entire
     solution you request, so you'll need enough memory to store the 2-hop
@@ -72,10 +108,11 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
     ----------
     input_graph : cugraph.Graph
         cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
+        as an edge list. The graph should be undirected where an undirected
+        edge is represented by a directed edge in both direction.The adjacency
+        list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
 
     vertex_pair : cudf.DataFrame, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
@@ -84,9 +121,20 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
         current implementation computes the jaccard coefficient for all
         adjacent vertices in the graph.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted jaccard (if use_weight==True)
+        or un-weighted jaccard (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
 
     Returns
     -------
@@ -99,7 +147,7 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
         df['first'] : cudf.Series
             The first vertex ID of each pair (will be identical to first if specified).
         df['second'] : cudf.Series
-            the second vertex ID of each pair (will be identical to second if
+            The second vertex ID of each pair (will be identical to second if
             specified).
         df['jaccard_coeff'] : cudf.Series
             The computed Jaccard coefficient between the first and the second
@@ -108,65 +156,101 @@ def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
     Examples
     --------
     >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True)
-    >>> df = cugraph.jaccard(G)
+    >>> from cugraph import jaccard
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = jaccard(input_graph)
 
     """
     if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
 
     if input_graph.is_directed():
         raise ValueError("Input must be an undirected Graph.")
-    if type(vertex_pair) == cudf.DataFrame:
+
+    if vertex_pair is None:
+        # Call two_hop neighbor of the entire graph
+        vertex_pair = input_graph.get_two_hop_neighbors()
+
+    v_p_num_col = len(vertex_pair.columns)
+
+    if isinstance(vertex_pair, cudf.DataFrame):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
+        vertex_pair = ensure_valid_dtype(input_graph, vertex_pair)
+        src_col_name = vertex_pair.columns[0]
+        dst_col_name = vertex_pair.columns[1]
+        first = vertex_pair[src_col_name]
+        second = vertex_pair[dst_col_name]
+
     elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
+        raise ValueError("vertex_pair must be a cudf Dataframe")
 
-    df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair)
+    first, second, jaccard_coeff = pylibcugraph_jaccard_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        first=first,
+        second=second,
+        use_weight=use_weight,
+        do_expensive_check=False,
+    )
 
     if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, src_col_name, preserve_order=True
+        )
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, dst_col_name, preserve_order=True
+        )
+
+    if v_p_num_col == 2:
+        # single column vertex
+        vertex_pair = vertex_pair.rename(
+            columns={src_col_name: "first", dst_col_name: "second"}
+        )
+
+    df = vertex_pair
+    df["jaccard_coeff"] = cudf.Series(jaccard_coeff)
 
     return df
 
 
-def jaccard_coefficient(G, ebunch=None, do_expensive_check=True):
+def jaccard_coefficient(
+    G: Union[Graph, "networkx.Graph"],
+    ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
     For NetworkX Compatability.  See `jaccard`
 
-    NOTE: This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
-
     Parameters
     ----------
-    graph : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
+    G : cugraph.Graph or NetworkX.Graph
+        cuGraph or NetworkX Graph instance, should contain the connectivity
+        information as an edge list. The graph should be undirected where an
+        undirected edge is represented by a directed edge in both direction.
+        The adjacency list will be computed if not already present.
 
-    ebunch : cudf.DataFrame, optional (default=None)
+        This implementation only supports undirected, non-multi Graphs.
+
+    ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the jaccard coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the jaccard coefficient for all
-        adjacent vertices in the graph.
+        vertices or iterable of 2-tuples (u, v) where u and v are nodes in
+        the graph.
+
+        If provided, the Overlap coefficient is computed for the given vertex
+        pairs. Otherwise, the current implementation computes the overlap
+        coefficient for all adjacent vertices in the graph.
+
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+                This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
 
     Returns
     -------
@@ -188,10 +272,18 @@ def jaccard_coefficient(G, ebunch=None, do_expensive_check=True):
     Examples
     --------
     >>> from cugraph.datasets import karate
+    >>> from cugraph import jaccard_coefficient
     >>> G = karate.get_graph(download=True)
-    >>> df = cugraph.jaccard_coefficient(G)
+    >>> df = jaccard_coefficient(G)
 
     """
+    if do_expensive_check:
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
+
     vertex_pair = None
 
     G, isNx = ensure_cugraph_obj_for_nx(G)
diff --git a/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx b/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx
deleted file mode 100644
index e66d8bf0b5c..00000000000
--- a/python/cugraph/cugraph/link_prediction/jaccard_wrapper.pyx
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.link_prediction.jaccard cimport jaccard as c_jaccard
-from cugraph.link_prediction.jaccard cimport jaccard_list as c_jaccard_list
-from cugraph.structure.graph_primtypes cimport *
-from cugraph.structure import graph_primtypes_wrapper
-from libc.stdint cimport uintptr_t
-import cudf
-import numpy as np
-
-
-def jaccard(input_graph, weights_arr=None, vertex_pair=None):
-    """
-    Call jaccard or jaccard_list
-    """
-    offsets = None
-    indices = None
-
-    if input_graph.adjlist:
-        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets,
-                                                                    input_graph.adjlist.indices], [np.int32])
-    elif input_graph.transposedadjlist:
-        #
-        # NOTE: jaccard ONLY operates on an undirected graph, so CSR and CSC should be
-        #       equivalent.  The undirected check has already happened, so we'll just use
-        #       the CSC as if it were CSR.
-        #
-        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.transposedadjlist.offsets,
-                                                                    input_graph.transposedadjlist.indices], [np.int32])
-    else:
-        input_graph.view_adj_list()
-        [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets,
-                                                                    input_graph.adjlist.indices], [np.int32])
-
-    num_verts = input_graph.number_of_vertices()
-    num_edges = input_graph.number_of_edges(directed_edges=True)
-
-    first = None
-    second = None
-
-    cdef uintptr_t c_result_col = <uintptr_t> NULL
-    cdef uintptr_t c_first_col = <uintptr_t> NULL
-    cdef uintptr_t c_second_col = <uintptr_t> NULL
-    cdef uintptr_t c_src_index_col = <uintptr_t> NULL
-    cdef uintptr_t c_dst_index_col = <uintptr_t> NULL
-    cdef uintptr_t c_weights = <uintptr_t> NULL
-    cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0]
-
-    cdef GraphCSRView[int,int,float] graph_float
-    cdef GraphCSRView[int,int,double] graph_double
-
-    weight_type = np.float32
-
-    if weights_arr is not None:
-        [weights] = graph_primtypes_wrapper.datatype_cast([weights_arr], [np.float32, np.float64])
-        c_weights = weights.__cuda_array_interface__['data'][0]
-        weight_type = weights.dtype
-
-    if type(vertex_pair) == cudf.DataFrame:
-        result_size = len(vertex_pair)
-        result = cudf.Series(np.ones(result_size, dtype=weight_type))
-        c_result_col = result.__cuda_array_interface__['data'][0]
-
-        df = cudf.DataFrame()
-        df['jaccard_coeff'] = result
-
-        cols = vertex_pair.columns.to_list()
-        first = vertex_pair[cols[0]].astype(np.int32)
-        second = vertex_pair[cols[1]].astype(np.int32)
-
-        # FIXME: multi column support
-        df['first'] = first
-        df['second'] = second
-        c_first_col = first.__cuda_array_interface__['data'][0]
-        c_second_col = second.__cuda_array_interface__['data'][0]
-
-        if weight_type == np.float32:
-            graph_float = GraphCSRView[int,int,float](<int*>c_offsets, <int*>c_indices,
-                                                  <float*>c_weights, num_verts, num_edges)
-            c_jaccard_list[int,int,float](graph_float,
-                                          <float*>c_weights,
-                                          result_size,
-                                          <int*>c_first_col,
-                                          <int*>c_second_col,
-                                          <float*>c_result_col)
-        else:
-            graph_double = GraphCSRView[int,int,double](<int*>c_offsets, <int*>c_indices,
-                                                    <double*>c_weights, num_verts, num_edges)
-            c_jaccard_list[int,int,double](graph_double,
-                                           <double*>c_weights,
-                                           result_size,
-                                           <int*>c_first_col,
-                                           <int*>c_second_col,
-                                           <double*>c_result_col)
-
-        return df
-    else:
-        # error check performed in jaccard.py
-        assert vertex_pair is None
-
-        df = cudf.DataFrame()
-        df['first'] = cudf.Series(np.zeros(num_edges, indices.dtype))
-        df['second'] = indices
-
-        c_src_index_col = df['first'].__cuda_array_interface__['data'][0]
-
-        if weight_type == np.float32:
-            df['jaccard_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float32),
-                                              nan_as_null=False)
-            c_result_col = df['jaccard_coeff'].__cuda_array_interface__['data'][0]
-
-            graph_float = GraphCSRView[int,int,float](<int*>c_offsets,
-                                                  <int*>c_indices,
-                                                  <float*>c_weights,
-                                                  num_verts,
-                                                  num_edges)
-            c_jaccard[int,int,float](graph_float,
-                                     <float*>c_weights,
-                                     <float*>c_result_col)
-
-            graph_float.get_source_indices(<int*>c_src_index_col)
-        else:
-            df['jaccard_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float64),
-                                              nan_as_null=False)
-            c_result_col = df['jaccard_coeff'].__cuda_array_interface__['data'][0]
-
-            graph_double = GraphCSRView[int,int,double](<int*>c_offsets,
-                                                    <int*>c_indices,
-                                                    <double*>c_weights,
-                                                    num_verts,
-                                                    num_edges)
-            c_jaccard[int,int,double](graph_double,
-                                      <double*>c_weights,
-                                      <double*>c_result_col)
-
-            graph_double.get_source_indices(<int*>c_src_index_col)
-
-        return df
diff --git a/python/cugraph/cugraph/link_prediction/overlap.pxd b/python/cugraph/cugraph/link_prediction/overlap.pxd
deleted file mode 100644
index f0654472587..00000000000
--- a/python/cugraph/cugraph/link_prediction/overlap.pxd
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.structure.graph_primtypes cimport *
-
-
-cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
-
-    cdef void overlap[VT,ET,WT](
-        const GraphCSRView[VT,ET,WT] &graph,
-        const WT *weights,
-        WT *result) except +
-
-    cdef void overlap_list[VT,ET,WT](
-        const GraphCSRView[VT,ET,WT] &graph,
-        const WT *weights,
-        ET num_pairs,
-        const VT *first,
-        const VT *second,
-        WT *result) except +
diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py
index 9bb7b76b0ca..3a25526679c 100644
--- a/python/cugraph/cugraph/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/link_prediction/overlap.py
@@ -11,28 +11,120 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.link_prediction import overlap_wrapper
-import cudf
 from cugraph.utilities import (
     ensure_cugraph_obj_for_nx,
     df_edge_score_to_dictionary,
     renumber_vertex_pair,
 )
+import cudf
+import warnings
+from typing import Union, Iterable
+
+from pylibcugraph import (
+    overlap_coefficients as pylibcugraph_overlap_coefficients,
+)
+from pylibcugraph import ResourceHandle
+
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
+
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
 
+# FIXME: Move this function to the utility module so that it can be
+# shared by other algos
+def ensure_valid_dtype(input_graph, vertex_pair):
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+    vertex_pair_dtypes = vertex_pair.dtypes
 
-def overlap_coefficient(G, ebunch=None, do_expensive_check=True):
+    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
+        warning_msg = (
+            "Overlap requires 'vertex_pair' to match the graph's 'vertex' type. "
+            f"input graph's vertex type is: {vertex_dtype} and got "
+            f"'vertex_pair' of type: {vertex_pair_dtypes}."
+        )
+        warnings.warn(warning_msg, UserWarning)
+        vertex_pair = vertex_pair.astype(vertex_dtype)
+
+    return vertex_pair
+
+
+def overlap_coefficient(
+    G: Union[Graph, "networkx.Graph"],
+    ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
-    For NetworkX Compatability.  See `overlap`
+    Compute overlap coefficient.
+
+    Parameters
+    ----------
+    G : cugraph.Graph or NetworkX.Graph
+        cuGraph or NetworkX Graph instance, should contain the connectivity
+        information as an edge list. The graph should be undirected where an
+        undirected edge is represented by a directed edge in both direction.
+        The adjacency list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi edge Graph.
+
+    ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
+        A GPU dataframe consisting of two columns representing pairs of
+        vertices or iterable of 2-tuples (u, v) where u and v are nodes in
+        the graph.
+
+        If provided, the Overlap coefficient is computed for the given vertex
+        pairs. Otherwise, the current implementation computes the overlap
+        coefficient for all adjacent vertices in the graph.
+
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
+
+    Returns
+    -------
+    df  : cudf.DataFrame
+        GPU data frame of size E (the default) or the size of the given pairs
+        (first, second) containing the overlap weights. The ordering is
+        relative to the adjacency list, or that given by the specified vertex
+        pairs.
 
-    NOTE: This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
+        ddf['first']: dask_cudf.Series
+            The first vertex ID of each pair (will be identical to first if specified).
+        ddf['second']: dask_cudf.Series
+            The second vertex ID of each pair (will be identical to second if
+            specified).
+        ddf['overlap_coeff']: dask_cudf.Series
+            The computed overlap coefficient between the first and the second
+            vertex ID.
 
+    Examples
+    --------
+    >>> from cugraph.datasets import karate
+    >>> from cugraph import overlap_coefficient
+    >>> G = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = overlap_coefficient(G)
     """
+    if do_expensive_check:
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
+
     vertex_pair = None
 
     G, isNx = ensure_cugraph_obj_for_nx(G)
 
+    # FIXME: What is the logic behind this since the docstrings mention that 'G' and
+    # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame?
     if isNx is True and ebunch is not None:
         vertex_pair = cudf.DataFrame(ebunch)
 
@@ -46,7 +138,12 @@ def overlap_coefficient(G, ebunch=None, do_expensive_check=True):
     return df
 
 
-def overlap(input_graph, vertex_pair=None, do_expensive_check=True):
+def overlap(
+    input_graph: Graph,
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+    use_weight: bool = False,
+):
     """
     Compute the Overlap Coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -58,25 +155,39 @@ def overlap(input_graph, vertex_pair=None, do_expensive_check=True):
     neighbors. If first is specified but second is not, or vice versa, an
     exception will be thrown.
 
-    NOTE: This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
+    cugraph.overlap, in the absence of a specified vertex pair list, will
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the overlap coefficient for those vertex pairs. This is
+    not advisable as the vertex_pairs can grow exponentially with respect to the
+    size of the datasets
 
     Parameters
     ----------
     input_graph : cugraph.Graph
         cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        adjacency list will be computed if not already present.
+        as an edge list. The adjacency list will be computed if not already
+        present.
 
+        This implementation only supports undirected, non-multi edge Graph.
     vertex_pair : cudf.DataFrame, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
         vertices. If provided, the overlap coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted overlap (if use_weight==True)
+        or un-weighted overlap (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
+
+
 
     Returns
     -------
@@ -98,35 +209,62 @@ def overlap(input_graph, vertex_pair=None, do_expensive_check=True):
     Examples
     --------
     >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True)
-    >>> df = cugraph.overlap(G)
+    >>> from cugraph import overlap
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = overlap(input_graph)
 
     """
     if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
-
-    if type(vertex_pair) == cudf.DataFrame:
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
+
+    if input_graph.is_directed():
+        raise ValueError("Input must be an undirected Graph.")
+
+    if vertex_pair is None:
+        # Call two_hop neighbor of the entire graph
+        vertex_pair = input_graph.get_two_hop_neighbors()
+
+    v_p_num_col = len(vertex_pair.columns)
+
+    if isinstance(vertex_pair, cudf.DataFrame):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
+        vertex_pair = ensure_valid_dtype(input_graph, vertex_pair)
+        src_col_name = vertex_pair.columns[0]
+        dst_col_name = vertex_pair.columns[1]
+        first = vertex_pair[src_col_name]
+        second = vertex_pair[dst_col_name]
+
     elif vertex_pair is not None:
         raise ValueError("vertex_pair must be a cudf dataframe")
 
-    df = overlap_wrapper.overlap(input_graph, None, vertex_pair)
+    first, second, overlap_coeff = pylibcugraph_overlap_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        first=first,
+        second=second,
+        use_weight=use_weight,
+        do_expensive_check=False,
+    )
 
     if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, src_col_name, preserve_order=True
+        )
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, dst_col_name, preserve_order=True
+        )
+
+    if v_p_num_col == 2:
+        # single column vertex
+        vertex_pair = vertex_pair.rename(
+            columns={src_col_name: "first", dst_col_name: "second"}
+        )
+
+    df = vertex_pair
+    df["overlap_coeff"] = cudf.Series(overlap_coeff)
 
     return df
diff --git a/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx b/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx
deleted file mode 100644
index 0f61460a72f..00000000000
--- a/python/cugraph/cugraph/link_prediction/overlap_wrapper.pyx
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.link_prediction.overlap cimport overlap as c_overlap
-from cugraph.link_prediction.overlap cimport overlap_list as c_overlap_list
-from cugraph.structure.graph_primtypes cimport *
-from cugraph.structure import graph_primtypes_wrapper
-from libc.stdint cimport uintptr_t
-import cudf
-import numpy as np
-
-
-def overlap(input_graph, weights_arr=None, vertex_pair=None):
-    """
-    Call overlap or overlap_list
-    """
-
-    if not input_graph.adjlist:
-        input_graph.view_adj_list()
-
-    [offsets, indices] = graph_primtypes_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
-
-    num_verts = input_graph.number_of_vertices()
-    num_edges = input_graph.number_of_edges(directed_edges=True)
-
-    first = None
-    second = None
-
-    cdef uintptr_t c_result_col = <uintptr_t> NULL
-    cdef uintptr_t c_first_col = <uintptr_t> NULL
-    cdef uintptr_t c_second_col = <uintptr_t> NULL
-    cdef uintptr_t c_src_index_col = <uintptr_t> NULL
-    cdef uintptr_t c_dst_index_col = <uintptr_t> NULL
-    cdef uintptr_t c_weights = <uintptr_t> NULL
-    cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0]
-    cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0]
-
-    cdef GraphCSRView[int,int,float] graph_float
-    cdef GraphCSRView[int,int,double] graph_double
-
-    weight_type = np.float32
-
-    if weights_arr is not None:
-        [weights] = graph_primtypes_wrapper.datatype_cast([weights_arr], [np.float32, np.float64])
-        c_weights = weights.__cuda_array_interface__['data'][0]
-        weight_type = weights.dtype
-
-    if type(vertex_pair) == cudf.DataFrame:
-        result_size = len(vertex_pair)
-        result = cudf.Series(np.ones(result_size, dtype=np.float32))
-        c_result_col = result.__cuda_array_interface__['data'][0]
-
-        df = cudf.DataFrame()
-        df['overlap_coeff'] = result
-
-        cols = vertex_pair.columns.to_list()
-        first = vertex_pair[cols[0]]
-        second = vertex_pair[cols[1]]
-
-        # FIXME: multi column support
-        df['first'] = first
-        df['second'] = second
-        c_first_col = first.__cuda_array_interface__['data'][0]
-        c_second_col = second.__cuda_array_interface__['data'][0]
-
-        if weight_type == np.float32:
-            graph_float = GraphCSRView[int,int,float](<int*>c_offsets, <int*>c_indices,
-                                                  <float*>c_weights, num_verts, num_edges)
-            c_overlap_list[int,int,float](graph_float,
-                                          <float*>c_weights,
-                                          result_size,
-                                          <int*>c_first_col,
-                                          <int*>c_second_col,
-                                          <float*>c_result_col)
-        else:
-            graph_double = GraphCSRView[int,int,double](<int*>c_offsets, <int*>c_indices,
-                                                    <double*>c_weights, num_verts, num_edges)
-            c_overlap_list[int,int,double](graph_double,
-                                           <double*>c_weights,
-                                           result_size,
-                                           <int*>c_first_col,
-                                           <int*>c_second_col,
-                                           <double*>c_result_col)
-
-        return df
-    else:
-        # error check performed in overlap.py
-        assert vertex_pair is None
-
-        df = cudf.DataFrame()
-        df['first'] = cudf.Series(np.zeros(num_edges, indices.dtype))
-        df['second'] = indices
-
-        c_src_index_col = df['first'].__cuda_array_interface__['data'][0]
-
-        if weight_type == np.float32:
-            df['overlap_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float32),
-                                              nan_as_null=False)
-            c_result_col = df['overlap_coeff'].__cuda_array_interface__['data'][0]
-
-            graph_float = GraphCSRView[int,int,float](<int*>c_offsets,
-                                                  <int*>c_indices,
-                                                  <float*>c_weights,
-                                                  num_verts,
-                                                  num_edges)
-            c_overlap[int,int,float](graph_float,
-                                     <float*>c_weights,
-                                     <float*>c_result_col)
-
-            graph_float.get_source_indices(<int*>c_src_index_col)
-        else:
-            df['overlap_coeff'] = cudf.Series(np.ones(num_edges, dtype=np.float64),
-                                              nan_as_null=False)
-            c_result_col = df['overlap_coeff'].__cuda_array_interface__['data'][0]
-
-            graph_double = GraphCSRView[int,int,double](<int*>c_offsets,
-                                                    <int*>c_indices,
-                                                    <double*>c_weights,
-                                                    num_verts,
-                                                    num_edges)
-            c_overlap[int,int,double](graph_double,
-                                      <double*>c_weights,
-                                      <double*>c_result_col)
-
-            graph_double.get_source_indices(<int*>c_src_index_col)
-
-        return df
diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py
index ef2bd8d674d..a8ccced1e68 100644
--- a/python/cugraph/cugraph/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/link_prediction/sorensen.py
@@ -11,17 +11,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cudf
-from cugraph.structure.graph_classes import Graph
-from cugraph.link_prediction import jaccard_wrapper
 from cugraph.utilities import (
     ensure_cugraph_obj_for_nx,
     df_edge_score_to_dictionary,
     renumber_vertex_pair,
 )
+import cudf
+import warnings
+from typing import Union, Iterable
+
+from pylibcugraph import (
+    sorensen_coefficients as pylibcugraph_sorensen_coefficients,
+)
+from pylibcugraph import ResourceHandle
+
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
+
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+# FIXME: Move this function to the utility module so that it can be
+# shared by other algos
+def ensure_valid_dtype(input_graph, vertex_pair):
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+    vertex_pair_dtypes = vertex_pair.dtypes
+
+    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
+        warning_msg = (
+            "Sorensen requires 'vertex_pair' to match the graph's 'vertex' type. "
+            f"input graph's vertex type is: {vertex_dtype} and got "
+            f"'vertex_pair' of type: {vertex_pair_dtypes}."
+        )
+        warnings.warn(warning_msg, UserWarning)
+        vertex_pair = vertex_pair.astype(vertex_dtype)
+
+    return vertex_pair
 
 
-def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
+def sorensen(
+    input_graph: Graph,
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+    use_weight: bool = False,
+):
     """
     Compute the Sorensen coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -30,22 +67,20 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
     If first is specified but second is not, or vice versa, an exception will
     be thrown.
 
-    NOTE: This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
-
     cugraph.sorensen, in the absence of a specified vertex pair list, will
-    use the edges of the graph to construct a vertex pair list and will
-    return the sorensen coefficient for those vertex pairs.
+    compute the two_hop_neighbors of the entire graph to construct a vertex pair
+    list and will return the sorensen coefficient for those vertex pairs. This is
+    not advisable as the vertex_pairs can grow exponentially with respect to the
+    size of the datasets
 
     Parameters
     ----------
     input_graph : cugraph.Graph
         cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
+        as an edge list. The adjacency list will be computed if not already
+        present.
+
+        This implementation only supports undirected, non-multi edge Graph.
 
     vertex_pair : cudf.DataFrame, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
@@ -54,9 +89,18 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
         current implementation computes the Sorensen coefficient for all
         adjacent vertices in the graph.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
+
+    use_weight : bool, optional (default=False)
+        Flag to indicate whether to compute weighted sorensen (if use_weight==True)
+        or un-weighted sorensen (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
 
     Returns
     -------
@@ -67,79 +111,112 @@ def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
         pairs.
 
         df['first'] : cudf.Series
-            The first vertex ID of each pair (will be identical to first if specified)
-
+            The first vertex ID of each pair (will be identical to first if specified).
         df['second'] : cudf.Series
             The second vertex ID of each pair (will be identical to second if
-            specified)
-
+            specified).
         df['sorensen_coeff'] : cudf.Series
-            The computed Sorensen coefficient between the first and the second
+            The computed sorensen coefficient between the first and the second
             vertex ID.
 
     Examples
     --------
     >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True)
-    >>> df = cugraph.sorensen(G)
+    >>> from cugraph import sorensen
+    >>> input_graph = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = sorensen(input_graph)
 
     """
     if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
-
-    if type(input_graph) is not Graph:
-        raise TypeError("input graph must a Graph")
-
-    if type(vertex_pair) == cudf.DataFrame:
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
+
+    if input_graph.is_directed():
+        raise ValueError("Input must be an undirected Graph.")
+
+    if vertex_pair is None:
+        # Call two_hop neighbor of the entire graph
+        vertex_pair = input_graph.get_two_hop_neighbors()
+
+    v_p_num_col = len(vertex_pair.columns)
+
+    if isinstance(vertex_pair, cudf.DataFrame):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
+        vertex_pair = ensure_valid_dtype(input_graph, vertex_pair)
+        src_col_name = vertex_pair.columns[0]
+        dst_col_name = vertex_pair.columns[1]
+        first = vertex_pair[src_col_name]
+        second = vertex_pair[dst_col_name]
+
     elif vertex_pair is not None:
         raise ValueError("vertex_pair must be a cudf dataframe")
 
-    df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair)
-    df.jaccard_coeff = (2 * df.jaccard_coeff) / (1 + df.jaccard_coeff)
-    df.rename({"jaccard_coeff": "sorensen_coeff"}, axis=1, inplace=True)
+    first, second, sorensen_coeff = pylibcugraph_sorensen_coefficients(
+        resource_handle=ResourceHandle(),
+        graph=input_graph._plc_graph,
+        first=first,
+        second=second,
+        use_weight=use_weight,
+        do_expensive_check=False,
+    )
+
     if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, src_col_name, preserve_order=True
+        )
+        vertex_pair = input_graph.unrenumber(
+            vertex_pair, dst_col_name, preserve_order=True
+        )
+
+    if v_p_num_col == 2:
+        # single column vertex
+        vertex_pair = vertex_pair.rename(
+            columns={src_col_name: "first", dst_col_name: "second"}
+        )
+
+    df = vertex_pair
+    df["sorensen_coeff"] = cudf.Series(sorensen_coeff)
 
     return df
 
 
-def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
+def sorensen_coefficient(
+    G: Union[Graph, "networkx.Graph"],
+    ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
-    For NetworkX Compatability.  See `sorensen`
-
-    NOTE: This algorithm doesn't currently support datasets with vertices that
-    are not (re)numebred vertices from 0 to V-1 where V is the total number of
-    vertices as this creates isolated vertices.
+    Compute sorensen coefficient.
 
     Parameters
     ----------
-    G : cugraph.Graph
-        cuGraph Graph instance, should contain the connectivity information
-        as an edge list (edge weights are not used for this algorithm). The
-        graph should be undirected where an undirected edge is represented by a
-        directed edge in both direction. The adjacency list will be computed if
-        not already present.
-    ebunch : cudf.DataFrame, optional (default=None)
+    G : cugraph.Graph or NetworkX.Graph
+        cuGraph or NetworkX Graph instance, should contain the connectivity
+        information as an edge list. The graph should be undirected where an
+        undirected edge is represented by a directed edge in both direction.
+        The adjacency list will be computed if not already present.
+
+        This implementation only supports undirected, non-multi Graphs.
+
+    ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None)
         A GPU dataframe consisting of two columns representing pairs of
-        vertices. If provided, the sorensen coefficient is computed for the
-        given vertex pairs.  If the vertex_pair is not provided then the
-        current implementation computes the sorensen coefficient for all
-        adjacent vertices in the graph.
+        vertices or iterable of 2-tuples (u, v) where u and v are nodes in
+        the graph.
+
+        If provided, the Overlap coefficient is computed for the given vertex
+        pairs. Otherwise, the current implementation computes the overlap
+        coefficient for all adjacent vertices in the graph.
+
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
 
     Returns
     -------
@@ -152,7 +229,7 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
         df['first'] : cudf.Series
             The first vertex ID of each pair (will be identical to first if specified).
         df['second'] : cudf.Series
-            the second vertex ID of each pair (will be identical to second if
+            The second vertex ID of each pair (will be identical to second if
             specified).
         df['sorensen_coeff'] : cudf.Series
             The computed Sorensen coefficient between the first and the second
@@ -161,14 +238,24 @@ def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
     Examples
     --------
     >>> from cugraph.datasets import karate
-    >>> G = karate.get_graph(download=True)
-    >>> df = cugraph.sorensen_coefficient(G)
+    >>> from cugraph import sorensen_coefficient
+    >>> G = karate.get_graph(download=True, ignore_weights=True)
+    >>> df = sorensen_coefficient(G)
 
     """
+    if do_expensive_check:
+        warnings.warn(
+            "do_expensive_check is deprecated since vertex IDs are no longer "
+            "required to be consecutively numbered",
+            FutureWarning,
+        )
+
     vertex_pair = None
 
     G, isNx = ensure_cugraph_obj_for_nx(G)
 
+    # FIXME: What is the logic behind this since the docstrings mention that 'G' and
+    # 'ebunch'(if not None) are respectively of type cugraph.Graph and cudf.DataFrame?
     if isNx is True and ebunch is not None:
         vertex_pair = cudf.DataFrame(ebunch)
 
diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py
index e3486473fe5..ec538bbc0ed 100644
--- a/python/cugraph/cugraph/link_prediction/wjaccard.py
+++ b/python/cugraph/cugraph/link_prediction/wjaccard.py
@@ -11,13 +11,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure.graph_classes import Graph
-from cugraph.link_prediction import jaccard_wrapper
+from cugraph.link_prediction import jaccard
 import cudf
-from cugraph.utilities import renumber_vertex_pair
+import warnings
 
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
 
-def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+# FIXME: Move this function to the utility module so that it can be
+# shared by other algos
+def ensure_valid_dtype(input_graph, vertex_pair):
+
+    vertex_dtype = input_graph.edgelist.edgelist_df.dtypes[0]
+    vertex_pair_dtypes = vertex_pair.dtypes
+
+    if vertex_pair_dtypes[0] != vertex_dtype or vertex_pair_dtypes[1] != vertex_dtype:
+        warning_msg = (
+            "Jaccard requires 'vertex_pair' to match the graph's 'vertex' type. "
+            f"input graph's vertex type is: {vertex_dtype} and got "
+            f"'vertex_pair' of type: {vertex_pair_dtypes}."
+        )
+        warnings.warn(warning_msg, UserWarning)
+        vertex_pair = vertex_pair.astype(vertex_dtype)
+
+    return vertex_pair
+
+
+def jaccard_w(
+    input_graph: Graph,
+    weights: cudf.DataFrame = None,  # deprecated
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
     Compute the weighted Jaccard similarity between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
@@ -55,9 +87,13 @@ def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
         vertices. If provided, the jaccard coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+                This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
 
     Returns
     -------
@@ -95,47 +131,9 @@ def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     >>> df = cugraph.jaccard_w(G, weights)
 
     """
-    if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
-
-    if type(input_graph) is not Graph:
-        raise TypeError("input graph must a Graph")
-
-    if type(vertex_pair) == cudf.DataFrame:
-        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    if input_graph.renumbered:
-        # The 'vertex' column of the cudf 'weights' also needs to be renumbered
-        # if the graph was renumbered
-        vertex_size = input_graph.vertex_column_size()
-        # single-column vertices i.e only one src and dst columns
-        if vertex_size == 1:
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex")
-        # multi-column vertices i.e more than one src and dst columns
-        else:
-            cols = weights.columns[:vertex_size].to_list()
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", cols)
-
-    jaccard_weights = weights["weight"]
-    df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair)
-
-    if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
-
-    return df
+    warning_msg = (
+        "jaccard_w is deprecated. To compute weighted jaccard, please use "
+        "jaccard(input_graph, vertex_pair=False, use_weight=True)"
+    )
+    warnings.warn(warning_msg, FutureWarning)
+    return jaccard(input_graph, vertex_pair, do_expensive_check, use_weight=True)
diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py
index d7ebc5fc684..5f43ad0670b 100644
--- a/python/cugraph/cugraph/link_prediction/woverlap.py
+++ b/python/cugraph/cugraph/link_prediction/woverlap.py
@@ -11,12 +11,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.link_prediction import overlap_wrapper
+from cugraph.link_prediction import overlap
 import cudf
-from cugraph.utilities import renumber_vertex_pair
+import warnings
 
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
 
-def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+def overlap_w(
+    input_graph: Graph,
+    weights: cudf.DataFrame = None,  # deprecated
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
     Compute the weighted Overlap Coefficient between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
@@ -55,9 +69,13 @@ def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
         vertices. If provided, the overlap coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
 
     Returns
     -------
@@ -96,43 +114,9 @@ def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     ...                      len(weights['vertex']))]
     >>> df = cugraph.overlap_w(G, weights)
     """
-    if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
-
-    if type(vertex_pair) == cudf.DataFrame:
-        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    if input_graph.renumbered:
-        vertex_size = input_graph.vertex_column_size()
-        if vertex_size == 1:
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex")
-        else:
-            cols = weights.columns[:vertex_size].to_list()
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", cols)
-
-    overlap_weights = weights["weight"]
-
-    overlap_weights = overlap_weights.astype("float32")
-
-    df = overlap_wrapper.overlap(input_graph, overlap_weights, vertex_pair)
-
-    if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
-
-    return df
+    warning_msg = (
+        " overlap_w is deprecated. To compute weighted overlap, please use "
+        "overlap(input_graph, vertex_pair=False, use_weight=True)"
+    )
+    warnings.warn(warning_msg, FutureWarning)
+    return overlap(input_graph, vertex_pair, do_expensive_check, use_weight=True)
diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py
index 8337b4602de..ff502b36837 100644
--- a/python/cugraph/cugraph/link_prediction/wsorensen.py
+++ b/python/cugraph/cugraph/link_prediction/wsorensen.py
@@ -11,13 +11,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure.graph_classes import Graph
-from cugraph.link_prediction import jaccard_wrapper
+from cugraph.link_prediction import sorensen
 import cudf
-from cugraph.utilities import renumber_vertex_pair
+import warnings
 
+from cugraph.structure import Graph
+from cugraph.utilities.utils import import_optional
 
-def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+
+def sorensen_w(
+    input_graph: Graph,
+    weights: cudf.DataFrame = None,  # deprecated
+    vertex_pair: cudf.DataFrame = None,
+    do_expensive_check: bool = False,  # deprecated
+):
     """
     Compute the weighted Sorensen similarity between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
@@ -51,9 +64,13 @@ def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
         vertices. If provided, the sorensen coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
-    do_expensive_check: bool (default=True)
-        When set to True, check if the vertices in the graph are (re)numbered
-        from 0 to V-1 where V is the total number of vertices.
+    do_expensive_check : bool, optional (default=False)
+        Deprecated.
+        This option added a check to ensure integer vertex IDs are sequential
+        values from 0 to V-1. That check is now redundant because cugraph
+        unconditionally renumbers and un-renumbers integer vertex IDs for
+        optimal performance, therefore this option is deprecated and will be
+        removed in a future version.
 
     Returns
     -------
@@ -93,44 +110,9 @@ def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     >>> df = cugraph.sorensen_w(G, weights)
 
     """
-    if do_expensive_check:
-        if not input_graph.renumbered:
-            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
-            max_vertex = input_df.max().max()
-            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
-                input_df.dtypes[0]
-            )
-            nodes = (
-                cudf.concat([input_df["src"], input_df["dst"]])
-                .unique()
-                .sort_values()
-                .reset_index(drop=True)
-            )
-            if not expected_nodes.equals(nodes):
-                raise ValueError("Unrenumbered vertices are not supported.")
-
-    if type(input_graph) is not Graph:
-        raise TypeError("input graph must a Graph")
-
-    if type(vertex_pair) == cudf.DataFrame:
-        vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
-    elif vertex_pair is not None:
-        raise ValueError("vertex_pair must be a cudf dataframe")
-
-    if input_graph.renumbered:
-        vertex_size = input_graph.vertex_column_size()
-        if vertex_size == 1:
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", "vertex")
-        else:
-            cols = weights.columns[:vertex_size].to_list()
-            weights = input_graph.add_internal_vertex_id(weights, "vertex", cols)
-    jaccard_weights = weights["weight"]
-    df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair)
-    df.jaccard_coeff = (2 * df.jaccard_coeff) / (1 + df.jaccard_coeff)
-    df.rename({"jaccard_coeff": "sorensen_coeff"}, axis=1, inplace=True)
-
-    if input_graph.renumbered:
-        df = input_graph.unrenumber(df, "first")
-        df = input_graph.unrenumber(df, "second")
-
-    return df
+    warning_msg = (
+        "sorensen_w is deprecated. To compute weighted sorensen, please use "
+        "sorensen(input_graph, vertex_pair=False, use_weight=True)"
+    )
+    warnings.warn(warning_msg, FutureWarning)
+    return sorensen(input_graph, vertex_pair, use_weight=True)
diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py
index 015c05d1b08..7b04dba82a5 100644
--- a/python/cugraph/cugraph/sampling/random_walks.py
+++ b/python/cugraph/cugraph/sampling/random_walks.py
@@ -25,11 +25,10 @@
 from cugraph.utilities.utils import import_optional
 from typing import Union, Tuple
 
-# FIXME: the networkx.Graph type used in the type annotation for
-# induced_subgraph() is specified using a string literal to avoid depending on
-# and importing networkx. Instead, networkx is imported optionally, which may
-# cause a problem for a type checker if run in an environment where networkx is
-# not installed.
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
 networkx = import_optional("networkx")
 
 
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
index cd883fb88f2..7ce7d263eda 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py
@@ -11,6 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# FIXME: Can we use global variables for column names instead of hardcoded ones?
+
 import gc
 
 import pytest
@@ -20,12 +22,19 @@
 import cugraph
 from cugraph.datasets import netscience
 from cugraph.testing import utils, UNDIRECTED_DATASETS
-from cugraph.experimental import jaccard as exp_jaccard
-from cudf.testing import assert_series_equal, assert_frame_equal
-from cugraph.experimental import jaccard_coefficient as exp_jaccard_coefficient
-
+from cudf.testing import assert_series_equal
+from cudf.testing.testing import assert_frame_equal
 
-print("Networkx version : {} ".format(nx.__version__))
+SRC_COL = "0"
+DST_COL = "1"
+VERTEX_PAIR_FIRST_COL = "first"
+VERTEX_PAIR_SECOND_COL = "second"
+JACCARD_COEFF_COL = "jaccard_coeff"
+EDGE_ATT_COL = "weight"
+MULTI_COL_SRC_0_COL = "src_0"
+MULTI_COL_DST_0_COL = "dst_0"
+MULTI_COL_SRC_1_COL = "src_1"
+MULTI_COL_DST_1_COL = "dst_1"
 
 
 # =============================================================================
@@ -38,65 +47,79 @@ def setup_function():
 # =============================================================================
 # Helper functions
 # =============================================================================
-def compare_jaccard_two_hop(G, Gnx, edgevals=True):
+
+
+def compare_jaccard_two_hop(G, Gnx, use_weight=False):
     """
     Compute both cugraph and nx jaccard after extracting the two hop neighbors
     from G and compare both results
     """
     pairs = (
         G.get_two_hop_neighbors()
-        .sort_values(["first", "second"])
+        .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL])
         .reset_index(drop=True)
     )
 
-    nx_pairs = list(pairs.to_records(index=False))
-    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
-    nx_coeff = []
-    for u, v, p in preds:
-        # print(u, " ", v, " ", p)
-        nx_coeff.append(p)
     df = cugraph.jaccard(G, pairs)
-    df = df.sort_values(by=["first", "second"]).reset_index(drop=True)
-    if not edgevals:
-        # experimental jaccard currently only supports unweighted graphs
-        df_exp = exp_jaccard(G, pairs)
-        df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True)
-        assert_frame_equal(df, df_exp, check_dtype=False, check_like=True)
+    df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index(
+        drop=True
+    )
 
-    assert len(nx_coeff) == len(df)
-    for i in range(len(df)):
-        diff = abs(nx_coeff[i] - df["jaccard_coeff"].iloc[i])
-        assert diff < 1.0e-6
+    if not use_weight:
+        nx_pairs = list(pairs.to_records(index=False))
+        preds = nx.jaccard_coefficient(Gnx, nx_pairs)
+        nx_coeff = []
+        for u, v, p in preds:
+            nx_coeff.append(p)
+
+        assert len(nx_coeff) == len(df)
+        for i in range(len(df)):
+            diff = abs(nx_coeff[i] - df[JACCARD_COEFF_COL].iloc[i])
+            assert diff < 1.0e-6
+    else:
+        # FIXME: compare results against resultset api
+        pass
 
 
-def cugraph_call(benchmark_callable, graph_file, edgevals=False, input_df=None):
+def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False):
     G = cugraph.Graph()
-    G = graph_file.get_graph(ignore_weights=not edgevals)
+    G = graph_file.get_graph(ignore_weights=not use_weight)
 
     # If no vertex_pair is passed as input, 'cugraph.jaccard' will
     # compute the 'jaccard_similarity' with the two_hop_neighbor of the
     # entire graph while nx compute with the one_hop_neighbor. For better
     # comparaison, get the one_hop_neighbor of the entire graph for 'cugraph.jaccard'
     # and pass it as vertex_pair
-    vertex_pair = input_df.rename(columns={"0": "first", "1": "second"})
-    vertex_pair = vertex_pair[["first", "second"]]
+    if isinstance(input_df, cudf.DataFrame):
+        vertex_pair = input_df.rename(
+            columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL}
+        )
+        vertex_pair = vertex_pair[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]]
+    else:
+        vertex_pair = cudf.DataFrame(
+            columns=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL],
+            dtype=G.edgelist.edgelist_df["src"].dtype,
+        )
 
     # cugraph Jaccard Call
-    df = benchmark_callable(cugraph.jaccard, G, vertex_pair=vertex_pair)
+    df = benchmark_callable(
+        cugraph.jaccard, G, vertex_pair=vertex_pair, use_weight=use_weight
+    )
 
-    df = df.sort_values(["first", "second"]).reset_index(drop=True)
+    df = df.sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index(
+        drop=True
+    )
 
     return (
-        df["first"].to_numpy(),
-        df["second"].to_numpy(),
-        df["jaccard_coeff"].to_numpy(),
+        df[VERTEX_PAIR_FIRST_COL].to_numpy(),
+        df[VERTEX_PAIR_SECOND_COL].to_numpy(),
+        df[JACCARD_COEFF_COL].to_numpy(),
     )
 
 
 def networkx_call(M, benchmark_callable=None):
-
-    sources = M["0"]
-    destinations = M["1"]
+    sources = M[SRC_COL]
+    destinations = M[DST_COL]
     edges = []
     for i in range(len(M)):
         edges.append((sources[i], destinations[i]))
@@ -108,7 +131,11 @@ def networkx_call(M, benchmark_callable=None):
     print("Format conversion ... ")
 
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
+        M,
+        source=SRC_COL,
+        target=DST_COL,
+        edge_attr=EDGE_ATT_COL,
+        create_using=nx.Graph(),
     )
 
     # Networkx Jaccard Call
@@ -144,118 +171,130 @@ def read_csv(request):
 
 
 @pytest.mark.sg
-def test_jaccard(read_csv, gpubenchmark):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_jaccard(read_csv, gpubenchmark, use_weight):
     M_cu, M, graph_file = read_csv
-    cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu)
-    nx_src, nx_dst, nx_coeff = networkx_call(M)
+    cu_src, cu_dst, cu_coeff = cugraph_call(
+        gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight
+    )
+    if not use_weight:
+        nx_src, nx_dst, nx_coeff = networkx_call(M)
 
-    # Calculating mismatch
-    err = 0
-    tol = 1.0e-06
+        # Calculating mismatch
+        err = 0
+        tol = 1.0e-06
 
-    assert len(cu_coeff) == len(nx_coeff)
-    for i in range(len(cu_coeff)):
-        if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1:
-            err += 1
+        assert len(cu_coeff) == len(nx_coeff)
+        for i in range(len(cu_coeff)):
+            if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1:
+                err += 1
 
-    print("Mismatches:  %d" % err)
-    assert err == 0
+        print("Mismatches:  %d" % err)
+        assert err == 0
+    else:
+        G = graph_file.get_graph()
+        res_w_jaccard = cugraph.jaccard_w(G, vertex_pair=M_cu[[SRC_COL, DST_COL]])
+        res_w_jaccard = res_w_jaccard.sort_values(
+            [VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]
+        ).reset_index(drop=True)
+        res_jaccard = cudf.DataFrame()
+        res_jaccard[VERTEX_PAIR_FIRST_COL] = cu_src
+        res_jaccard[VERTEX_PAIR_SECOND_COL] = cu_dst
+        res_jaccard[JACCARD_COEFF_COL] = cu_coeff
+        assert_frame_equal(
+            res_w_jaccard, res_jaccard, check_dtype=False, check_like=True
+        )
+
+        # FIXME: compare weighted jaccard results against resultset api
 
 
 @pytest.mark.sg
-def test_directed_graph_check(read_csv):
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_directed_graph_check(read_csv, use_weight):
     _, M, _ = read_csv
 
     cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    cu_M[SRC_COL] = cudf.Series(M[SRC_COL])
+    cu_M[DST_COL] = cudf.Series(M[DST_COL])
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
+
     G1 = cugraph.Graph(directed=True)
-    G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
-    )
+    weight = EDGE_ATT_COL if use_weight else None
+    G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight)
+
+    vertex_pair = cu_M[[SRC_COL, DST_COL]]
 
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
     vertex_pair = vertex_pair[:5]
     with pytest.raises(ValueError):
-        cugraph.jaccard(G1, vertex_pair)
+        cugraph.jaccard(G1, vertex_pair, use_weight)
 
 
 @pytest.mark.sg
 def test_nx_jaccard_time(read_csv, gpubenchmark):
-
     _, M, _ = read_csv
     nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark)
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", [netscience])
-@pytest.mark.skip(reason="Skipping because this datasets is unrenumbered")
-def test_jaccard_edgevals(gpubenchmark, graph_file):
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_jaccard_edgevals(gpubenchmark, graph_file, use_weight):
     dataset_path = netscience.get_path()
     M = utils.read_csv_for_nx(dataset_path)
     M_cu = utils.read_csv_file(dataset_path)
     cu_src, cu_dst, cu_coeff = cugraph_call(
-        gpubenchmark, netscience, edgevals=True, input_df=M_cu
+        gpubenchmark, netscience, input_df=M_cu, use_weight=use_weight
     )
-    nx_src, nx_dst, nx_coeff = networkx_call(M)
+    if not use_weight:
+        nx_src, nx_dst, nx_coeff = networkx_call(M)
 
-    # Calculating mismatch
-    err = 0
-    tol = 1.0e-06
-
-    assert len(cu_coeff) == len(nx_coeff)
-    for i in range(len(cu_coeff)):
-        if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1:
-            err += 1
-
-    print("Mismatches:  %d" % err)
-    assert err == 0
+        # Calculating mismatch
+        err = 0
+        tol = 1.0e-06
 
+        assert len(cu_coeff) == len(nx_coeff)
+        for i in range(len(cu_coeff)):
+            if abs(cu_coeff[i] - nx_coeff[i]) > tol * 1.1:
+                err += 1
 
-@pytest.mark.sg
-def test_jaccard_two_hop(read_csv):
-
-    _, M, graph_file = read_csv
-
-    Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph())
-    G = graph_file.get_graph(ignore_weights=True)
-
-    compare_jaccard_two_hop(G, Gnx)
+        print("Mismatches:  %d" % err)
+        assert err == 0
+    else:
+        # FIXME: compare results against resultset api
+        pass
 
 
 @pytest.mark.sg
-def test_jaccard_two_hop_edge_vals(read_csv):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_jaccard_two_hop(read_csv, use_weight):
     _, M, graph_file = read_csv
 
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
+        M, source=SRC_COL, target=DST_COL, create_using=nx.Graph()
     )
+    G = graph_file.get_graph(ignore_weights=not use_weight)
 
-    G = graph_file.get_graph()
-
-    compare_jaccard_two_hop(G, Gnx, edgevals=True)
+    compare_jaccard_two_hop(G, Gnx, use_weight)
 
 
 @pytest.mark.sg
 def test_jaccard_nx(read_csv):
-
     M_cu, M, _ = read_csv
-    Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph())
+    Gnx = nx.from_pandas_edgelist(
+        M, source=SRC_COL, target=DST_COL, create_using=nx.Graph()
+    )
 
     nx_j = nx.jaccard_coefficient(Gnx)
     nv_js = sorted(nx_j, key=len, reverse=True)
 
-    ebunch = M_cu.rename(columns={"0": "first", "1": "second"})
-    ebunch = ebunch[["first", "second"]]
+    ebunch = M_cu.rename(
+        columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL}
+    )
+    ebunch = ebunch[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]]
     cg_j = cugraph.jaccard_coefficient(Gnx, ebunch=ebunch)
-    cg_j_exp = exp_jaccard_coefficient(Gnx, ebunch=ebunch)
 
     assert len(nv_js) > len(cg_j)
-    assert len(nv_js) > len(cg_j_exp)
 
     # FIXME:  Nx does a full all-pair Jaccard.
     # cuGraph does a limited 1-hop Jaccard
@@ -263,68 +302,58 @@ def test_jaccard_nx(read_csv):
 
 
 @pytest.mark.sg
-def test_jaccard_multi_column(read_csv):
-
-    _, M, _ = read_csv
+@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_jaccard_multi_column(graph_file, use_weight):
+    dataset_path = graph_file.get_path()
+    M = utils.read_csv_for_nx(dataset_path)
 
     cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL])
+    cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL])
+    cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000
+    cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
+
     G1 = cugraph.Graph()
+    weight = EDGE_ATT_COL if use_weight else None
     G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
+        cu_M,
+        source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL],
+        destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL],
+        weight=weight,
     )
 
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
+    vertex_pair = cu_M[
+        [
+            MULTI_COL_SRC_0_COL,
+            MULTI_COL_SRC_1_COL,
+            MULTI_COL_DST_0_COL,
+            MULTI_COL_DST_1_COL,
+        ]
+    ]
     vertex_pair = vertex_pair[:5]
 
-    df_res = cugraph.jaccard(G1, vertex_pair)
-    df_plc_exp = exp_jaccard(G1, vertex_pair)
-
-    df_plc_exp = df_plc_exp.rename(
-        columns={
-            "0_src": "0_source",
-            "0_dst": "0_destination",
-            "1_src": "1_source",
-            "1_dst": "1_destination",
-        }
-    )
-
-    jaccard_res = df_res["jaccard_coeff"].sort_values().reset_index(drop=True)
-    jaccard_plc_exp = df_plc_exp["jaccard_coeff"].sort_values().reset_index(drop=True)
-    assert_series_equal(jaccard_res, jaccard_plc_exp)
+    df_multi_col_res = cugraph.jaccard(G1, vertex_pair)
 
     G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-    df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]])
+    G2.from_cudf_edgelist(
+        cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight
+    )
+    df_single_col_res = cugraph.jaccard(
+        G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]]
+    )
 
     # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"])
+    actual = df_multi_col_res.sort_values("0_src").reset_index()
+    expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index()
+    assert_series_equal(actual[JACCARD_COEFF_COL], expected[JACCARD_COEFF_COL])
 
 
 @pytest.mark.sg
-def test_weighted_exp_jaccard():
+def test_weighted_jaccard():
     karate = UNDIRECTED_DATASETS[0]
-    G = karate.get_graph()
-    with pytest.raises(ValueError):
-        exp_jaccard(G)
-
     G = karate.get_graph(ignore_weights=True)
-    use_weight = True
-    with pytest.raises(ValueError):
-        exp_jaccard(G, use_weight=use_weight)
-
-
-@pytest.mark.sg
-def test_invalid_datasets_jaccard():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
     with pytest.raises(ValueError):
-        cugraph.jaccard(G)
+        cugraph.jaccard(G, use_weight=True)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
index 586d534cd42..e24deaa61ac 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py
@@ -20,8 +20,19 @@
 import cudf
 import cugraph
 from cugraph.testing import utils, UNDIRECTED_DATASETS
-from cugraph.experimental import overlap as exp_overlap
-from cudf.testing import assert_series_equal, assert_frame_equal
+from cudf.testing import assert_series_equal
+from cudf.testing.testing import assert_frame_equal
+
+SRC_COL = "0"
+DST_COL = "1"
+VERTEX_PAIR_FIRST_COL = "first"
+VERTEX_PAIR_SECOND_COL = "second"
+OVERLAP_COEFF_COL = "overlap_coeff"
+EDGE_ATT_COL = "weight"
+MULTI_COL_SRC_0_COL = "src_0"
+MULTI_COL_DST_0_COL = "dst_0"
+MULTI_COL_SRC_1_COL = "src_1"
+MULTI_COL_DST_1_COL = "dst_1"
 
 
 # =============================================================================
@@ -35,7 +46,6 @@ def setup_function():
 # Helper functions
 # =============================================================================
 def compare_overlap(cu_coeff, cpu_coeff):
-
     assert len(cu_coeff) == len(cpu_coeff)
     for i in range(len(cu_coeff)):
         if np.isnan(cpu_coeff[i]):
@@ -47,21 +57,21 @@ def compare_overlap(cu_coeff, cpu_coeff):
             assert diff < 1.0e-6
 
 
-def cugraph_call(benchmark_callable, graph_file, pairs, edgevals=False):
+def cugraph_call(benchmark_callable, graph_file, pairs, use_weight=False):
     # Device data
     G = graph_file.get_graph(
-        create_using=cugraph.Graph(directed=False), ignore_weights=not edgevals
+        create_using=cugraph.Graph(directed=False), ignore_weights=not use_weight
     )
     # cugraph Overlap Call
     df = benchmark_callable(cugraph.overlap, G, pairs)
-    df = df.sort_values(by=["first", "second"]).reset_index(drop=True)
-    if not edgevals:
-        # experimental overlap currently only supports unweighted graphs
-        df_exp = exp_overlap(G, pairs)
-        df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True)
-        assert_frame_equal(df, df_exp, check_dtype=False, check_like=True)
+    df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index(
+        drop=True
+    )
+    if use_weight:
+        res_w_overlap = cugraph.overlap_w(G, vertex_pair=pairs)
+        assert_frame_equal(res_w_overlap, df, check_dtype=False, check_like=True)
 
-    return df["overlap_coeff"].to_numpy()
+    return df[OVERLAP_COEFF_COL].to_numpy()
 
 
 def intersection(a, b, M):
@@ -120,8 +130,10 @@ def read_csv(request):
     dataset_path = graph_file.get_path()
     Mnx = utils.read_csv_for_nx(dataset_path)
 
-    N = max(max(Mnx["0"]), max(Mnx["1"])) + 1
-    M = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N))
+    N = max(max(Mnx[SRC_COL]), max(Mnx[DST_COL])) + 1
+    M = scipy.sparse.csr_matrix(
+        (Mnx.weight, (Mnx[SRC_COL], Mnx[DST_COL])), shape=(N, N)
+    )
 
     return M, graph_file
 
@@ -135,7 +147,7 @@ def extract_two_hop(read_csv):
     G = graph_file.get_graph(ignore_weights=True)
     pairs = (
         G.get_two_hop_neighbors()
-        .sort_values(["first", "second"])
+        .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL])
         .reset_index(drop=True)
     )
 
@@ -144,93 +156,91 @@ def extract_two_hop(read_csv):
 
 # Test
 @pytest.mark.sg
-def test_overlap(gpubenchmark, read_csv, extract_two_hop):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_overlap(gpubenchmark, read_csv, extract_two_hop, use_weight):
     M, graph_file = read_csv
     pairs = extract_two_hop
 
-    cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs)
-    cpu_coeff = cpu_call(M, pairs["first"], pairs["second"])
+    cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs, use_weight=use_weight)
+    cpu_coeff = cpu_call(M, pairs[VERTEX_PAIR_FIRST_COL], pairs[VERTEX_PAIR_SECOND_COL])
 
     compare_overlap(cu_coeff, cpu_coeff)
 
 
-# Test
 @pytest.mark.sg
-def test_overlap_edge_vals(gpubenchmark, read_csv, extract_two_hop):
+@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_directed_graph_check(graph_file, use_weight):
+    M = utils.read_csv_for_nx(graph_file.get_path())
+    cu_M = cudf.DataFrame()
+    cu_M[SRC_COL] = cudf.Series(M[SRC_COL])
+    cu_M[DST_COL] = cudf.Series(M[DST_COL])
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
 
-    M, graph_file = read_csv
-    pairs = extract_two_hop
+    G1 = cugraph.Graph(directed=True)
+    weight = EDGE_ATT_COL if use_weight else None
+    G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight)
 
-    cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs, edgevals=True)
-    cpu_coeff = cpu_call(M, pairs["first"], pairs["second"])
+    vertex_pair = cu_M[[SRC_COL, DST_COL]]
 
-    compare_overlap(cu_coeff, cpu_coeff)
+    vertex_pair = vertex_pair[:5]
+    with pytest.raises(ValueError):
+        cugraph.overlap(G1, vertex_pair, use_weight)
 
 
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
-def test_overlap_multi_column(graph_file):
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_overlap_multi_column(graph_file, use_weight):
     dataset_path = graph_file.get_path()
     M = utils.read_csv_for_nx(dataset_path)
 
     cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL])
+    cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL])
+    cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000
+    cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
+
     G1 = cugraph.Graph()
+    weight = EDGE_ATT_COL if use_weight else None
     G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
+        cu_M,
+        source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL],
+        destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL],
+        weight=weight,
     )
 
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
+    vertex_pair = cu_M[
+        [
+            MULTI_COL_SRC_0_COL,
+            MULTI_COL_SRC_1_COL,
+            MULTI_COL_DST_0_COL,
+            MULTI_COL_DST_1_COL,
+        ]
+    ]
     vertex_pair = vertex_pair[:5]
 
-    df_res = cugraph.overlap(G1, vertex_pair)
-    df_plc_exp = exp_overlap(G1, vertex_pair)
-
-    df_plc_exp = df_plc_exp.rename(
-        columns={
-            "0_src": "0_source",
-            "0_dst": "0_destination",
-            "1_src": "1_source",
-            "1_dst": "1_destination",
-        }
-    )
-    overlap_res = df_res["overlap_coeff"].sort_values().reset_index(drop=True)
-    overlap_plc_exp = df_plc_exp["overlap_coeff"].sort_values().reset_index(drop=True)
-    assert_series_equal(overlap_res, overlap_plc_exp)
-
+    df_multi_col_res = cugraph.overlap(G1, vertex_pair, use_weight=use_weight)
     G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-    df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]])
+    G2.from_cudf_edgelist(
+        cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight
+    )
+    df_single_col_res = cugraph.overlap(
+        G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]]
+    )
 
     # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
+    actual = df_multi_col_res.sort_values("0_src").reset_index()
+    expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index()
+    assert_series_equal(actual[OVERLAP_COEFF_COL], expected[OVERLAP_COEFF_COL])
 
 
 @pytest.mark.sg
-def test_weighted_exp_overlap():
+def test_weighted_overlap():
     karate = UNDIRECTED_DATASETS[0]
-    G = karate.get_graph()
-    with pytest.raises(ValueError):
-        exp_overlap(G)
-
     G = karate.get_graph(ignore_weights=True)
-    use_weight = True
-    with pytest.raises(ValueError):
-        exp_overlap(G, use_weight=use_weight)
-
-
-@pytest.mark.sg
-def test_invalid_datasets_overlap():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
     with pytest.raises(ValueError):
-        cugraph.overlap(G)
+        cugraph.overlap(G, use_weight=True)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
index 3da33a3e853..6b4074fce30 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py
@@ -20,11 +20,19 @@
 import cugraph
 from cugraph.testing import utils, UNDIRECTED_DATASETS
 from cugraph.datasets import netscience
-from cugraph.experimental import sorensen as exp_sorensen
-from cudf.testing import assert_series_equal, assert_frame_equal
+from cudf.testing import assert_series_equal
+from cudf.testing.testing import assert_frame_equal
 
-
-print("Networkx version : {} ".format(nx.__version__))
+SRC_COL = "0"
+DST_COL = "1"
+VERTEX_PAIR_FIRST_COL = "first"
+VERTEX_PAIR_SECOND_COL = "second"
+SORENSEN_COEFF_COL = "sorensen_coeff"
+EDGE_ATT_COL = "weight"
+MULTI_COL_SRC_0_COL = "src_0"
+MULTI_COL_DST_0_COL = "dst_0"
+MULTI_COL_SRC_1_COL = "src_1"
+MULTI_COL_DST_1_COL = "dst_1"
 
 
 # =============================================================================
@@ -37,68 +45,89 @@ def setup_function():
 # =============================================================================
 # Helper functions
 # =============================================================================
-def compare_sorensen_two_hop(G, Gnx, edgevals=False):
+def compare_sorensen_two_hop(G, Gnx, use_weight=False):
     """
     Compute both cugraph and nx sorensen after extracting the two hop neighbors
     from G and compare both results
     """
     pairs = (
         G.get_two_hop_neighbors()
-        .sort_values(["first", "second"])
+        .sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL])
         .reset_index(drop=True)
     )
-    nx_pairs = []
-    nx_pairs = list(pairs.to_records(index=False))
-    preds = nx.jaccard_coefficient(Gnx, nx_pairs)
-    nx_coeff = []
-    for u, v, p in preds:
+
+    # print(f'G = {G.edgelist.edgelist_df}')
+
+    df = cugraph.sorensen(G, pairs)
+    df = df.sort_values(by=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index(
+        drop=True
+    )
+
+    if not use_weight:
+        nx_pairs = list(pairs.to_records(index=False))
+
+        # print(f'nx_pairs = {len(nx_pairs)}')
+
+        preds = nx.jaccard_coefficient(Gnx, nx_pairs)
+
         # FIXME: Use known correct values of Sorensen for few graphs,
         # hardcode it and compare to Cugraph Sorensen to get a more robust test
 
         # Conversion from Networkx Jaccard to Sorensen
         # No networkX equivalent
-        nx_coeff.append((2 * p) / (1 + p))
-    df = cugraph.sorensen(G, pairs)
-    df = df.sort_values(by=["first", "second"]).reset_index(drop=True)
-    if not edgevals:
-        # experimental sorensen currently only supports unweighted graphs
-        df_exp = exp_sorensen(G, pairs)
-        df_exp = df_exp.sort_values(by=["first", "second"]).reset_index(drop=True)
-        assert_frame_equal(df, df_exp, check_dtype=False, check_like=True)
-    assert len(nx_coeff) == len(df)
-    for i in range(len(df)):
-        diff = abs(nx_coeff[i] - df["sorensen_coeff"].iloc[i])
-        assert diff < 1.0e-6
-
-
-def cugraph_call(benchmark_callable, graph_file, edgevals=False, input_df=None):
+
+        nx_coeff = list(map(lambda x: (2 * x[2]) / (1 + x[2]), preds))
+
+        assert len(nx_coeff) == len(df)
+        for i in range(len(df)):
+            diff = abs(nx_coeff[i] - df[SORENSEN_COEFF_COL].iloc[i])
+            assert diff < 1.0e-6
+    else:
+        # FIXME: compare results against resultset api
+        res_w_sorensen = cugraph.sorensen_w(G, vertex_pair=pairs)
+        res_w_sorensen = res_w_sorensen.sort_values(
+            [VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]
+        ).reset_index(drop=True)
+        assert_frame_equal(res_w_sorensen, df, check_dtype=False, check_like=True)
+
+
+def cugraph_call(benchmark_callable, graph_file, input_df=None, use_weight=False):
     G = cugraph.Graph()
-    G = graph_file.get_graph(ignore_weights=not edgevals)
+    G = graph_file.get_graph(ignore_weights=not use_weight)
 
     # If no vertex_pair is passed as input, 'cugraph.sorensen' will
     # compute the 'sorensen_similarity' with the two_hop_neighbor of the
     # entire graph while nx compute with the one_hop_neighbor. For better
     # comparaison, get the one_hop_neighbor of the entire graph for 'cugraph.sorensen'
     # and pass it as vertex_pair
-    vertex_pair = input_df.rename(columns={"0": "first", "1": "second"})
-    vertex_pair = vertex_pair[["first", "second"]]
+    if isinstance(input_df, cudf.DataFrame):
+        vertex_pair = input_df.rename(
+            columns={SRC_COL: VERTEX_PAIR_FIRST_COL, DST_COL: VERTEX_PAIR_SECOND_COL}
+        )
+        vertex_pair = vertex_pair[[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]]
+    else:
+        vertex_pair = cudf.DataFrame(
+            columns=[VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL],
+            dtype=G.edgelist.edgelist_df["src"].dtype,
+        )
 
     # cugraph Sorensen Call
     df = benchmark_callable(cugraph.sorensen, G, vertex_pair=vertex_pair)
 
-    df = df.sort_values(["first", "second"]).reset_index(drop=True)
+    df = df.sort_values([VERTEX_PAIR_FIRST_COL, VERTEX_PAIR_SECOND_COL]).reset_index(
+        drop=True
+    )
 
     return (
-        df["first"].to_numpy(),
-        df["second"].to_numpy(),
-        df["sorensen_coeff"].to_numpy(),
+        df[VERTEX_PAIR_FIRST_COL].to_numpy(),
+        df[VERTEX_PAIR_SECOND_COL].to_numpy(),
+        df[SORENSEN_COEFF_COL].to_numpy(),
     )
 
 
 def networkx_call(M, benchmark_callable=None):
-
-    sources = M["0"]
-    destinations = M["1"]
+    sources = M[SRC_COL]
+    destinations = M[DST_COL]
     edges = []
     for i in range(len(M)):
         edges.append((sources[i], destinations[i]))
@@ -110,7 +139,11 @@ def networkx_call(M, benchmark_callable=None):
     print("Format conversion ... ")
 
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
+        M,
+        source=SRC_COL,
+        target=DST_COL,
+        edge_attr=EDGE_ATT_COL,
+        create_using=nx.Graph(),
     )
 
     # Networkx Jaccard Call
@@ -149,10 +182,12 @@ def read_csv(request):
 
 
 @pytest.mark.sg
-def test_sorensen(gpubenchmark, read_csv):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_sorensen(gpubenchmark, read_csv, use_weight):
     M_cu, M, graph_file = read_csv
-    cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file, input_df=M_cu)
+    cu_src, cu_dst, cu_coeff = cugraph_call(
+        gpubenchmark, graph_file, input_df=M_cu, use_weight=use_weight
+    )
     nx_src, nx_dst, nx_coeff = networkx_call(M)
 
     # Calculating mismatch
@@ -170,20 +205,42 @@ def test_sorensen(gpubenchmark, read_csv):
 
 @pytest.mark.sg
 def test_nx_sorensen_time(gpubenchmark, read_csv):
-
     _, M, _ = read_csv
     nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark)
 
 
+@pytest.mark.sg
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_directed_graph_check(read_csv, use_weight):
+    _, M, _ = read_csv
+
+    cu_M = cudf.DataFrame()
+    cu_M[SRC_COL] = cudf.Series(M[SRC_COL])
+    cu_M[DST_COL] = cudf.Series(M[DST_COL])
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
+
+    G1 = cugraph.Graph(directed=True)
+    weight = EDGE_ATT_COL if use_weight else None
+    G1.from_cudf_edgelist(cu_M, source=SRC_COL, destination=DST_COL, weight=weight)
+
+    vertex_pair = cu_M[[SRC_COL, DST_COL]]
+
+    vertex_pair = vertex_pair[:5]
+    with pytest.raises(ValueError):
+        cugraph.sorensen(G1, vertex_pair, use_weight)
+
+
 @pytest.mark.sg
 @pytest.mark.parametrize("graph_file", [netscience])
+@pytest.mark.parametrize("use_weight", [False, True])
 @pytest.mark.skip(reason="Skipping because this datasets is unrenumbered")
-def test_sorensen_edgevals(gpubenchmark, graph_file):
+def test_sorensen_edgevals(gpubenchmark, graph_file, use_weight):
     dataset_path = netscience.get_path()
     M = utils.read_csv_for_nx(dataset_path)
     M_cu = utils.read_csv_file(dataset_path)
     cu_src, cu_dst, cu_coeff = cugraph_call(
-        gpubenchmark, netscience, edgevals=True, input_df=M_cu
+        gpubenchmark, netscience, input_df=M_cu, use_weight=use_weight
     )
     nx_src, nx_dst, nx_coeff = networkx_call(M)
 
@@ -201,92 +258,89 @@ def test_sorensen_edgevals(gpubenchmark, graph_file):
 
 
 @pytest.mark.sg
-def test_sorensen_two_hop(read_csv):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_sorensen_two_hop(read_csv, use_weight):
     _, M, graph_file = read_csv
 
-    Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph())
-    G = graph_file.get_graph(ignore_weights=True)
+    Gnx = nx.from_pandas_edgelist(
+        M, source=SRC_COL, target=DST_COL, create_using=nx.Graph()
+    )
+    G = graph_file.get_graph(ignore_weights=not use_weight)
 
-    compare_sorensen_two_hop(G, Gnx)
+    compare_sorensen_two_hop(G, Gnx, use_weight=use_weight)
 
 
 @pytest.mark.sg
-def test_sorensen_two_hop_edge_vals(read_csv):
-
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_sorensen_two_hop_edge_vals(read_csv, use_weight):
     _, M, graph_file = read_csv
 
     Gnx = nx.from_pandas_edgelist(
-        M, source="0", target="1", edge_attr="weight", create_using=nx.Graph()
+        M,
+        source=SRC_COL,
+        target=DST_COL,
+        edge_attr=EDGE_ATT_COL,
+        create_using=nx.Graph(),
     )
 
-    G = graph_file.get_graph()
+    G = graph_file.get_graph(ignore_weights=not use_weight)
 
-    compare_sorensen_two_hop(G, Gnx, edgevals=True)
+    compare_sorensen_two_hop(G, Gnx, use_weight=use_weight)
 
 
 @pytest.mark.sg
-def test_sorensen_multi_column(read_csv):
-
-    _, M, _ = read_csv
+@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
+@pytest.mark.parametrize("use_weight", [False, True])
+def test_sorensen_multi_column(graph_file, use_weight):
+    dataset_path = graph_file.get_path()
+    M = utils.read_csv_for_nx(dataset_path)
 
     cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
+    cu_M[MULTI_COL_SRC_0_COL] = cudf.Series(M[SRC_COL])
+    cu_M[MULTI_COL_DST_0_COL] = cudf.Series(M[DST_COL])
+    cu_M[MULTI_COL_SRC_1_COL] = cu_M[MULTI_COL_SRC_0_COL] + 1000
+    cu_M[MULTI_COL_DST_1_COL] = cu_M[MULTI_COL_DST_0_COL] + 1000
+    if use_weight:
+        cu_M[EDGE_ATT_COL] = cudf.Series(M[EDGE_ATT_COL])
+
     G1 = cugraph.Graph()
+    weight = EDGE_ATT_COL if use_weight else None
     G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
+        cu_M,
+        source=[MULTI_COL_SRC_0_COL, MULTI_COL_SRC_1_COL],
+        destination=[MULTI_COL_DST_0_COL, MULTI_COL_DST_1_COL],
+        weight=weight,
     )
 
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
+    vertex_pair = cu_M[
+        [
+            MULTI_COL_SRC_0_COL,
+            MULTI_COL_SRC_1_COL,
+            MULTI_COL_DST_0_COL,
+            MULTI_COL_DST_1_COL,
+        ]
+    ]
     vertex_pair = vertex_pair[:5]
 
-    df_res = cugraph.sorensen(G1, vertex_pair)
-    df_plc_exp = exp_sorensen(G1, vertex_pair)
-
-    df_plc_exp = df_plc_exp.rename(
-        columns={
-            "0_src": "0_source",
-            "0_dst": "0_destination",
-            "1_src": "1_source",
-            "1_dst": "1_destination",
-        }
-    )
-    sorensen_res = df_res["sorensen_coeff"].sort_values().reset_index(drop=True)
-    sorensen_plc_exp = df_plc_exp["sorensen_coeff"].sort_values().reset_index(drop=True)
-    assert_series_equal(sorensen_res, sorensen_plc_exp)
+    df_multi_col_res = cugraph.sorensen(G1, vertex_pair)
 
     G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-    df_exp = cugraph.sorensen(G2, vertex_pair[["src_0", "dst_0"]])
+    G2.from_cudf_edgelist(
+        cu_M, source=MULTI_COL_SRC_0_COL, destination=MULTI_COL_DST_0_COL, weight=weight
+    )
+    df_single_col_res = cugraph.sorensen(
+        G2, vertex_pair[[MULTI_COL_SRC_0_COL, MULTI_COL_DST_0_COL]]
+    )
 
     # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"])
+    actual = df_multi_col_res.sort_values("0_src").reset_index()
+    expected = df_single_col_res.sort_values(VERTEX_PAIR_FIRST_COL).reset_index()
+    assert_series_equal(actual[SORENSEN_COEFF_COL], expected[SORENSEN_COEFF_COL])
 
 
 @pytest.mark.sg
-def test_weighted_exp_sorensen():
+def test_weighted_sorensen():
     karate = UNDIRECTED_DATASETS[0]
-    G = karate.get_graph()
-    with pytest.raises(ValueError):
-        exp_sorensen(G)
-
     G = karate.get_graph(ignore_weights=True)
-    use_weight = True
-    with pytest.raises(ValueError):
-        exp_sorensen(G, use_weight=use_weight)
-
-
-@pytest.mark.sg
-def test_invalid_datasets_sorensen():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
     with pytest.raises(ValueError):
-        cugraph.sorensen(G)
+        cugraph.sorensen(G, use_weight=True)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py
deleted file mode 100644
index 36a21df46b8..00000000000
--- a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-
-import pytest
-import numpy as np
-import networkx as nx
-
-import cudf
-import cugraph
-from cugraph.testing import utils, UNDIRECTED_DATASETS
-from cudf.testing import assert_series_equal
-
-
-print("Networkx version : {} ".format(nx.__version__))
-
-
-# =============================================================================
-# Pytest Setup / Teardown - called for each test function
-# =============================================================================
-def setup_function():
-    gc.collect()
-
-
-def cugraph_call(benchmark_callable, graph_file):
-    # Device data
-    cu_M = graph_file.get_edgelist()
-    weight_arr = cudf.Series(
-        np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32)
-    )
-    weights = cudf.DataFrame()
-    weights["vertex"] = np.arange(len(weight_arr), dtype=np.int32)
-    weights["weight"] = weight_arr
-
-    G = graph_file.get_graph(ignore_weights=True)
-
-    # cugraph Jaccard Call
-    df = benchmark_callable(cugraph.jaccard_w, G, weights)
-
-    df = df.sort_values(["first", "second"]).reset_index(drop=True)
-
-    return df["jaccard_coeff"]
-
-
-def networkx_call(M, benchmark_callable=None):
-
-    sources = M["0"]
-    destinations = M["1"]
-    edges = []
-    for i in range(len(sources)):
-        edges.append((sources[i], destinations[i]))
-        edges.append((destinations[i], sources[i]))
-    edges = list(dict.fromkeys(edges))
-    edges = sorted(edges)
-    # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this
-    # explicitly
-    print("Format conversion ... ")
-
-    # NetworkX graph
-    Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph())
-    # Networkx Jaccard Call
-    print("Solving... ")
-    if benchmark_callable is not None:
-        preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges)
-    else:
-        preds = nx.jaccard_coefficient(Gnx, edges)
-
-    coeff = []
-    for u, v, p in preds:
-        coeff.append(p)
-    return coeff
-
-
-# =============================================================================
-# Pytest Fixtures
-# =============================================================================
-@pytest.fixture(scope="module", params=UNDIRECTED_DATASETS)
-def read_csv(request):
-    """
-    Read csv file for both networkx and cugraph
-    """
-    graph_file = request.param
-    dataset_path = graph_file.get_path()
-    M = utils.read_csv_for_nx(dataset_path)
-
-    return M, graph_file
-
-
-@pytest.mark.sg
-def test_wjaccard(gpubenchmark, read_csv):
-
-    M, graph_file = read_csv
-
-    cu_coeff = cugraph_call(gpubenchmark, graph_file)
-    nx_coeff = networkx_call(M)
-    for i in range(len(cu_coeff)):
-        diff = abs(nx_coeff[i] - cu_coeff[i])
-        assert diff < 1.0e-6
-
-
-@pytest.mark.sg
-def test_nx_wjaccard_time(gpubenchmark, read_csv):
-
-    M, _ = read_csv
-    networkx_call(M, gpubenchmark)
-
-
-@pytest.mark.sg
-def test_wjaccard_multi_column_weights(gpubenchmark, read_csv):
-
-    M, graph_file = read_csv
-
-    cu_coeff = cugraph_call(gpubenchmark, graph_file)
-    nx_coeff = networkx_call(M)
-    for i in range(len(cu_coeff)):
-        diff = abs(nx_coeff[i] - cu_coeff[i])
-        assert diff < 1.0e-6
-
-
-@pytest.mark.sg
-def test_wjaccard_multi_column(read_csv):
-
-    M, _ = read_csv
-
-    cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
-    G1 = cugraph.Graph()
-    G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
-    )
-
-    G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
-    vertex_pair = vertex_pair[:5]
-
-    weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32))
-    weights = cudf.DataFrame()
-    weights["vertex"] = G2.nodes()
-    weights["vertex_"] = weights["vertex"] + 1000
-    weights["weight"] = weight_arr
-
-    df_res = cugraph.jaccard_w(G1, weights, vertex_pair)
-
-    weights = weights[["vertex", "weight"]]
-    df_exp = cugraph.jaccard_w(G2, weights, vertex_pair[["src_0", "dst_0"]])
-
-    # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"])
-
-
-@pytest.mark.sg
-def test_invalid_datasets_jaccard_w():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
-    with pytest.raises(ValueError):
-        cugraph.jaccard_w(G, None)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py
deleted file mode 100644
index 1dffb9fca41..00000000000
--- a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-
-import pytest
-import scipy
-import numpy as np
-
-import cudf
-import cugraph
-from cudf.testing import assert_series_equal
-from cugraph.testing import utils, UNDIRECTED_DATASETS
-
-
-# =============================================================================
-# Pytest Setup / Teardown - called for each test function
-# =============================================================================
-def setup_function():
-    gc.collect()
-
-
-def cugraph_call(benchmark_callable, graph_file, pairs):
-    # Device data
-    cu_M = graph_file.get_edgelist()
-    weights_arr = cudf.Series(
-        np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32)
-    )
-    weights = cudf.DataFrame()
-    weights["vertex"] = np.arange(len(weights_arr), dtype=np.int32)
-    weights["weight"] = weights_arr
-
-    G = graph_file.get_graph(create_using=cugraph.Graph(directed=True))
-
-    # cugraph Overlap Call
-    df = benchmark_callable(cugraph.overlap_w, G, weights, pairs)
-
-    df = df.sort_values(by=["first", "second"])
-    return df["overlap_coeff"].to_numpy()
-
-
-def intersection(a, b, M):
-    count = 0
-    a_idx = M.indptr[a]
-    b_idx = M.indptr[b]
-
-    while (a_idx < M.indptr[a + 1]) and (b_idx < M.indptr[b + 1]):
-        a_vertex = M.indices[a_idx]
-        b_vertex = M.indices[b_idx]
-
-        if a_vertex == b_vertex:
-            count += 1
-            a_idx += 1
-            b_idx += 1
-        elif a_vertex < b_vertex:
-            a_idx += 1
-        else:
-            b_idx += 1
-
-    return count
-
-
-def degree(a, M):
-    return M.indptr[a + 1] - M.indptr[a]
-
-
-def overlap(a, b, M):
-    b_sum = degree(b, M)
-    if b_sum == 0:
-        return float("NaN")
-
-    i = intersection(a, b, M)
-    a_sum = degree(a, M)
-    total = min(a_sum, b_sum)
-    return i / total
-
-
-def cpu_call(M, first, second):
-    result = []
-    for i in range(len(first)):
-        result.append(overlap(first[i], second[i], M))
-    return result
-
-
-@pytest.mark.sg
-@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
-def test_woverlap(gpubenchmark, graph_file):
-    dataset_path = graph_file.get_path()
-    Mnx = utils.read_csv_for_nx(dataset_path)
-    N = max(max(Mnx["0"]), max(Mnx["1"])) + 1
-    M = scipy.sparse.csr_matrix((Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N))
-
-    G = graph_file.get_graph(ignore_weights=True)
-    pairs = (
-        G.get_two_hop_neighbors()
-        .sort_values(["first", "second"])
-        .reset_index(drop=True)
-    )
-
-    cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs)
-    cpu_coeff = cpu_call(M, pairs["first"], pairs["second"])
-    assert len(cu_coeff) == len(cpu_coeff)
-    for i in range(len(cu_coeff)):
-        if np.isnan(cpu_coeff[i]):
-            assert np.isnan(cu_coeff[i])
-        elif np.isnan(cu_coeff[i]):
-            assert cpu_coeff[i] == cu_coeff[i]
-        else:
-            diff = abs(cpu_coeff[i] - cu_coeff[i])
-            assert diff < 1.0e-6
-
-
-@pytest.mark.sg
-@pytest.mark.parametrize("graph_file", UNDIRECTED_DATASETS)
-def test_woverlap_multi_column(graph_file):
-    dataset_path = graph_file.get_path()
-    M = utils.read_csv_for_nx(dataset_path)
-
-    cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
-    G1 = cugraph.Graph()
-    G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
-    )
-
-    G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
-    vertex_pair = vertex_pair[:5]
-
-    weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32))
-
-    weights = cudf.DataFrame()
-    weights["vertex"] = G2.nodes()
-    weights["vertex_"] = weights["vertex"] + 1000
-    weights["weight"] = weight_arr
-
-    df_res = cugraph.overlap_w(G1, weights, vertex_pair)
-
-    weights = weights[["vertex", "weight"]]
-    df_exp = cugraph.overlap_w(G2, weights, vertex_pair[["src_0", "dst_0"]])
-
-    # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"])
-
-
-@pytest.mark.sg
-def test_invalid_datasets_overlap_w():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
-    with pytest.raises(ValueError):
-        cugraph.overlap_w(G, None)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py
deleted file mode 100644
index 8d09b3e25b3..00000000000
--- a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-
-import pytest
-import numpy as np
-import networkx as nx
-
-import cudf
-import cugraph
-from cudf.testing import assert_series_equal
-from cugraph.testing import utils, UNDIRECTED_DATASETS
-
-
-print("Networkx version : {} ".format(nx.__version__))
-
-
-# =============================================================================
-# Pytest Setup / Teardown - called for each test function
-# =============================================================================
-def setup_function():
-    gc.collect()
-
-
-def cugraph_call(benchmark_callable, graph_file):
-    # Device data
-    cu_M = graph_file.get_edgelist()
-    weight_arr = cudf.Series(
-        np.ones(max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32)
-    )
-    weights = cudf.DataFrame()
-    weights["vertex"] = np.arange(len(weight_arr), dtype=np.int32)
-    weights["weight"] = weight_arr
-
-    G = graph_file.get_graph(ignore_weights=True)
-
-    # cugraph Sorensen Call
-    df = benchmark_callable(cugraph.sorensen_w, G, weights)
-
-    df = df.sort_values(["first", "second"]).reset_index(drop=True)
-
-    return df["sorensen_coeff"]
-
-
-def networkx_call(M, benchmark_callable=None):
-
-    sources = M["0"]
-    destinations = M["1"]
-    edges = []
-    for i in range(len(sources)):
-        edges.append((sources[i], destinations[i]))
-        edges.append((destinations[i], sources[i]))
-    edges = list(dict.fromkeys(edges))
-    edges = sorted(edges)
-    # in NVGRAPH tests we read as CSR and feed as CSC, so here we doing this
-    # explicitly
-    print("Format conversion ... ")
-
-    # NetworkX graph
-    Gnx = nx.from_pandas_edgelist(M, source="0", target="1", create_using=nx.Graph())
-    # Networkx Jaccard Call
-    print("Solving... ")
-    if benchmark_callable is not None:
-        preds = benchmark_callable(nx.jaccard_coefficient, Gnx, edges)
-    else:
-        preds = nx.jaccard_coefficient(Gnx, edges)
-    coeff = []
-    for u, v, p in preds:
-        # FIXME: Use known correct values of WSorensen for few graphs,
-        # hardcode it and compare to Cugraph WSorensen
-        # to get a more robust test
-
-        # Conversion from Networkx Jaccard to Sorensen
-        coeff.append((2 * p) / (1 + p))
-    return coeff
-
-
-# =============================================================================
-# Pytest Fixtures
-# =============================================================================
-@pytest.fixture(scope="module", params=UNDIRECTED_DATASETS)
-def read_csv(request):
-    """
-    Read csv file for both networkx and cugraph
-    """
-    graph_file = request.param
-    dataset_path = graph_file.get_path()
-    M = utils.read_csv_for_nx(dataset_path)
-
-    return M, graph_file
-
-
-@pytest.mark.sg
-def test_wsorensen(gpubenchmark, read_csv):
-
-    M, graph_file = read_csv
-
-    cu_coeff = cugraph_call(gpubenchmark, graph_file)
-    nx_coeff = networkx_call(M)
-    for i in range(len(cu_coeff)):
-        diff = abs(nx_coeff[i] - cu_coeff[i])
-        assert diff < 1.0e-6
-
-
-@pytest.mark.sg
-def test_nx_wsorensen_time(gpubenchmark, read_csv):
-
-    M, _ = read_csv
-    networkx_call(M, gpubenchmark)
-
-
-@pytest.mark.sg
-def test_wsorensen_multi_column_weights(gpubenchmark, read_csv):
-
-    M, cu_M = read_csv
-
-    cu_coeff = cugraph_call(gpubenchmark, cu_M)
-    nx_coeff = networkx_call(M)
-    for i in range(len(cu_coeff)):
-        diff = abs(nx_coeff[i] - cu_coeff[i])
-        assert diff < 1.0e-6
-
-
-@pytest.mark.sg
-def test_wsorensen_multi_column(read_csv):
-
-    M, _ = read_csv
-
-    cu_M = cudf.DataFrame()
-    cu_M["src_0"] = cudf.Series(M["0"])
-    cu_M["dst_0"] = cudf.Series(M["1"])
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
-    G1 = cugraph.Graph()
-    G1.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"]
-    )
-
-    G2 = cugraph.Graph()
-    G2.from_cudf_edgelist(cu_M, source="src_0", destination="dst_0")
-
-    vertex_pair = cu_M[["src_0", "src_1", "dst_0", "dst_1"]]
-    vertex_pair = vertex_pair[:5]
-
-    weight_arr = cudf.Series(np.ones(G2.number_of_vertices(), dtype=np.float32))
-    weights = cudf.DataFrame()
-    weights["vertex"] = G2.nodes()
-    weights["vertex_"] = weights["vertex"] + 1000
-    weights["weight"] = weight_arr
-
-    df_res = cugraph.sorensen_w(G1, weights, vertex_pair)
-
-    weights = weights[["vertex", "weight"]]
-    df_exp = cugraph.sorensen_w(G2, weights, vertex_pair[["src_0", "dst_0"]])
-
-    # Calculating mismatch
-    actual = df_res.sort_values("0_first").reset_index()
-    expected = df_exp.sort_values("first").reset_index()
-    assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"])
-
-
-@pytest.mark.sg
-def test_invalid_datasets_sorensen_w():
-    karate = UNDIRECTED_DATASETS[0]
-    df = karate.get_edgelist()
-    df = df.add(1)
-    G = cugraph.Graph(directed=False)
-    G.from_cudf_edgelist(df, source="src", destination="dst")
-    with pytest.raises(ValueError):
-        cugraph.sorensen_w(G, None)
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index 711652bbae6..45f6de2f663 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -87,6 +87,13 @@
 
 from pylibcugraph.generate_rmat_edgelists import generate_rmat_edgelists
 
+from pylibcugraph.jaccard_coefficients import jaccard_coefficients
+
+from pylibcugraph.overlap_coefficients import overlap_coefficients
+
+from pylibcugraph.sorensen_coefficients import sorensen_coefficients
+
+
 from pylibcugraph import exceptions
 
 __version__ = "23.10.00"
diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py
index 1b93f9322af..6194ace5956 100644
--- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -74,18 +74,17 @@
 
 from pylibcugraph.node2vec import node2vec
 
-node2vec = promoted_experimental_warning_wrapper(node2vec)
 
-from pylibcugraph.jaccard_coefficients import EXPERIMENTAL__jaccard_coefficients
+# from pylibcugraph.jaccard_coefficients import EXPERIMENTAL__jaccard_coefficients
 
-jaccard_coefficients = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficients)
+# jaccard_coefficients = experimental_warning_wrapper(EXPERIMENTAL__jaccard_coefficients)
 
-from pylibcugraph.overlap_coefficients import EXPERIMENTAL__overlap_coefficients
+# from pylibcugraph.overlap_coefficients import EXPERIMENTAL__overlap_coefficients
 
-overlap_coefficients = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficients)
+# overlap_coefficients = experimental_warning_wrapper(EXPERIMENTAL__overlap_coefficients)
 
-from pylibcugraph.sorensen_coefficients import EXPERIMENTAL__sorensen_coefficients
+# from pylibcugraph.sorensen_coefficients import EXPERIMENTAL__sorensen_coefficients
 
-sorensen_coefficients = experimental_warning_wrapper(
-    EXPERIMENTAL__sorensen_coefficients
-)
+# sorensen_coefficients = experimental_warning_wrapper(
+# EXPERIMENTAL__sorensen_coefficients
+# )
diff --git a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx
index 805ee821eab..59e94aeb615 100644
--- a/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx
+++ b/python/pylibcugraph/pylibcugraph/jaccard_coefficients.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,6 +15,8 @@
 # cython: language_level = 3
 
 from libc.stdint cimport uintptr_t
+from libc.stdio cimport printf
+from cython.operator cimport dereference
 
 from pylibcugraph._cugraph_c.resource_handle cimport (
     bool_t,
@@ -57,7 +59,7 @@ from pylibcugraph.utils cimport (
 )
 
 
-def EXPERIMENTAL__jaccard_coefficients(ResourceHandle resource_handle,
+def jaccard_coefficients(ResourceHandle resource_handle,
         _GPUGraph graph,
         first,
         second,
@@ -83,8 +85,10 @@ def EXPERIMENTAL__jaccard_coefficients(ResourceHandle resource_handle,
     second :
         Destination of the vertex pair.
     
-    use_weight : bool, optional (default=False)
-        Currently not supported
+    use_weight : bool, optional
+        If set to True, the  compute weighted jaccard_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, computed un-weighted jaccard_coefficients
 
     do_expensive_check : bool
         If True, performs more extensive tests on the inputs to ensure
diff --git a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx
index 6af71116469..28360121c64 100644
--- a/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx
+++ b/python/pylibcugraph/pylibcugraph/overlap_coefficients.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -57,7 +57,7 @@ from pylibcugraph.utils cimport (
 )
 
 
-def EXPERIMENTAL__overlap_coefficients(ResourceHandle resource_handle,
+def overlap_coefficients(ResourceHandle resource_handle,
         _GPUGraph graph,
         first,
         second,
@@ -84,8 +84,10 @@ def EXPERIMENTAL__overlap_coefficients(ResourceHandle resource_handle,
     second :
         Destination of the vertex pair.
     
-    use_weight : bool, optional (default=False)
-        Currently not supported
+    use_weight : bool, optional
+        If set to True, the  compute weighted jaccard_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, computed un-weighted jaccard_coefficients
 
     do_expensive_check : bool
         If True, performs more extensive tests on the inputs to ensure
diff --git a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx
index 12647baccb2..983a635012f 100644
--- a/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx
+++ b/python/pylibcugraph/pylibcugraph/sorensen_coefficients.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -57,7 +57,7 @@ from pylibcugraph.utils cimport (
 )
 
 
-def EXPERIMENTAL__sorensen_coefficients(ResourceHandle resource_handle,
+def sorensen_coefficients(ResourceHandle resource_handle,
         _GPUGraph graph,
         first,
         second,
@@ -83,8 +83,10 @@ def EXPERIMENTAL__sorensen_coefficients(ResourceHandle resource_handle,
     second :
         Destination of the vertex pair.
     
-    use_weight : bool, optional (default=False)
-        Currently not supported
+    use_weight : bool, optional
+        If set to True, the  compute weighted jaccard_coefficients(
+            the input graph must be weighted in that case).
+        Otherwise, computed un-weighted jaccard_coefficients
 
     do_expensive_check : bool
         If True, performs more extensive tests on the inputs to ensure

From db5073da6c69ac3ee44d7130d8799177ec69a0ef Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 21 Sep 2023 13:52:57 -0500
Subject: [PATCH 44/72] Update image names (#3867)

PR updates `rapidsai/ci` references to `rapidsai/ci-conda`

Authors:
  - Jake Awe (https://github.com/AyodeAwe)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cugraph/pull/3867
---
 .github/workflows/build.yaml | 2 +-
 .github/workflows/pr.yaml    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2d0d58315a0..02b357c7c88 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
+      container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.10"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 005fe4a0267..d2d24d90fbe 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -63,7 +63,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
+      container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.10"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -73,7 +73,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
+      container_image: "rapidsai/ci-conda:cuda11.8.0-ubuntu22.04-py3.10"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibcugraph:
     needs: checks

From a7047e3f0049597b4da625138107830ff78405e5 Mon Sep 17 00:00:00 2001
From: Don Acosta <97529984+acostadon@users.noreply.github.com>
Date: Thu, 21 Sep 2023 17:19:30 -0400
Subject: [PATCH 45/72] adding dining preference dataset (#3866)

This dataset is very small, and uses strings as node names. It will be used to test force atlas, and in a new link prediction/similarity notebook.

the licensing is contained here.
http://networkdata.ics.uci.edu/netdata/html/Dining-table_partners.html

Authors:
  - Don Acosta (https://github.com/acostadon)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - ralph (https://github.com/nv-rliu)

URL: https://github.com/rapidsai/cugraph/pull/3866
---
 python/cugraph/cugraph/datasets/__init__.py   |  1 +
 .../datasets/metadata/dining_prefs.yaml       | 23 +++++++++++++++++++
 python/cugraph/cugraph/testing/__init__.py    |  3 +++
 3 files changed, 27 insertions(+)
 create mode 100644 python/cugraph/cugraph/datasets/metadata/dining_prefs.yaml

diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py
index 7ba274c5960..65a820f108b 100644
--- a/python/cugraph/cugraph/datasets/__init__.py
+++ b/python/cugraph/cugraph/datasets/__init__.py
@@ -27,6 +27,7 @@
 meta_path = Path(__file__).parent / "metadata"
 
 cyber = Dataset(meta_path / "cyber.yaml")
+dining_prefs = Dataset(meta_path / "dining_prefs.yaml")
 dolphins = Dataset(meta_path / "dolphins.yaml")
 email_Eu_core = Dataset(meta_path / "email_Eu_core.yaml")
 karate = Dataset(meta_path / "karate.yaml")
diff --git a/python/cugraph/cugraph/datasets/metadata/dining_prefs.yaml b/python/cugraph/cugraph/datasets/metadata/dining_prefs.yaml
new file mode 100644
index 00000000000..e7ec85d7a1f
--- /dev/null
+++ b/python/cugraph/cugraph/datasets/metadata/dining_prefs.yaml
@@ -0,0 +1,23 @@
+name: dining_prefs
+file_type: .csv
+description:  Classic social networking dataset describes dining preferences for a dormitory in New York state.
+author: J.L. Moreno
+refs:
+  J. L. Moreno (1960). The Sociometry Reader. The Free Press, Glencoe, Illinois, pg.35
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+  - wgt
+col_types:
+  - string
+  - string
+  - int
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: true
+number_of_edges: 42
+number_of_nodes: 26
+url:  https://data.rapids.ai/cugraph/datasets/dining_prefs.csv
\ No newline at end of file
diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py
index bde398aadbd..f5f0bcb06eb 100644
--- a/python/cugraph/cugraph/testing/__init__.py
+++ b/python/cugraph/cugraph/testing/__init__.py
@@ -23,6 +23,7 @@
 )
 from cugraph.datasets import (
     cyber,
+    dining_prefs,
     dolphins,
     karate,
     karate_disjoint,
@@ -42,6 +43,7 @@
 UNDIRECTED_DATASETS = [karate, dolphins]
 SMALL_DATASETS = [karate, dolphins, polbooks]
 WEIGHTED_DATASETS = [
+    dining_prefs,
     dolphins,
     karate,
     karate_disjoint,
@@ -51,6 +53,7 @@
     small_tree,
 ]
 ALL_DATASETS = [
+    dining_prefs,
     dolphins,
     karate,
     karate_disjoint,

From 367f36cfd4719fb522f12dbb74cec5b8a1e61aa6 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:55:52 -0400
Subject: [PATCH 46/72] Add file to update-version.sh [skip ci] (#3870)

Add a new file to `update-version.sh`. Tested locally

Authors:
   - Ray Douglass (https://github.com/raydouglass)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
   - Jake Awe (https://github.com/AyodeAwe)
---
 ci/release/update-version.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 2c8735079f0..bd3aa6bc370 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -126,3 +126,5 @@ for FILE in .github/workflows/*.yaml; do
   sed_runner "s/dask-cuda.git@branch-[0-9][0-9].[0-9][0-9]/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
+
+sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" python/nx-cugraph/README.md

From f53bb56dc3245f64523aeeb997430c8f49de4624 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Fri, 22 Sep 2023 12:44:31 -0400
Subject: [PATCH 47/72] Fix torch seed in `cugraph-dgl` and `-pyg` tests for
 conv layers (#3869)

Fixes https://github.com/rapidsai/graph_dl/issues/325

Recently, a few CI runs (ex. [1](https://github.com/rapidsai/cugraph/actions/runs/6254253684/job/16983164330?pr=3828#step:7:5078), [2](https://github.com/rapidsai/cugraph/actions/runs/6224345348/job/16896416094?pr=3843)) failed when comparing results from cugraph-ops-based conv layers against results from upstream frameworks. The tests pass most of the time, but occasionally fail due to a combination of using a strict tolerance and bad numerics (floating point error). This PR fixes the seed used for generating random feature tensors so that CI behaves consistently across different runs.

Authors:
  - Tingyu Wang (https://github.com/tingyu66)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3869
---
 python/cugraph-dgl/tests/nn/test_gatconv.py       |  2 ++
 python/cugraph-dgl/tests/nn/test_gatv2conv.py     |  2 ++
 python/cugraph-dgl/tests/nn/test_relgraphconv.py  | 15 +++++++++++----
 python/cugraph-dgl/tests/nn/test_sageconv.py      |  1 +
 .../cugraph-dgl/tests/nn/test_transformerconv.py  |  1 +
 .../cugraph_pyg/tests/nn/test_gat_conv.py         |  1 +
 .../cugraph_pyg/tests/nn/test_gatv2_conv.py       |  1 +
 .../cugraph_pyg/tests/nn/test_rgcn_conv.py        |  1 +
 .../cugraph_pyg/tests/nn/test_sage_conv.py        |  1 +
 .../cugraph_pyg/tests/nn/test_transformer_conv.py |  1 +
 10 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/tests/nn/test_gatconv.py
index ef3047dc2cd..ce145b2bc87 100644
--- a/python/cugraph-dgl/tests/nn/test_gatconv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatconv.py
@@ -35,6 +35,7 @@ def test_gatconv_equality(
 ):
     from dgl.nn.pytorch import GATConv
 
+    torch.manual_seed(12345)
     g = create_graph1().to("cuda")
 
     if idtype_int:
@@ -121,6 +122,7 @@ def test_gatconv_equality(
 def test_gatconv_edge_feats(
     bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
 ):
+    torch.manual_seed(12345)
     g = create_graph1().to("cuda")
 
     if to_block:
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
index cc46a6e4b39..52003edacca 100644
--- a/python/cugraph-dgl/tests/nn/test_gatv2conv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
@@ -35,6 +35,7 @@ def test_gatv2conv_equality(
 ):
     from dgl.nn.pytorch import GATv2Conv
 
+    torch.manual_seed(12345)
     g = create_graph1().to("cuda")
 
     if idtype_int:
@@ -109,6 +110,7 @@ def test_gatv2conv_equality(
 def test_gatv2conv_edge_feats(
     bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
 ):
+    torch.manual_seed(12345)
     g = create_graph1().to("cuda")
 
     if to_block:
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
index 901f9ba1433..bdaa89e57f2 100644
--- a/python/cugraph-dgl/tests/nn/test_relgraphconv.py
+++ b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
@@ -41,6 +41,7 @@ def test_relgraphconv_equality(
 ):
     from dgl.nn.pytorch import RelGraphConv
 
+    torch.manual_seed(12345)
     in_feat, out_feat, num_rels = 10, 2, 3
     args = (in_feat, out_feat, num_rels)
     kwargs = {
@@ -75,12 +76,18 @@ def test_relgraphconv_equality(
             size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
         )
 
-    torch.manual_seed(0)
     conv1 = RelGraphConv(*args, **kwargs).cuda()
+    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).cuda()
 
-    torch.manual_seed(0)
-    kwargs["apply_norm"] = False
-    conv2 = CuGraphRelGraphConv(*args, **kwargs).cuda()
+    with torch.no_grad():
+        if self_loop:
+            conv2.W.data[:-1] = conv1.linear_r.W.data
+            conv2.W.data[-1] = conv1.loop_weight.data
+        else:
+            conv2.W.data = conv1.linear_r.W.data.detach().clone()
+
+        if regularizer is not None:
+            conv2.coeff.data = conv1.linear_r.coeff.data.detach().clone()
 
     out1 = conv1(g, feat, g.edata[dgl.ETYPE])
 
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index e2acf9e6596..b5d0a44b868 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -35,6 +35,7 @@ def test_sageconv_equality(
 ):
     from dgl.nn.pytorch import SAGEConv
 
+    torch.manual_seed(12345)
     kwargs = {"aggregator_type": aggr, "bias": bias}
     g = create_graph1().to("cuda")
 
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
index b2b69cb35ab..5ac4fd7bea7 100644
--- a/python/cugraph-dgl/tests/nn/test_transformerconv.py
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -41,6 +41,7 @@ def test_transformerconv(
     use_edge_feats,
     sparse_format,
 ):
+    torch.manual_seed(12345)
     device = "cuda"
     g = create_graph1().to(device)
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
index 21c43bad38c..62bebb9211d 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
@@ -32,6 +32,7 @@ def test_gat_conv_equality(
     import torch
     from torch_geometric.nn import GATConv
 
+    torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
index 6b11e87154a..a4794628410 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
@@ -28,6 +28,7 @@ def test_gatv2_conv_equality(bipartite, concat, heads, use_edge_attr, graph, req
     import torch
     from torch_geometric.nn import GATv2Conv
 
+    torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
index 233c6aa2836..ded4f300c0c 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
@@ -31,6 +31,7 @@ def test_rgcn_conv_equality(
     import torch
     from torch_geometric.nn import FastRGCNConv as RGCNConv
 
+    torch.manual_seed(12345)
     in_channels, out_channels, num_relations = (4, 2, 3)
     kwargs = dict(aggr=aggr, bias=bias, num_bases=num_bases, root_weight=root_weight)
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
index 7f73cddbdbb..b2977d1d175 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
@@ -32,6 +32,7 @@ def test_sage_conv_equality(
     import torch
     from torch_geometric.nn import SAGEConv
 
+    torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
     csc = CuGraphSAGEConv.to_csc(edge_index, size)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
index 7dba1a6d515..fbdb244898b 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
@@ -27,6 +27,7 @@ def test_transformer_conv_equality(bipartite, concat, heads, graph, request):
     import torch
     from torch_geometric.nn import TransformerConv
 
+    torch.manual_seed(12345)
     edge_index, size = request.getfixturevalue(graph)
     edge_index = edge_index.cuda()
     csc = CuGraphTransformerConv.to_csc(edge_index, size)

From fe17abc6da469d810ea512d1d887407032613405 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Mon, 25 Sep 2023 10:48:40 -0400
Subject: [PATCH 48/72] cuGraph-PyG Loader Improvements (#3795)

Consolidates various speed improvements tested while running performance benchmarks.  Avoids copying batch data, removes redundant data loading code, simplifies and improves de-offsetting, even though that is now being bypassed entirely for homogeneous graphs.  Removes extra host to device copy.  Properly flips the src/dst columns in the returned `HeteroData` minibatch objects, avoid exposing this to the end user.

I've confirmed this cuts the MFG time by a factor of 4.

Closes #3807

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Don Acosta (https://github.com/acostadon)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3795
---
 .../cugraph_pyg/data/cugraph_store.py         | 218 ++++++++++----
 .../cugraph_pyg/loader/cugraph_node_loader.py | 107 +++++--
 .../cugraph-pyg/cugraph_pyg/loader/filter.py  |  57 ----
 .../cugraph_pyg/sampler/cugraph_sampler.py    | 281 +++++++++++-------
 .../tests/mg/test_mg_cugraph_loader.py        |   4 +-
 .../tests/mg/test_mg_cugraph_sampler.py       |  28 +-
 .../tests/mg/test_mg_cugraph_store.py         |   6 +-
 .../cugraph_pyg/tests/test_cugraph_loader.py  | 158 ++++++----
 .../cugraph_pyg/tests/test_cugraph_sampler.py |  28 +-
 .../cugraph_pyg/tests/test_cugraph_store.py   |   2 +-
 10 files changed, 548 insertions(+), 341 deletions(-)
 delete mode 100644 python/cugraph-pyg/cugraph_pyg/loader/filter.py

diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
index 8d5d2fd4894..e0d318adbe0 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
@@ -25,6 +25,7 @@
 import pandas
 import cudf
 import cugraph
+import warnings
 
 from cugraph.utilities.utils import import_optional, MissingModule
 
@@ -211,7 +212,9 @@ def __init__(
         F: cugraph.gnn.FeatureStore,
         G: Union[Dict[str, Tuple[TensorType]], Dict[str, int]],
         num_nodes_dict: Dict[str, int],
+        *,
         multi_gpu: bool = False,
+        order: str = "CSC",
     ):
         """
         Constructs a new CuGraphStore from the provided
@@ -256,11 +259,20 @@ def __init__(
         multi_gpu: bool (Optional, default = False)
             Whether the store should be backed by a multi-GPU graph.
             Requires dask to have been set up.
+
+        order: str (Optional ["CSR", "CSC"], default = CSC)
+            The order to use for sampling.  Should nearly always be CSC
+            unless there is a specific expectation of "reverse" sampling.
+            It is also not uncommon to use CSR order for correctness
+            testing, which some cuGraph-PyG tests do.
         """
 
         if None in G:
             raise ValueError("Unspecified edge types not allowed in PyG")
 
+        if order != "CSR" and order != "CSC":
+            raise ValueError("invalid valid for order")
+
         self.__vertex_dtype = torch.int64
 
         self._tensor_attr_cls = CuGraphTensorAttr
@@ -289,6 +301,7 @@ def __init__(
         self.__features = F
         self.__graph = None
         self.__is_graph_owner = False
+        self.__order = order
 
         if construct_graph:
             if multi_gpu:
@@ -297,7 +310,9 @@ def __init__(
                 )
 
             if self.__graph is None:
-                self.__graph = self.__construct_graph(G, multi_gpu=multi_gpu)
+                self.__graph = self.__construct_graph(
+                    G, multi_gpu=multi_gpu, order=order
+                )
                 self.__is_graph_owner = True
 
         self.__subgraphs = {}
@@ -347,6 +362,7 @@ def __construct_graph(
         self,
         edge_info: Dict[Tuple[str, str, str], List[TensorType]],
         multi_gpu: bool = False,
+        order: str = "CSC",
     ) -> cugraph.MultiGraph:
         """
         This function takes edge information and uses it to construct
@@ -363,6 +379,14 @@ def __construct_graph(
         multi_gpu: bool (Optional, default=False)
             Whether to construct a single-GPU or multi-GPU cugraph Graph.
             Defaults to a single-GPU graph.
+
+        order: str (CSC or CSR)
+            Essentially whether to reverse edges so that the cuGraph
+            sampling algorithm operates on the CSC matrix instead of
+            the CSR matrix.  Should nearly always be CSC unless there
+            is a specific expectation of reverse sampling, or correctness
+            testing is being performed.
+
         Returns
         -------
         A newly-constructed directed cugraph.MultiGraph object.
@@ -371,6 +395,9 @@ def __construct_graph(
         # Ensure the original dict is not modified.
         edge_info_cg = {}
 
+        if order != "CSR" and order != "CSC":
+            raise ValueError("Order must be either CSC (default) or CSR!")
+
         # Iterate over the keys in sorted order so that the created
         # numerical types correspond to the lexicographic order
         # of the keys, which is critical to converting the numeric
@@ -430,20 +457,43 @@ def __construct_graph(
 
         df = pandas.DataFrame(
             {
-                "src": pandas.Series(na_src),
-                "dst": pandas.Series(na_dst),
+                "src": pandas.Series(na_dst)
+                if order == "CSC"
+                else pandas.Series(na_src),
+                "dst": pandas.Series(na_src)
+                if order == "CSC"
+                else pandas.Series(na_dst),
                 "etp": pandas.Series(na_etp),
             }
         )
+        vertex_dtype = df.src.dtype
 
         if multi_gpu:
             nworkers = len(distributed.get_client().scheduler_info()["workers"])
-            df = dd.from_pandas(df, npartitions=nworkers).persist()
-            df = df.map_partitions(cudf.DataFrame.from_pandas)
-        else:
-            df = cudf.from_pandas(df)
+            df = dd.from_pandas(df, npartitions=nworkers if len(df) > 32 else 1)
+
+            # Ensure the dataframe is constructed on each partition
+            # instead of adding additional synchronization head from potential
+            # host to device copies.
+            def get_empty_df():
+                return cudf.DataFrame(
+                    {
+                        "src": cudf.Series([], dtype=vertex_dtype),
+                        "dst": cudf.Series([], dtype=vertex_dtype),
+                        "etp": cudf.Series([], dtype="int32"),
+                    }
+                )
 
-        df = df.reset_index(drop=True)
+            # Have to check for empty partitions and handle them appropriately
+            df = df.persist()
+            df = df.map_partitions(
+                lambda f: cudf.DataFrame.from_pandas(f)
+                if len(f) > 0
+                else get_empty_df(),
+                meta=get_empty_df(),
+            ).reset_index(drop=True)
+        else:
+            df = cudf.from_pandas(df).reset_index(drop=True)
 
         graph = cugraph.MultiGraph(directed=True)
         if multi_gpu:
@@ -468,6 +518,10 @@ def __construct_graph(
     def _edge_types_to_attrs(self) -> dict:
         return dict(self.__edge_types_to_attrs)
 
+    @property
+    def order(self) -> str:
+        return self.__order
+
     @property
     def node_types(self) -> List[NodeType]:
         return list(self.__vertex_type_offsets["type"])
@@ -557,6 +611,7 @@ def _get_edge_index(self, attr: CuGraphEdgeAttr) -> Tuple[TensorType, TensorType
             raise ValueError("Graph is not in memory, cannot access edge index!")
 
         if attr.layout != EdgeLayout.COO:
+            # TODO support returning CSR/CSC (Issue #3802)
             raise TypeError("Only COO direct access is supported!")
 
         # Currently, graph creation enforces that input vertex ids are always of
@@ -566,12 +621,14 @@ def _get_edge_index(self, attr: CuGraphEdgeAttr) -> Tuple[TensorType, TensorType
         # This may change in the future if/when renumbering or the graph
         # creation process is refactored.
         # See Issue #3201 for more details.
+        # Also note src/dst are flipped so that cuGraph sampling is done in
+        # CSC format rather than CSR format.
         if self._is_delayed:
-            src_col_name = self.__graph.renumber_map.renumbered_src_col_name
-            dst_col_name = self.__graph.renumber_map.renumbered_dst_col_name
+            dst_col_name = self.__graph.renumber_map.renumbered_src_col_name
+            src_col_name = self.__graph.renumber_map.renumbered_dst_col_name
         else:
-            src_col_name = self.__graph.srcCol
-            dst_col_name = self.__graph.dstCol
+            dst_col_name = self.__graph.srcCol
+            src_col_name = self.__graph.dstCol
 
         # If there is only one edge type (homogeneous graph) then
         # bypass the edge filters for a significant speed improvement.
@@ -785,29 +842,73 @@ def _get_renumbered_edge_groups_from_sample(
         """
         row_dict = {}
         col_dict = {}
-        if len(self.__edge_types_to_attrs) == 1:
+        # If there is only 1 edge type (includes heterogeneous graphs)
+        if len(self.edge_types) == 1:
             t_pyg_type = list(self.__edge_types_to_attrs.values())[0].edge_type
             src_type, _, dst_type = t_pyg_type
 
-            dst_id_table = noi_index[dst_type]
-            dst_id_map = (
-                cudf.Series(cupy.asarray(dst_id_table), name="dst")
-                .reset_index()
-                .rename(columns={"index": "new_id"})
-                .set_index("dst")
-            )
-            dst = dst_id_map["new_id"].loc[sampling_results.destinations]
-            col_dict[t_pyg_type] = torch.as_tensor(dst.values, device="cuda")
-
-            src_id_table = noi_index[src_type]
-            src_id_map = (
-                cudf.Series(cupy.asarray(src_id_table), name="src")
-                .reset_index()
-                .rename(columns={"index": "new_id"})
-                .set_index("src")
-            )
-            src = src_id_map["new_id"].loc[sampling_results.sources]
-            row_dict[t_pyg_type] = torch.as_tensor(src.values, device="cuda")
+            # If there is only 1 node type (homogeneous)
+            # This should only occur if the cuGraph loader was
+            # not used.  This logic is deprecated.
+            if len(self.node_types) == 1:
+                warnings.warn(
+                    "Renumbering after sampling for homogeneous graphs is deprecated.",
+                    FutureWarning,
+                )
+
+                # Create a dataframe mapping old ids to new ids.
+                vtype = src_type
+                id_table = noi_index[vtype]
+                id_map = cudf.Series(
+                    cupy.arange(id_table.shape[0], dtype="int32"),
+                    name="new_id",
+                    index=cupy.asarray(id_table),
+                ).sort_index()
+
+                # Renumber the sources using binary search
+                # Step 1: get the index of the new id
+                ix_r = torch.searchsorted(
+                    torch.as_tensor(id_map.index.values, device="cuda"),
+                    torch.as_tensor(sampling_results.sources.values, device="cuda"),
+                )
+                # Step 2: Go from id indices to actual ids
+                row_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
+                    ix_r
+                ]
+
+                # Renumber the destinations using binary search
+                # Step 1: get the index of the new id
+                ix_c = torch.searchsorted(
+                    torch.as_tensor(id_map.index.values, device="cuda"),
+                    torch.as_tensor(
+                        sampling_results.destinations.values, device="cuda"
+                    ),
+                )
+                # Step 2: Go from id indices to actual ids
+                col_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
+                    ix_c
+                ]
+            else:
+                # Handle the heterogeneous case where there is only 1 edge type
+                dst_id_table = noi_index[dst_type]
+                dst_id_map = cudf.DataFrame(
+                    {
+                        "dst": cupy.asarray(dst_id_table),
+                        "new_id": cupy.arange(dst_id_table.shape[0]),
+                    }
+                ).set_index("dst")
+                dst = dst_id_map["new_id"].loc[sampling_results.destinations]
+                col_dict[t_pyg_type] = torch.as_tensor(dst.values, device="cuda")
+
+                src_id_table = noi_index[src_type]
+                src_id_map = cudf.DataFrame(
+                    {
+                        "src": cupy.asarray(src_id_table),
+                        "new_id": cupy.arange(src_id_table.shape[0]),
+                    }
+                ).set_index("src")
+                src = src_id_map["new_id"].loc[sampling_results.sources]
+                row_dict[t_pyg_type] = torch.as_tensor(src.values, device="cuda")
 
         else:
             # This will retrieve the single string representation.
@@ -822,36 +923,18 @@ def _get_renumbered_edge_groups_from_sample(
 
             for pyg_can_edge_type_str, ix in eoi_types.items():
                 pyg_can_edge_type = tuple(pyg_can_edge_type_str.split("__"))
-                src_type, _, dst_type = pyg_can_edge_type
-
-                # Get the de-offsetted sources
-                sources = torch.as_tensor(
-                    sampling_results.sources.iloc[ix].values, device="cuda"
-                )
-                sources_ix = torch.searchsorted(
-                    self.__vertex_type_offsets["stop"], sources
-                )
-                sources -= self.__vertex_type_offsets["start"][sources_ix]
 
-                # Create the row entry for this type
-                src_id_table = noi_index[src_type]
-                src_id_map = (
-                    cudf.Series(cupy.asarray(src_id_table), name="src")
-                    .reset_index()
-                    .rename(columns={"index": "new_id"})
-                    .set_index("src")
-                )
-                src = src_id_map["new_id"].loc[cupy.asarray(sources)]
-                row_dict[pyg_can_edge_type] = torch.as_tensor(src.values, device="cuda")
+                if self.__order == "CSR":
+                    src_type, _, dst_type = pyg_can_edge_type
+                else:  # CSC
+                    dst_type, _, src_type = pyg_can_edge_type
 
                 # Get the de-offsetted destinations
+                dst_num_type = self._numeric_vertex_type_from_name(dst_type)
                 destinations = torch.as_tensor(
                     sampling_results.destinations.iloc[ix].values, device="cuda"
                 )
-                destinations_ix = torch.searchsorted(
-                    self.__vertex_type_offsets["stop"], destinations
-                )
-                destinations -= self.__vertex_type_offsets["start"][destinations_ix]
+                destinations -= self.__vertex_type_offsets["start"][dst_num_type]
 
                 # Create the col entry for this type
                 dst_id_table = noi_index[dst_type]
@@ -864,6 +947,24 @@ def _get_renumbered_edge_groups_from_sample(
                 dst = dst_id_map["new_id"].loc[cupy.asarray(destinations)]
                 col_dict[pyg_can_edge_type] = torch.as_tensor(dst.values, device="cuda")
 
+                # Get the de-offsetted sources
+                src_num_type = self._numeric_vertex_type_from_name(src_type)
+                sources = torch.as_tensor(
+                    sampling_results.sources.iloc[ix].values, device="cuda"
+                )
+                sources -= self.__vertex_type_offsets["start"][src_num_type]
+
+                # Create the row entry for this type
+                src_id_table = noi_index[src_type]
+                src_id_map = (
+                    cudf.Series(cupy.asarray(src_id_table), name="src")
+                    .reset_index()
+                    .rename(columns={"index": "new_id"})
+                    .set_index("src")
+                )
+                src = src_id_map["new_id"].loc[cupy.asarray(sources)]
+                row_dict[pyg_can_edge_type] = torch.as_tensor(src.values, device="cuda")
+
         return row_dict, col_dict
 
     def put_tensor(self, tensor, attr) -> None:
@@ -959,9 +1060,7 @@ def _get_tensor(self, attr: CuGraphTensorAttr) -> TensorType:
                 t = t[-1]
 
             if isinstance(t, np.ndarray):
-                t = torch.as_tensor(t, device="cuda")
-            else:
-                t = t.cuda()
+                t = torch.as_tensor(t, device="cpu")
 
             return t
 
@@ -979,7 +1078,6 @@ def _get_tensor(self, attr: CuGraphTensorAttr) -> TensorType:
 
                 t = torch.concatenate([t, u])
 
-            t = t.cuda()
             return t
 
     def _multi_get_tensor(self, attrs: List[CuGraphTensorAttr]) -> List[TensorType]:
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
index 8d79685965f..cf7eb330d67 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
@@ -23,12 +23,15 @@
 from cugraph.utilities.utils import import_optional, MissingModule
 
 from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.loader.filter import _filter_cugraph_store
-from cugraph_pyg.sampler.cugraph_sampler import _sampler_output_from_sampling_results
+from cugraph_pyg.sampler.cugraph_sampler import (
+    _sampler_output_from_sampling_results_heterogeneous,
+    _sampler_output_from_sampling_results_homogeneous,
+)
 
 from typing import Union, Tuple, Sequence, List, Dict
 
 torch_geometric = import_optional("torch_geometric")
+torch = import_optional("torch")
 InputNodes = (
     Sequence
     if isinstance(torch_geometric, MissingModule)
@@ -253,55 +256,97 @@ def __next__(self):
 
             raw_sample_data = cudf.read_parquet(parquet_path)
             if "map" in raw_sample_data.columns:
-                self.__renumber_map = raw_sample_data["map"]
+                num_batches = end_inclusive - self.__start_inclusive + 1
+
+                map_end = raw_sample_data["map"].iloc[num_batches]
+
+                map = torch.as_tensor(
+                    raw_sample_data["map"].iloc[0:map_end], device="cuda"
+                )
                 raw_sample_data.drop("map", axis=1, inplace=True)
+
+                self.__renumber_map_offsets = map[0 : num_batches + 1] - map[0]
+                self.__renumber_map = map[num_batches + 1 :]
+
             else:
                 self.__renumber_map = None
 
             self.__data = raw_sample_data[list(columns.keys())].astype(columns)
             self.__data.dropna(inplace=True)
 
+            if (
+                len(self.__graph_store.edge_types) == 1
+                and len(self.__graph_store.node_types) == 1
+            ):
+                group_cols = ["batch_id", "hop_id"]
+                self.__data_index = self.__data.groupby(group_cols, as_index=True).agg(
+                    {"sources": "max", "destinations": "max"}
+                )
+                self.__data_index.rename(
+                    columns={"sources": "src_max", "destinations": "dst_max"},
+                    inplace=True,
+                )
+                self.__data_index = self.__data_index.to_dict(orient="index")
+
         # Pull the next set of sampling results out of the dataframe in memory
         f = self.__data["batch_id"] == self.__next_batch
         if self.__renumber_map is not None:
             i = self.__next_batch - self.__start_inclusive
-            ix = self.__renumber_map.iloc[[i, i + 1]]
-            ix_start, ix_end = ix.iloc[0], ix.iloc[1]
-            current_renumber_map = self.__renumber_map.iloc[ix_start:ix_end]
-            if len(current_renumber_map) != ix_end - ix_start:
-                raise ValueError("invalid renumber map")
-        else:
-            current_renumber_map = None
 
-        sampler_output = _sampler_output_from_sampling_results(
-            self.__data[f], current_renumber_map, self.__graph_store
-        )
+            # this should avoid d2h copy
+            current_renumber_map = self.__renumber_map[
+                self.__renumber_map_offsets[i] : self.__renumber_map_offsets[i + 1]
+            ]
 
-        # Get ready for next iteration
-        self.__next_batch += 1
+        else:
+            current_renumber_map = None
 
         # Get and return the sampled subgraph
-        if isinstance(torch_geometric, MissingModule):
-            noi_index, row_dict, col_dict, edge_dict = sampler_output["out"]
-            return _filter_cugraph_store(
-                self.__feature_store,
+        if (
+            len(self.__graph_store.edge_types) == 1
+            and len(self.__graph_store.node_types) == 1
+        ):
+            sampler_output = _sampler_output_from_sampling_results_homogeneous(
+                self.__data[f],
+                current_renumber_map,
                 self.__graph_store,
-                noi_index,
-                row_dict,
-                col_dict,
-                edge_dict,
+                self.__data_index,
+                self.__next_batch,
             )
         else:
-            out = torch_geometric.loader.utils.filter_custom_store(
-                self.__feature_store,
-                self.__graph_store,
-                sampler_output.node,
-                sampler_output.row,
-                sampler_output.col,
-                sampler_output.edge,
+            sampler_output = _sampler_output_from_sampling_results_heterogeneous(
+                self.__data[f], current_renumber_map, self.__graph_store
             )
 
-            return out
+        # Get ready for next iteration
+        self.__next_batch += 1
+
+        # Create a PyG HeteroData object, loading the required features
+        out = torch_geometric.loader.utils.filter_custom_store(
+            self.__feature_store,
+            self.__graph_store,
+            sampler_output.node,
+            sampler_output.row,
+            sampler_output.col,
+            sampler_output.edge,
+        )
+
+        # Account for CSR format in cuGraph vs. CSC format in PyG
+        if self.__graph_store.order == "CSC":
+            for node_type in out.edge_index_dict:
+                out[node_type].edge_index[0], out[node_type].edge_index[1] = (
+                    out[node_type].edge_index[1],
+                    out[node_type].edge_index[0],
+                )
+
+        out.set_value_dict("num_sampled_nodes", sampler_output.num_sampled_nodes)
+        out.set_value_dict("num_sampled_edges", sampler_output.num_sampled_edges)
+
+        return out
+
+    @property
+    def _starting_batch_id(self):
+        return self.__starting_batch_id
 
     def __iter__(self):
         return self
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/filter.py b/python/cugraph-pyg/cugraph_pyg/loader/filter.py
deleted file mode 100644
index f519ba7cfc9..00000000000
--- a/python/cugraph-pyg/cugraph_pyg/loader/filter.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cupy
-
-from cugraph_pyg.data import CuGraphStore
-
-from typing import (
-    Dict,
-    Sequence,
-)
-
-
-def _filter_cugraph_store(
-    feature_store: CuGraphStore,
-    graph_store: CuGraphStore,
-    node_dict: Dict[str, Sequence],
-    row_dict: Dict[str, Sequence],
-    col_dict: Dict[str, Sequence],
-    edge_dict: Dict[str, Sequence],
-) -> dict:
-    """
-    Primarily for testing without torch and torch_geometric.
-    Returns a dictionary containing the sampled subgraph.
-    """
-    data = {}
-
-    for attr in graph_store.get_all_edge_attrs():
-        key = attr.edge_type
-        if key in row_dict and key in col_dict:
-            edge_index = cupy.stack([row_dict[key], col_dict[key]])
-            data[attr.edge_type] = {}
-            data[attr.edge_type]["edge_index"] = edge_index
-
-    # Filter node storage:
-    required_attrs = []
-    for attr in feature_store.get_all_tensor_attrs():
-        if attr.group_name in node_dict:
-            attr.index = node_dict[attr.group_name]
-            required_attrs.append(attr)
-            data[attr.group_name] = {}
-            data["num_nodes"] = attr.index.size
-    tensors = feature_store.multi_get_tensor(required_attrs)
-    for i, attr in enumerate(required_attrs):
-        data[attr.group_name][attr.attr_name] = tensors[i]
-
-    return data
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
index d4f600006be..6e8c4322418 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
@@ -12,26 +12,21 @@
 # limitations under the License.
 
 
-from typing import Sequence
+from typing import Sequence, Dict, Tuple
 
 from cugraph_pyg.data import CuGraphStore
 
-from cugraph.utilities.utils import import_optional, MissingModule
+from cugraph.utilities.utils import import_optional
 import cudf
 
 dask_cudf = import_optional("dask_cudf")
 torch_geometric = import_optional("torch_geometric")
 
 torch = import_optional("torch")
+HeteroSamplerOutput = torch_geometric.sampler.base.HeteroSamplerOutput
 
-HeteroSamplerOutput = (
-    None
-    if isinstance(torch_geometric, MissingModule)
-    else torch_geometric.sampler.base.HeteroSamplerOutput
-)
 
-
-def _count_unique_nodes(
+def _get_unique_nodes(
     sampling_results: cudf.DataFrame,
     graph_store: CuGraphStore,
     node_type: str,
@@ -54,8 +49,8 @@ def _count_unique_nodes(
 
     Returns
     -------
-    int
-        The number of unique nodes of the given node type.
+    cudf.Series
+        The unique nodes of the given node type.
     """
     if node_position == "src":
         edge_index = "sources"
@@ -78,12 +73,111 @@ def _count_unique_nodes(
 
         sampling_results_node = sampling_results[f]
     else:
-        return 0
+        return cudf.Series([], dtype="int64")
 
-    return sampling_results_node[edge_index].nunique()
+    return sampling_results_node[edge_index]
 
 
-def _sampler_output_from_sampling_results(
+def _sampler_output_from_sampling_results_homogeneous(
+    sampling_results: cudf.DataFrame,
+    renumber_map: torch.Tensor,
+    graph_store: CuGraphStore,
+    data_index: Dict[Tuple[int, int], Dict[str, int]],
+    batch_id: int,
+    metadata: Sequence = None,
+) -> HeteroSamplerOutput:
+    """
+    Parameters
+    ----------
+    sampling_results: cudf.DataFrame
+        The dataframe containing sampling results.
+    renumber_map: torch.Tensor
+        The tensor containing the renumber map, or None if there
+        is no renumber map.
+    graph_store: CuGraphStore
+        The graph store containing the structure of the sampled graph.
+    data_index: Dict[Tuple[int, int], Dict[str, int]]
+        Dictionary where keys are the batch id and hop id,
+        and values are dictionaries containing the max src
+        and max dst node ids for the batch and hop.
+    batch_id: int
+        The current batch id, whose samples are being retrieved
+        from the sampling results and data index.
+    metadata: Tensor
+        The metadata for the sampled batch.
+
+    Returns
+    -------
+    HeteroSamplerOutput
+    """
+
+    if len(graph_store.edge_types) > 1 or len(graph_store.node_types) > 1:
+        raise ValueError("Graph is heterogeneous")
+
+    hops = torch.arange(
+        sampling_results.hop_id.iloc[len(sampling_results) - 1] + 1, device="cuda"
+    )
+    hops = torch.searchsorted(
+        torch.as_tensor(sampling_results.hop_id, device="cuda"), hops
+    )
+
+    node_type = graph_store.node_types[0]
+    edge_type = graph_store.edge_types[0]
+
+    num_nodes_per_hop_dict = {node_type: torch.zeros(len(hops) + 1, dtype=torch.int64)}
+    num_edges_per_hop_dict = {edge_type: torch.zeros(len(hops), dtype=torch.int64)}
+
+    if renumber_map is None:
+        raise ValueError("Renumbered input is expected for homogeneous graphs")
+
+    noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
+
+    row_dict = {
+        edge_type: torch.as_tensor(sampling_results.sources, device="cuda"),
+    }
+
+    col_dict = {
+        edge_type: torch.as_tensor(sampling_results.destinations, device="cuda"),
+    }
+
+    num_nodes_per_hop_dict[node_type][0] = data_index[batch_id, 0]["src_max"] + 1
+    for hop in range(len(hops)):
+        hop_ix_start = hops[hop]
+        hop_ix_end = hops[hop + 1] if hop < len(hops) - 1 else len(sampling_results)
+
+        if num_nodes_per_hop_dict[node_type][hop] > 0:
+            max_id_hop = data_index[batch_id, hop]["dst_max"]
+            max_id_prev_hop = (
+                data_index[batch_id, hop - 1]["dst_max"]
+                if hop > 0
+                else data_index[batch_id, 0]["src_max"]
+            )
+
+            if max_id_hop > max_id_prev_hop:
+                num_nodes_per_hop_dict[node_type][hop + 1] = (
+                    max_id_hop - max_id_prev_hop
+                )
+            else:
+                num_nodes_per_hop_dict[node_type][hop + 1] = 0
+        # will default to 0 if the previous hop was 0, since this is a PyG requirement
+
+        num_edges_per_hop_dict[edge_type][hop] = hop_ix_end - hop_ix_start
+
+    if HeteroSamplerOutput is None:
+        raise ImportError("Error importing from pyg")
+
+    return HeteroSamplerOutput(
+        node=noi_index,
+        row=row_dict,
+        col=col_dict,
+        edge=None,
+        num_sampled_nodes=num_nodes_per_hop_dict,
+        num_sampled_edges=num_edges_per_hop_dict,
+        metadata=metadata,
+    )
+
+
+def _sampler_output_from_sampling_results_heterogeneous(
     sampling_results: cudf.DataFrame,
     renumber_map: cudf.Series,
     graph_store: CuGraphStore,
@@ -109,7 +203,7 @@ def _sampler_output_from_sampling_results(
 
     hops = torch.arange(sampling_results.hop_id.max() + 1, device="cuda")
     hops = torch.searchsorted(
-        torch.as_tensor(sampling_results.hop_id.values, device="cuda"), hops
+        torch.as_tensor(sampling_results.hop_id, device="cuda"), hops
     )
 
     num_nodes_per_hop_dict = {}
@@ -119,13 +213,11 @@ def _sampler_output_from_sampling_results(
     sampling_results_hop_0 = sampling_results.iloc[
         0 : (hops[1] if len(hops) > 1 else len(sampling_results))
     ]
+
     for node_type in graph_store.node_types:
-        if len(graph_store.node_types) == 1:
-            num_unique_nodes = sampling_results_hop_0.sources.nunique()
-        else:
-            num_unique_nodes = _count_unique_nodes(
-                sampling_results_hop_0, graph_store, node_type, "src"
-            )
+        num_unique_nodes = _get_unique_nodes(
+            sampling_results_hop_0, graph_store, node_type, "src"
+        ).nunique()
 
         if num_unique_nodes > 0:
             num_nodes_per_hop_dict[node_type] = torch.zeros(
@@ -134,112 +226,87 @@ def _sampler_output_from_sampling_results(
             num_nodes_per_hop_dict[node_type][0] = num_unique_nodes
 
     if renumber_map is not None:
-        if len(graph_store.node_types) > 1 or len(graph_store.edge_types) > 1:
-            raise ValueError(
-                "Precomputing the renumber map is currently "
-                "unsupported for heterogeneous graphs."
-            )
+        raise ValueError(
+            "Precomputing the renumber map is currently "
+            "unsupported for heterogeneous graphs."
+        )
 
-        node_type = graph_store.node_types[0]
-        if not isinstance(node_type, str):
-            raise ValueError("Node types must be strings")
-        noi_index = {node_type: torch.as_tensor(renumber_map.values, device="cuda")}
-
-        edge_type = graph_store.edge_types[0]
-        if (
-            not isinstance(edge_type, tuple)
-            or not isinstance(edge_type[0], str)
-            or len(edge_type) != 3
-        ):
-            raise ValueError("Edge types must be 3-tuples of strings")
-        if edge_type[0] != node_type or edge_type[2] != node_type:
-            raise ValueError("Edge src/dst type must match for homogeneous graphs")
-        row_dict = {
-            edge_type: torch.as_tensor(sampling_results.sources.values, device="cuda"),
-        }
-        col_dict = {
-            edge_type: torch.as_tensor(
-                sampling_results.destinations.values, device="cuda"
+    # Calculate nodes of interest based on unique nodes in order of appearance
+    # Use hop 0 sources since those are the only ones not included in destinations
+    # Use torch.concat based on benchmark performance (vs. cudf.concat)
+
+    if sampling_results_hop_0 is None:
+        sampling_results_hop_0 = sampling_results.iloc[
+            0 : (hops[1] if len(hops) > 1 else len(sampling_results))
+        ]
+
+    nodes_of_interest = (
+        cudf.Series(
+            torch.concat(
+                [
+                    torch.as_tensor(sampling_results_hop_0.sources, device="cuda"),
+                    torch.as_tensor(sampling_results.destinations, device="cuda"),
+                ]
             ),
-        }
-    else:
-        # Calculate nodes of interest based on unique nodes in order of appearance
-        # Use hop 0 sources since those are the only ones not included in destinations
-        # Use torch.concat based on benchmark performance (vs. cudf.concat)
-        nodes_of_interest = (
-            cudf.Series(
-                torch.concat(
-                    [
-                        torch.as_tensor(
-                            sampling_results_hop_0.sources.values, device="cuda"
-                        ),
-                        torch.as_tensor(
-                            sampling_results.destinations.values, device="cuda"
-                        ),
-                    ]
-                ),
-                name="nodes_of_interest",
-            )
-            .drop_duplicates()
-            .sort_index()
+            name="nodes_of_interest",
         )
-        del sampling_results_hop_0
+        .drop_duplicates()
+        .sort_index()
+    )
 
-        # Get the grouped node index (for creating the renumbered grouped edge index)
-        noi_index = graph_store._get_vertex_groups_from_sample(
-            torch.as_tensor(nodes_of_interest.values, device="cuda")
-        )
-        del nodes_of_interest
+    # Get the grouped node index (for creating the renumbered grouped edge index)
+    noi_index = graph_store._get_vertex_groups_from_sample(
+        torch.as_tensor(nodes_of_interest, device="cuda")
+    )
+    del nodes_of_interest
 
-        # Get the new edge index (by type as expected for HeteroData)
-        # FIXME handle edge ids/types after the C++ updates
-        row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
-            sampling_results, noi_index
-        )
+    # Get the new edge index (by type as expected for HeteroData)
+    # FIXME handle edge ids/types after the C++ updates
+    row_dict, col_dict = graph_store._get_renumbered_edge_groups_from_sample(
+        sampling_results, noi_index
+    )
 
     for hop in range(len(hops)):
         hop_ix_start = hops[hop]
         hop_ix_end = hops[hop + 1] if hop < len(hops) - 1 else len(sampling_results)
-        sampling_results_hop = sampling_results.iloc[hop_ix_start:hop_ix_end]
+        sampling_results_to_hop = sampling_results.iloc[0:hop_ix_end]
 
         for node_type in graph_store.node_types:
-            if len(graph_store.node_types) == 1:
-                num_unique_nodes = sampling_results_hop.destinations.nunique()
-            else:
-                num_unique_nodes = _count_unique_nodes(
-                    sampling_results_hop, graph_store, node_type, "dst"
-                )
+            unique_nodes_hop = _get_unique_nodes(
+                sampling_results_to_hop, graph_store, node_type, "dst"
+            )
+
+            unique_nodes_0 = _get_unique_nodes(
+                sampling_results_hop_0, graph_store, node_type, "src"
+            )
+
+            num_unique_nodes = cudf.concat([unique_nodes_0, unique_nodes_hop]).nunique()
 
             if num_unique_nodes > 0:
                 if node_type not in num_nodes_per_hop_dict:
                     num_nodes_per_hop_dict[node_type] = torch.zeros(
                         len(hops) + 1, dtype=torch.int64
                     )
-                num_nodes_per_hop_dict[node_type][hop + 1] = num_unique_nodes
+                num_nodes_per_hop_dict[node_type][hop + 1] = num_unique_nodes - int(
+                    num_nodes_per_hop_dict[node_type][: hop + 1].sum(0)
+                )
 
-        if len(graph_store.edge_types) == 1:
-            edge_type = graph_store.edge_types[0]
-            if edge_type not in num_edges_per_hop_dict:
-                num_edges_per_hop_dict[edge_type] = torch.zeros(
+        numeric_etypes, counts = torch.unique(
+            torch.as_tensor(
+                sampling_results.iloc[hop_ix_start:hop_ix_end].edge_type,
+                device="cuda",
+            ),
+            return_counts=True,
+        )
+        numeric_etypes = list(numeric_etypes)
+        counts = list(counts)
+        for num_etype, count in zip(numeric_etypes, counts):
+            can_etype = graph_store.numeric_edge_type_to_canonical(num_etype)
+            if can_etype not in num_edges_per_hop_dict:
+                num_edges_per_hop_dict[can_etype] = torch.zeros(
                     len(hops), dtype=torch.int64
                 )
-            num_edges_per_hop_dict[graph_store.edge_types[0]][hop] = len(
-                sampling_results_hop
-            )
-        else:
-            numeric_etypes, counts = torch.unique(
-                torch.as_tensor(sampling_results_hop.edge_type.values, device="cuda"),
-                return_counts=True,
-            )
-            numeric_etypes = list(numeric_etypes)
-            counts = list(counts)
-            for num_etype, count in zip(numeric_etypes, counts):
-                can_etype = graph_store.numeric_edge_type_to_canonical(num_etype)
-                if can_etype not in num_edges_per_hop_dict:
-                    num_edges_per_hop_dict[can_etype] = torch.zeros(
-                        len(hops), dtype=torch.int64
-                    )
-                num_edges_per_hop_dict[can_etype][hop] = count
+            num_edges_per_hop_dict[can_etype][hop] = count
 
     if HeteroSamplerOutput is None:
         raise ImportError("Error importing from pyg")
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
index e29f3aea512..55aebf305da 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py
@@ -24,7 +24,7 @@
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_basic(dask_client, karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
     loader = CuGraphNeighborLoader(
         (cugraph_store, cugraph_store),
         torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
@@ -52,7 +52,7 @@ def test_cugraph_loader_basic(dask_client, karate_gnn):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_hetero(dask_client, karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
     loader = CuGraphNeighborLoader(
         (cugraph_store, cugraph_store),
         input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
index 550852a3303..a1a72a44d0c 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
@@ -17,7 +17,9 @@
 import pytest
 
 from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.sampler.cugraph_sampler import _sampler_output_from_sampling_results
+from cugraph_pyg.sampler.cugraph_sampler import (
+    _sampler_output_from_sampling_results_heterogeneous,
+)
 
 from cugraph.gnn import FeatureStore
 
@@ -31,7 +33,7 @@
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_neighbor_sample(dask_client, basic_graph_1):
     F, G, N = basic_graph_1
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -56,7 +58,7 @@ def test_neighbor_sample(dask_client, basic_graph_1):
         .sort_values(by=["sources", "destinations"])
     )
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         sampling_results=sampling_results,
         renumber_map=None,
         graph_store=cugraph_store,
@@ -84,7 +86,7 @@ def test_neighbor_sample(dask_client, basic_graph_1):
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"].tolist() == [4, 4]
+    assert out.num_sampled_nodes["vt1"].tolist() == [4, 1]
 
     assert len(out.num_sampled_edges) == 1
     assert out.num_sampled_edges[("vt1", "pig", "vt1")].tolist() == [6]
@@ -95,7 +97,7 @@ def test_neighbor_sample(dask_client, basic_graph_1):
 @pytest.mark.skip(reason="broken")
 def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    cugraph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -119,7 +121,7 @@ def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph
         .compute()
     )
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         sampling_results=sampling_results,
         renumber_map=None,
         graph_store=cugraph_store,
@@ -144,8 +146,8 @@ def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 2
-    assert out.num_sampled_nodes["black"].tolist() == [2, 2]
-    assert out.num_sampled_nodes["brown"].tolist() == [3, 2]
+    assert out.num_sampled_nodes["black"].tolist() == [2, 0]
+    assert out.num_sampled_nodes["brown"].tolist() == [3, 0]
 
     assert len(out.num_sampled_edges) == 5
     assert out.num_sampled_edges[("brown", "horse", "brown")].tolist() == [2]
@@ -186,7 +188,7 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
         torch.tensor([3.2, 2.1], dtype=torch.float32), type_name="A", feat_name="prop1"
     )
 
-    graph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    graph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
@@ -198,7 +200,7 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
         }
     )
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         mock_sampling_results, None, graph_store, None
     )
 
@@ -218,9 +220,9 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
     assert out.col[("B", "ba", "A")].tolist() == [1, 1]
 
     assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 1, 0, 1]
-    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 1, 0]
-    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 2]
+    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 0, 0, 0]
+    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 0, 0]
+    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 1]
 
     assert len(out.num_sampled_edges) == 3
     assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
index a5a59623710..43b1e5da5a0 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
@@ -117,8 +117,8 @@ def test_get_edge_index(graph, edge_index_type, dask_client):
             G[et][1] = cudf.Series(G[et][1])
     elif edge_index_type == "dask-cudf":
         for et in list(G.keys()):
-            G[et][0] = dask_cudf.from_cudf(cudf.Series(G[et][0]), npartitions=2)
-            G[et][1] = dask_cudf.from_cudf(cudf.Series(G[et][1]), npartitions=2)
+            G[et][0] = dask_cudf.from_cudf(cudf.Series(G[et][0]), npartitions=1)
+            G[et][1] = dask_cudf.from_cudf(cudf.Series(G[et][1]), npartitions=1)
 
     cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
 
@@ -215,7 +215,7 @@ def test_renumber_vertices_multi_edge_multi_vertex(
 def test_renumber_edges(abc_graph, dask_client):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N, multi_gpu=True)
+    graph_store = CuGraphStore(F, G, N, multi_gpu=True, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index 620f1a5eb85..48a21cb7fd6 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -26,12 +26,14 @@
 from cugraph.utilities.utils import import_optional, MissingModule
 
 torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+trim_to_layer = import_optional("torch_geometric.utils.trim_to_layer")
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_basic(karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
     loader = CuGraphNeighborLoader(
         (cugraph_store, cugraph_store),
         torch.arange(N["type0"] + N["type1"], dtype=torch.int64),
@@ -57,7 +59,7 @@ def test_cugraph_loader_basic(karate_gnn):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_hetero(karate_gnn):
     F, G, N = karate_gnn
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
     loader = CuGraphNeighborLoader(
         (cugraph_store, cugraph_store),
         input_nodes=("type1", torch.tensor([0, 1, 2, 5], device="cuda")),
@@ -82,23 +84,29 @@ def test_cugraph_loader_hetero(karate_gnn):
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_from_disk():
+    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
+    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
+    x = torch.zeros(256, dtype=torch.int32)
+    x[torch.tensor(m, dtype=torch.int32)] = n
     F = FeatureStore()
-    F.add_data(torch.tensor([1, 2, 3, 4, 5, 6, 7]), "t0", "x")
+    F.add_data(x, "t0", "x")
 
-    G = {("t0", "knows", "t0"): 7}
-    N = {"t0": 7}
+    G = {("t0", "knows", "t0"): 9080}
+    N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
-            "sources": [0, 1, 2, 3, 4, 5, 6],
-            "destinations": [6, 4, 3, 2, 2, 1, 5],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 2, 2], dtype="int32"),
+            "sources": [0, 1, 2, 3, 4, 5, 6, 6],
+            "destinations": [5, 4, 3, 2, 2, 6, 5, 2],
+            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
+            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
         }
     )
+    map = cudf.Series(m, name="map")
+    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
 
     tempdir = tempfile.TemporaryDirectory()
     for s in range(256):
@@ -115,32 +123,49 @@ def test_cugraph_loader_from_disk():
     for sample in loader:
         num_samples += 1
         assert sample["t0"]["num_nodes"] == 7
-        # correct vertex order is [0, 1, 2, 6, 4, 3, 5]; x = [1, 2, 3, 7, 5, 4, 6]
-        assert sample["t0"]["x"].tolist() == [1, 2, 3, 7, 5, 4, 6]
-        assert list(sample[("t0", "knows", "t0")]["edge_index"].shape) == [2, 7]
+        # correct vertex order is [0, 1, 2, 5, 4, 3, 6]; x = [1, 2, 3, 6, 5, 4, 7]
+        assert sample["t0"]["x"].tolist() == [3, 4, 5, 6, 7, 8, 9]
+
+        edge_index = sample[("t0", "knows", "t0")]["edge_index"]
+        assert list(edge_index.shape) == [2, 8]
+
+        assert (
+            edge_index[0].tolist()
+            == bogus_samples.sources.dropna().values_host.tolist()
+        )
+        assert (
+            edge_index[1].tolist()
+            == bogus_samples.destinations.dropna().values_host.tolist()
+        )
 
     assert num_samples == 256
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_cugraph_loader_from_disk_subset():
+    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
+    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
+    x = torch.zeros(256, dtype=torch.int32)
+    x[torch.tensor(m, dtype=torch.int32)] = n
     F = FeatureStore()
-    F.add_data(torch.tensor([1, 2, 3, 4, 5, 6, 7]), "t0", "x")
+    F.add_data(x, "t0", "x")
 
-    G = {("t0", "knows", "t0"): 7}
-    N = {"t0": 7}
+    G = {("t0", "knows", "t0"): 9080}
+    N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
-            "sources": [0, 1, 2, 3, 4, 5, 6],
-            "destinations": [6, 4, 3, 2, 2, 1, 5],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 2, 2], dtype="int32"),
+            "sources": [0, 1, 2, 3, 4, 5, 6, 6],
+            "destinations": [5, 4, 3, 2, 2, 6, 5, 2],
+            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
+            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
         }
     )
+    map = cudf.Series(m, name="map")
+    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
 
     tempdir = tempfile.TemporaryDirectory()
     for s in range(256):
@@ -159,33 +184,45 @@ def test_cugraph_loader_from_disk_subset():
         num_samples += 1
         assert sample["t0"]["num_nodes"] == 7
         # correct vertex order is [0, 1, 2, 6, 4, 3, 5]; x = [1, 2, 3, 7, 5, 4, 6]
-        assert sample["t0"]["x"].tolist() == [1, 2, 3, 7, 5, 4, 6]
-        assert list(sample[("t0", "knows", "t0")]["edge_index"].shape) == [2, 7]
+        assert sample["t0"]["x"].tolist() == [3, 4, 5, 6, 7, 8, 9]
+
+        edge_index = sample[("t0", "knows", "t0")]["edge_index"]
+        assert list(edge_index.shape) == [2, 8]
+
+        assert (
+            edge_index[0].tolist()
+            == bogus_samples.sources.dropna().values_host.tolist()
+        )
+        assert (
+            edge_index[1].tolist()
+            == bogus_samples.destinations.dropna().values_host.tolist()
+        )
 
     assert num_samples == 100
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
-def test_cugraph_loader_from_disk_subset_renumbered():
+def test_cugraph_loader_e2e_coo():
+    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
+    x = torch.randint(3000, (256, 256)).to(torch.float32)
     F = FeatureStore()
-    F.add_data(torch.tensor([1, 2, 3, 4, 5, 6, 7]), "t0", "x")
+    F.add_data(x, "t0", "x")
 
-    G = {("t0", "knows", "t0"): 7}
-    N = {"t0": 7}
+    G = {("t0", "knows", "t0"): 9999}
+    N = {"t0": 256}
 
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
 
     bogus_samples = cudf.DataFrame(
         {
-            "sources": [0, 1, 2, 3, 4, 5, 6],
-            "destinations": [6, 4, 3, 2, 2, 1, 5],
-            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0], dtype="int32"),
-            "edge_id": [5, 10, 15, 20, 25, 30, 35],
-            "hop_id": cudf.Series([0, 0, 0, 1, 1, 2, 2], dtype="int32"),
+            "sources": [0, 1, 2, 3, 4, 5, 6, 6],
+            "destinations": [5, 4, 3, 2, 2, 6, 5, 2],
+            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
+            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
+            "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
         }
     )
-
-    map = cudf.Series([2, 9, 0, 2, 1, 3, 4, 6, 5], name="map")
+    map = cudf.Series(m, name="map")
     bogus_samples = bogus_samples.join(map, how="outer").sort_index()
 
     tempdir = tempfile.TemporaryDirectory()
@@ -200,22 +237,35 @@ def test_cugraph_loader_from_disk_subset_renumbered():
         input_files=list(os.listdir(tempdir.name))[100:200],
     )
 
-    num_samples = 0
-    for sample in loader:
-        num_samples += 1
-        assert sample["t0"]["num_nodes"] == 7
-        # correct vertex order is [0, 2, 1, 3, 4, 6, 5]; x = [1, 3, 2, 4, 5, 7, 6]
-        assert sample["t0"]["x"].tolist() == [1, 3, 2, 4, 5, 7, 6]
+    convs = [
+        torch_geometric.nn.SAGEConv(256, 64, aggr="mean").cuda(),
+        torch_geometric.nn.SAGEConv(64, 8, aggr="mean").cuda(),
+        torch_geometric.nn.SAGEConv(8, 1, aggr="mean").cuda(),
+    ]
 
-        edge_index = sample[("t0", "knows", "t0")]["edge_index"]
-        assert list(edge_index.shape) == [2, 7]
-        assert (
-            edge_index[0].tolist()
-            == bogus_samples.sources.dropna().values_host.tolist()
-        )
-        assert (
-            edge_index[1].tolist()
-            == bogus_samples.destinations.dropna().values_host.tolist()
-        )
+    trim = trim_to_layer.TrimToLayer()
+    relu = torch.nn.functional.relu
+    dropout = torch.nn.functional.dropout
 
-    assert num_samples == 100
+    for hetero_data in loader:
+        ei = hetero_data["t0", "knows", "t0"]["edge_index"]
+        x = hetero_data["t0"]["x"].cuda()
+        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
+        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
+
+        print(num_sampled_nodes, num_sampled_edges)
+
+        for i in range(len(convs)):
+            x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)
+
+            s = x.shape[0]
+
+            x = convs[i](x, ei, size=(s, s))
+            x = relu(x)
+            x = dropout(x, p=0.5)
+            print(x.shape)
+
+        print(x.shape)
+        x = x.narrow(dim=0, start=0, length=x.shape[0] - num_sampled_nodes[1])
+
+        assert list(x.shape) == [3, 1]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
index 08a8625b33b..84f62e80c9d 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
@@ -17,7 +17,9 @@
 import pytest
 
 from cugraph_pyg.data import CuGraphStore
-from cugraph_pyg.sampler.cugraph_sampler import _sampler_output_from_sampling_results
+from cugraph_pyg.sampler.cugraph_sampler import (
+    _sampler_output_from_sampling_results_heterogeneous,
+)
 
 from cugraph.utilities.utils import import_optional, MissingModule
 from cugraph import uniform_neighbor_sample
@@ -29,7 +31,7 @@
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_neighbor_sample(basic_graph_1):
     F, G, N = basic_graph_1
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -49,7 +51,7 @@ def test_neighbor_sample(basic_graph_1):
         return_offsets=False,
     ).sort_values(by=["sources", "destinations"])
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         sampling_results=sampling_results,
         renumber_map=None,
         graph_store=cugraph_store,
@@ -77,7 +79,7 @@ def test_neighbor_sample(basic_graph_1):
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 1
-    assert out.num_sampled_nodes["vt1"].tolist() == [4, 4]
+    assert out.num_sampled_nodes["vt1"].tolist() == [4, 1]
 
     assert len(out.num_sampled_edges) == 1
     assert out.num_sampled_edges[("vt1", "pig", "vt1")].tolist() == [6]
@@ -87,7 +89,7 @@ def test_neighbor_sample(basic_graph_1):
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
 def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
     F, G, N = multi_edge_multi_vertex_graph_1
-    cugraph_store = CuGraphStore(F, G, N)
+    cugraph_store = CuGraphStore(F, G, N, order="CSR")
 
     batches = cudf.DataFrame(
         {
@@ -107,7 +109,7 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
         with_batch_ids=True,
     ).sort_values(by=["sources", "destinations"])
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         sampling_results=sampling_results,
         renumber_map=None,
         graph_store=cugraph_store,
@@ -132,8 +134,8 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
 
     # check the hop dictionaries
     assert len(out.num_sampled_nodes) == 2
-    assert out.num_sampled_nodes["black"].tolist() == [2, 2]
-    assert out.num_sampled_nodes["brown"].tolist() == [3, 2]
+    assert out.num_sampled_nodes["black"].tolist() == [2, 0]
+    assert out.num_sampled_nodes["brown"].tolist() == [3, 0]
 
     assert len(out.num_sampled_edges) == 5
     assert out.num_sampled_edges[("brown", "horse", "brown")].tolist() == [2]
@@ -147,7 +149,7 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
 def test_neighbor_sample_mock_sampling_results(abc_graph):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N)
+    graph_store = CuGraphStore(F, G, N, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
@@ -159,7 +161,7 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
         }
     )
 
-    out = _sampler_output_from_sampling_results(
+    out = _sampler_output_from_sampling_results_heterogeneous(
         mock_sampling_results, None, graph_store, None
     )
 
@@ -179,9 +181,9 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
     assert out.col[("B", "ba", "A")].tolist() == [1, 1]
 
     assert len(out.num_sampled_nodes) == 3
-    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 1, 0, 1]
-    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 1, 0]
-    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 2]
+    assert out.num_sampled_nodes["A"].tolist() == [2, 0, 0, 0, 0]
+    assert out.num_sampled_nodes["B"].tolist() == [0, 2, 0, 0, 0]
+    assert out.num_sampled_nodes["C"].tolist() == [0, 0, 2, 0, 1]
 
     assert len(out.num_sampled_edges) == 3
     assert out.num_sampled_edges[("A", "ab", "B")].tolist() == [3, 0, 1, 0]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
index 289dd69a829..e815b813050 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
@@ -199,7 +199,7 @@ def test_renumber_vertices_multi_edge_multi_vertex(multi_edge_multi_vertex_graph
 def test_renumber_edges(abc_graph):
     F, G, N = abc_graph
 
-    graph_store = CuGraphStore(F, G, N)
+    graph_store = CuGraphStore(F, G, N, order="CSR")
 
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(

From 9c96b2613f029eb9616c19aefb38a689c1267bae Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Mon, 25 Sep 2023 07:49:19 -0700
Subject: [PATCH 49/72] Update dgl benchmarks (#3775)

This PR adds upstream DGL benchmarks , I will expand this to add `cugraph-dgl` soon.

<img width="1632" alt="image" src="https://github.com/rapidsai/cugraph/assets/4837571/f3908d30-a0ba-4c4d-a1d0-f06fe15e160f">

CC: @tingyu66 , @BradReesWork

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3775
---
 .../cugraph-dgl/scale-benchmarks/__init__.py  |   0
 .../scale-benchmarks/dgl_benchmark.py         | 152 ++++++++++++++++++
 .../scale-benchmarks/load_graph_feats.py      | 123 ++++++++++++++
 .../cugraph-dgl/scale-benchmarks/model.py     | 110 +++++++++++++
 4 files changed, 385 insertions(+)
 create mode 100644 benchmarks/cugraph-dgl/scale-benchmarks/__init__.py
 create mode 100644 benchmarks/cugraph-dgl/scale-benchmarks/dgl_benchmark.py
 create mode 100644 benchmarks/cugraph-dgl/scale-benchmarks/load_graph_feats.py
 create mode 100644 benchmarks/cugraph-dgl/scale-benchmarks/model.py

diff --git a/benchmarks/cugraph-dgl/scale-benchmarks/__init__.py b/benchmarks/cugraph-dgl/scale-benchmarks/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/benchmarks/cugraph-dgl/scale-benchmarks/dgl_benchmark.py b/benchmarks/cugraph-dgl/scale-benchmarks/dgl_benchmark.py
new file mode 100644
index 00000000000..3762226d570
--- /dev/null
+++ b/benchmarks/cugraph-dgl/scale-benchmarks/dgl_benchmark.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import time
+import dgl
+from dgl.dataloading import MultiLayerNeighborSampler, DataLoader
+import pandas as pd
+import torch
+from model import run_1_epoch
+from argparse import ArgumentParser
+from load_graph_feats import load_edges_from_disk, load_node_labels, load_node_features
+
+class DataLoaderArgs:
+    def __init__(self, args):
+        self.dataset_path = args.dataset_path
+        self.replication_factors = [int(x) for x in args.replication_factors.split(",")]
+        self.fanouts = [[int(y) for y in x.split("_")] for x in args.fanouts.split(",")]
+        self.batch_sizes = [int(x) for x in args.batch_sizes.split(",")]
+        self.use_uva = not args.do_not_use_uva
+
+
+
+def create_dataloader(g, train_idx, batch_size, fanouts, use_uva):
+    print("Creating dataloader", flush=True)
+    st = time.time()
+    if use_uva:
+        train_idx = {k: v.to("cuda") for k, v in train_idx.items()}
+    sampler = MultiLayerNeighborSampler(fanouts=fanouts)
+    dataloader = DataLoader(
+        g,
+        train_idx,
+        sampler,
+        num_workers=0,
+        batch_size=batch_size,
+        use_uva=use_uva,
+        shuffle=False,
+        drop_last=False,
+    )
+    et = time.time()
+    print(f"Time to create dataloader = {et - st:.2f} seconds", flush=True)
+    return dataloader
+
+
+
+def create_dgl_graph_from_disk(dataset_path, replication_factor=1):
+    """
+    Create a DGL graph from a dataset on disk.
+    Args:
+        dataset_path: Path to the dataset on disk.
+        replication_factor: Number of times to replicate the edges.
+    Returns:
+        DGLGraph: DGLGraph with the loaded dataset.
+    """
+    with open(os.path.join(dataset_path, "meta.json"), "r") as f:
+        input_meta = json.load(f)
+
+    parquet_path = os.path.join(dataset_path, "parquet")
+    graph_data = load_edges_from_disk(
+        parquet_path, replication_factor, input_meta
+    )
+    label_data = load_node_labels(dataset_path, replication_factor, input_meta)
+    if replication_factor <8 :
+        feat_data = load_node_features(dataset_path, replication_factor, node_type='paper')
+    else:
+        feat_data = None   
+    print("labels and features loaded ", flush=True)
+
+    g = dgl.heterograph(graph_data)
+
+    return g, label_data, feat_data
+
+
+def main(args):
+    print(f"Running dgl dataloading benchmark with the following parameters:\n"
+          f"Dataset path = {args.dataset_path}\n"
+          f"Replication factors = {args.replication_factors}\n"
+          f"Fanouts = {args.fanouts}\n"
+          f"Batch sizes = {args.batch_sizes}\n"
+          f"Use UVA = {args.use_uva}\n"
+          f"{'=' * 30}")
+
+    time_ls = []
+    for replication_factor in args.replication_factors:
+        start_time = time.time()
+        g, label_data, feat_data = create_dgl_graph_from_disk(args.dataset_path, replication_factor)
+        elapsed_time = time.time() - start_time
+
+        print(f"Replication factor = {replication_factor}\n"
+              f"G has {g.num_edges():,} edges and took {elapsed_time:.2f} seconds to load", flush=True)
+
+        train_idx = {"paper": label_data["paper"]["train_idx"]}
+        y = label_data["paper"]["y"]
+        r_time_ls = e2e_benchmark(g, feat_data, y, train_idx, args.fanouts, args.batch_sizes, use_uva=args.use_uva)
+        [x.update({"replication_factor": replication_factor}) for x in r_time_ls]
+        [x.update({"num_edges":  g.num_edges()}) for x in r_time_ls]
+        time_ls.extend(r_time_ls)
+
+        print(f"Benchmark completed for replication factor = {replication_factor}\n{'=' * 30}", flush=True)
+
+    df = pd.DataFrame(time_ls)
+    df.to_csv("dgl_e2e_benchmark.csv", index=False)
+    print(f"Benchmark completed for all replication factors\n{'=' * 30}", flush=True)
+
+
+def e2e_benchmark(g, feat, y, train_idx, fanouts, batch_sizes, use_uva):
+    """
+    Run the e2e_benchmark
+    Args:
+        g: DGLGraph
+        feat: Tensor containing the features.
+        y: Tensor containing the labels.
+        train_idx: Tensor containing the training indices.
+        fanouts: List of fanouts to use for the dataloader.
+        batch_sizes: List of batch sizes to use for the dataloader.
+        use_uva: Whether to use unified virtual address space.
+        model_backend: Backend of model to use.
+    """
+    time_ls = []
+    for fanout in fanouts:
+        for batch_size in batch_sizes:
+            dataloader = create_dataloader(g, train_idx, batch_size, fanout, use_uva)
+            time_d = run_1_epoch(dataloader, feat, y, fanout, batch_size, model_backend='dgl')
+            time_ls.append(time_d)
+            print("="*30)
+    return time_ls
+
+
+
+def parse_arguments():
+    parser = ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, default="/raid/vjawa/ogbn_papers100M")
+    parser.add_argument("--replication_factors", type=str, default="2")
+    parser.add_argument("--fanouts", type=str, default="10_10_10")
+    parser.add_argument("--batch_sizes", type=str, default="512,1024,8192,16384")
+    parser.add_argument("--do_not_use_uva", action="store_true")
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    arguments = parse_arguments()
+    main(DataLoaderArgs(arguments))
diff --git a/benchmarks/cugraph-dgl/scale-benchmarks/load_graph_feats.py b/benchmarks/cugraph-dgl/scale-benchmarks/load_graph_feats.py
new file mode 100644
index 00000000000..4f0f81c70e1
--- /dev/null
+++ b/benchmarks/cugraph-dgl/scale-benchmarks/load_graph_feats.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import torch
+import os
+
+
+def load_edges_from_disk(parquet_path, replication_factor, input_meta):
+    """
+    Load the edges from disk into a graph data dictionary.
+    Args:
+        parquet_path: Path to the parquet directory.
+        replication_factor: Number of times to replicate the edges.
+        input_meta: Input meta data.
+    Returns:
+        dict: Dictionary of edge types to a tuple of (src, dst)
+    """
+    graph_data = {}
+
+    for edge_type in input_meta["num_edges"].keys():
+        print(f"Loading edge index for edge type {edge_type} for replication factor = {replication_factor}")
+
+        canonical_edge_type = tuple(edge_type.split("__"))
+        edge_index = pd.read_parquet(os.path.join(parquet_path, edge_type, "edge_index.parquet"))
+        edge_index = {
+            "src": torch.from_numpy(edge_index.src.values),
+            "dst": torch.from_numpy(edge_index.dst.values),
+        }
+
+        if replication_factor > 1:
+            src_list, dst_list = replicate_edges(edge_index, canonical_edge_type, replication_factor, input_meta)
+            edge_index["src"] = torch.cat(src_list).contiguous()
+            edge_index["dst"] = torch.cat(dst_list).contiguous()
+
+        graph_data[canonical_edge_type] = edge_index["src"], edge_index["dst"]
+
+    print("Read Edge Data")
+    return graph_data
+
+
+def replicate_edges(edge_index, canonical_edge_type, replication_factor, input_meta):
+    src_list = [edge_index["src"]]
+    dst_list = [edge_index["dst"]]
+
+    for r in range(1, replication_factor):
+        new_src = edge_index["src"] + (r * input_meta["num_nodes"][canonical_edge_type[0]])
+        new_dst = edge_index["dst"] + (r * input_meta["num_nodes"][canonical_edge_type[2]])
+        src_list.append(new_src)
+        dst_list.append(new_dst)
+
+    return src_list, dst_list
+
+
+
+
+def load_node_labels(dataset_path, replication_factor, input_meta):
+    num_nodes_dict = {node_type: t * replication_factor for node_type, t in input_meta["num_nodes"].items()}
+    node_data = {}
+
+    for node_type in input_meta["num_nodes"].keys():
+        node_data[node_type] = {}
+        label_path = os.path.join(dataset_path, "parquet", node_type, "node_label.parquet")
+
+        if os.path.exists(label_path):
+            node_data[node_type] = process_node_label(label_path, node_type, replication_factor, num_nodes_dict, input_meta)
+
+        else:
+            node_data[node_type]["num_nodes"] = num_nodes_dict[node_type]
+
+    print("Loaded node labels", flush=True)
+    return node_data
+
+def process_node_label(label_path, node_type, replication_factor, num_nodes_dict, input_meta):
+    node_label = pd.read_parquet(label_path)
+
+    if replication_factor > 1:
+        node_label = replicate_node_label(node_label, node_type, replication_factor, input_meta)
+
+    node_label_tensor = torch.full((num_nodes_dict[node_type],), -1, dtype=torch.float32)
+    node_label_tensor[torch.as_tensor(node_label.node.values)] = torch.as_tensor(node_label.label.values)
+
+    del node_label
+
+    return {
+        "train_idx": (node_label_tensor > -1).contiguous().nonzero().view(-1),
+        "y": node_label_tensor.contiguous().long()
+    }
+
+
+def replicate_node_label(node_label, node_type, replication_factor, input_meta):
+    base_num_nodes = input_meta["num_nodes"][node_type]
+
+    replicated_df = pd.DataFrame({
+        "node": pd.concat([node_label.node + (r * base_num_nodes) for r in range(1, replication_factor)]),
+        "label": pd.concat([node_label.label for _ in range(1, replication_factor)])
+    })
+
+    return pd.concat([node_label, replicated_df]).reset_index(drop=True)
+
+
+def load_node_features(dataset_path, replication_factor, node_type):
+    print("Loading node features", flush=True)
+    node_type_path = os.path.join(dataset_path, "npy", node_type)
+    if replication_factor == 1:
+        fname =  os.path.join(node_type_path, "node_feat.npy")
+    else:
+        fname = os.path.join(node_type_path, f"node_feat_{replication_factor}x.npy")
+    
+    feat = torch.from_numpy(np.load(fname))
+    print("Loaded node features", flush=True)
+    return feat
diff --git a/benchmarks/cugraph-dgl/scale-benchmarks/model.py b/benchmarks/cugraph-dgl/scale-benchmarks/model.py
new file mode 100644
index 00000000000..506e3bd5227
--- /dev/null
+++ b/benchmarks/cugraph-dgl/scale-benchmarks/model.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+import time
+
+
+class GNN(torch.nn.Module):
+    def __init__(self, in_channels, hidden_channels, out_channels, num_layers, model_backend='dgl'):
+        if model_backend == 'dgl':
+            from dgl.nn import SAGEConv
+        else:
+            from cugraph_dgl.nn import SAGEConv
+
+        super(GNN, self).__init__()
+        self.convs = torch.nn.ModuleList()
+        for _ in range(num_layers - 1):
+            self.convs.append(SAGEConv(in_channels, hidden_channels, aggregator_type='mean'))
+            in_channels = hidden_channels
+        self.convs.append(SAGEConv(hidden_channels, out_channels, aggregator_type='mean'))
+
+    def forward(self, blocks, x):
+        for i, conv in enumerate(self.convs):
+            x = conv(blocks[i], x)
+            if i != len(self.convs) - 1:
+                x = F.relu(x)
+        return x
+
+
+def create_model(feat_size, num_classes, num_layers, model_backend='dgl'):
+    model = GNN(feat_size, 64, num_classes, num_layers, model_backend=model_backend)
+    model = model.to('cuda')
+    model.train()
+    return model
+
+def train_model(model, dataloader, opt, feat, y):
+    times = {key: 0 for key in ['mfg_creation', 'feature', 'm_fwd', 'm_bkwd']}
+    epoch_st = time.time()
+    mfg_st = time.time()
+    for input_nodes, output_nodes, blocks in dataloader:
+        times['mfg_creation'] += time.time() - mfg_st
+        if feat is not None:
+            fst = time.time()
+            input_nodes = input_nodes.to('cpu')
+            input_feat = feat[input_nodes]
+            input_feat = input_feat.to('cuda')
+            if isinstance(output_nodes, dict):
+                output_nodes = output_nodes['paper']
+            output_nodes = output_nodes.to(y.device)
+            y_batch = y[output_nodes].to('cuda')
+            times['feature'] += time.time() - fst
+
+            m_fwd_st = time.time()
+            y_hat = model(blocks, input_feat)
+            times['m_fwd'] += time.time() - m_fwd_st
+        
+            m_bkwd_st = time.time()
+            loss = F.cross_entropy(y_hat, y_batch)
+            opt.zero_grad()
+            loss.backward()
+            opt.step()
+            times['m_bkwd'] += time.time() - m_bkwd_st
+        mfg_st = time.time()
+
+    print(f"Epoch time = {time.time() - epoch_st:.2f} seconds")
+    
+    return times
+
+def analyze_time(dataloader, times, epoch_time, fanout, batch_size):
+    num_batches = len(dataloader)
+    time_d = {
+        "fanout": fanout,
+        "batch_size": batch_size,
+        "epoch_time": epoch_time,
+        "epoch_time_per_batch": epoch_time / num_batches,
+        "num_batches": num_batches,
+    }
+    for key, value in times.items():
+        time_d[f"{key}_time_per_epoch"] = value
+        time_d[f"{key}_time_per_batch"] = value / num_batches
+
+    print(f"Time analysis for fanout = {fanout}, batch_size = {batch_size}")
+    for k in time_d.keys():
+        if 'time_per_epoch' in str(k):
+            print(f"{k} = {time_d[k]:.2f} seconds")
+    return time_d
+
+def run_1_epoch(dataloader, feat, y, fanout, batch_size, model_backend):
+    if feat is not None:
+        model = create_model(feat.shape[1], 172, len(fanout), model_backend=model_backend)
+        opt = torch.optim.Adam(model.parameters(), lr=0.01)
+    else:
+        model = None
+        opt = None
+    epoch_st = time.time()
+    times = train_model(model, dataloader, opt, feat, y)
+    epoch_time = time.time() - epoch_st
+    time_d = analyze_time(dataloader, times, epoch_time, fanout, batch_size)
+    return time_d

From c11eff23926dd483d23444a4757629e8ed069683 Mon Sep 17 00:00:00 2001
From: Don Acosta <97529984+acostadon@users.noreply.github.com>
Date: Mon, 25 Sep 2023 21:59:29 -0400
Subject: [PATCH 50/72] similarity notebook to compare link prediction algos
 (#3868)

New notebook to compare link prediction
Dependent on dining_prefs being added to datasets api in PR #3866

Authors:
  - Don Acosta (https://github.com/acostadon)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3868
---
 .../link_prediction/similarity_combined.ipynb | 217 ++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 notebooks/algorithms/link_prediction/similarity_combined.ipynb

diff --git a/notebooks/algorithms/link_prediction/similarity_combined.ipynb b/notebooks/algorithms/link_prediction/similarity_combined.ipynb
new file mode 100644
index 00000000000..cd80ee34002
--- /dev/null
+++ b/notebooks/algorithms/link_prediction/similarity_combined.ipynb
@@ -0,0 +1,217 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Similarity Compared\n",
+    "----\n",
+    "\n",
+    "In this notebook, we will compute vertex similarity scores using the various cuGraph algorithms.  We will then compare the similarities scores in tables.\n",
+    "\n",
+    "| Author Credit |    Date    |  Update          | cuGraph Version |  Test Hardware        |\n",
+    "| --------------|------------|------------------|-----------------|-----------------------|\n",
+    "| Don Acosta    | 09/25/2023 | created          | 23.10 nightly   | AMPERE A6000 CUDA 11.7|\n",
+    "\n",
+    "\n",
+    "**Note: On large graphs these algorithms can take prohibitive time or memory. The notebook will show how to run on defined pairs instead.**\n",
+    "\n",
+    "The Similarity algorithms in cuGraph use different methods to compare pairs of vertices. All of them use the intersection of the set of adjacent nodes for the set overlap. However each of the three algorithms differ on the denominator to determine the similarity coefficients. All three are normalized between zero and one. where zero is no overlap at all and one means identical adjacencies.\n",
+    "\n",
+    "__Jaccard Similarity__<br>\n",
+    "The [Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index) measure was developed by botonist, Paul Jaccard who used the measure to compare plant species. His work popularized the measure's use in in other fields as well.\n",
+    "\n",
+    "It can be expressed as:<br>\n",
+    "$\\text{Jaccard similarity} = \\frac{|A \\cap B|}{|A \\cup B|}$\n",
+    "\n",
+    "__Overlap Similarity__<br>\n",
+    "The [Overlap Similarity](https://en.wikipedia.org/wiki/Overlap_coefficient) is also known as the Szymkiewicz–Simpson coefficient. It is often used to compare binary and categorical data in the fields of Genome analysis, recommender systems and anomaly detection. It differs from the Jaccard measure above in that it uses the size of the smaller of the two set sizes as the denominator.\n",
+    "\n",
+    "It can be expressed as\n",
+    "\n",
+    "$oc(A,B)=\\frac{|A|\\cap|B|}{min(|A|,|B|)}$\n",
+    "\n",
+    "__Sørensen-Dice Coefficient__<br>\n",
+    "The [Sørensen coefficient](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient#) is known as the Sørensen-Dice coefficient. It was independently developed for use by botonists Lee Raymond Dice and  Thorvald Sørensen. Although originating in the field of Botony, the coefficient is now used in computer vision, Natural Language Processing(NLP) and Data Mining among other fields.\n",
+    "It differs from Jaccard and Overlap in that the calculation doubles the intersection size and divides it by the sum of the two set sizes.\n",
+    "\n",
+    "It can be expressed as\n",
+    "\n",
+    "Sørensen coefficient = $\\left(2 * |A \\cap B| \\right) \\over \\left(|A| + |B| \\right)$\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "# Now for the code !"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the required dependencies."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cugraph\n",
+    "from cugraph.datasets import dining_prefs\n",
+    "# only needed to display results in a table \n",
+    "from IPython.display import display_html "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Function that calls all the cuGraph similarity/link prediction algorithms "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_similarity(G,pairs=None):\n",
+    "    _jdf = cugraph.jaccard(G,pairs)\n",
+    "    _jdf2 = _jdf[ (_jdf['first'] != _jdf['second'] ) ]\n",
+    "    _odf = cugraph.overlap(G,pairs)\n",
+    "    _odf2 = _odf[ (_odf['first'] != _odf['second'] ) ]\n",
+    "    _sdf = cugraph.sorensen_coefficient(G,pairs)\n",
+    "    _sdf2 = _sdf[ (_sdf['first'] != _sdf['second'] ) ]\n",
+    "    return _jdf2, _odf2, _sdf2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Function to put all the results in a convenient table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Print function\n",
+    "def print_similarity(jdf,odf,sdf,num_records=5):\n",
+    "\n",
+    "    js_top = jdf.sort_values(by='jaccard_coeff', ascending=False).head(num_records).to_pandas()\n",
+    "    os_top = odf.sort_values(by='overlap_coeff', ascending=False).head(num_records).to_pandas()\n",
+    "    ss_top = sdf.sort_values(by='sorensen_coeff', ascending=False).head(num_records).to_pandas()\n",
+    "    \n",
+    "    df1_styler = js_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Jaccard').hide(axis='index')\n",
+    "    df2_styler = os_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Overlap').hide(axis='index')\n",
+    "    df3_styler = ss_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Sørensen').hide(axis='index')\n",
+    "\n",
+    "    display_html(df1_styler._repr_html_()+df2_styler._repr_html_()+df3_styler._repr_html_(), raw=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the graph from the Dining preferences data set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G = dining_prefs.get_graph(download=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run the three similarity Algorithms and print out the five links with the highest scores."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "jdf, odf, sdf = compute_similarity(G)\n",
+    "print_similarity(jdf,odf,sdf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now find the the complete set of two-hop neigbors and compare them instead of just using the existing one-hop edges. In a larger graph, this will run considerably faster since the default "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this cugraph algorithm pulls a set containing every pair of vertices\n",
+    "# that are within 2-hops of each other\n",
+    "two_hops_pairs = G.get_two_hop_neighbors()\n",
+    "\n",
+    "jdf_hops, odf_hops, sdf_hops = compute_similarity(G,pairs=two_hops_pairs)\n",
+    "print_similarity(jdf_hops,odf_hops,sdf_hops)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### It's that easy with cuGraph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "----\n",
+    "Copyright (c) 2023, NVIDIA CORPORATION.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0\n",
+    "\n",
+    "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "cugraph_0802",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From b199bf01a2dbb3a8bc89198c6d35fd5a0444e213 Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Tue, 26 Sep 2023 05:33:13 -0700
Subject: [PATCH 51/72] [REVIEW] Add Pure DGL Dataloading benchmark (#3660)

This PR adds the DGL data loading benchmark:

Arguments supported:
- dataset_path: path to the dataset
- replication_factors: replication factors for number of edges
- fanouts: fanouts
- batch_sizes: batch sizes

```bash
python3 dgl_dataloading.py --dataset_path "/datasets/abarghi/ogbn_papers100M" \
--replication_factors "1,2,4" \
--fanouts "25_25,10_10_10,5_10_20" \
--batch_sizes "512,1024"
```


This produces the following results on a V100:

| Fanout | Batch Size | Data Loading Time per Epoch | Data Loading Time per Batch | Number of Edges | Number of Batches | Replication Factor |
|--------|------------|-----------------------------|-----------------------------|-----------------|-------------------|--------------------|
| [25, 25] | 512 | 9.48 | 0.0031 | 1615685872 | 3022 | 1 |
| [25, 25] | 1024 | 6.39 | 0.0042 | 1615685872 | 1511 | 1 |
| [10, 10, 10] | 512 | 15.91 | 0.0053 | 1615685872 | 3022 | 1 |
| [10, 10, 10] | 1024 | 11.64 | 0.0077 | 1615685872 | 1511 | 1 |
| [5, 10, 20] | 512 | 17.73 | 0.0059 | 1615685872 | 3022 | 1 |
| [5, 10, 20] | 1024 | 13.52 | 0.0089 | 1615685872 | 1511 | 1 |
| [25, 25] | 512 | 19.44 | 0.0032 | 3231371744 | 6043 | 2 |
| [25, 25] | 1024 | 12.98 | 0.0043 | 3231371744 | 3022 | 2 |
| [10, 10, 10] | 512 | 32.88 | 0.0054 | 3231371744 | 6043 | 2 |
| [10, 10, 10] | 1024 | 24.35 | 0.0081 | 3231371744 | 3022 | 2 |
| [5, 10, 20] | 512 | 38.35 | 0.0063 | 3231371744 | 6043 | 2 |
| [5, 10, 20] | 1024 | 28.93 | 0.0096 | 3231371744 | 3022 | 2 |
| [25, 25] | 512 | 37.31 | 0.0031 | 6462743488 | 12085 | 4 |
| [25, 25] | 1024 | 25.15 | 0.0042 | 6462743488 | 6043 | 4 |
| [10, 10, 10] | 512 | 64.29 | 0.0053 | 6462743488 | 12085 | 4 |
| [10, 10, 10] | 1024 | 47.13 | 0.0078 | 6462743488 | 6043 | 4 |
| [5, 10, 20] | 512 | 72.90 | 0.0060 | 6462743488 | 12085 | 4 |
| [5, 10, 20] | 1024 | 56.70 | 0.0094 | 6462743488 | 6043 | 4 |
| [25, 25] | 512 | 80.99 | 0.0034 | 12925486976 | 24169 | 8 |
| [25, 25] | 1024 | 50.89 | 0.0042 | 12925486976 | 12085 | 8 |
| [10, 10, 10] | 512 | 129.49 | 0.0054 | 12925486976 | 24169 | 8 |
| [10, 10, 10] | 1024 | 93.66 | 0.0078 | 12925486976 | 12085 | 8 |
| [5, 10, 20] | 512 | 143.45 | 0.0059 | 12925486976 | 24169 | 8 |
| [5, 10, 20] | 1024 | 110.22 | 0.0091 | 12925486976 | 12085 | 8 |

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Tingyu Wang (https://github.com/tingyu66)

URL: https://github.com/rapidsai/cugraph/pull/3660
---
 .../dgl_benchmark.py                          | 291 ++++++++++++++++++
 1 file changed, 291 insertions(+)
 create mode 100644 benchmarks/cugraph-dgl/python-script/dgl_dataloading_benchmark/dgl_benchmark.py

diff --git a/benchmarks/cugraph-dgl/python-script/dgl_dataloading_benchmark/dgl_benchmark.py b/benchmarks/cugraph-dgl/python-script/dgl_dataloading_benchmark/dgl_benchmark.py
new file mode 100644
index 00000000000..0a52703c546
--- /dev/null
+++ b/benchmarks/cugraph-dgl/python-script/dgl_dataloading_benchmark/dgl_benchmark.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import dgl
+import torch
+import pandas as pd
+import os
+import time
+import json
+import random
+import numpy as np
+from argparse import ArgumentParser
+
+
+def load_edges_from_disk(parquet_path, replication_factor, input_meta):
+    """
+    Load the edges from disk into a graph data dictionary.
+    Args:
+        parquet_path: Path to the parquet directory.
+        replication_factor: Number of times to replicate the edges.
+        input_meta: Input meta data.
+    Returns:
+        dict: Dictionary of edge types to a tuple of (src, dst)
+    """
+    graph_data = {}
+    for edge_type in input_meta["num_edges"].keys():
+        print(
+            f"Loading edge index for edge type {edge_type}"
+            f"for replication factor = {replication_factor}"
+        )
+        can_edge_type = tuple(edge_type.split("__"))
+        # TODO: Rename `edge_index` to a better name
+        ei = pd.read_parquet(
+            os.path.join(parquet_path, edge_type, "edge_index.parquet")
+        )
+        ei = {
+            "src": torch.from_numpy(ei.src.values),
+            "dst": torch.from_numpy(ei.dst.values),
+        }
+        if replication_factor > 1:
+            src_ls = [ei["src"]]
+            dst_ls = [ei["dst"]]
+            for r in range(1, replication_factor):
+                new_src = ei["src"] + (
+                    r * input_meta["num_nodes"][can_edge_type[0]]
+                )
+                src_ls.append(new_src)
+                new_dst = ei["dst"] + (
+                    r * input_meta["num_nodes"][can_edge_type[2]]
+                )
+                dst_ls.append(new_dst)
+
+            ei["src"] = torch.cat(src_ls).contiguous()
+            ei["dst"] = torch.cat(dst_ls).contiguous()
+        graph_data[can_edge_type] = ei["src"], ei["dst"]
+    print("Graph Data compiled")
+    return graph_data
+
+
+def load_node_labels(dataset_path, replication_factor, input_meta):
+    num_nodes_dict = {
+        node_type: t * replication_factor
+        for node_type, t in input_meta["num_nodes"].items()
+    }
+    node_data = {}
+    for node_type in input_meta["num_nodes"].keys():
+        node_data[node_type] = {}
+        label_path = os.path.join(
+            dataset_path, "parquet", node_type, "node_label.parquet"
+        )
+        if os.path.exists(label_path):
+            node_label = pd.read_parquet(label_path)
+            if replication_factor > 1:
+                base_num_nodes = input_meta["num_nodes"][node_type]
+                dfr = pd.DataFrame(
+                    {
+                        "node": pd.concat(
+                            [
+                                node_label.node + (r * base_num_nodes)
+                                for r in range(1, replication_factor)
+                            ]
+                        ),
+                        "label": pd.concat(
+                            [
+                                node_label.label
+                                for r in range(1, replication_factor)
+                            ]
+                        ),
+                    }
+                )
+                node_label = pd.concat([node_label, dfr]).reset_index(
+                    drop=True
+                )
+
+            node_label_tensor = torch.full(
+                (num_nodes_dict[node_type],), -1, dtype=torch.float32
+            )
+            node_label_tensor[
+                torch.as_tensor(node_label.node.values)
+            ] = torch.as_tensor(node_label.label.values)
+
+            del node_label
+            node_data[node_type]["train_idx"] = (
+                (node_label_tensor > -1).contiguous().nonzero().view(-1)
+            )
+            node_data[node_type]["y"] = node_label_tensor.contiguous()
+        else:
+            node_data[node_type]["num_nodes"] = num_nodes_dict[node_type]
+    return node_data
+
+
+def create_dgl_graph_from_disk(dataset_path, replication_factor=1):
+    """
+    Create a DGL graph from a dataset on disk.
+    Args:
+        dataset_path: Path to the dataset on disk.
+        replication_factor: Number of times to replicate the edges.
+    Returns:
+        DGLGraph: DGLGraph with the loaded dataset.
+    """
+    with open(os.path.join(dataset_path, "meta.json"), "r") as f:
+        input_meta = json.load(f)
+
+    parquet_path = os.path.join(dataset_path, "parquet")
+    graph_data = load_edges_from_disk(
+        parquet_path, replication_factor, input_meta
+    )
+    node_data = load_node_labels(dataset_path, replication_factor, input_meta)
+    g = dgl.heterograph(graph_data)
+
+    return g, node_data
+
+
+def create_dataloader(g, train_idx, batch_size, fanouts, use_uva):
+    """
+    Create a DGL dataloader from a DGL graph.
+    Args:
+        g: DGLGraph to create the dataloader from.
+        train_idx: Tensor containing the training indices.
+        batch_size: Batch size to use for the dataloader.
+        fanouts: List of fanouts to use for the dataloader.
+        use_uva: Whether to use unified virtual address space.
+    Returns:
+        DGLGraph: DGLGraph with the loaded dataset.
+    """
+    
+    print("Creating dataloader", flush=True)
+    st = time.time()
+    if use_uva:
+        train_idx = {k: v.to("cuda") for k, v in train_idx.items()}
+    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts=fanouts)
+    dataloader = dgl.dataloading.DataLoader(
+        g,
+        train_idx,
+        sampler,
+        num_workers=0,
+        batch_size=batch_size,
+        use_uva=use_uva,
+        shuffle=False,
+        drop_last=False,
+    )
+    et = time.time()
+    print(f"Time to create dataloader = {et - st:.2f} seconds")
+    return dataloader
+
+
+def dataloading_benchmark(g, train_idx, fanouts, batch_sizes, use_uva):
+    """
+    Run the dataloading benchmark.
+    Args:
+        g: DGLGraph
+        train_idx: Tensor containing the training indices.
+        fanouts: List of fanouts to use for the dataloader.
+        batch_sizes: List of batch sizes to use for the dataloader.
+        use_uva: Whether to use unified virtual address space.
+    """
+    time_ls = []
+    for fanout in fanouts:
+        for batch_size in batch_sizes:
+            dataloader = create_dataloader(
+                g,
+                train_idx,
+                batch_size=batch_size,
+                fanouts=fanout,
+                use_uva=use_uva,
+            )
+            dataloading_st = time.time()
+            for input_nodes, output_nodes, blocks in dataloader:
+                pass
+            dataloading_et = time.time()
+            dataloading_time = dataloading_et - dataloading_st
+            time_d = {
+                "fanout": fanout,
+                "batch_size": batch_size,
+                "dataloading_time_per_epoch": dataloading_time,
+                "dataloading_time_per_batch": dataloading_time / len(dataloader),
+                "num_edges": g.num_edges(),
+                "num_batches": len(dataloader),
+            }
+            time_ls.append(time_d)
+
+            print("Dataloading completed")
+            print(f"Fanout = {fanout}, batch_size = {batch_size}")
+            print(
+                f"Time taken {dataloading_time:.2f} ",
+                f"seconds for num batches {len(dataloader)}",
+                flush=True,
+            )
+            print("==============================================")
+    return time_ls
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--dataset_path", type=str, default="/datasets/abarghi/ogbn_papers100M"
+    )
+    parser.add_argument("--replication_factors", type=str, default="1,2,4,8")
+    parser.add_argument(
+        "--fanouts", type=str, default="25_25,10_10_10,5_10_20"
+    )
+    parser.add_argument("--batch_sizes", type=str, default="512,1024")
+    parser.add_argument("--do_not_use_uva", action="store_true")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    if args.do_not_use_uva:
+        use_uva = False
+    else:
+        use_uva = True
+    set_seed(args.seed)
+    replication_factors = [int(x) for x in args.replication_factors.split(",")]
+    fanouts = [[int(y) for y in x.split("_")] for x in args.fanouts.split(",")]
+    batch_sizes = [int(x) for x in args.batch_sizes.split(",")]
+
+    print("Running dgl dataloading benchmark with the following parameters:")
+    print(f"Dataset path = {args.dataset_path}")
+    print(f"Replication factors = {replication_factors}")
+    print(f"Fanouts = {fanouts}")
+    print(f"Batch sizes = {batch_sizes}")
+    print(f"Use UVA = {use_uva}")
+    print("==============================================")
+
+    time_ls = []
+    for replication_factor in replication_factors:
+        st = time.time()
+        g, node_data = create_dgl_graph_from_disk(
+            dataset_path=args.dataset_path,
+            replication_factor=replication_factor,
+        )
+        et = time.time()
+        print(f"Replication factor = {replication_factor}")
+        print(
+            f"G has {g.num_edges()} edges and took",
+            f" {et - st:.2f} seconds to load"
+        )
+        train_idx = {"paper": node_data["paper"]["train_idx"]}
+        r_time_ls = dataloading_benchmark(
+            g, train_idx, fanouts, batch_sizes, use_uva=use_uva
+        )
+        print(
+            "Benchmark completed for replication factor = ", replication_factor
+        )
+        print("==============================================")
+        # Add replication factor to the time list
+        [
+            x.update({"replication_factor": replication_factor})
+            for x in r_time_ls
+        ]
+        time_ls.extend(r_time_ls)
+
+    df = pd.DataFrame(time_ls)
+    df.to_csv("dgl_dataloading_benchmark.csv", index=False)
+    print("Benchmark completed for all replication factors")
+    print("==============================================")

From 8b02e241617df8bac33d0fd69b03046d2ddaf2d9 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Tue, 26 Sep 2023 15:43:02 +0200
Subject: [PATCH 52/72] Enable temporarily disabled MG tests (#3837)

Enable TEMPORARILY disable single-GPU "MG" tests
And Skip deleting copied Dataframe while creating distributed graph from cudf edge-lists.
Ideally we would like to merger this PR once the [issue 3790](https://github.com/rapidsai/cugraph/issues/3790) is closed, but me  might need to merger it if the issue is not resolved before the next release.

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cugraph/pull/3837
---
 ci/test_python.sh                                           | 6 +-----
 ci/test_wheel.sh                                            | 4 +---
 .../graph_implementation/simpleDistributedGraph.py          | 5 ++++-
 python/cugraph/cugraph/tests/traversal/test_bfs_mg.py       | 5 ++++-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 14886909fc9..7b0077991ae 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -63,10 +63,6 @@ pytest \
   tests
 popd
 
-# FIXME: TEMPORARILY disable single-GPU "MG" testing until
-# https://github.com/rapidsai/cugraph/issues/3790 is closed
-# When closed, replace -k "not _mg" with
-#  -k "not test_property_graph_mg" \
 rapids-logger "pytest cugraph"
 pushd python/cugraph/cugraph
 export DASK_WORKER_DEVICES="0"
@@ -79,7 +75,7 @@ pytest \
   --cov=cugraph \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-coverage.xml" \
   --cov-report=term \
-  -k "not _mg" \
+  -k "not test_property_graph_mg" \
   tests
 popd
 
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index b62635d08b4..146186ae2e7 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -18,7 +18,5 @@ arch=$(uname -m)
 if [[ "${arch}" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
     python ./ci/wheel_smoke_test_${package_name}.py
 else
-    # FIXME: TEMPORARILY disable single-GPU "MG" testing until
-    # https://github.com/rapidsai/cugraph/issues/3790 is closed
-    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest -k "not _mg" ./python/${package_name}/${package_name}/tests
+    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest ./python/${package_name}/${package_name}/tests
 fi
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index 0586d0d853c..01885c2d1c3 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -14,6 +14,7 @@
 import gc
 from typing import Union
 import warnings
+import random
 
 import cudf
 import cupy as cp
@@ -182,7 +183,9 @@ def __from_edgelist(
         # Repartition to 2 partitions per GPU for memory efficient process
         input_ddf = input_ddf.repartition(npartitions=len(workers) * 2)
         # FIXME: Make a copy of the input ddf before implicitly altering it.
-        input_ddf = input_ddf.map_partitions(lambda df: df.copy())
+        input_ddf = input_ddf.map_partitions(
+            lambda df: df.copy(), token="custom-" + str(random.random())
+        )
         # The dataframe will be symmetrized iff the graph is undirected
         # otherwise, the inital dataframe will be returned
         if edge_attr is not None:
diff --git a/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py b/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py
index 8ffbecea4fc..5eafc231141 100644
--- a/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py
+++ b/python/cugraph/cugraph/tests/traversal/test_bfs_mg.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 
 import gc
+import random
 
 import pytest
 
@@ -61,7 +62,9 @@ def modify_dataset(df):
         return cudf.concat([df, temp_df])
 
     meta = ddf._meta
-    ddf = ddf.map_partitions(modify_dataset, meta=meta)
+    ddf = ddf.map_partitions(
+        modify_dataset, meta=meta, token="custom-" + str(random.random())
+    )
 
     df = cudf.read_csv(
         input_data_path,

From a9f4297223593f8df211599277519e206c597630 Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Tue, 26 Sep 2023 08:57:51 -0500
Subject: [PATCH 53/72] Enable weights for MG similarity algorithms (#3879)

This is a follow up PR to #3828 which enabled weighted for the python SG similarity algorithms.
This PR also updates the tests, docstrings and remove experimental calls

Authors:
  - Joseph Nke (https://github.com/jnke2016)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)

URL: https://github.com/rapidsai/cugraph/pull/3879
---
 .../cugraph/dask/link_prediction/jaccard.py   | 10 +---
 .../cugraph/dask/link_prediction/overlap.py   | 10 +---
 .../cugraph/dask/link_prediction/sorensen.py  | 10 +---
 .../tests/link_prediction/test_jaccard_mg.py  | 59 ++++++-------------
 .../tests/link_prediction/test_overlap_mg.py  | 59 ++++++-------------
 .../tests/link_prediction/test_sorensen_mg.py | 59 ++++++-------------
 6 files changed, 60 insertions(+), 147 deletions(-)

diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
index 218e6206fc3..5362c7a9e1e 100644
--- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py
+++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py
@@ -118,7 +118,9 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
         adjacent vertices in the graph.
 
     use_weight : bool, optional (default=False)
-        Currently not supported
+        Flag to indicate whether to compute weighted jaccard (if use_weight==True)
+        or un-weighted jaccard (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
 
     Returns
     -------
@@ -144,12 +146,6 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False):
 
     vertex_pair_col_name = vertex_pair.columns
 
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if input_graph.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
     if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
 
diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py
index 5540be28fd1..4bda05e3c95 100644
--- a/python/cugraph/cugraph/dask/link_prediction/overlap.py
+++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py
@@ -96,7 +96,9 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
         adjacent vertices in the graph.
 
     use_weight : bool, optional (default=False)
-        Currently not supported
+        Flag to indicate whether to compute weighted overlap (if use_weight==True)
+        or un-weighted overlap (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
 
     Returns
     -------
@@ -122,12 +124,6 @@ def overlap(input_graph, vertex_pair=None, use_weight=False):
 
     vertex_pair_col_name = vertex_pair.columns
 
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if input_graph.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
     if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
 
diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
index 24295ac330c..163b0d0dc16 100644
--- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py
+++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py
@@ -92,7 +92,9 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):
         adjacent vertices in the graph.
 
     use_weight : bool, optional (default=False)
-        Currently not supported
+        Flag to indicate whether to compute weighted sorensen (if use_weight==True)
+        or un-weighted sorensen (if use_weight==False).
+        'input_graph' must be weighted if 'use_weight=True'.
 
     Returns
     -------
@@ -118,12 +120,6 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False):
 
     vertex_pair_col_name = vertex_pair.columns
 
-    if use_weight:
-        raise ValueError("'use_weight' is currently not supported.")
-
-    if input_graph.is_weighted():
-        raise ValueError("Weighted graphs are currently not supported.")
-
     if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)):
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
 
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
index b56a6baae2b..ee739c9f236 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py
@@ -34,6 +34,7 @@ def setup_function():
 
 IS_DIRECTED = [False]
 HAS_VERTEX_PAIR = [True, False]
+IS_WEIGHTED = [True, False]
 
 
 # =============================================================================
@@ -48,6 +49,7 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (IS_WEIGHTED, "is_weighted"),
 )
 
 
@@ -57,7 +59,9 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
+    parameters = dict(
+        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+    )
 
     return parameters
 
@@ -72,7 +76,10 @@ def input_expected_output(input_combo):
     input_data_path = input_combo["graph_file"]
     directed = input_combo["directed"]
     has_vertex_pair = input_combo["has_vertex_pair"]
-    G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -84,7 +91,9 @@ def input_expected_output(input_combo):
         vertex_pair = None
 
     input_combo["vertex_pair"] = vertex_pair
-    sg_cugraph_jaccard = cugraph.experimental.jaccard(G, input_combo["vertex_pair"])
+    sg_cugraph_jaccard = cugraph.jaccard(
+        G, input_combo["vertex_pair"], use_weight=is_weighted
+    )
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -104,6 +113,7 @@ def input_expected_output(input_combo):
         ddf,
         source="src",
         destination="dst",
+        edge_attr="value" if is_weighted else None,
         renumber=True,
         store_transposed=True,
     )
@@ -122,8 +132,11 @@ def input_expected_output(input_combo):
 def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
+    use_weight = input_expected_output["is_weighted"]
 
-    result_jaccard = benchmark(dcg.jaccard, dg, input_expected_output["vertex_pair"])
+    result_jaccard = benchmark(
+        dcg.jaccard, dg, input_expected_output["vertex_pair"], use_weight=use_weight
+    )
 
     result_jaccard = (
         result_jaccard.compute()
@@ -151,41 +164,3 @@ def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output):
 
     assert len(jaccard_coeff_diffs1) == 0
     assert len(jaccard_coeff_diffs2) == 0
-
-
-@pytest.mark.mg
-def test_dask_mg_weighted_jaccard(dask_client):
-    input_data_path = datasets[0]
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-        store_transposed=True,
-    )
-    with pytest.raises(ValueError):
-        dcg.jaccard(dg)
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        store_transposed=True,
-    )
-
-    use_weight = True
-    with pytest.raises(ValueError):
-        dcg.jaccard(dg, use_weight=use_weight)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
index ce4bf619f47..87407d7b59c 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py
@@ -34,6 +34,7 @@ def setup_function():
 
 IS_DIRECTED = [False]
 HAS_VERTEX_PAIR = [True, False]
+IS_WEIGHTED = [True, False]
 
 
 # =============================================================================
@@ -48,6 +49,7 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (IS_WEIGHTED, "is_weighted"),
 )
 
 
@@ -57,7 +59,9 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
+    parameters = dict(
+        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+    )
 
     return parameters
 
@@ -72,7 +76,10 @@ def input_expected_output(input_combo):
     input_data_path = input_combo["graph_file"]
     directed = input_combo["directed"]
     has_vertex_pair = input_combo["has_vertex_pair"]
-    G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -84,7 +91,9 @@ def input_expected_output(input_combo):
         vertex_pair = None
 
     input_combo["vertex_pair"] = vertex_pair
-    sg_cugraph_overlap = cugraph.experimental.overlap(G, input_combo["vertex_pair"])
+    sg_cugraph_overlap = cugraph.overlap(
+        G, input_combo["vertex_pair"], use_weight=is_weighted
+    )
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -104,6 +113,7 @@ def input_expected_output(input_combo):
         ddf,
         source="src",
         destination="dst",
+        edge_attr="value" if is_weighted else None,
         renumber=True,
         store_transposed=True,
     )
@@ -125,8 +135,11 @@ def input_expected_output(input_combo):
 def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
+    use_weight = input_expected_output["is_weighted"]
 
-    result_overlap = benchmark(dcg.overlap, dg, input_expected_output["vertex_pair"])
+    result_overlap = benchmark(
+        dcg.overlap, dg, input_expected_output["vertex_pair"], use_weight=use_weight
+    )
 
     result_overlap = (
         result_overlap.compute()
@@ -154,41 +167,3 @@ def test_dask_mg_overlap(dask_client, benchmark, input_expected_output):
 
     assert len(overlap_coeff_diffs1) == 0
     assert len(overlap_coeff_diffs2) == 0
-
-
-@pytest.mark.mg
-def test_dask_mg_weighted_overlap():
-    input_data_path = datasets[0]
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-        store_transposed=True,
-    )
-    with pytest.raises(ValueError):
-        dcg.overlap(dg)
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        store_transposed=True,
-    )
-
-    use_weight = True
-    with pytest.raises(ValueError):
-        dcg.overlap(dg, use_weight=use_weight)
diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
index af6b60771a0..66832d08427 100644
--- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
+++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py
@@ -35,6 +35,7 @@ def setup_function():
 
 IS_DIRECTED = [False]
 HAS_VERTEX_PAIR = [True, False]
+IS_WEIGHTED = [True, False]
 
 
 # =============================================================================
@@ -49,6 +50,7 @@ def setup_function():
     (datasets, "graph_file"),
     (IS_DIRECTED, "directed"),
     (HAS_VERTEX_PAIR, "has_vertex_pair"),
+    (IS_WEIGHTED, "is_weighted"),
 )
 
 
@@ -58,7 +60,9 @@ def input_combo(request):
     Simply return the current combination of params as a dictionary for use in
     tests or other parameterized fixtures.
     """
-    parameters = dict(zip(("graph_file", "directed", "has_vertex_pair"), request.param))
+    parameters = dict(
+        zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param)
+    )
 
     return parameters
 
@@ -73,7 +77,10 @@ def input_expected_output(input_combo):
     input_data_path = input_combo["graph_file"]
     directed = input_combo["directed"]
     has_vertex_pair = input_combo["has_vertex_pair"]
-    G = utils.generate_cugraph_graph_from_file(input_data_path, directed=directed)
+    is_weighted = input_combo["is_weighted"]
+    G = utils.generate_cugraph_graph_from_file(
+        input_data_path, directed=directed, edgevals=is_weighted
+    )
     if has_vertex_pair:
         # Sample random vertices from the graph and compute the two_hop_neighbors
         # with those seeds
@@ -85,7 +92,9 @@ def input_expected_output(input_combo):
         vertex_pair = None
 
     input_combo["vertex_pair"] = vertex_pair
-    sg_cugraph_sorensen = cugraph.experimental.sorensen(G, input_combo["vertex_pair"])
+    sg_cugraph_sorensen = cugraph.sorensen(
+        G, input_combo["vertex_pair"], use_weight=is_weighted
+    )
     # Save the results back to the input_combo dictionary to prevent redundant
     # cuGraph runs. Other tests using the input_combo fixture will look for
     # them, and if not present they will have to re-run the same cuGraph call.
@@ -105,6 +114,7 @@ def input_expected_output(input_combo):
         ddf,
         source="src",
         destination="dst",
+        edge_attr="value" if is_weighted else None,
         renumber=True,
         store_transposed=True,
     )
@@ -124,8 +134,11 @@ def input_expected_output(input_combo):
 def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output):
 
     dg = input_expected_output["MGGraph"]
+    use_weight = input_expected_output["is_weighted"]
 
-    result_sorensen = benchmark(dcg.sorensen, dg, input_expected_output["vertex_pair"])
+    result_sorensen = benchmark(
+        dcg.sorensen, dg, input_expected_output["vertex_pair"], use_weight=use_weight
+    )
 
     result_sorensen = (
         result_sorensen.compute()
@@ -153,41 +166,3 @@ def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output):
 
     assert len(sorensen_coeff_diffs1) == 0
     assert len(sorensen_coeff_diffs2) == 0
-
-
-@pytest.mark.mg
-def test_dask_mg_weighted_sorensen(dask_client):
-    input_data_path = datasets[0]
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        renumber=True,
-        store_transposed=True,
-    )
-    with pytest.raises(ValueError):
-        dcg.sorensen(dg)
-
-    dg = cugraph.Graph(directed=False)
-    dg.from_dask_cudf_edgelist(
-        ddf,
-        source="src",
-        destination="dst",
-        edge_attr="value",
-        store_transposed=True,
-    )
-
-    use_weight = True
-    with pytest.raises(ValueError):
-        dcg.sorensen(dg, use_weight=use_weight)

From 5c34d3dd340c76678d8f2667057c6b0ce2f1f480 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Tue, 26 Sep 2023 11:57:30 -0400
Subject: [PATCH 54/72] Update Allocator Selection in cuGraph-DGL Example
 (#3877)

Closes #3847

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3877
---
 .../cugraph-dgl/examples/graphsage/node-classification.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/cugraph-dgl/examples/graphsage/node-classification.py b/python/cugraph-dgl/examples/graphsage/node-classification.py
index 24df73ada75..320890b0312 100644
--- a/python/cugraph-dgl/examples/graphsage/node-classification.py
+++ b/python/cugraph-dgl/examples/graphsage/node-classification.py
@@ -39,14 +39,16 @@
 
 
 def set_allocators():
+    import rmm
     import cudf
     import cupy
-    import rmm
+    from rmm.allocators.torch import rmm_torch_allocator
+    from rmm.allocators.cupy import rmm_cupy_allocator
 
     mr = rmm.mr.CudaAsyncMemoryResource()
     rmm.mr.set_current_device_resource(mr)
-    torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
-    cupy.cuda.set_allocator(rmm.allocators.cupy.rmm_cupy_allocator)
+    torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
     cudf.set_option("spill", True)
 
 
From 5d2f5486899bdd2d71c00b12fb26afbdd60100d1 Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Tue, 26 Sep 2023 20:59:55 -0500
Subject: [PATCH 55/72] Updates to build and test `nx-cugraph` wheel as part of
 CI and nightly workflows (#3852)

closes rapidsai/graph_dl#302

* Updates GHA yaml files to build and test a `nx-cugraph` wheel
* Adds CI scripts for building and testing the `nx-cugraph` wheel
* Adds a smoketest script for `nx-cugraph`
* Relevant code cleanup: removes unnecessary dataset download from cugraph wheel testing

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cugraph/pull/3852
---
 .github/workflows/build.yaml      | 20 ++++++++++++++++
 .github/workflows/pr.yaml         | 16 +++++++++++++
 .github/workflows/test.yaml       |  9 ++++++++
 ci/build_wheel.sh                 | 12 ++++++----
 ci/build_wheel_nx-cugraph.sh      |  6 +++++
 ci/test_wheel.sh                  |  9 +++++---
 ci/test_wheel_cugraph.sh          |  8 -------
 ci/test_wheel_nx-cugraph.sh       |  6 +++++
 ci/wheel_smoke_test_nx-cugraph.py | 38 +++++++++++++++++++++++++++++++
 9 files changed, 109 insertions(+), 15 deletions(-)
 create mode 100755 ci/build_wheel_nx-cugraph.sh
 create mode 100755 ci/test_wheel_nx-cugraph.sh
 create mode 100644 ci/wheel_smoke_test_nx-cugraph.py

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 02b357c7c88..c01a6fcb94a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -112,3 +112,23 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: cugraph
+  wheel-build-nx-cugraph:
+    needs: wheel-publish-pylibcugraph
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_nx-cugraph.sh
+  wheel-publish-nx-cugraph:
+    needs: wheel-build-nx-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: nx-cugraph
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d2d24d90fbe..d49ae7f8d3d 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,6 +23,8 @@ jobs:
       - wheel-tests-pylibcugraph
       - wheel-build-cugraph
       - wheel-tests-cugraph
+      - wheel-build-nx-cugraph
+      - wheel-tests-nx-cugraph
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
@@ -109,3 +111,17 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph.sh
+  wheel-build-nx-cugraph:
+    needs: wheel-tests-pylibcugraph
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_nx-cugraph.sh
+  wheel-tests-nx-cugraph:
+    needs: wheel-build-nx-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_nx-cugraph.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 97abca71260..dc9ed60b29e 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -48,3 +48,12 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_cugraph.sh
+  wheel-tests-nx-cugraph:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/test_wheel_nx-cugraph.sh
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 3798d561126..821aa25c1b9 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -49,7 +49,11 @@ cd "${package_dir}"
 
 python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
-mkdir -p final_dist
-python -m auditwheel repair -w final_dist dist/*
-
-RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+# pure-python packages should not have auditwheel run on them.
+if [[ ${package_name} == "nx-cugraph" ]]; then
+    RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 dist
+else
+    mkdir -p final_dist
+    python -m auditwheel repair -w final_dist dist/*
+    RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+fi
diff --git a/ci/build_wheel_nx-cugraph.sh b/ci/build_wheel_nx-cugraph.sh
new file mode 100755
index 00000000000..4481de1283d
--- /dev/null
+++ b/ci/build_wheel_nx-cugraph.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+./ci/build_wheel.sh nx-cugraph python/nx-cugraph
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 146186ae2e7..3ac3549f143 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -6,17 +6,20 @@ set -eoxu pipefail
 package_name=$1
 package_dir=$2
 
+python_package_name=$(echo ${package_name}|sed 's/-/_/g')
+
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# echo to expand wildcard before adding `[extra]` requires for pip
+# use 'ls' to expand wildcard before adding `[extra]` requires for pip
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-python -m pip install $(echo ./dist/${package_name}*.whl)[test]
+# pip creates wheels using python package names
+python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
 
 # Run smoke tests for aarch64 pull requests
 arch=$(uname -m)
 if [[ "${arch}" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
     python ./ci/wheel_smoke_test_${package_name}.py
 else
-    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest ./python/${package_name}/${package_name}/tests
+    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest ./python/${package_name}/${python_package_name}/tests
 fi
diff --git a/ci/test_wheel_cugraph.sh b/ci/test_wheel_cugraph.sh
index 4d511ac2a0f..f9e2aa6d8da 100755
--- a/ci/test_wheel_cugraph.sh
+++ b/ci/test_wheel_cugraph.sh
@@ -11,12 +11,4 @@ python -m pip install --no-deps ./local-pylibcugraph-dep/pylibcugraph*.whl
 # Always install latest dask for testing
 python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main
 
-# Only download test data for x86
-arch=$(uname -m)
-if [[ "${arch}" == "x86_64" ]]; then
-    pushd ./datasets
-    bash ./get_test_data.sh
-    popd
-fi
-
 ./ci/test_wheel.sh cugraph python/cugraph
diff --git a/ci/test_wheel_nx-cugraph.sh b/ci/test_wheel_nx-cugraph.sh
new file mode 100755
index 00000000000..53d40960fc3
--- /dev/null
+++ b/ci/test_wheel_nx-cugraph.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+set -eoxu pipefail
+
+./ci/test_wheel.sh nx-cugraph python/nx-cugraph
diff --git a/ci/wheel_smoke_test_nx-cugraph.py b/ci/wheel_smoke_test_nx-cugraph.py
new file mode 100644
index 00000000000..10d26e3aac7
--- /dev/null
+++ b/ci/wheel_smoke_test_nx-cugraph.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import networkx as nx
+import nx_cugraph as nxcg
+
+
+if __name__ == "__main__":
+    G = nx.Graph()
+    G.add_edges_from([(0, 1), (1, 2), (2, 3)])
+
+    nx_result = nx.betweenness_centrality(G)
+    # nx_cugraph is intended to be called via the NetworkX dispatcher, like
+    # this:
+    #    nxcu_result = nx.betweenness_centrality(G, backend="cugraph")
+    #
+    # but here it is being called directly since the NetworkX version that
+    # supports the "backend" kwarg may not be available in the testing env.
+    nxcu_result = nxcg.betweenness_centrality(G)
+
+    nx_nodes, nxcu_nodes = nx_result.keys(), nxcu_result.keys()
+    assert nxcu_nodes == nx_nodes
+    for node_id in nx_nodes:
+        nx_bc, nxcu_bc = nx_result[node_id], nxcu_result[node_id]
+        assert math.isclose(nx_bc, nxcu_bc, rel_tol=1e-6), \
+            f"bc for {node_id=} exceeds tolerance: {nx_bc=}, {nxcu_bc=}"

From 8b87915c8c5e068caef04be4eeeb9cf7ae27b488 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 27 Sep 2023 12:08:12 -0700
Subject: [PATCH 56/72] Add cuGraph devcontainers (#3838)

This PR adds some [devcontainers](https://containers.dev/) to help simplify building the cuGraph C++ and Python libraries.

It also adds an optional job to the `pr.yaml` to [build the cuGraph libs in each devcontainer](https://github.com/trxcllnt/cugraph/blob/fea/devcontainers/.github/workflows/pr.yaml#L113-L119), so the build caches are populated for devs by CI.

A devcontainer can be launched by clicking the "Reopen in Container" button that VSCode shows when opening the repo (or by using the "Rebuild and Reopen in Container" command from the command palette):
![image](https://user-images.githubusercontent.com/178183/221771999-97ab29d5-e718-4e5f-b32f-2cdd51bba25c.png)

Clicking this button will cause VSCode to prompt the user to select one of these devcontainer variants:
![image](https://github.com/rapidsai/rmm/assets/178183/68d4b264-4fc2-4008-92b6-cb4bdd19b29f)

On startup, the devcontainer creates or updates the conda/pip environment using `cugraph/dependencies.yaml`. The envs/package caches are cached on the host via volume mounts, which are described in more detail in [`.devcontainer/README.md`](https://github.com/trxcllnt/cugraph/blob/fea/devcontainers/.devcontainer/README.md).

The container includes convenience functions to clean, configure, and build the various cuGraph components:

```shell
$ clean-cugraph-cpp # only cleans the C++ build dir
$ clean-cugraph-python # only cleans the Python build dir
$ clean-cugraph # cleans both C++ and Python build dirs

$ configure-cugraph-cpp # only configures cugraph C++ lib

$ build-cugraph-cpp # only builds cugraph C++ lib
$ build-cugraph-python # only builds cugraph Python lib
$ build-cugraph # builds both C++ and Python libs
```

* The C++ build script is a small wrapper around `cmake -S ~/cugraph/cpp -B ~/cugraph/cpp/build` and `cmake --build ~/cugraph/cpp/build`
* The Python build script is a small wrapper around `pip install --editable ~/cugraph/cpp`

Unlike `build.sh`, these convenience scripts *don't* install the libraries after building them. Instead, they automatically inject the correct arguments to build the C++ libraries from source and use their build dirs as package roots:

```shell
$ cmake -S ~/cugraph/cpp -B ~/cugraph/cpp/build
$ CMAKE_ARGS="-Dcugraph_ROOT=~/cugraph/cpp/build" \ # <-- this argument is automatic
  pip install -e ~/cugraph/cpp
```

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cugraph/pull/3838
---
 .devcontainer/Dockerfile                      |  33 +++
 .devcontainer/README.md                       |  34 +++
 .../cuda11.8-conda/devcontainer.json          |  37 +++
 .devcontainer/cuda11.8-pip/devcontainer.json  |  37 +++
 .../cuda12.0-conda/devcontainer.json          |  37 +++
 .devcontainer/cuda12.0-pip/devcontainer.json  |  37 +++
 .github/workflows/pr.yaml                     |  10 +
 .gitignore                                    |   4 +
 ci/release/update-version.sh                  |   7 +
 .../all_cuda-118_arch-x86_64.yaml             |   2 +
 .../all_cuda-120_arch-x86_64.yaml             |   2 +
 cpp/.clangd                                   |  65 +++++
 dependencies.yaml                             | 263 +++++++++++++++---
 13 files changed, 527 insertions(+), 41 deletions(-)
 create mode 100644 .devcontainer/Dockerfile
 create mode 100644 .devcontainer/README.md
 create mode 100644 .devcontainer/cuda11.8-conda/devcontainer.json
 create mode 100644 .devcontainer/cuda11.8-pip/devcontainer.json
 create mode 100644 .devcontainer/cuda12.0-conda/devcontainer.json
 create mode 100644 .devcontainer/cuda12.0-pip/devcontainer.json
 create mode 100644 cpp/.clangd

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 00000000000..3d0ac075be3
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,33 @@
+# syntax=docker/dockerfile:1.5
+
+ARG BASE
+ARG PYTHON_PACKAGE_MANAGER=conda
+
+FROM ${BASE} as pip-base
+
+ENV DEFAULT_VIRTUAL_ENV=rapids
+
+FROM ${BASE} as conda-base
+
+ENV DEFAULT_CONDA_ENV=rapids
+
+FROM ${PYTHON_PACKAGE_MANAGER}-base
+
+ARG CUDA
+ENV CUDAARCHS="RAPIDS"
+ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
+
+ARG PYTHON_PACKAGE_MANAGER
+ENV PYTHON_PACKAGE_MANAGER="${PYTHON_PACKAGE_MANAGER}"
+
+ENV PYTHONSAFEPATH="1"
+ENV PYTHONUNBUFFERED="1"
+ENV PYTHONDONTWRITEBYTECODE="1"
+
+ENV SCCACHE_REGION="us-east-2"
+ENV SCCACHE_BUCKET="rapids-sccache-devs"
+ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV HISTFILE="/home/coder/.cache/._bash_history"
+
+# cugraph_pyg's setup.py needs this defined when building in a conda env
+ENV CUDA_HOME="${CUDA_HOME:-/home/coder/.conda/envs/$DEFAULT_CONDA_ENV}"
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
new file mode 100644
index 00000000000..e645c51de8b
--- /dev/null
+++ b/.devcontainer/README.md
@@ -0,0 +1,34 @@
+# cuGraph Development Containers
+
+This directory contains [devcontainer configurations](https://containers.dev/implementors/json_reference/) for using VSCode to [develop in a container](https://code.visualstudio.com/docs/devcontainers/containers) via the `Remote Containers` [extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) or [GitHub Codespaces](https://github.com/codespaces).
+
+This container is a turnkey development environment for building and testing the cuGraph C++ and Python libraries.
+
+## Table of Contents
+
+* [Prerequisites](#prerequisites)
+* [Host bind mounts](#host-bind-mounts)
+* [Launch a Dev Container](#launch-a-dev-container)
+## Prerequisites
+
+* [VSCode](https://code.visualstudio.com/download)
+* [VSCode Remote Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
+
+## Host bind mounts
+
+By default, the following directories are bind-mounted into the devcontainer:
+
+* `${repo}:/home/coder/cugraph`
+* `${repo}/../.aws:/home/coder/.aws`
+* `${repo}/../.local:/home/coder/.local`
+* `${repo}/../.cache:/home/coder/.cache`
+* `${repo}/../.conda:/home/coder/.conda`
+* `${repo}/../.config:/home/coder/.config`
+
+This ensures caches, configurations, dependencies, and your commits are persisted on the host across container runs.
+
+## Launch a Dev Container
+
+To launch a devcontainer from VSCode, open the cuGraph repo and select the "Reopen in Container" button in the bottom right:<br/><img src="https://user-images.githubusercontent.com/178183/221771999-97ab29d5-e718-4e5f-b32f-2cdd51bba25c.png"/>
+
+Alternatively, open the VSCode command palette (typically `cmd/ctrl + shift + P`) and run the "Rebuild and Reopen in Container" command.
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
new file mode 100644
index 00000000000..cf4ba5aa114
--- /dev/null
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cugraph,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
new file mode 100644
index 00000000000..e86a38abbde
--- /dev/null
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda11.8-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:23.10": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cugraph,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
new file mode 100644
index 00000000000..863eeea48ff
--- /dev/null
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cugraph,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
new file mode 100644
index 00000000000..c7612771fd3
--- /dev/null
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:23.10-cpp-llvm16-cuda12.0-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:23.10": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cugraph,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d49ae7f8d3d..7b267d7edf3 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,6 +25,7 @@ jobs:
       - wheel-tests-cugraph
       - wheel-build-nx-cugraph
       - wheel-tests-nx-cugraph
+      - devcontainer
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
@@ -125,3 +126,12 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_nx-cugraph.sh
+  devcontainer:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.10
+    with:
+      extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
+      build_command: |
+        sccache -z;
+        build-all --verbose;
+        sccache -s;
diff --git a/.gitignore b/.gitignore
index 3fda9f8a037..c6bcf6965d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,7 @@ python/cugraph/cugraph/tests/dask-worker-space
 docs/cugraph/source/api_docs/api/*
 _html
 _text
+
+# clang tooling
+compile_commands.json
+.clangd/
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index bd3aa6bc370..f3892fbd3c4 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -128,3 +128,10 @@ done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
 
 sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" python/nx-cugraph/README.md
+
+# .devcontainer files
+find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
+    sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+done
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index c66890f8ae5..86de24c991d 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -66,10 +66,12 @@ dependencies:
 - scikit-build>=0.13.1
 - scikit-learn>=0.23.1
 - scipy
+- setuptools>=61.0.0
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sphinx<6
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
 - ucx-py==0.34.*
+- wheel
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 3afb1415572..1054f75ba54 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -65,10 +65,12 @@ dependencies:
 - scikit-build>=0.13.1
 - scikit-learn>=0.23.1
 - scipy
+- setuptools>=61.0.0
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sphinx<6
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
 - ucx-py==0.34.*
+- wheel
 name: all_cuda-120_arch-x86_64
diff --git a/cpp/.clangd b/cpp/.clangd
new file mode 100644
index 00000000000..7c4fe036ddf
--- /dev/null
+++ b/cpp/.clangd
@@ -0,0 +1,65 @@
+# https://clangd.llvm.org/config
+
+# Apply a config conditionally to all C files
+If:
+  PathMatch: .*\.(c|h)$
+
+---
+
+# Apply a config conditionally to all C++ files
+If:
+  PathMatch: .*\.(c|h)pp
+
+---
+
+# Apply a config conditionally to all CUDA files
+If:
+  PathMatch: .*\.cuh?
+CompileFlags:
+  Add:
+    - "-x"
+    - "cuda"
+    # No error on unknown CUDA versions
+    - "-Wno-unknown-cuda-version"
+    # Allow variadic CUDA functions
+    - "-Xclang=-fcuda-allow-variadic-functions"
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
+
+---
+
+# Tweak the clangd parse settings for all files
+CompileFlags:
+  Add:
+    # report all errors
+    - "-ferror-limit=0"
+    - "-fmacro-backtrace-limit=0"
+    - "-ftemplate-backtrace-limit=0"
+    # Skip the CUDA version check
+    - "--no-cuda-version-check"
+  Remove:
+    # remove gcc's -fcoroutines
+    - -fcoroutines
+    # remove nvc++ flags unknown to clang
+    - "-gpu=*"
+    - "-stdpar*"
+    # remove nvcc flags unknown to clang
+    - "-arch*"
+    - "-gencode*"
+    - "--generate-code*"
+    - "-ccbin*"
+    - "-t=*"
+    - "--threads*"
+    - "-Xptxas*"
+    - "-Xcudafe*"
+    - "-Xfatbin*"
+    - "-Xcompiler*"
+    - "--diag-suppress*"
+    - "--diag_suppress*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
diff --git a/dependencies.yaml b/dependencies.yaml
index 04ec1b6e957..a162ac01354 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -11,9 +11,15 @@ files:
       - cpp_build
       - cudatoolkit
       - docs
+      - python_build_wheel
       - python_build_cythonize
+      - depends_on_rmm
+      - depends_on_cudf
+      - depends_on_dask_cudf
+      - depends_on_pylibraft
+      - depends_on_raft_dask
+      - depends_on_cupy
       - python_run_cugraph
-      - python_run_pylibcugraph
       - python_run_nx_cugraph
       - python_run_cugraph_dgl
       - python_run_cugraph_pyg
@@ -50,6 +56,7 @@ files:
     output: none
     includes:
       - cudatoolkit
+      - depends_on_cudf
       - py_version
       - test_python_common
       - test_python_cugraph
@@ -62,14 +69,22 @@ files:
     includes:
       - common_build
       - python_build_wheel
+      - depends_on_rmm
+      - depends_on_pylibraft
+      - depends_on_pylibcugraph
       - python_build_cythonize
-      - python_build_cugraph
   py_run_cugraph:
     output: pyproject
     pyproject_dir: python/cugraph
     extras:
       table: project
     includes:
+      - depends_on_rmm
+      - depends_on_cudf
+      - depends_on_dask_cudf
+      - depends_on_raft_dask
+      - depends_on_pylibcugraph
+      - depends_on_cupy
       - python_run_cugraph
   py_test_cugraph:
     output: pyproject
@@ -88,6 +103,8 @@ files:
     includes:
       - common_build
       - python_build_wheel
+      - depends_on_rmm
+      - depends_on_pylibraft
       - python_build_cythonize
   py_run_pylibcugraph:
     output: pyproject
@@ -95,7 +112,8 @@ files:
     extras:
       table: project
     includes:
-      - python_run_pylibcugraph
+      - depends_on_rmm
+      - depends_on_pylibraft
   py_test_pylibcugraph:
     output: pyproject
     pyproject_dir: python/pylibcugraph
@@ -103,6 +121,7 @@ files:
       table: project.optional-dependencies
       key: test
     includes:
+      - depends_on_cudf
       - test_python_common
       - test_python_pylibcugraph
   py_build_nx_cugraph:
@@ -118,6 +137,8 @@ files:
     extras:
       table: project
     includes:
+      - depends_on_pylibcugraph
+      - depends_on_cupy
       - python_run_nx_cugraph
   py_test_nx_cugraph:
     output: pyproject
@@ -183,6 +204,10 @@ files:
     extras:
       table: project
     includes:
+      - depends_on_rmm
+      - depends_on_cudf
+      - depends_on_dask_cudf
+      - depends_on_cupy
       - python_run_cugraph_service_server
   py_test_cugraph_service_server:
     output: pyproject
@@ -334,41 +359,29 @@ dependencies:
               - python>=3.9,<3.11
   python_build_wheel:
     common:
-      - output_types: [conda, pyproject]
+      - output_types: [conda, pyproject, requirements]
         packages:
-          - wheel
           - setuptools>=61.0.0
+          - wheel
   python_build_cythonize:
     common:
-      - output_types: [conda, pyproject]
+      - output_types: [conda, pyproject, requirements]
         packages:
           - cython>=3.0.0
-          - &pylibraft pylibraft==23.10.*
-          - &rmm rmm==23.10.*
           - scikit-build>=0.13.1
-  python_build_cugraph:
-    common:
-      - output_types: [conda, pyproject]
-        packages:
-          - &pylibcugraph pylibcugraph==23.10.*
   python_run_cugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &cudf cudf==23.10.*
           - &dask dask>=2023.7.1
           - &distributed distributed>=2023.7.1
           - &dask_cuda dask-cuda==23.10.*
-          - &dask_cudf dask-cudf==23.10.*
           - &numba numba>=0.57
-          - raft-dask==23.10.*
-          - *rmm
           - &ucx_py ucx-py==0.34.*
       - output_types: conda
         packages:
           - aiohttp
-          - &cupy cupy>=12.0.0
-          - &dask-core dask-core>=2023.7.1
+          - &dask-core_conda dask-core>=2023.7.1
           - fsspec>=0.6.0
           - libcudf==23.10.*
           - requests
@@ -376,29 +389,14 @@ dependencies:
           - ucx-proc=*=gpu
       - output_types: pyproject
         packages:
-          - &cupy_pip cupy-cuda11x>=12.0.0
             # cudf uses fsspec but is protocol independent. cugraph
             # dataset APIs require [http] extras for use with cudf.
           - fsspec[http]>=0.6.0
-          - *pylibcugraph
-  python_run_pylibcugraph:
-    common:
-      - output_types: [conda, pyproject]
-        packages:
-          - *pylibraft
-          - *rmm
   python_run_nx_cugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
           - networkx>=3.0
-      - output_types: conda
-        packages:
-          - *cupy
-      - output_types: pyproject
-        packages:
-          - *cupy_pip
-          - *pylibcugraph
   python_run_cugraph_dgl:
     common:
       - output_types: [conda, pyproject]
@@ -426,23 +424,18 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - *cudf
           - *dask
           - *dask_cuda
-          - *dask_cudf
           - *distributed
           - *numba
           - *numpy
-          - *rmm
           - *thrift
           - *ucx_py
       - output_types: conda
         packages:
-          - *cupy
-          - *dask-core
+          - *dask-core_conda
       - output_types: pyproject
         packages:
-          - *cupy_pip
           - *cugraph
           - cugraph-service-client==23.10.*
   doc:
@@ -492,7 +485,6 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - *cudf
           - *numpy
   test_python_nx_cugraph:
     common:
@@ -519,3 +511,192 @@ dependencies:
           - pytorch==2.0
           - pytorch-cuda==11.8
           - pyg=2.3.1=*torch_2.0.0*cu118*
+
+  depends_on_rmm:
+    common:
+      - output_types: conda
+        packages:
+          - &rmm_conda rmm==23.10.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &rmm_packages_pip_cu12
+              - rmm-cu12==23.10.*
+          - {matrix: {cuda: "12.1"}, packages: *rmm_packages_pip_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *rmm_packages_pip_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &rmm_packages_pip_cu11
+              - rmm-cu11==23.10.*
+          - {matrix: {cuda: "11.5"}, packages: *rmm_packages_pip_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *rmm_packages_pip_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *rmm_packages_pip_cu11}
+          - {matrix: null, packages: [*rmm_conda]}
+
+  depends_on_cudf:
+    common:
+      - output_types: conda
+        packages:
+          - &cudf_conda cudf==23.10.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &cudf_packages_pip_cu12
+              - cudf-cu12==23.10.*
+          - {matrix: {cuda: "12.1"}, packages: *cudf_packages_pip_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *cudf_packages_pip_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &cudf_packages_pip_cu11
+              - cudf-cu11==23.10.*
+          - {matrix: {cuda: "11.5"}, packages: *cudf_packages_pip_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *cudf_packages_pip_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *cudf_packages_pip_cu11}
+          - {matrix: null, packages: [*cudf_conda]}
+
+  depends_on_dask_cudf:
+    common:
+      - output_types: conda
+        packages:
+          - &dask_cudf_conda dask-cudf==23.10.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &dask_cudf_packages_pip_cu12
+              - dask-cudf-cu12==23.10.*
+          - {matrix: {cuda: "12.1"}, packages: *dask_cudf_packages_pip_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *dask_cudf_packages_pip_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &dask_cudf_packages_pip_cu11
+              - dask-cudf-cu11==23.10.*
+          - {matrix: {cuda: "11.5"}, packages: *dask_cudf_packages_pip_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *dask_cudf_packages_pip_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *dask_cudf_packages_pip_cu11}
+          - {matrix: null, packages: [*dask_cudf_conda]}
+
+  depends_on_pylibraft:
+    common:
+      - output_types: conda
+        packages:
+          - &pylibraft_conda pylibraft==23.10.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &pylibraft_packages_pip_cu12
+              - pylibraft-cu12==23.10.*
+          - {matrix: {cuda: "12.1"}, packages: *pylibraft_packages_pip_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *pylibraft_packages_pip_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &pylibraft_packages_pip_cu11
+              - pylibraft-cu11==23.10.*
+          - {matrix: {cuda: "11.5"}, packages: *pylibraft_packages_pip_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *pylibraft_packages_pip_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *pylibraft_packages_pip_cu11}
+          - {matrix: null, packages: [*pylibraft_conda]}
+
+  depends_on_raft_dask:
+    common:
+      - output_types: conda
+        packages:
+          - &raft_dask_conda raft-dask==23.10.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &raft_dask_packages_pip_cu12
+              - raft-dask-cu12==23.10.*
+          - {matrix: {cuda: "12.1"}, packages: *raft_dask_packages_pip_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *raft_dask_packages_pip_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &raft_dask_packages_pip_cu11
+              - raft-dask-cu11==23.10.*
+          - {matrix: {cuda: "11.5"}, packages: *raft_dask_packages_pip_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *raft_dask_packages_pip_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *raft_dask_packages_pip_cu11}
+          - {matrix: null, packages: [*raft_dask_conda]}
+
+  depends_on_pylibcugraph:
+    common:
+      - output_types: conda
+        packages:
+          - &pylibcugraph_conda pylibcugraph==23.10.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &pylibcugraph_packages_pip_cu12
+              - pylibcugraph-cu12==23.10.*
+          - {matrix: {cuda: "12.1"}, packages: *pylibcugraph_packages_pip_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *pylibcugraph_packages_pip_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &pylibcugraph_packages_pip_cu11
+              - pylibcugraph-cu11==23.10.*
+          - {matrix: {cuda: "11.5"}, packages: *pylibcugraph_packages_pip_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *pylibcugraph_packages_pip_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *pylibcugraph_packages_pip_cu11}
+          - {matrix: null, packages: [*pylibcugraph_conda]}
+
+  depends_on_cupy:
+    common:
+      - output_types: conda
+        packages:
+          - cupy>=12.0.0
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          # All CUDA 12 + x86_64 versions
+          - matrix: {cuda: "12.2", arch: x86_64}
+            packages: &cupy_packages_cu12_x86_64
+              - cupy-cuda12x>=12.0.0
+          - {matrix: {cuda: "12.1", arch: x86_64}, packages: *cupy_packages_cu12_x86_64}
+          - {matrix: {cuda: "12.0", arch: x86_64}, packages: *cupy_packages_cu12_x86_64}
+
+          # All CUDA 12 + aarch64 versions
+          - matrix: {cuda: "12.2", arch: aarch64}
+            packages: &cupy_packages_cu12_aarch64
+              - cupy-cuda12x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+          - {matrix: {cuda: "12.1", arch: aarch64}, packages: *cupy_packages_cu12_aarch64}
+          - {matrix: {cuda: "12.0", arch: aarch64}, packages: *cupy_packages_cu12_aarch64}
+
+          # All CUDA 11 + x86_64 versions
+          - matrix: {cuda: "11.8", arch: x86_64}
+            packages: &cupy_packages_cu11_x86_64
+              - cupy-cuda11x>=12.0.0
+          - {matrix: {cuda: "11.5", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+          - {matrix: {cuda: "11.4", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+          - {matrix: {cuda: "11.2", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
+
+          # All CUDA 11 + aarch64 versions
+          - matrix: {cuda: "11.8", arch: aarch64}
+            packages: &cupy_packages_cu11_aarch64
+              - cupy-cuda11x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+          - {matrix: {cuda: "11.5", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: {cuda: "11.4", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: {cuda: "11.2", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
+          - {matrix: null, packages: [cupy-cuda11x>=12.0.0]}

From 84207c34ee3a5a02853762f85b40cdaeb5afdee9 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Thu, 28 Sep 2023 10:38:29 -0400
Subject: [PATCH 57/72] Integrate C++ Renumbering and Compression (#3841)

- [x] C API
- [x] PLC
- [x] Python API
- [x] Bulk Sampling API
- [x] Documentation for Python SG
- [x] Documentation for Python MG
- [x] Documentation for Bulk Sampler
- [x] Resolve the C++ empty batch issue with new check
- [x] Add FutureWarnings for all temporary flags
- [x] Remove all print statements and pytest tags
- [x] Verify cuGraph-PyG and cuGraph-DGL tests

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Tingyu Wang (https://github.com/tingyu66)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Joseph Nke (https://github.com/jnke2016)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3841
---
 cpp/include/cugraph/sampling_functions.hpp    |   2 +-
 cpp/include/cugraph_c/sampling_algorithms.h   | 127 ++++----
 cpp/src/c_api/uniform_neighbor_sampling.cpp   | 291 ++++++++++++-----
 .../sampling_post_processing_impl.cuh         |   4 +-
 cpp/tests/c_api/create_graph_test.c           |  26 +-
 .../c_api/mg_uniform_neighbor_sample_test.c   | 193 +++++++----
 .../c_api/uniform_neighbor_sample_test.c      | 267 ++++------------
 .../dask/sampling/uniform_neighbor_sample.py  | 300 +++++++++---------
 .../cugraph/gnn/data_loading/bulk_sampler.py  |   1 +
 .../gnn/data_loading/bulk_sampler_io.py       | 219 ++++++++++++-
 .../cugraph/sampling/sampling_utilities.py    | 198 ++++++++++++
 .../sampling/uniform_neighbor_sample.py       | 197 ++++++------
 .../tests/sampling/test_bulk_sampler.py       |  52 ++-
 .../tests/sampling/test_bulk_sampler_io.py    |  69 +++-
 .../tests/sampling/test_bulk_sampler_io_mg.py |  14 +-
 .../tests/sampling/test_bulk_sampler_mg.py    |  58 +++-
 .../sampling/test_uniform_neighbor_sample.py  | 207 +++++++++++-
 .../test_uniform_neighbor_sample_mg.py        | 244 +++++++++++++-
 .../pylibcugraph/_cugraph_c/algorithms.pxd    |  48 ++-
 .../_cugraph_c/sampling_algorithms.pxd        |  17 -
 .../internal_types/sampling_result.pyx        |  91 +++++-
 .../tests/test_uniform_neighbor_sample.py     |   4 +-
 .../pylibcugraph/uniform_neighbor_sample.pyx  | 112 ++++++-
 23 files changed, 2021 insertions(+), 720 deletions(-)
 create mode 100644 python/cugraph/cugraph/sampling/sampling_utilities.py

diff --git a/cpp/include/cugraph/sampling_functions.hpp b/cpp/include/cugraph/sampling_functions.hpp
index e42ef9bfcf3..75cf8f91f92 100644
--- a/cpp/include/cugraph/sampling_functions.hpp
+++ b/cpp/include/cugraph/sampling_functions.hpp
@@ -103,7 +103,7 @@ namespace cugraph {
  * std::get<1>(*edgelist_label_offsets) if @p edgelist_label_offsets.has_value() is true and 1
  * otherwise and # hops = std::get<1>(*edgelist_hops) if edgelist_hops.has_value() is true and 1
  * otherwise, valid only if at least one of @p edgelist_label_offsets.has_value() or @p
- * edgelist_hops.has_value() is rue), renumber_map to query original vertices (size = # unique
+ * edgelist_hops.has_value() is true), renumber_map to query original vertices (size = # unique
  * vertices or aggregate # unique vertices for every label), and label offsets to the renumber_map
  * (size = std::get<1>(*edgelist_label_offsets) + 1, valid only if @p
  * edgelist_label_offsets.has_value() is true).
diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h
index 37124d100dd..92fe50ef622 100644
--- a/cpp/include/cugraph_c/sampling_algorithms.h
+++ b/cpp/include/cugraph_c/sampling_algorithms.h
@@ -205,6 +205,21 @@ typedef enum cugraph_prior_sources_behavior_t {
                    but exclude any vertex that has already been used as a source */
 } cugraph_prior_sources_behavior_t;
 
+/**
+ * @brief Selects the type of compression to use for the output samples.
+ */
+typedef enum cugraph_compression_type_t {
+  COO = 0, /** Outputs in COO format.  Default. */
+  CSR,     /** Compresses in CSR format.  This means the row (src) column
+               is compressed into a row pointer. */
+  CSC,     /** Compresses in CSC format.  This means the col (dst) column
+               is compressed into a column pointer. */
+  DCSR,    /** Compresses in DCSR format.  This outputs an additional index
+              that avoids empty entries in the row pointer. */
+  DCSC     /** Compresses in DCSC format.  This outputs an additional index
+               that avoid empty entries in the col pointer. */
+} cugraph_compression_type_t;
+
 /**
  * @brief   Create sampling options object
  *
@@ -225,6 +240,14 @@ cugraph_error_code_t cugraph_sampling_options_create(cugraph_sampling_options_t*
  */
 void cugraph_sampling_set_renumber_results(cugraph_sampling_options_t* options, bool_t value);
 
+/**
+ * @brief   Set whether to compress per-hop (True) or globally (False)
+ *
+ * @param options - opaque pointer to the sampling options
+ * @param value - Boolean value to assign to the option
+ */
+void cugraph_sampling_set_compress_per_hop(cugraph_sampling_options_t* options, bool_t value);
+
 /**
  * @brief   Set flag to sample with_replacement
  *
@@ -241,6 +264,15 @@ void cugraph_sampling_set_with_replacement(cugraph_sampling_options_t* options,
  */
 void cugraph_sampling_set_return_hops(cugraph_sampling_options_t* options, bool_t value);
 
+/**
+ * @brief   Set compression type
+ *
+ * @param options - opaque pointer to the sampling options
+ * @param value - Enum defining the compresion type
+ */
+void cugraph_sampling_set_compression_type(cugraph_sampling_options_t* options,
+                                           cugraph_compression_type_t value);
+
 /**
  * @brief   Set prior sources behavior
  *
@@ -265,62 +297,6 @@ void cugraph_sampling_set_dedupe_sources(cugraph_sampling_options_t* options, bo
  */
 void cugraph_sampling_options_free(cugraph_sampling_options_t* options);
 
-/**
- * @brief     Uniform Neighborhood Sampling
- * @deprecated This call should be replaced with cugraph_uniform_neighbor_sample
- *
- * Returns a sample of the neighborhood around specified start vertices.  Optionally, each
- * start vertex can be associated with a label, allowing the caller to specify multiple batches
- * of sampling requests in the same function call - which should improve GPU utilization.
- *
- * If label is NULL then all start vertices will be considered part of the same batch and the
- * return value will not have a label column.
- *
- * @param [in]  handle       Handle for accessing resources
- * @param [in]  graph        Pointer to graph.  NOTE: Graph might be modified if the storage
- *                           needs to be transposed
- * @param [in]  start_vertices Device array of start vertices for the sampling
- * @param [in]  start_vertex_labels  Device array of start vertex labels for the sampling.  The
- * labels associated with each start vertex will be included in the output associated with results
- * that were derived from that start vertex.  We only support label of type INT32. If label is
- * NULL, the return data will not be labeled.
- * @param [in]  label_list Device array of the labels included in @p start_vertex_labels.  If
- * @p label_to_comm_rank is not specified this parameter is ignored.  If specified, label_list
- * must be sorted in ascending order.
- * @param [in]  label_to_comm_rank Device array identifying which comm rank the output for a
- * particular label should be shuffled in the output.  If not specifed the data is not organized in
- * output.  If specified then the all data from @p label_list[i] will be shuffled to rank @p
- * label_to_comm_rank[i].  If not specified then the output data will not be shuffled between ranks.
- * @param [in]  fanout       Host array defining the fan out at each step in the sampling algorithm.
- *                           We only support fanout values of type INT32
- * @param [in/out] rng_state State of the random number generator, updated with each call
- * @param [in]  with_replacement
- *                           Boolean value.  If true selection of edges is done with
- *                           replacement.  If false selection is done without replacement.
- * @param [in]  return_hops  Boolean value.  If true include the hop number in the result,
- *                           If false the hop number will not be included in result.
- * @param [in]  do_expensive_check
- *                           A flag to run expensive checks for input arguments (if set to true)
- * @param [in]  result       Output from the uniform_neighbor_sample call
- * @param [out] error        Pointer to an error object storing details of any error.  Will
- *                           be populated if error code is not CUGRAPH_SUCCESS
- * @return error code
- */
-cugraph_error_code_t cugraph_uniform_neighbor_sample_with_edge_properties(
-  const cugraph_resource_handle_t* handle,
-  cugraph_graph_t* graph,
-  const cugraph_type_erased_device_array_view_t* start_vertices,
-  const cugraph_type_erased_device_array_view_t* start_vertex_labels,
-  const cugraph_type_erased_device_array_view_t* label_list,
-  const cugraph_type_erased_device_array_view_t* label_to_comm_rank,
-  const cugraph_type_erased_host_array_view_t* fan_out,
-  cugraph_rng_state_t* rng_state,
-  bool_t with_replacement,
-  bool_t return_hops,
-  bool_t do_expensive_check,
-  cugraph_sample_result_t** result,
-  cugraph_error_t** error);
-
 /**
  * @brief     Uniform Neighborhood Sampling
  *
@@ -374,6 +350,7 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
   cugraph_error_t** error);
 
 /**
+ * @deprecated This call should be replaced with cugraph_sample_result_get_majors
  * @brief     Get the source vertices from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -383,6 +360,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_sources(
   const cugraph_sample_result_t* result);
 
 /**
+ * @deprecated This call should be replaced with cugraph_sample_result_get_minors
  * @brief     Get the destination vertices from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -391,6 +369,33 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_sources(
 cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_destinations(
   const cugraph_sample_result_t* result);
 
+/**
+ * @brief     Get the major vertices from the sampling algorithm result
+ *
+ * @param [in]   result   The result from a sampling algorithm
+ * @return type erased array pointing to the major vertices in device memory
+ */
+cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_majors(
+  const cugraph_sample_result_t* result);
+
+/**
+ * @brief     Get the minor vertices from the sampling algorithm result
+ *
+ * @param [in]   result   The result from a sampling algorithm
+ * @return type erased array pointing to the minor vertices in device memory
+ */
+cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_minors(
+  const cugraph_sample_result_t* result);
+
+/**
+ * @brief     Get the major offsets from the sampling algorithm result
+ *
+ * @param [in]   result   The result from a sampling algorithm
+ * @return type erased array pointing to the major offsets in device memory
+ */
+cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_major_offsets(
+  const cugraph_sample_result_t* result);
+
 /**
  * @brief     Get the start labels from the sampling algorithm result
  *
@@ -436,6 +441,15 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_edge_weight(
 cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_hop(
   const cugraph_sample_result_t* result);
 
+/**
+ * @brief     Get the label-hop offsets from the sampling algorithm result
+ *
+ * @param [in]   result   The result from a sampling algorithm
+ * @return type erased array pointing to the label-hop offsets
+ */
+cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_label_hop_offsets(
+  const cugraph_sample_result_t* result);
+
 /**
  * @brief     Get the index from the sampling algorithm result
  *
@@ -446,6 +460,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_index(
   const cugraph_sample_result_t* result);
 
 /**
+ * @deprecated This call should be replaced with cugraph_sample_get_get_label_hop_offsets
  * @brief     Get the result offsets from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp
index f146c331d8c..1a53c899109 100644
--- a/cpp/src/c_api/uniform_neighbor_sampling.cpp
+++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp
@@ -38,17 +38,20 @@ struct cugraph_sampling_options_t {
   prior_sources_behavior_t prior_sources_behavior_{prior_sources_behavior_t::DEFAULT};
   bool_t dedupe_sources_{FALSE};
   bool_t renumber_results_{FALSE};
+  cugraph_compression_type_t compression_type_{cugraph_compression_type_t::COO};
+  bool_t compress_per_hop_{FALSE};
 };
 
 struct cugraph_sample_result_t {
-  cugraph_type_erased_device_array_t* src_{nullptr};
-  cugraph_type_erased_device_array_t* dst_{nullptr};
+  cugraph_type_erased_device_array_t* major_offsets_{nullptr};
+  cugraph_type_erased_device_array_t* majors_{nullptr};
+  cugraph_type_erased_device_array_t* minors_{nullptr};
   cugraph_type_erased_device_array_t* edge_id_{nullptr};
   cugraph_type_erased_device_array_t* edge_type_{nullptr};
   cugraph_type_erased_device_array_t* wgt_{nullptr};
   cugraph_type_erased_device_array_t* hop_{nullptr};
+  cugraph_type_erased_device_array_t* label_hop_offsets_{nullptr};
   cugraph_type_erased_device_array_t* label_{nullptr};
-  cugraph_type_erased_device_array_t* offsets_{nullptr};
   cugraph_type_erased_device_array_t* renumber_map_{nullptr};
   cugraph_type_erased_device_array_t* renumber_map_offsets_{nullptr};
 };
@@ -186,6 +189,8 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
         graph_view.local_vertex_partition_range_last(),
         do_expensive_check_);
 
+      bool has_labels = start_vertex_labels_ != nullptr;
+
       auto&& [src, dst, wgt, edge_id, edge_type, hop, edge_label, offsets] =
         cugraph::uniform_neighbor_sample(
           handle_,
@@ -229,25 +234,130 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
                                                             vertex_partition_lasts,
                                                             do_expensive_check_);
 
+      std::optional<rmm::device_uvector<vertex_t>> majors{std::nullopt};
+      rmm::device_uvector<vertex_t> minors(0, handle_.get_stream());
+      std::optional<rmm::device_uvector<size_t>> major_offsets{std::nullopt};
+
+      std::optional<rmm::device_uvector<size_t>> label_hop_offsets{std::nullopt};
+
       std::optional<rmm::device_uvector<vertex_t>> renumber_map{std::nullopt};
       std::optional<rmm::device_uvector<size_t>> renumber_map_offsets{std::nullopt};
 
+      bool src_is_major = (options_.compression_type_ == cugraph_compression_type_t::CSR) ||
+                          (options_.compression_type_ == cugraph_compression_type_t::DCSR) ||
+                          (options_.compression_type_ == cugraph_compression_type_t::COO);
+
       if (options_.renumber_results_) {
-        std::tie(src, dst, renumber_map, renumber_map_offsets) = cugraph::renumber_sampled_edgelist(
-          handle_,
-          std::move(src),
-          std::move(dst),
-          hop ? std::make_optional(raft::device_span<int32_t const>{hop->data(), hop->size()})
-              : std::nullopt,
-          std::make_optional(std::make_tuple(
-            raft::device_span<label_t const>{edge_label->data(), edge_label->size()},
-            raft::device_span<size_t const>{offsets->data(), offsets->size()})),
-          do_expensive_check_);
+        if (options_.compression_type_ == cugraph_compression_type_t::COO) {
+          // COO
+
+          rmm::device_uvector<vertex_t> output_majors(0, handle_.get_stream());
+          rmm::device_uvector<vertex_t> output_renumber_map(0, handle_.get_stream());
+          std::tie(output_majors,
+                   minors,
+                   wgt,
+                   edge_id,
+                   edge_type,
+                   label_hop_offsets,
+                   output_renumber_map,
+                   renumber_map_offsets) =
+            cugraph::renumber_and_sort_sampled_edgelist(
+              handle_,
+              std::move(src),
+              std::move(dst),
+              wgt ? std::move(wgt) : std::nullopt,
+              edge_id ? std::move(edge_id) : std::nullopt,
+              edge_type ? std::move(edge_type) : std::nullopt,
+              hop ? std::make_optional(std::make_tuple(std::move(*hop), fan_out_->size_))
+                  : std::nullopt,
+              offsets ? std::make_optional(std::make_tuple(
+                          raft::device_span<size_t const>{offsets->data(), offsets->size()},
+                          edge_label->size()))
+                      : std::nullopt,
+              src_is_major,
+              do_expensive_check_);
+
+          majors.emplace(std::move(output_majors));
+          renumber_map.emplace(std::move(output_renumber_map));
+        } else {
+          // (D)CSC, (D)CSR
+
+          bool doubly_compress = (options_.compression_type_ == cugraph_compression_type_t::DCSR) ||
+                                 (options_.compression_type_ == cugraph_compression_type_t::DCSC);
+
+          rmm::device_uvector<size_t> output_major_offsets(0, handle_.get_stream());
+          rmm::device_uvector<vertex_t> output_renumber_map(0, handle_.get_stream());
+          std::tie(majors,
+                   output_major_offsets,
+                   minors,
+                   wgt,
+                   edge_id,
+                   edge_type,
+                   label_hop_offsets,
+                   output_renumber_map,
+                   renumber_map_offsets) =
+            cugraph::renumber_and_compress_sampled_edgelist(
+              handle_,
+              std::move(src),
+              std::move(dst),
+              wgt ? std::move(wgt) : std::nullopt,
+              edge_id ? std::move(edge_id) : std::nullopt,
+              edge_type ? std::move(edge_type) : std::nullopt,
+              hop ? std::make_optional(std::make_tuple(std::move(*hop), fan_out_->size_))
+                  : std::nullopt,
+              offsets ? std::make_optional(std::make_tuple(
+                          raft::device_span<size_t const>{offsets->data(), offsets->size()},
+                          edge_label->size()))
+                      : std::nullopt,
+              src_is_major,
+              options_.compress_per_hop_,
+              doubly_compress,
+              do_expensive_check_);
+
+          renumber_map.emplace(std::move(output_renumber_map));
+          major_offsets.emplace(std::move(output_major_offsets));
+        }
+
+        // These are now represented by label_hop_offsets
+        hop.reset();
+        offsets.reset();
+      } else {
+        if (options_.compression_type_ != cugraph_compression_type_t::COO) {
+          CUGRAPH_FAIL("Can only use COO format if not renumbering");
+        }
+
+        std::tie(src, dst, wgt, edge_id, edge_type, label_hop_offsets) =
+          cugraph::sort_sampled_edgelist(
+            handle_,
+            std::move(src),
+            std::move(dst),
+            wgt ? std::move(wgt) : std::nullopt,
+            edge_id ? std::move(edge_id) : std::nullopt,
+            edge_type ? std::move(edge_type) : std::nullopt,
+            hop ? std::make_optional(std::make_tuple(std::move(*hop), fan_out_->size_))
+                : std::nullopt,
+            offsets ? std::make_optional(std::make_tuple(
+                        raft::device_span<size_t const>{offsets->data(), offsets->size()},
+                        edge_label->size()))
+                    : std::nullopt,
+            src_is_major,
+            do_expensive_check_);
+
+        majors.emplace(std::move(src));
+        minors = std::move(dst);
+
+        hop.reset();
+        offsets.reset();
       }
 
       result_ = new cugraph::c_api::cugraph_sample_result_t{
-        new cugraph::c_api::cugraph_type_erased_device_array_t(src, graph_->vertex_type_),
-        new cugraph::c_api::cugraph_type_erased_device_array_t(dst, graph_->vertex_type_),
+        (major_offsets)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*major_offsets, SIZE_T)
+          : nullptr,
+        (majors)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*majors, graph_->vertex_type_)
+          : nullptr,
+        new cugraph::c_api::cugraph_type_erased_device_array_t(minors, graph_->vertex_type_),
         (edge_id)
           ? new cugraph::c_api::cugraph_type_erased_device_array_t(*edge_id, graph_->edge_type_)
           : nullptr,
@@ -256,12 +366,14 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
                     : nullptr,
         (wgt) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*wgt, graph_->weight_type_)
               : nullptr,
-        (hop) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*hop, INT32) : nullptr,
+        (hop) ? new cugraph::c_api::cugraph_type_erased_device_array_t(*hop, INT32)
+              : nullptr,  // FIXME get rid of this
+        (label_hop_offsets)
+          ? new cugraph::c_api::cugraph_type_erased_device_array_t(*label_hop_offsets, SIZE_T)
+          : nullptr,
         (edge_label)
           ? new cugraph::c_api::cugraph_type_erased_device_array_t(edge_label.value(), INT32)
           : nullptr,
-        (offsets) ? new cugraph::c_api::cugraph_type_erased_device_array_t(offsets.value(), SIZE_T)
-                  : nullptr,
         (renumber_map) ? new cugraph::c_api::cugraph_type_erased_device_array_t(
                            renumber_map.value(), graph_->vertex_type_)
                        : nullptr,
@@ -295,6 +407,13 @@ extern "C" void cugraph_sampling_set_renumber_results(cugraph_sampling_options_t
   internal_pointer->renumber_results_ = value;
 }
 
+extern "C" void cugraph_sampling_set_compress_per_hop(cugraph_sampling_options_t* options,
+                                                      bool_t value)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t*>(options);
+  internal_pointer->compress_per_hop_ = value;
+}
+
 extern "C" void cugraph_sampling_set_with_replacement(cugraph_sampling_options_t* options,
                                                       bool_t value)
 {
@@ -308,6 +427,20 @@ extern "C" void cugraph_sampling_set_return_hops(cugraph_sampling_options_t* opt
   internal_pointer->return_hops_ = value;
 }
 
+extern "C" void cugraph_sampling_set_compression_type(cugraph_sampling_options_t* options,
+                                                      cugraph_compression_type_t value)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t*>(options);
+  switch (value) {
+    case COO: internal_pointer->compression_type_ = cugraph_compression_type_t::COO; break;
+    case CSR: internal_pointer->compression_type_ = cugraph_compression_type_t::CSR; break;
+    case CSC: internal_pointer->compression_type_ = cugraph_compression_type_t::CSC; break;
+    case DCSR: internal_pointer->compression_type_ = cugraph_compression_type_t::DCSR; break;
+    case DCSC: internal_pointer->compression_type_ = cugraph_compression_type_t::DCSC; break;
+    default: CUGRAPH_FAIL("Invalid compression type");
+  }
+}
+
 extern "C" void cugraph_sampling_set_prior_sources_behavior(cugraph_sampling_options_t* options,
                                                             cugraph_prior_sources_behavior_t value)
 {
@@ -341,15 +474,45 @@ extern "C" void cugraph_sampling_options_free(cugraph_sampling_options_t* option
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_sources(
   const cugraph_sample_result_t* result)
 {
-  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t const*>(result);
-  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->src_->view());
+  // Deprecated.
+  return cugraph_sample_result_get_majors(result);
 }
 
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_destinations(
   const cugraph_sample_result_t* result)
+{
+  // Deprecated.
+  return cugraph_sample_result_get_minors(result);
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_majors(
+  const cugraph_sample_result_t* result)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t const*>(result);
+  return (internal_pointer->majors_ != nullptr)
+           ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->majors_->view())
+
+           : NULL;
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_major_offsets(
+  const cugraph_sample_result_t* result)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t const*>(result);
+  return (internal_pointer->major_offsets_ != nullptr)
+           ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->major_offsets_->view())
+
+           : NULL;
+}
+
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_minors(
+  const cugraph_sample_result_t* result)
 {
   auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t const*>(result);
-  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(internal_pointer->dst_->view());
+  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+    internal_pointer->minors_->view());
 }
 
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_start_labels(
@@ -402,6 +565,16 @@ extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_ho
            : NULL;
 }
 
+extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_label_hop_offsets(
+  const cugraph_sample_result_t* result)
+{
+  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t const*>(result);
+  return internal_pointer->label_hop_offsets_ != nullptr
+           ? reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
+               internal_pointer->label_hop_offsets_->view())
+           : NULL;
+}
+
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_index(
   const cugraph_sample_result_t* result)
 {
@@ -413,9 +586,8 @@ extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_in
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_offsets(
   const cugraph_sample_result_t* result)
 {
-  auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t const*>(result);
-  return reinterpret_cast<cugraph_type_erased_device_array_view_t*>(
-    internal_pointer->offsets_->view());
+  // Deprecated.
+  return cugraph_sample_result_get_label_hop_offsets(result);
 }
 
 extern "C" cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_renumber_map(
@@ -532,6 +704,7 @@ extern "C" cugraph_error_code_t cugraph_test_uniform_neighborhood_sample_result_
 
   // create new cugraph_sample_result_t
   *result = reinterpret_cast<cugraph_sample_result_t*>(new cugraph::c_api::cugraph_sample_result_t{
+    nullptr,
     reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_t*>(
       new_device_srcs.release()),
     reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_t*>(
@@ -675,78 +848,20 @@ extern "C" cugraph_error_code_t cugraph_test_sample_result_create(
 extern "C" void cugraph_sample_result_free(cugraph_sample_result_t* result)
 {
   auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sample_result_t*>(result);
-  delete internal_pointer->src_;
-  delete internal_pointer->dst_;
+  delete internal_pointer->major_offsets_;
+  delete internal_pointer->majors_;
+  delete internal_pointer->minors_;
   delete internal_pointer->edge_id_;
   delete internal_pointer->edge_type_;
   delete internal_pointer->wgt_;
   delete internal_pointer->hop_;
+  delete internal_pointer->label_hop_offsets_;
   delete internal_pointer->label_;
+  delete internal_pointer->renumber_map_;
+  delete internal_pointer->renumber_map_offsets_;
   delete internal_pointer;
 }
 
-extern "C" cugraph_error_code_t cugraph_uniform_neighbor_sample_with_edge_properties(
-  const cugraph_resource_handle_t* handle,
-  cugraph_graph_t* graph,
-  const cugraph_type_erased_device_array_view_t* start_vertices,
-  const cugraph_type_erased_device_array_view_t* start_vertex_labels,
-  const cugraph_type_erased_device_array_view_t* label_list,
-  const cugraph_type_erased_device_array_view_t* label_to_comm_rank,
-  const cugraph_type_erased_host_array_view_t* fan_out,
-  cugraph_rng_state_t* rng_state,
-  bool_t with_replacement,
-  bool_t return_hops,
-  bool_t do_expensive_check,
-  cugraph_sample_result_t** result,
-  cugraph_error_t** error)
-{
-  CAPI_EXPECTS((start_vertex_labels == nullptr) ||
-                 (reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
-                    start_vertex_labels)
-                    ->type_ == INT32),
-               CUGRAPH_INVALID_INPUT,
-               "start_vertex_labels should be of type int",
-               *error);
-
-  CAPI_EXPECTS((label_to_comm_rank == nullptr) || (start_vertex_labels != nullptr),
-               CUGRAPH_INVALID_INPUT,
-               "cannot specify label_to_comm_rank unless start_vertex_labels is also specified",
-               *error);
-
-  CAPI_EXPECTS((label_to_comm_rank == nullptr) || (label_list != nullptr),
-               CUGRAPH_INVALID_INPUT,
-               "cannot specify label_to_comm_rank unless label_list is also specified",
-               *error);
-
-  CAPI_EXPECTS(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)->vertex_type_ ==
-                 reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
-                   start_vertices)
-                   ->type_,
-               CUGRAPH_INVALID_INPUT,
-               "vertex type of graph and start_vertices must match",
-               *error);
-
-  CAPI_EXPECTS(
-    reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)
-        ->type_ == INT32,
-    CUGRAPH_INVALID_INPUT,
-    "fan_out should be of type int",
-    *error);
-
-  uniform_neighbor_sampling_functor functor{
-    handle,
-    graph,
-    start_vertices,
-    start_vertex_labels,
-    label_list,
-    label_to_comm_rank,
-    fan_out,
-    rng_state,
-    cugraph::c_api::cugraph_sampling_options_t{with_replacement, return_hops},
-    do_expensive_check};
-  return cugraph::c_api::run_algorithm(graph, functor, result, error);
-}
-
 cugraph_error_code_t cugraph_uniform_neighbor_sample(
   const cugraph_resource_handle_t* handle,
   cugraph_graph_t* graph,
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 0c397d91b20..77d4f2d865f 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -166,9 +166,7 @@ void check_input_edges(
                                               std::numeric_limits<label_index_t>::max()),
                   "Invalid input arguments: current implementation assumes that the number of "
                   "unique labels is no larger than std::numeric_limits<uint32_t>::max().");
-  CUGRAPH_EXPECTS(!edgelist_label_offsets || std::get<1>(*edgelist_label_offsets) > 0,
-                  "Invlaid input arguments: there should be 1 or more labels if "
-                  "edgelist_label_offsets.has_value() is true.");
+
   CUGRAPH_EXPECTS(
     !edgelist_label_offsets.has_value() ||
       (std::get<0>(*edgelist_label_offsets).size() == std::get<1>(*edgelist_label_offsets) + 1),
diff --git a/cpp/tests/c_api/create_graph_test.c b/cpp/tests/c_api/create_graph_test.c
index eef49458f2b..736db761ebd 100644
--- a/cpp/tests/c_api/create_graph_test.c
+++ b/cpp/tests/c_api/create_graph_test.c
@@ -142,6 +142,14 @@ int test_create_sg_graph_csr()
   vertex_t h_start[]   = {0, 1, 2, 3, 4, 5};
   weight_t h_wgt[]     = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
 
+  bool_t with_replacement = FALSE;
+  bool_t return_hops = TRUE;
+  cugraph_prior_sources_behavior_t prior_sources_behavior = DEFAULT;
+  bool_t dedupe_sources = FALSE;
+  bool_t renumber_results = FALSE;
+  cugraph_compression_type_t compression = COO;
+  bool_t compress_per_hop = FALSE;
+
   cugraph_resource_handle_t* handle = NULL;
   cugraph_graph_t* graph            = NULL;
   cugraph_graph_properties_t properties;
@@ -238,8 +246,21 @@ int test_create_sg_graph_csr()
   ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
 
-  ret_code = cugraph_uniform_neighbor_sample_with_edge_properties(
-                                                                  handle, graph, d_start_view, NULL, NULL, NULL, h_fan_out_view, rng_state, FALSE, FALSE, FALSE, &result, &ret_error);
+  cugraph_sampling_options_t *sampling_options;
+
+  ret_code = cugraph_sampling_options_create(&sampling_options, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "sampling_options create failed.");
+
+  cugraph_sampling_set_with_replacement(sampling_options, with_replacement);
+  cugraph_sampling_set_return_hops(sampling_options, return_hops);
+  cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior);
+  cugraph_sampling_set_dedupe_sources(sampling_options, dedupe_sources);
+  cugraph_sampling_set_renumber_results(sampling_options, renumber_results);
+  cugraph_sampling_set_compression_type(sampling_options, compression);
+  cugraph_sampling_set_compress_per_hop(sampling_options, compress_per_hop);
+
+  ret_code = cugraph_uniform_neighbor_sample(
+                                              handle, graph, d_start_view, NULL, NULL, NULL, h_fan_out_view, rng_state, sampling_options, FALSE, &result, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed.");
@@ -289,6 +310,7 @@ int test_create_sg_graph_csr()
 
   cugraph_free_resource_handle(handle);
   cugraph_error_free(ret_error);
+  cugraph_sampling_options_free(sampling_options);
 
   return test_ret_value;
 }
diff --git a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c
index f8241bd8a5f..86a0a92eb01 100644
--- a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c
+++ b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c
@@ -213,11 +213,6 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle
     TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "gatherv_fill failed.");
   }
 
-  if (return_hops) {
-    ret_code = cugraph_test_device_gatherv_fill(handle, result_hops, h_result_hops);
-    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "gatherv_fill failed.");
-  }
-
   if (d_start_labels != NULL) {
     size_t sz = cugraph_type_erased_device_array_view_size(result_offsets);
 
@@ -452,6 +447,7 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
   size_t num_vertices = 5;
   size_t fan_out_size = 2;
   size_t num_starts   = 2;
+  size_t num_start_labels = 2;
 
   vertex_t src[]   = {0, 1, 2, 3, 4, 3, 4, 2, 0, 1, 0, 2};
   vertex_t dst[]   = {1, 2, 4, 2, 3, 4, 1, 1, 2, 3, 4, 4};
@@ -462,7 +458,6 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
   int32_t batch[]  = {0, 1};
   int fan_out[]    = {2, 2};
 
-  bool_t with_replacement = TRUE;
   bool_t store_transposed = FALSE;
 
   int test_ret_value            = 0;
@@ -472,6 +467,14 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
   cugraph_graph_t* graph          = NULL;
   cugraph_sample_result_t* result = NULL;
 
+  bool_t with_replacement = FALSE;
+  bool_t return_hops = TRUE;
+  cugraph_prior_sources_behavior_t prior_sources_behavior = DEFAULT;
+  bool_t dedupe_sources = FALSE;
+  bool_t renumber_results = FALSE;
+  cugraph_compression_type_t compression = COO;
+  bool_t compress_per_hop = FALSE;
+
   cugraph_type_erased_device_array_t* d_start           = NULL;
   cugraph_type_erased_device_array_t* d_label           = NULL;
   cugraph_type_erased_device_array_view_t* d_start_view = NULL;
@@ -512,19 +515,31 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
 
   h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, fan_out_size, INT32);
 
-  ret_code = cugraph_uniform_neighbor_sample_with_edge_properties(handle,
-                                                                  graph,
-                                                                  d_start_view,
-                                                                  d_label_view,
-                                                                  NULL,
-                                                                  NULL,
-                                                                  h_fan_out_view,
-                                                                  rng_state,
-                                                                  with_replacement,
-                                                                  TRUE,
-                                                                  FALSE,
-                                                                  &result,
-                                                                  &ret_error);
+  cugraph_sampling_options_t *sampling_options;
+
+  ret_code = cugraph_sampling_options_create(&sampling_options, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "sampling_options create failed.");
+
+  cugraph_sampling_set_with_replacement(sampling_options, with_replacement);
+  cugraph_sampling_set_return_hops(sampling_options, return_hops);
+  cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior);
+  cugraph_sampling_set_dedupe_sources(sampling_options, dedupe_sources);
+  cugraph_sampling_set_renumber_results(sampling_options, renumber_results);
+  cugraph_sampling_set_compression_type(sampling_options, compression);
+  cugraph_sampling_set_compress_per_hop(sampling_options, compress_per_hop);
+
+  ret_code = cugraph_uniform_neighbor_sample(handle,
+                                              graph,
+                                              d_start_view,
+                                              d_label_view,
+                                              NULL,
+                                              NULL,
+                                              h_fan_out_view,
+                                              rng_state,
+                                              sampling_options,
+                                              FALSE,
+                                              &result,
+                                              &ret_error);
 
 #ifdef NO_CUGRAPH_OPS
   TEST_ASSERT(
@@ -540,6 +555,7 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
   cugraph_type_erased_device_array_view_t* result_weight;
   cugraph_type_erased_device_array_view_t* result_labels;
   cugraph_type_erased_device_array_view_t* result_hops;
+  cugraph_type_erased_device_array_view_t* result_offsets;
 
   result_src    = cugraph_sample_result_get_sources(result);
   result_dst    = cugraph_sample_result_get_destinations(result);
@@ -548,8 +564,10 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
   result_weight = cugraph_sample_result_get_edge_weight(result);
   result_labels = cugraph_sample_result_get_start_labels(result);
   result_hops   = cugraph_sample_result_get_hop(result);
+  result_offsets = cugraph_sample_result_get_offsets(result);
 
   size_t result_size = cugraph_type_erased_device_array_view_size(result_src);
+  size_t offsets_size = cugraph_type_erased_device_array_view_size(result_offsets);
 
   vertex_t h_srcs[result_size];
   vertex_t h_dsts[result_size];
@@ -558,6 +576,7 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
   weight_t h_wgt[result_size];
   int h_labels[result_size];
   int h_hop[result_size];
+  int h_offsets[offsets_size];
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
     handle, (byte_t*)h_srcs, result_src, &ret_error);
@@ -584,9 +603,24 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_hop, result_hops, &ret_error);
+    handle, (byte_t*)h_offsets, result_offsets, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
+  for(int k = 0; k < offsets_size-1; k += fan_out_size) {
+    for(int h = 0; h < fan_out_size; ++h) {
+      int hop_start = h_offsets[k+h];
+      int hop_end = h_offsets[k+h+1];
+      for(int i = hop_start; i < hop_end; ++i) {
+        h_hop[i] = h;
+      }
+    }
+  }
+
+  for(int k = 0; k < num_start_labels+1; ++k) {
+    h_offsets[k] = h_offsets[k*fan_out_size];
+  }
+  offsets_size = num_start_labels + 1;
+
   //  NOTE:  The C++ tester does a more thorough validation.  For our purposes
   //  here we will do a simpler validation, merely checking that all edges
   //  are actually part of the graph
@@ -611,6 +645,7 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
   cugraph_type_erased_host_array_view_free(h_fan_out_view);
   cugraph_mg_graph_free(graph);
   cugraph_error_free(ret_error);
+  cugraph_sampling_options_free(sampling_options);
 
   return test_ret_value;
 }
@@ -661,6 +696,15 @@ int test_uniform_neighbor_sample_alex_bug(const cugraph_resource_handle_t* handl
 
   size_t expected_size[] = { 3, 2, 1, 1, 1, 1, 1, 1 };
 
+
+  bool_t with_replacement = FALSE;
+  bool_t return_hops = TRUE;
+  cugraph_prior_sources_behavior_t prior_sources_behavior = CARRY_OVER;
+  bool_t dedupe_sources = TRUE;
+  bool_t renumber_results = FALSE;
+  cugraph_compression_type_t compression = COO;
+  bool_t compress_per_hop = FALSE;
+
   // Create graph
   int test_ret_value              = 0;
   cugraph_error_code_t ret_code   = CUGRAPH_SUCCESS;
@@ -747,19 +791,30 @@ int test_uniform_neighbor_sample_alex_bug(const cugraph_resource_handle_t* handl
 
   h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, fan_out_size, INT32);
 
-  ret_code = cugraph_uniform_neighbor_sample_with_edge_properties(handle,
-                                                                  graph,
-                                                                  d_start_view,
-                                                                  d_start_labels_view,
-                                                                  d_label_list_view,
-                                                                  d_label_to_output_comm_rank_view,
-                                                                  h_fan_out_view,
-                                                                  rng_state,
-                                                                  FALSE,
-                                                                  TRUE,
-                                                                  FALSE,
-                                                                  &result,
-                                                                  &ret_error);
+  cugraph_sampling_options_t* sampling_options;
+  ret_code = cugraph_sampling_options_create(&sampling_options, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "sampling_options create failed.");
+
+  cugraph_sampling_set_with_replacement(sampling_options, with_replacement);
+  cugraph_sampling_set_return_hops(sampling_options, return_hops);
+  cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior);
+  cugraph_sampling_set_dedupe_sources(sampling_options, dedupe_sources);
+  cugraph_sampling_set_renumber_results(sampling_options, renumber_results);
+  cugraph_sampling_set_compression_type(sampling_options, compression);
+  cugraph_sampling_set_compress_per_hop(sampling_options, compress_per_hop);
+
+  ret_code = cugraph_uniform_neighbor_sample(handle,
+                                              graph,
+                                              d_start_view,
+                                              d_start_labels_view,
+                                              d_label_list_view,
+                                              d_label_to_output_comm_rank_view,
+                                              h_fan_out_view,
+                                              rng_state,
+                                              sampling_options,
+                                              FALSE,
+                                              &result,
+                                              &ret_error);
 
 #ifdef NO_CUGRAPH_OPS
   TEST_ASSERT(
@@ -900,6 +955,14 @@ int test_uniform_neighbor_sample_sort_by_hop(const cugraph_resource_handle_t* ha
 
   size_t expected_size[] = { 3, 2, 1, 1, 1, 1, 1, 1 };
 
+    bool_t with_replacement = FALSE;
+  bool_t return_hops = TRUE;
+  cugraph_prior_sources_behavior_t prior_sources_behavior = CARRY_OVER;
+  bool_t dedupe_sources = TRUE;
+  bool_t renumber_results = FALSE;
+  cugraph_compression_type_t compression = COO;
+  bool_t compress_per_hop = FALSE;
+
   // Create graph
   int test_ret_value              = 0;
   cugraph_error_code_t ret_code   = CUGRAPH_SUCCESS;
@@ -986,19 +1049,30 @@ int test_uniform_neighbor_sample_sort_by_hop(const cugraph_resource_handle_t* ha
 
   h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, fan_out_size, INT32);
 
-  ret_code = cugraph_uniform_neighbor_sample_with_edge_properties(handle,
-                                                                  graph,
-                                                                  d_start_view,
-                                                                  d_start_labels_view,
-                                                                  d_label_list_view,
-                                                                  d_label_to_output_comm_rank_view,
-                                                                  h_fan_out_view,
-                                                                  rng_state,
-                                                                  FALSE,
-                                                                  TRUE,
-                                                                  FALSE,
-                                                                  &result,
-                                                                  &ret_error);
+  cugraph_sampling_options_t* sampling_options;
+  ret_code = cugraph_sampling_options_create(&sampling_options, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "sampling_options create failed.");
+
+  cugraph_sampling_set_with_replacement(sampling_options, with_replacement);
+  cugraph_sampling_set_return_hops(sampling_options, return_hops);
+  cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior);
+  cugraph_sampling_set_dedupe_sources(sampling_options, dedupe_sources);
+  cugraph_sampling_set_renumber_results(sampling_options, renumber_results);
+  cugraph_sampling_set_compression_type(sampling_options, compression);
+  cugraph_sampling_set_compress_per_hop(sampling_options, compress_per_hop);
+
+  ret_code = cugraph_uniform_neighbor_sample(handle,
+                                              graph,
+                                              d_start_view,
+                                              d_start_labels_view,
+                                              d_label_list_view,
+                                              d_label_to_output_comm_rank_view,
+                                              h_fan_out_view,
+                                              rng_state,
+                                              sampling_options,
+                                              FALSE,
+                                              &result,
+                                              &ret_error);
 
 #ifdef NO_CUGRAPH_OPS
   TEST_ASSERT(
@@ -1047,14 +1121,27 @@ int test_uniform_neighbor_sample_sort_by_hop(const cugraph_resource_handle_t* ha
     handle, (byte_t*)h_weight, result_weights, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_hops, result_hops, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
     handle, (byte_t*)h_result_offsets, result_offsets, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
+  for(int k = 0; k < result_offsets_size-1; k += fan_out_size) {
+    for(int h = 0; h < fan_out_size; ++h) {
+      int hop_start = h_result_offsets[k+h];
+      int hop_end = h_result_offsets[k+h+1];
+      for(int i = hop_start; i < hop_end; ++i) {
+        h_hops[i] = h;
+      }
+    }
+  }
+
+  size_t num_local_labels = (result_offsets_size - 1) / fan_out_size;
+
+  for(int k = 0; k < num_local_labels+1; ++k) {
+    h_result_offsets[k] = h_result_offsets[k*fan_out_size];
+  }
+  result_offsets_size = num_local_labels + 1;
+
   //  NOTE:  The C++ tester does a more thorough validation.  For our purposes
   //  here we will do a simpler validation, merely checking that all edges
   //  are actually part of the graph
@@ -1223,9 +1310,9 @@ int main(int argc, char** argv)
   result |= RUN_MG_TEST(test_uniform_neighbor_from_alex, handle);
   //result |= RUN_MG_TEST(test_uniform_neighbor_sample_alex_bug, handle);
   result |= RUN_MG_TEST(test_uniform_neighbor_sample_sort_by_hop, handle);
-  result |= RUN_MG_TEST(test_uniform_neighbor_sample_dedupe_sources, handle);
-  result |= RUN_MG_TEST(test_uniform_neighbor_sample_unique_sources, handle);
-  result |= RUN_MG_TEST(test_uniform_neighbor_sample_carry_over_sources, handle);
+  //result |= RUN_MG_TEST(test_uniform_neighbor_sample_dedupe_sources, handle);
+  //result |= RUN_MG_TEST(test_uniform_neighbor_sample_unique_sources, handle);
+  //result |= RUN_MG_TEST(test_uniform_neighbor_sample_carry_over_sources, handle);
 
   cugraph_free_resource_handle(handle);
   free_mg_raft_handle(raft_handle);
diff --git a/cpp/tests/c_api/uniform_neighbor_sample_test.c b/cpp/tests/c_api/uniform_neighbor_sample_test.c
index a2c1e230485..92f3821e3cc 100644
--- a/cpp/tests/c_api/uniform_neighbor_sample_test.c
+++ b/cpp/tests/c_api/uniform_neighbor_sample_test.c
@@ -53,6 +53,7 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle
                                          vertex_t *h_start,
                                          int *h_start_labels,
                                          size_t num_start_vertices,
+                                         size_t num_start_labels,
                                          int *fan_out,
                                          size_t fan_out_size,
                                          bool_t with_replacement,
@@ -192,7 +193,7 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle
   int32_t h_result_edge_types[result_size];
   int32_t h_result_hops[result_size];
   size_t h_result_offsets[result_offsets_size];
-  int h_result_labels[result_offsets_size-1];
+  int h_result_labels[num_start_labels];
   vertex_t h_renumber_map[renumber_map_size];
   size_t h_renumber_map_offsets[result_offsets_size];
 
@@ -216,9 +217,7 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle
     handle, (byte_t*)h_result_edge_types, result_edge_types, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_result_hops, result_hops, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  TEST_ASSERT(test_ret_value, result_hops == NULL, "hops was not empty");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
     handle, (byte_t*)h_result_offsets, result_offsets, &ret_error);
@@ -228,6 +227,21 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle
     handle, (byte_t*)h_result_labels, result_labels, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
+  for(int k = 0; k < result_offsets_size-1; k += fan_out_size) {
+    for(int h = 0; h < fan_out_size; ++h) {
+      int hop_start = h_result_offsets[k+h];
+      int hop_end = h_result_offsets[k+h+1];
+      for(int i = hop_start; i < hop_end; ++i) {
+        h_result_hops[i] = h;
+      }
+    }
+  }
+
+  for(int k = 0; k < num_start_labels+1; ++k) {
+    h_result_offsets[k] = h_result_offsets[k*fan_out_size];
+  }
+  result_offsets_size = num_start_labels + 1;
+
   if (renumber_results) {
     ret_code = cugraph_type_erased_device_array_view_copy_to_host(
       handle, (byte_t*)h_renumber_map, result_renumber_map, &ret_error);
@@ -348,6 +362,7 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle
 
       for (size_t i = h_result_offsets[label_id]; (i < h_result_offsets[label_id+1]) && (test_ret_value == 0) ; ++i) {
         if (h_result_hops[i] == hop) {
+
           bool found = false;
           for (size_t j = 0 ; (!found) && (j < sources_size) ; ++j) {
             found = renumber_results ? (h_renumber_map[h_renumber_map_offsets[label_id] + h_result_srcs[i]] == check_sources[j])
@@ -516,183 +531,6 @@ int create_test_graph_with_edge_ids(const cugraph_resource_handle_t* p_handle,
   return test_ret_value;
 }
 
-int test_uniform_neighbor_sample_with_properties(const cugraph_resource_handle_t* handle)
-{
-  data_type_id_t vertex_tid    = INT32;
-  data_type_id_t edge_tid      = INT32;
-  data_type_id_t weight_tid    = FLOAT32;
-  data_type_id_t edge_id_tid   = INT32;
-  data_type_id_t edge_type_tid = INT32;
-
-  size_t num_edges    = 8;
-  size_t num_vertices = 6;
-  size_t fan_out_size = 1;
-  size_t num_starts   = 1;
-
-  vertex_t src[]       = {0, 1, 1, 2, 2, 2, 3, 4};
-  vertex_t dst[]       = {1, 3, 4, 0, 1, 3, 5, 5};
-  edge_t edge_ids[]    = {0, 1, 2, 3, 4, 5, 6, 7};
-  weight_t weight[]    = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
-  int32_t edge_types[] = {7, 6, 5, 4, 3, 2, 1, 0};
-  vertex_t start[]     = {2};
-  int fan_out[]        = {-1};
-
-  // Create graph
-  int test_ret_value              = 0;
-  cugraph_error_code_t ret_code   = CUGRAPH_SUCCESS;
-  cugraph_error_t* ret_error      = NULL;
-  cugraph_graph_t* graph          = NULL;
-  cugraph_sample_result_t* result = NULL;
-
-  ret_code = create_sg_test_graph(handle,
-                                  vertex_tid,
-                                  edge_tid,
-                                  src,
-                                  dst,
-                                  weight_tid,
-                                  weight,
-                                  edge_type_tid,
-                                  edge_types,
-                                  edge_id_tid,
-                                  edge_ids,
-                                  num_edges,
-                                  FALSE,
-                                  TRUE,
-                                  FALSE,
-                                  FALSE,
-                                  &graph,
-                                  &ret_error);
-
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed.");
-
-  cugraph_type_erased_device_array_t* d_start           = NULL;
-  cugraph_type_erased_device_array_view_t* d_start_view = NULL;
-  cugraph_type_erased_host_array_view_t* h_fan_out_view = NULL;
-
-  ret_code =
-    cugraph_type_erased_device_array_create(handle, num_starts, INT32, &d_start, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "d_start create failed.");
-
-  d_start_view = cugraph_type_erased_device_array_view(d_start);
-
-  ret_code = cugraph_type_erased_device_array_view_copy_from_host(
-    handle, d_start_view, (byte_t*)start, &ret_error);
-
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed.");
-
-  h_fan_out_view = cugraph_type_erased_host_array_view_create(fan_out, 1, INT32);
-
-  cugraph_rng_state_t *rng_state;
-  ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
-
-  ret_code = cugraph_uniform_neighbor_sample_with_edge_properties(handle,
-                                                                  graph,
-                                                                  d_start_view,
-                                                                  NULL,
-                                                                  NULL,
-                                                                  NULL,
-                                                                  h_fan_out_view,
-                                                                  rng_state,
-                                                                  FALSE,
-                                                                  TRUE,
-                                                                  FALSE,
-                                                                  &result,
-                                                                  &ret_error);
-
-#ifdef NO_CUGRAPH_OPS
-  TEST_ASSERT(
-    test_ret_value, ret_code != CUGRAPH_SUCCESS, "uniform_neighbor_sample should have failed")
-#else
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed.");
-
-  cugraph_type_erased_device_array_view_t* result_srcs;
-  cugraph_type_erased_device_array_view_t* result_dsts;
-  cugraph_type_erased_device_array_view_t* result_edge_id;
-  cugraph_type_erased_device_array_view_t* result_weights;
-  cugraph_type_erased_device_array_view_t* result_edge_types;
-  cugraph_type_erased_device_array_view_t* result_hops;
-
-  result_srcs       = cugraph_sample_result_get_sources(result);
-  result_dsts       = cugraph_sample_result_get_destinations(result);
-  result_edge_id    = cugraph_sample_result_get_edge_id(result);
-  result_weights    = cugraph_sample_result_get_edge_weight(result);
-  result_edge_types = cugraph_sample_result_get_edge_type(result);
-  result_hops       = cugraph_sample_result_get_hop(result);
-
-  size_t result_size = cugraph_type_erased_device_array_view_size(result_srcs);
-
-  vertex_t h_srcs[result_size];
-  vertex_t h_dsts[result_size];
-  edge_t h_edge_id[result_size];
-  weight_t h_weight[result_size];
-  int32_t h_edge_types[result_size];
-  int32_t h_hops[result_size];
-
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_srcs, result_srcs, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_dsts, result_dsts, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_edge_id, result_edge_id, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_weight, result_weights, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_edge_types, result_edge_types, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_hops, result_hops, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
-
-  //  NOTE:  The C++ tester does a more thorough validation.  For our purposes
-  //  here we will do a simpler validation, merely checking that all edges
-  //  are actually part of the graph
-  weight_t M_w[num_vertices][num_vertices];
-  edge_t M_edge_id[num_vertices][num_vertices];
-  int32_t M_edge_type[num_vertices][num_vertices];
-
-  for (int i = 0; i < num_vertices; ++i)
-    for (int j = 0; j < num_vertices; ++j) {
-      M_w[i][j]         = 0.0;
-      M_edge_id[i][j]   = -1;
-      M_edge_type[i][j] = -1;
-    }
-
-  for (int i = 0; i < num_edges; ++i) {
-    M_w[src[i]][dst[i]]         = weight[i];
-    M_edge_id[src[i]][dst[i]]   = edge_ids[i];
-    M_edge_type[src[i]][dst[i]] = edge_types[i];
-  }
-
-  for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) {
-    TEST_ASSERT(test_ret_value,
-                M_w[h_srcs[i]][h_dsts[i]] == h_weight[i],
-                "uniform_neighbor_sample got edge that doesn't exist");
-    TEST_ASSERT(test_ret_value,
-                M_edge_id[h_srcs[i]][h_dsts[i]] == h_edge_id[i],
-                "uniform_neighbor_sample got edge that doesn't exist");
-    TEST_ASSERT(test_ret_value,
-                M_edge_type[h_srcs[i]][h_dsts[i]] == h_edge_types[i],
-                "uniform_neighbor_sample got edge that doesn't exist");
-  }
-
-  cugraph_sample_result_free(result);
-#endif
-
-  cugraph_sg_graph_free(graph);
-  cugraph_error_free(ret_error);
-}
-
 int test_uniform_neighbor_sample_with_labels(const cugraph_resource_handle_t* handle)
 {
   data_type_id_t vertex_tid    = INT32;
@@ -722,6 +560,14 @@ int test_uniform_neighbor_sample_with_labels(const cugraph_resource_handle_t* ha
   cugraph_graph_t* graph          = NULL;
   cugraph_sample_result_t* result = NULL;
 
+  bool_t with_replacement = TRUE;
+  bool_t return_hops = TRUE;
+  cugraph_prior_sources_behavior_t prior_sources_behavior = DEFAULT;
+  bool_t dedupe_sources = FALSE;
+  bool_t renumber_results = FALSE;
+  cugraph_compression_type_t compression = COO;
+  bool_t compress_per_hop = FALSE;
+
   ret_code = create_sg_test_graph(handle,
                                   vertex_tid,
                                   edge_tid,
@@ -775,19 +621,31 @@ int test_uniform_neighbor_sample_with_labels(const cugraph_resource_handle_t* ha
   ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed.");
 
-  ret_code = cugraph_uniform_neighbor_sample_with_edge_properties(handle,
-                                                                  graph,
-                                                                  d_start_view,
-                                                                  d_start_labels_view,
-                                                                  NULL,
-                                                                  NULL,
-                                                                  h_fan_out_view,
-                                                                  rng_state,
-                                                                  FALSE,
-                                                                  TRUE,
-                                                                  FALSE,
-                                                                  &result,
-                                                                  &ret_error);
+  cugraph_sampling_options_t *sampling_options;
+
+  ret_code = cugraph_sampling_options_create(&sampling_options, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "sampling_options create failed.");
+
+  cugraph_sampling_set_with_replacement(sampling_options, with_replacement);
+  cugraph_sampling_set_return_hops(sampling_options, return_hops);
+  cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior);
+  cugraph_sampling_set_dedupe_sources(sampling_options, dedupe_sources);
+  cugraph_sampling_set_renumber_results(sampling_options, renumber_results);
+  cugraph_sampling_set_compression_type(sampling_options, compression);
+  cugraph_sampling_set_compress_per_hop(sampling_options, compress_per_hop);
+
+  ret_code = cugraph_uniform_neighbor_sample(handle,
+                                              graph,
+                                              d_start_view,
+                                              d_start_labels_view,
+                                              NULL,
+                                              NULL,
+                                              h_fan_out_view,
+                                              rng_state,
+                                              sampling_options,
+                                              FALSE,
+                                              &result,
+                                              &ret_error);
 
 #ifdef NO_CUGRAPH_OPS
   TEST_ASSERT(
@@ -843,9 +701,7 @@ int test_uniform_neighbor_sample_with_labels(const cugraph_resource_handle_t* ha
     handle, (byte_t*)h_edge_types, result_edge_types, &ret_error);
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
 
-  ret_code = cugraph_type_erased_device_array_view_copy_to_host(
-    handle, (byte_t*)h_hops, result_hops, &ret_error);
-  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+  TEST_ASSERT(test_ret_value, result_hops == NULL, "hops was not empty");
 
   ret_code = cugraph_type_erased_device_array_view_copy_to_host(
     handle, (byte_t*)h_result_offsets, result_offsets, &ret_error);
@@ -884,6 +740,7 @@ int test_uniform_neighbor_sample_with_labels(const cugraph_resource_handle_t* ha
   }
 
   cugraph_sample_result_free(result);
+  cugraph_sampling_options_free(sampling_options);
 #endif
 
   cugraph_sg_graph_free(graph);
@@ -902,6 +759,7 @@ int test_uniform_neighbor_sample_clean(const cugraph_resource_handle_t* handle)
   size_t num_vertices = 6;
   size_t fan_out_size = 3;
   size_t num_starts   = 2;
+  size_t num_start_labels = 2;
 
   vertex_t src[]       = {0, 0, 1, 1, 2, 2, 2, 3, 4};
   vertex_t dst[]       = {1, 3, 3, 4, 0, 1, 3, 5, 5};
@@ -923,7 +781,7 @@ int test_uniform_neighbor_sample_clean(const cugraph_resource_handle_t* handle)
   bool_t renumber_results = FALSE;
 
   return generic_uniform_neighbor_sample_test(handle, src, dst, weight, edge_ids, edge_types, num_vertices, num_edges,
-                                              start, start_labels, num_starts,
+                                              start, start_labels, num_starts, num_start_labels,
                                               fan_out, fan_out_size, with_replacement,
                                               return_hops, prior_sources_behavior, dedupe_sources, renumber_results);
 }
@@ -940,6 +798,7 @@ int test_uniform_neighbor_sample_dedupe_sources(const cugraph_resource_handle_t*
   size_t num_vertices = 6;
   size_t fan_out_size = 3;
   size_t num_starts   = 2;
+  size_t num_start_labels = 2;
 
   vertex_t src[]       = {0, 0, 1, 1, 2, 2, 2, 3, 4};
   vertex_t dst[]       = {1, 3, 3, 4, 0, 1, 3, 5, 5};
@@ -961,7 +820,7 @@ int test_uniform_neighbor_sample_dedupe_sources(const cugraph_resource_handle_t*
   bool_t renumber_results = FALSE;
 
   return generic_uniform_neighbor_sample_test(handle, src, dst, weight, edge_ids, edge_types, num_vertices, num_edges,
-                                              start, start_labels, num_starts,
+                                              start, start_labels, num_starts, num_start_labels,
                                               fan_out, fan_out_size, with_replacement,
                                               return_hops, prior_sources_behavior, dedupe_sources, renumber_results);
 }
@@ -978,6 +837,7 @@ int test_uniform_neighbor_sample_unique_sources(const cugraph_resource_handle_t*
   size_t num_vertices = 6;
   size_t fan_out_size = 3;
   size_t num_starts   = 2;
+  size_t num_start_labels = 2;
 
   vertex_t src[]       = {0, 0, 1, 1, 2, 2, 2, 3, 4};
   vertex_t dst[]       = {1, 2, 3, 4, 0, 1, 3, 5, 5};
@@ -999,7 +859,7 @@ int test_uniform_neighbor_sample_unique_sources(const cugraph_resource_handle_t*
   bool_t renumber_results = FALSE;
 
   return generic_uniform_neighbor_sample_test(handle, src, dst, weight, edge_ids, edge_types, num_vertices, num_edges,
-                                              start, start_labels, num_starts,
+                                              start, start_labels, num_starts, num_start_labels,
                                               fan_out, fan_out_size, with_replacement,
                                               return_hops, prior_sources_behavior, dedupe_sources, renumber_results);
 }
@@ -1016,6 +876,7 @@ int test_uniform_neighbor_sample_carry_over_sources(const cugraph_resource_handl
   size_t num_vertices = 6;
   size_t fan_out_size = 3;
   size_t num_starts   = 2;
+  size_t num_start_labels = 2;
 
   vertex_t src[]       = {0, 0, 1, 1, 2, 2, 2, 3, 4};
   vertex_t dst[]       = {1, 2, 3, 4, 0, 1, 3, 5, 5};
@@ -1037,7 +898,7 @@ int test_uniform_neighbor_sample_carry_over_sources(const cugraph_resource_handl
   bool_t renumber_results = FALSE;
 
   return generic_uniform_neighbor_sample_test(handle, src, dst, weight, edge_ids, edge_types, num_vertices, num_edges,
-                                              start, start_labels, num_starts,
+                                              start, start_labels, num_starts, num_start_labels,
                                               fan_out, fan_out_size, with_replacement,
                                               return_hops, prior_sources_behavior, dedupe_sources, renumber_results);
 }
@@ -1054,6 +915,7 @@ int test_uniform_neighbor_sample_renumber_results(const cugraph_resource_handle_
   size_t num_vertices = 6;
   size_t fan_out_size = 3;
   size_t num_starts   = 2;
+  size_t num_start_labels = 2;
 
   vertex_t src[]       = {0, 0, 1, 1, 2, 2, 2, 3, 4};
   vertex_t dst[]       = {1, 2, 3, 4, 0, 1, 3, 5, 5};
@@ -1075,7 +937,7 @@ int test_uniform_neighbor_sample_renumber_results(const cugraph_resource_handle_
   bool_t renumber_results = TRUE;
 
   return generic_uniform_neighbor_sample_test(handle, src, dst, weight, edge_ids, edge_types, num_vertices, num_edges,
-                                              start, start_labels, num_starts,
+                                              start, start_labels, num_starts, num_start_labels,
                                               fan_out, fan_out_size, with_replacement,
                                               return_hops, prior_sources_behavior, dedupe_sources, renumber_results);
 }
@@ -1087,7 +949,6 @@ int main(int argc, char** argv)
   handle = cugraph_create_resource_handle(NULL);
 
   int result = 0;
-  result |= RUN_TEST_NEW(test_uniform_neighbor_sample_with_properties, handle);
   result |= RUN_TEST_NEW(test_uniform_neighbor_sample_with_labels, handle);
   result |= RUN_TEST_NEW(test_uniform_neighbor_sample_clean, handle);
   result |= RUN_TEST_NEW(test_uniform_neighbor_sample_dedupe_sources, handle);
diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
index 9e50169b4a7..03746561817 100644
--- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py
@@ -42,6 +42,7 @@
 if TYPE_CHECKING:
     from cugraph import Graph
 
+
 src_n = "sources"
 dst_n = "destinations"
 indices_n = "indices"
@@ -71,8 +72,21 @@ def create_empty_df(indices_t, weight_t):
 
 
 def create_empty_df_with_edge_props(
-    indices_t, weight_t, return_offsets=False, renumber=False
+    indices_t,
+    weight_t,
+    return_offsets=False,
+    renumber=False,
+    use_legacy_names=True,
+    include_hop_column=True,
+    compression="COO",
 ):
+    if compression != "COO":
+        majors_name = "major_offsets"
+    else:
+        majors_name = src_n if use_legacy_names else "majors"
+
+    minors_name = dst_n if use_legacy_names else "minors"
+
     if renumber:
         empty_df_renumber = cudf.DataFrame(
             {
@@ -84,14 +98,17 @@ def create_empty_df_with_edge_props(
     if return_offsets:
         df = cudf.DataFrame(
             {
-                src_n: numpy.empty(shape=0, dtype=indices_t),
-                dst_n: numpy.empty(shape=0, dtype=indices_t),
+                majors_name: numpy.empty(shape=0, dtype=indices_t),
+                minors_name: numpy.empty(shape=0, dtype=indices_t),
                 weight_n: numpy.empty(shape=0, dtype=weight_t),
                 edge_id_n: numpy.empty(shape=0, dtype=indices_t),
                 edge_type_n: numpy.empty(shape=0, dtype="int32"),
-                hop_id_n: numpy.empty(shape=0, dtype="int32"),
             }
         )
+
+        if include_hop_column:
+            df[hop_id_n] = numpy.empty(shape=0, dtype="int32")
+
         empty_df_offsets = cudf.DataFrame(
             {
                 offsets_n: numpy.empty(shape=0, dtype="int32"),
@@ -106,13 +123,13 @@ def create_empty_df_with_edge_props(
     else:
         df = cudf.DataFrame(
             {
-                src_n: numpy.empty(shape=0, dtype=indices_t),
-                dst_n: numpy.empty(shape=0, dtype=indices_t),
+                majors_name: numpy.empty(shape=0, dtype=indices_t),
+                minors_name: numpy.empty(shape=0, dtype=indices_t),
                 weight_n: numpy.empty(shape=0, dtype=weight_t),
                 edge_id_n: numpy.empty(shape=0, dtype=indices_t),
                 edge_type_n: numpy.empty(shape=0, dtype="int32"),
-                hop_id_n: numpy.empty(shape=0, dtype="int32"),
                 batch_id_n: numpy.empty(shape=0, dtype="int32"),
+                hop_id_n: numpy.empty(shape=0, dtype="int32"),
             }
         )
         if renumber:
@@ -121,102 +138,6 @@ def create_empty_df_with_edge_props(
             return df
 
 
-def convert_to_cudf(
-    cp_arrays, weight_t, with_edge_properties, return_offsets=False, renumber=False
-):
-    """
-    Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
-    """
-    df = cudf.DataFrame()
-
-    if with_edge_properties:
-        if renumber:
-            (
-                sources,
-                destinations,
-                weights,
-                edge_ids,
-                edge_types,
-                batch_ids,
-                offsets,
-                hop_ids,
-                renumber_map,
-                renumber_map_offsets,
-            ) = cp_arrays
-        else:
-            (
-                sources,
-                destinations,
-                weights,
-                edge_ids,
-                edge_types,
-                batch_ids,
-                offsets,
-                hop_ids,
-            ) = cp_arrays
-
-        df[src_n] = sources
-        df[dst_n] = destinations
-        df[weight_n] = weights
-        df[edge_id_n] = edge_ids
-        df[edge_type_n] = edge_types
-        df[hop_id_n] = hop_ids
-
-        return_dfs = [df]
-
-        if return_offsets:
-            offsets_df = cudf.DataFrame(
-                {
-                    batch_id_n: batch_ids,
-                    offsets_n: offsets[:-1],
-                }
-            )
-
-            if renumber:
-                offsets_df[map_offsets_n] = renumber_map_offsets[:-1]
-
-            return_dfs.append(offsets_df)
-        else:
-            batch_ids_b = batch_ids
-            if len(batch_ids_b) > 0:
-                batch_ids_b = cudf.Series(batch_ids_b).repeat(cp.diff(offsets))
-                batch_ids_b.reset_index(drop=True, inplace=True)
-
-            df[batch_id_n] = batch_ids_b
-
-        if renumber:
-            renumber_df = cudf.DataFrame(
-                {
-                    "map": renumber_map,
-                }
-            )
-
-            if not return_offsets:
-                batch_ids_r = cudf.Series(batch_ids).repeat(
-                    cp.diff(renumber_map_offsets)
-                )
-                batch_ids_r.reset_index(drop=True, inplace=True)
-                renumber_df["batch_id"] = batch_ids_r
-
-            return_dfs.append(renumber_df)
-
-        return tuple(return_dfs)
-    else:
-        cupy_sources, cupy_destinations, cupy_indices = cp_arrays
-
-        df[src_n] = cupy_sources
-        df[dst_n] = cupy_destinations
-        df[indices_n] = cupy_indices
-
-        if cupy_indices is not None:
-            if weight_t == "int32":
-                df.indices = df.indices.astype("int32")
-            elif weight_t == "int64":
-                df.indices = df.indices.astype("int64")
-
-        return (df,)
-
-
 def __get_label_to_output_comm_rank(min_batch_id, max_batch_id, n_workers):
     num_batches = max_batch_id - min_batch_id + 1
     num_batches = int(num_batches)
@@ -246,6 +167,10 @@ def _call_plc_uniform_neighbor_sample(
     prior_sources_behavior=None,
     deduplicate_sources=False,
     renumber=False,
+    use_legacy_names=True,
+    include_hop_column=True,
+    compress_per_hop=False,
+    compression="COO",
 ):
     st_x = st_x[0]
     start_list_x = st_x[start_col_name]
@@ -259,7 +184,7 @@ def _call_plc_uniform_neighbor_sample(
             min_batch_id, max_batch_id, n_workers
         )
 
-    cp_arrays = pylibcugraph_uniform_neighbor_sample(
+    cupy_array_dict = pylibcugraph_uniform_neighbor_sample(
         resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
         input_graph=mg_graph_x,
         start_list=start_list_x,
@@ -275,13 +200,25 @@ def _call_plc_uniform_neighbor_sample(
         deduplicate_sources=deduplicate_sources,
         return_hops=return_hops,
         renumber=renumber,
+        compression=compression,
+        compress_per_hop=compress_per_hop,
+        return_dict=True,
+    )
+
+    # have to import here due to circular import issue
+    from cugraph.sampling.sampling_utilities import (
+        sampling_results_from_cupy_array_dict,
     )
-    return convert_to_cudf(
-        cp_arrays,
+
+    return sampling_results_from_cupy_array_dict(
+        cupy_array_dict,
         weight_t,
-        with_edge_properties,
+        len(fanout_vals),
+        with_edge_properties=with_edge_properties,
         return_offsets=return_offsets,
         renumber=renumber,
+        use_legacy_names=use_legacy_names,
+        include_hop_column=include_hop_column,
     )
 
 
@@ -304,6 +241,10 @@ def _mg_call_plc_uniform_neighbor_sample(
     prior_sources_behavior=None,
     deduplicate_sources=False,
     renumber=False,
+    use_legacy_names=True,
+    include_hop_column=True,
+    compress_per_hop=False,
+    compression="COO",
 ):
     n_workers = None
     if keep_batches_together:
@@ -335,6 +276,10 @@ def _mg_call_plc_uniform_neighbor_sample(
             prior_sources_behavior=prior_sources_behavior,
             deduplicate_sources=deduplicate_sources,
             renumber=renumber,
+            use_legacy_names=use_legacy_names,  # remove in 23.12
+            include_hop_column=include_hop_column,  # remove in 23.12
+            compress_per_hop=compress_per_hop,
+            compression=compression,
             allow_other_workers=False,
             pure=False,
         )
@@ -348,6 +293,9 @@ def _mg_call_plc_uniform_neighbor_sample(
             weight_t,
             return_offsets=return_offsets,
             renumber=renumber,
+            use_legacy_names=use_legacy_names,
+            compression=compression,
+            include_hop_column=include_hop_column,
         )
         if with_edge_properties
         else create_empty_df(indices_t, weight_t)
@@ -397,6 +345,7 @@ def uniform_neighbor_sample(
     input_graph: Graph,
     start_list: Sequence,
     fanout_vals: List[int],
+    *,
     with_replacement: bool = True,
     with_edge_properties: bool = False,  # deprecated
     with_batch_ids: bool = False,
@@ -406,9 +355,13 @@ def uniform_neighbor_sample(
     random_state: int = None,
     return_offsets: bool = False,
     return_hops: bool = True,
+    include_hop_column: bool = True,  # deprecated
     prior_sources_behavior: str = None,
     deduplicate_sources: bool = False,
     renumber: bool = False,
+    use_legacy_names=True,  # deprecated
+    compress_per_hop=False,
+    compression="COO",
     _multiple_clients: bool = False,
 ) -> Union[dask_cudf.DataFrame, Tuple[dask_cudf.DataFrame, dask_cudf.DataFrame]]:
     """
@@ -463,6 +416,12 @@ def uniform_neighbor_sample(
         corresponding to the hop where the edge appeared.
         Defaults to True.
 
+    include_hop_column: bool, optional (default=True)
+        Deprecated.  Defaults to True.
+        If True, will include the hop column even if
+        return_offsets is True.  This option will
+        be removed in release 23.12.
+
     prior_sources_behavior: str (Optional)
         Options are "carryover", and "exclude".
         Default will leave the source list as-is.
@@ -481,6 +440,21 @@ def uniform_neighbor_sample(
         will return the renumber map and renumber map offsets
         as an additional dataframe.
 
+    use_legacy_names: bool, optional (default=True)
+        Whether to use the legacy column names (sources, destinations).
+        If True, will use "sources" and "destinations" as the column names.
+        If False, will use "majors" and "minors" as the column names.
+        Deprecated.  Will be removed in release 23.12 in favor of always
+        using the new names "majors" and "minors".
+
+    compress_per_hop: bool, optional (default=False)
+        Whether to compress globally (default), or to produce a separate
+        compressed edgelist per hop.
+
+    compression: str, optional (default=COO)
+        Sets the compression type for the output minibatches.
+        Valid options are COO (default), CSR, CSC, DCSR, and DCSC.
+
     _multiple_clients: bool, optional (default=False)
         internal flag to ensure sampling works with multiple dask clients
         set to True to prevent hangs in multi-client environment
@@ -548,12 +522,46 @@ def uniform_neighbor_sample(
                         Contains the batch offsets for the renumber maps
     """
 
+    if compression not in ["COO", "CSR", "CSC", "DCSR", "DCSC"]:
+        raise ValueError("compression must be one of COO, CSR, CSC, DCSR, or DCSC")
+
     if with_edge_properties:
         warning_msg = (
             "The with_edge_properties flag is deprecated"
             " and will be removed in the next release."
         )
-        warnings.warn(warning_msg, DeprecationWarning)
+        warnings.warn(warning_msg, FutureWarning)
+
+    if (
+        (compression != "COO")
+        and (not compress_per_hop)
+        and prior_sources_behavior != "exclude"
+    ):
+        raise ValueError(
+            "hop-agnostic compression is only supported with"
+            " the exclude prior sources behavior due to limitations "
+            "of the libcugraph C++ API"
+        )
+
+    if compress_per_hop and prior_sources_behavior != "carryover":
+        raise ValueError(
+            "Compressing the edgelist per hop is only supported "
+            "with the carryover prior sources behavior due to limitations"
+            " of the libcugraph C++ API"
+        )
+
+    if include_hop_column:
+        warning_msg = (
+            "The include_hop_column flag is deprecated and will be"
+            " removed in the next release in favor of always "
+            "excluding the hop column when return_offsets is True"
+        )
+        warnings.warn(warning_msg, FutureWarning)
+
+        if compression != "COO":
+            raise ValueError(
+                "Including the hop id column is only supported with COO compression."
+            )
 
     if isinstance(start_list, int):
         start_list = [start_list]
@@ -643,6 +651,31 @@ def uniform_neighbor_sample(
     ddf = persist_dask_df_equal_parts_per_worker(ddf, client)
     ddf = get_persisted_df_worker_map(ddf, client)
 
+    sample_call_kwargs = {
+        "client": client,
+        "session_id": session_id,
+        "input_graph": input_graph,
+        "ddf": ddf,
+        "keep_batches_together": keep_batches_together,
+        "min_batch_id": min_batch_id,
+        "max_batch_id": max_batch_id,
+        "fanout_vals": fanout_vals,
+        "with_replacement": with_replacement,
+        "weight_t": weight_t,
+        "indices_t": indices_t,
+        "with_edge_properties": with_edge_properties,
+        "random_state": random_state,
+        "return_offsets": return_offsets,
+        "return_hops": return_hops,
+        "prior_sources_behavior": prior_sources_behavior,
+        "deduplicate_sources": deduplicate_sources,
+        "renumber": renumber,
+        "use_legacy_names": use_legacy_names,
+        "include_hop_column": include_hop_column,
+        "compress_per_hop": compress_per_hop,
+        "compression": compression,
+    }
+
     if _multiple_clients:
         # Distributed centralized lock to allow
         # two disconnected processes (clients) to coordinate a lock
@@ -650,26 +683,7 @@ def uniform_neighbor_sample(
         lock = Lock("plc_graph_access")
         if lock.acquire(timeout=100):
             try:
-                ddf = _mg_call_plc_uniform_neighbor_sample(
-                    client=client,
-                    session_id=session_id,
-                    input_graph=input_graph,
-                    ddf=ddf,
-                    keep_batches_together=keep_batches_together,
-                    min_batch_id=min_batch_id,
-                    max_batch_id=max_batch_id,
-                    fanout_vals=fanout_vals,
-                    with_replacement=with_replacement,
-                    weight_t=weight_t,
-                    indices_t=indices_t,
-                    with_edge_properties=with_edge_properties,
-                    random_state=random_state,
-                    return_offsets=return_offsets,
-                    return_hops=return_hops,
-                    prior_sources_behavior=prior_sources_behavior,
-                    deduplicate_sources=deduplicate_sources,
-                    renumber=renumber,
-                )
+                ddf = _mg_call_plc_uniform_neighbor_sample(**sample_call_kwargs)
             finally:
                 lock.release()
         else:
@@ -677,26 +691,7 @@ def uniform_neighbor_sample(
                 "Failed to acquire lock(plc_graph_access) while trying to sampling"
             )
     else:
-        ddf = _mg_call_plc_uniform_neighbor_sample(
-            client=client,
-            session_id=session_id,
-            input_graph=input_graph,
-            ddf=ddf,
-            keep_batches_together=keep_batches_together,
-            min_batch_id=min_batch_id,
-            max_batch_id=max_batch_id,
-            fanout_vals=fanout_vals,
-            with_replacement=with_replacement,
-            weight_t=weight_t,
-            indices_t=indices_t,
-            with_edge_properties=with_edge_properties,
-            random_state=random_state,
-            return_offsets=return_offsets,
-            return_hops=return_hops,
-            prior_sources_behavior=prior_sources_behavior,
-            deduplicate_sources=deduplicate_sources,
-            renumber=renumber,
-        )
+        ddf = _mg_call_plc_uniform_neighbor_sample(**sample_call_kwargs)
 
     if return_offsets:
         if renumber:
@@ -708,9 +703,12 @@ def uniform_neighbor_sample(
             ddf, renumber_df = ddf
 
     if input_graph.renumbered and not renumber:
-        ddf = input_graph.unrenumber(ddf, "sources", preserve_order=True)
-        ddf = input_graph.unrenumber(ddf, "destinations", preserve_order=True)
-
+        if use_legacy_names:
+            ddf = input_graph.unrenumber(ddf, "sources", preserve_order=True)
+            ddf = input_graph.unrenumber(ddf, "destinations", preserve_order=True)
+        else:
+            ddf = input_graph.unrenumber(ddf, "majors", preserve_order=True)
+            ddf = input_graph.unrenumber(ddf, "minors", preserve_order=True)
     if return_offsets:
         if renumber:
             return ddf, offsets_df, renumber_df
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
index 92caba6dbaf..dbfcb124ce5 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
@@ -269,6 +269,7 @@ def flush(self) -> None:
             with_edge_properties=True,
             return_offsets=True,
             renumber=self.__renumber,
+            # use_legacy_names=False,
         )
 
         if self.__renumber:
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
index e9e5be26fc3..7e67eab83c9 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler_io.py
@@ -15,10 +15,24 @@
 import cudf
 import cupy
 
-from typing import Union, Optional
+from math import ceil
 
+from pandas import isna
 
-def _write_samples_to_parquet(
+from typing import Union, Optional, List
+
+
+def create_df_from_disjoint_series(series_list: List[cudf.Series]):
+    series_list.sort(key=lambda s: len(s), reverse=True)
+
+    df = cudf.DataFrame()
+    for s in series_list:
+        df[s.name] = s
+
+    return df
+
+
+def _write_samples_to_parquet_csr(
     results: cudf.DataFrame,
     offsets: cudf.DataFrame,
     renumber_map: cudf.DataFrame,
@@ -27,7 +41,184 @@ def _write_samples_to_parquet(
     partition_info: Optional[Union[dict, str]] = None,
 ) -> cudf.Series:
     """
-    Writes the samples to parquet.
+    Writes CSR/CSC compressed samples to parquet.
+
+    Batches that are empty are discarded, and the remaining non-empty
+    batches are renumbered to be contiguous starting from the first
+    batch id.  This means that the output batch ids may not match
+    the input batch ids.
+
+    results: cudf.DataFrame
+        The results dataframe containing the sampled minibatches.
+    offsets: cudf.DataFrame
+        The offsets dataframe indicating the start/end of each minibatch
+        in the reuslts dataframe.
+    renumber_map: cudf.DataFrame
+        The renumber map containing the mapping of renumbered vertex ids
+        to original vertex ids.
+    batches_per_partition: int
+        The maximum number of minibatches allowed per written parquet partition.
+    output_path: str
+        The output path (where parquet files should be written to).
+    partition_info: Union[dict, str]
+        Either a dictionary containing partition data from dask, the string 'sg'
+        indicating that this is a single GPU write, or None indicating that this
+        function should perform a no-op (required by dask).
+
+    Returns an empty cudf series.
+    """
+    # Required by dask; need to skip dummy partitions.
+    if partition_info is None or len(results) == 0:
+        return cudf.Series(dtype="int64")
+    if partition_info != "sg" and (not isinstance(partition_info, dict)):
+        raise ValueError("Invalid value of partition_info")
+
+    # Additional check to skip dummy partitions required for CSR format.
+    if isna(offsets.batch_id.iloc[0]):
+        return cudf.Series(dtype="int64")
+
+    # Output:
+    # major_offsets - CSR/CSC row/col pointers
+    # minors - CSR/CSC col/row indices
+    # edge id - edge ids (same shape as minors)
+    # edge type - edge types (same shape as minors)
+    # weight - edge weight (same shape as minors)
+    # renumber map - the original vertex ids
+    # renumber map offsets - start/end of the map for each batch
+    #                        (only 1 per batch b/c of framework
+    #                         stipulations making this legal)
+    # label-hop offsets - indicate the start/end of each hop
+    #                     for each batch
+
+    batch_ids = offsets.batch_id
+    label_hop_offsets = offsets.offsets
+    renumber_map_offsets = offsets.renumber_map_offsets
+    del offsets
+
+    batch_ids.dropna(inplace=True)
+    label_hop_offsets.dropna(inplace=True)
+    renumber_map_offsets.dropna(inplace=True)
+
+    major_offsets_array = results.major_offsets
+    results.drop(columns="major_offsets", inplace=True)
+    major_offsets_array.dropna(inplace=True)
+    major_offsets_array = major_offsets_array.values
+
+    minors_array = results.minors
+    results.drop(columns="minors", inplace=True)
+    minors_array.dropna(inplace=True)
+    minors_array = minors_array.values
+
+    weight_array = results.weight
+    results.drop(columns="weight", inplace=True)
+    weight_array.dropna(inplace=True)
+    weight_array = (
+        cupy.array([], dtype="float32") if weight_array.empty else weight_array.values
+    )
+
+    edge_id_array = results.edge_id
+    results.drop(columns="edge_id", inplace=True)
+    edge_id_array.dropna(inplace=True)
+    edge_id_array = (
+        cupy.array([], dtype="int64") if edge_id_array.empty else edge_id_array.values
+    )
+
+    edge_type_array = results.edge_type
+    results.drop(columns="edge_type", inplace=True)
+    edge_type_array.dropna(inplace=True)
+    edge_type_array = (
+        cupy.array([], dtype="int32")
+        if edge_type_array.empty
+        else edge_type_array.values
+    )
+
+    del results
+
+    offsets_length = len(label_hop_offsets) - 1
+    if offsets_length % len(batch_ids) != 0:
+        raise ValueError("Invalid hop offsets")
+    fanout_length = int(offsets_length / len(batch_ids))
+
+    for p in range(0, int(ceil(len(batch_ids) / batches_per_partition))):
+        partition_start = p * (batches_per_partition)
+        partition_end = (p + 1) * (batches_per_partition)
+
+        label_hop_offsets_current_partition = label_hop_offsets.iloc[
+            partition_start * fanout_length : partition_end * fanout_length + 1
+        ].reset_index(drop=True)
+        label_hop_offsets_current_partition.name = "label_hop_offsets"
+
+        batch_ids_current_partition = batch_ids.iloc[partition_start:partition_end]
+
+        (
+            major_offsets_start,
+            major_offsets_end,
+        ) = label_hop_offsets_current_partition.iloc[
+            [0, -1]
+        ].values  # legal since offsets has the 1 extra offset
+        results_start, results_end = major_offsets_array[
+            [major_offsets_start, major_offsets_end]
+        ]  # avoid d2h copy
+
+        # no need to use end batch id, just ensure the batch is labeled correctly
+        start_batch_id = batch_ids_current_partition.iloc[0]
+        # end_batch_id = batch_ids_current_partition.iloc[-1]
+
+        # create the renumber map offsets
+        renumber_map_offsets_current_partition = renumber_map_offsets.iloc[
+            partition_start : partition_end + 1
+        ].reset_index(drop=True)
+        renumber_map_offsets_current_partition.name = "renumber_map_offsets"
+
+        (
+            renumber_map_start,
+            renumber_map_end,
+        ) = renumber_map_offsets_current_partition.iloc[
+            [0, -1]
+        ].values  # avoid d2h copy
+
+        results_current_partition = create_df_from_disjoint_series(
+            [
+                cudf.Series(minors_array[results_start:results_end], name="minors"),
+                cudf.Series(
+                    renumber_map.map.values[renumber_map_start:renumber_map_end],
+                    name="map",
+                ),
+                label_hop_offsets_current_partition,
+                cudf.Series(
+                    major_offsets_array[major_offsets_start : major_offsets_end + 1],
+                    name="major_offsets",
+                ),
+                cudf.Series(weight_array[results_start:results_end], name="weight"),
+                cudf.Series(edge_id_array[results_start:results_end], name="edge_id"),
+                cudf.Series(
+                    edge_type_array[results_start:results_end], name="edge_type"
+                ),
+                renumber_map_offsets_current_partition,
+            ]
+        )
+
+        end_batch_id = start_batch_id + len(batch_ids_current_partition) - 1
+        filename = f"batch={start_batch_id}-{end_batch_id}.parquet"
+        full_output_path = os.path.join(output_path, filename)
+
+        results_current_partition.to_parquet(
+            full_output_path, compression=None, index=False, force_nullable_schema=True
+        )
+
+    return cudf.Series(dtype="int64")
+
+
+def _write_samples_to_parquet_coo(
+    results: cudf.DataFrame,
+    offsets: cudf.DataFrame,
+    renumber_map: cudf.DataFrame,
+    batches_per_partition: int,
+    output_path: str,
+    partition_info: Optional[Union[dict, str]] = None,
+) -> cudf.Series:
+    """
+    Writes COO compressed samples to parquet.
 
     Batches that are empty are discarded, and the remaining non-empty
     batches are renumbered to be contiguous starting from the first
@@ -60,8 +251,10 @@ def _write_samples_to_parquet(
     if partition_info != "sg" and (not isinstance(partition_info, dict)):
         raise ValueError("Invalid value of partition_info")
 
+    offsets = offsets[:-1]
+
     # Offsets is always in order, so the last batch id is always the highest
-    max_batch_id = offsets.batch_id.iloc[len(offsets) - 1]
+    max_batch_id = offsets.batch_id.iloc[-1]
     results.dropna(axis=1, how="all", inplace=True)
     results["hop_id"] = results["hop_id"].astype("uint8")
 
@@ -182,9 +375,23 @@ def write_samples(
     output_path: str
         The output path (where parquet files should be written to).
     """
+
+    if ("majors" in results.columns) and ("minors" in results.columns):
+        write_fn = _write_samples_to_parquet_coo
+
+    # TODO these names will be deprecated in release 23.12
+    elif ("sources" in results.columns) and ("destinations" in results.columns):
+        write_fn = _write_samples_to_parquet_coo
+
+    elif "major_offsets" in results.columns and "minors" in results.columns:
+        write_fn = _write_samples_to_parquet_csr
+
+    else:
+        raise ValueError("invalid columns")
+
     if hasattr(results, "compute"):
         results.map_partitions(
-            _write_samples_to_parquet,
+            write_fn,
             offsets,
             renumber_map,
             batches_per_partition,
@@ -194,7 +401,7 @@ def write_samples(
         ).compute()
 
     else:
-        _write_samples_to_parquet(
+        write_fn(
             results,
             offsets,
             renumber_map,
diff --git a/python/cugraph/cugraph/sampling/sampling_utilities.py b/python/cugraph/cugraph/sampling/sampling_utilities.py
new file mode 100644
index 00000000000..50c315129dc
--- /dev/null
+++ b/python/cugraph/cugraph/sampling/sampling_utilities.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cupy
+import cudf
+
+import warnings
+
+
+def sampling_results_from_cupy_array_dict(
+    cupy_array_dict,
+    weight_t,
+    num_hops,
+    with_edge_properties=False,
+    return_offsets=False,
+    renumber=False,
+    use_legacy_names=True,
+    include_hop_column=True,
+):
+    """
+    Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
+    """
+    results_df = cudf.DataFrame()
+
+    if use_legacy_names:
+        major_col_name = "sources"
+        minor_col_name = "destinations"
+        warning_msg = (
+            "The legacy column names (sources, destinations)"
+            " will no longer be supported for uniform_neighbor_sample"
+            " in release 23.12.  The use_legacy_names=False option will"
+            " become the only option, and (majors, minors) will be the"
+            " only supported column names."
+        )
+        warnings.warn(warning_msg, FutureWarning)
+    else:
+        major_col_name = "majors"
+        minor_col_name = "minors"
+
+    if with_edge_properties:
+        majors = cupy_array_dict["majors"]
+        if majors is not None:
+            results_df["majors"] = majors
+
+        results_df_cols = [
+            "minors",
+            "weight",
+            "edge_id",
+            "edge_type",
+        ]
+
+        for col in results_df_cols:
+            array = cupy_array_dict[col]
+            # The length of each of these arrays should be the same
+            results_df[col] = array
+
+        results_df.rename(
+            columns={"majors": major_col_name, "minors": minor_col_name}, inplace=True
+        )
+
+        label_hop_offsets = cupy_array_dict["label_hop_offsets"]
+        batch_ids = cupy_array_dict["batch_id"]
+
+        if renumber:
+            renumber_df = cudf.DataFrame(
+                {
+                    "map": cupy_array_dict["renumber_map"],
+                }
+            )
+
+            if not return_offsets:
+                if len(batch_ids) > 0:
+                    batch_ids_r = cudf.Series(batch_ids).repeat(
+                        cupy.diff(cupy_array_dict["renumber_map_offsets"])
+                    )
+                    batch_ids_r.reset_index(drop=True, inplace=True)
+                    renumber_df["batch_id"] = batch_ids_r
+                else:
+                    renumber_df["batch_id"] = None
+
+        if return_offsets:
+            batches_series = cudf.Series(
+                batch_ids,
+                name="batch_id",
+            )
+            if include_hop_column:
+                # TODO remove this logic in release 23.12
+                offsets_df = cudf.Series(
+                    label_hop_offsets[cupy.arange(len(batch_ids) + 1) * num_hops],
+                    name="offsets",
+                ).to_frame()
+            else:
+                offsets_df = cudf.Series(
+                    label_hop_offsets,
+                    name="offsets",
+                ).to_frame()
+
+            if len(batches_series) > len(offsets_df):
+                # this is extremely rare so the inefficiency is ok
+                offsets_df = offsets_df.join(batches_series, how="outer").sort_index()
+            else:
+                offsets_df["batch_id"] = batches_series
+
+            if renumber:
+                renumber_offset_series = cudf.Series(
+                    cupy_array_dict["renumber_map_offsets"], name="renumber_map_offsets"
+                )
+
+                if len(renumber_offset_series) > len(offsets_df):
+                    # this is extremely rare so the inefficiency is ok
+                    offsets_df = offsets_df.join(
+                        renumber_offset_series, how="outer"
+                    ).sort_index()
+                else:
+                    offsets_df["renumber_map_offsets"] = renumber_offset_series
+
+        else:
+            if len(batch_ids) > 0:
+                batch_ids_r = cudf.Series(cupy.repeat(batch_ids, num_hops))
+                batch_ids_r = cudf.Series(batch_ids_r).repeat(
+                    cupy.diff(label_hop_offsets)
+                )
+                batch_ids_r.reset_index(drop=True, inplace=True)
+
+                results_df["batch_id"] = batch_ids_r
+            else:
+                results_df["batch_id"] = None
+
+        # TODO remove this logic in release 23.12, hops will always returned as offsets
+        if include_hop_column:
+            if len(batch_ids) > 0:
+                hop_ids_r = cudf.Series(cupy.arange(num_hops))
+                hop_ids_r = cudf.concat([hop_ids_r] * len(batch_ids), ignore_index=True)
+
+                # generate the hop column
+                hop_ids_r = (
+                    cudf.Series(hop_ids_r, name="hop_id")
+                    .repeat(cupy.diff(label_hop_offsets))
+                    .reset_index(drop=True)
+                )
+            else:
+                hop_ids_r = cudf.Series(name="hop_id", dtype="int32")
+
+            results_df = results_df.join(hop_ids_r, how="outer").sort_index()
+
+        if major_col_name not in results_df:
+            if use_legacy_names:
+                raise ValueError("Can't use legacy names with major offsets")
+
+            major_offsets_series = cudf.Series(
+                cupy_array_dict["major_offsets"], name="major_offsets"
+            )
+            if len(major_offsets_series) > len(results_df):
+                # this is extremely rare so the inefficiency is ok
+                results_df = results_df.join(
+                    major_offsets_series, how="outer"
+                ).sort_index()
+            else:
+                results_df["major_offsets"] = major_offsets_series
+
+    else:
+        # TODO this is deprecated, remove it in 23.12
+
+        results_df[major_col_name] = cupy_array_dict["sources"]
+        results_df[minor_col_name] = cupy_array_dict["destinations"]
+        indices = cupy_array_dict["indices"]
+
+        if indices is None:
+            results_df["indices"] = None
+        else:
+            results_df["indices"] = indices
+            if weight_t == "int32":
+                results_df["indices"] = indices.astype("int32")
+            elif weight_t == "int64":
+                results_df["indices"] = indices.astype("int64")
+            else:
+                results_df["indices"] = indices
+
+    if return_offsets:
+        if renumber:
+            return results_df, offsets_df, renumber_df
+        else:
+            return results_df, offsets_df
+
+    if renumber:
+        return results_df, renumber_df
+
+    return (results_df,)
diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
index 219854bb002..1832585c0ab 100644
--- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
@@ -16,6 +16,8 @@
 from pylibcugraph import ResourceHandle
 from pylibcugraph import uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample
 
+from cugraph.sampling.sampling_utilities import sampling_results_from_cupy_array_dict
+
 import numpy
 
 import cudf
@@ -58,15 +60,20 @@ def uniform_neighbor_sample(
     G: Graph,
     start_list: Sequence,
     fanout_vals: List[int],
+    *,
     with_replacement: bool = True,
     with_edge_properties: bool = False,  # deprecated
     with_batch_ids: bool = False,
     random_state: int = None,
     return_offsets: bool = False,
     return_hops: bool = True,
+    include_hop_column: bool = True,  # deprecated
     prior_sources_behavior: str = None,
     deduplicate_sources: bool = False,
     renumber: bool = False,
+    use_legacy_names: bool = True,  # deprecated
+    compress_per_hop: bool = False,
+    compression: str = "COO",
 ) -> Union[cudf.DataFrame, Tuple[cudf.DataFrame, cudf.DataFrame]]:
     """
     Does neighborhood sampling, which samples nodes from a graph based on the
@@ -111,6 +118,12 @@ def uniform_neighbor_sample(
         corresponding to the hop where the edge appeared.
         Defaults to True.
 
+    include_hop_column: bool, optional (default=True)
+        Deprecated.  Defaults to True.
+        If True, will include the hop column even if
+        return_offsets is True.  This option will
+        be removed in release 23.12.
+
     prior_sources_behavior: str, optional (default=None)
         Options are "carryover", and "exclude".
         Default will leave the source list as-is.
@@ -129,6 +142,21 @@ def uniform_neighbor_sample(
         will return the renumber map and renumber map offsets
         as an additional dataframe.
 
+    use_legacy_names: bool, optional (default=True)
+        Whether to use the legacy column names (sources, destinations).
+        If True, will use "sources" and "destinations" as the column names.
+        If False, will use "majors" and "minors" as the column names.
+        Deprecated.  Will be removed in release 23.12 in favor of always
+        using the new names "majors" and "minors".
+
+    compress_per_hop: bool, optional (default=False)
+        Whether to compress globally (default), or to produce a separate
+        compressed edgelist per hop.
+
+    compression: str, optional (default=COO)
+        Sets the compression type for the output minibatches.
+        Valid options are COO (default), CSR, CSC, DCSR, and DCSC.
+
     Returns
     -------
     result : cudf.DataFrame or Tuple[cudf.DataFrame, cudf.DataFrame]
@@ -193,12 +221,62 @@ def uniform_neighbor_sample(
                         Contains the batch offsets for the renumber maps
     """
 
+    if use_legacy_names:
+        major_col_name = "sources"
+        minor_col_name = "destinations"
+        warning_msg = (
+            "The legacy column names (sources, destinations)"
+            " will no longer be supported for uniform_neighbor_sample"
+            " in release 23.12.  The use_legacy_names=False option will"
+            " become the only option, and (majors, minors) will be the"
+            " only supported column names."
+        )
+        warnings.warn(warning_msg, FutureWarning)
+    else:
+        major_col_name = "majors"
+        minor_col_name = "minors"
+
+    if compression not in ["COO", "CSR", "CSC", "DCSR", "DCSC"]:
+        raise ValueError("compression must be one of COO, CSR, CSC, DCSR, or DCSC")
+
+    if (
+        (compression != "COO")
+        and (not compress_per_hop)
+        and prior_sources_behavior != "exclude"
+    ):
+        raise ValueError(
+            "hop-agnostic compression is only supported with"
+            " the exclude prior sources behavior due to limitations "
+            "of the libcugraph C++ API"
+        )
+
+    if compress_per_hop and prior_sources_behavior != "carryover":
+        raise ValueError(
+            "Compressing the edgelist per hop is only supported "
+            "with the carryover prior sources behavior due to limitations"
+            " of the libcugraph C++ API"
+        )
+
+    if include_hop_column:
+        warning_msg = (
+            "The include_hop_column flag is deprecated and will be"
+            " removed in the next release in favor of always "
+            "excluding the hop column when return_offsets is True"
+        )
+        warnings.warn(warning_msg, FutureWarning)
+
+        if compression != "COO":
+            raise ValueError(
+                "Including the hop id column is only supported with COO compression."
+            )
+
     if with_edge_properties:
         warning_msg = (
             "The with_edge_properties flag is deprecated"
-            " and will be removed in the next release."
+            " and will be removed in the next release in favor"
+            " of returning all properties in the graph"
         )
-        warnings.warn(warning_msg, DeprecationWarning)
+        warnings.warn(warning_msg, FutureWarning)
 
     if isinstance(start_list, int):
         start_list = [start_list]
@@ -255,7 +333,7 @@ def uniform_neighbor_sample(
                 start_list = G.lookup_internal_vertex_id(start_list, columns)
             start_list = start_list.rename(columns={columns[0]: start_col_name})
 
-    sampling_result = pylibcugraph_uniform_neighbor_sample(
+    sampling_result_array_dict = pylibcugraph_uniform_neighbor_sample(
         resource_handle=ResourceHandle(),
         input_graph=G._plc_graph,
         start_list=start_list[start_col_name],
@@ -271,104 +349,27 @@ def uniform_neighbor_sample(
         deduplicate_sources=deduplicate_sources,
         return_hops=return_hops,
         renumber=renumber,
+        compression=compression,
+        compress_per_hop=compress_per_hop,
+        return_dict=True,
     )
 
-    df = cudf.DataFrame()
-
-    if with_edge_properties:
-        # TODO use a dictionary at PLC w/o breaking users
-        if renumber:
-            (
-                sources,
-                destinations,
-                weights,
-                edge_ids,
-                edge_types,
-                batch_ids,
-                offsets,
-                hop_ids,
-                renumber_map,
-                renumber_map_offsets,
-            ) = sampling_result
-        else:
-            (
-                sources,
-                destinations,
-                weights,
-                edge_ids,
-                edge_types,
-                batch_ids,
-                offsets,
-                hop_ids,
-            ) = sampling_result
-
-        df["sources"] = sources
-        df["destinations"] = destinations
-        df["weight"] = weights
-        df["edge_id"] = edge_ids
-        df["edge_type"] = edge_types
-        df["hop_id"] = hop_ids
-
-        if renumber:
-            renumber_df = cudf.DataFrame(
-                {
-                    "map": renumber_map,
-                }
-            )
-
-            if not return_offsets:
-                batch_ids_r = cudf.Series(batch_ids).repeat(
-                    cp.diff(renumber_map_offsets)
-                )
-                batch_ids_r.reset_index(drop=True, inplace=True)
-                renumber_df["batch_id"] = batch_ids_r
-
-        if return_offsets:
-            offsets_df = cudf.DataFrame(
-                {
-                    "batch_id": batch_ids,
-                    "offsets": offsets[:-1],
-                }
-            )
-
-            if renumber:
-                offsets_df["renumber_map_offsets"] = renumber_map_offsets[:-1]
-
-        else:
-            if len(batch_ids) > 0:
-                batch_ids = cudf.Series(batch_ids).repeat(cp.diff(offsets))
-                batch_ids.reset_index(drop=True, inplace=True)
-
-            df["batch_id"] = batch_ids
-
-    else:
-        sources, destinations, indices = sampling_result
-
-        df["sources"] = sources
-        df["destinations"] = destinations
-
-        if indices is None:
-            df["indices"] = None
-        else:
-            df["indices"] = indices
-            if weight_t == "int32":
-                df["indices"] = indices.astype("int32")
-            elif weight_t == "int64":
-                df["indices"] = indices.astype("int64")
-            else:
-                df["indices"] = indices
+    dfs = sampling_results_from_cupy_array_dict(
+        sampling_result_array_dict,
+        weight_t,
+        len(fanout_vals),
+        with_edge_properties=with_edge_properties,
+        return_offsets=return_offsets,
+        renumber=renumber,
+        use_legacy_names=use_legacy_names,
+        include_hop_column=include_hop_column,
+    )
 
     if G.renumbered and not renumber:
-        df = G.unrenumber(df, "sources", preserve_order=True)
-        df = G.unrenumber(df, "destinations", preserve_order=True)
-
-    if return_offsets:
-        if renumber:
-            return df, offsets_df, renumber_df
-        else:
-            return df, offsets_df
+        dfs[0] = G.unrenumber(dfs[0], major_col_name, preserve_order=True)
+        dfs[0] = G.unrenumber(dfs[0], minor_col_name, preserve_order=True)
 
-    if renumber:
-        return df, renumber_df
+    if len(dfs) > 1:
+        return dfs
 
-    return df
+    return dfs[0]
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
index 5ea79e0893a..a945881394b 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
@@ -16,7 +16,7 @@
 import cudf
 import cupy
 import cugraph
-from cugraph.datasets import karate
+from cugraph.datasets import karate, email_Eu_core
 from cugraph.experimental.gnn import BulkSampler
 from cugraph.utilities.utils import create_directory_with_overwrite
 
@@ -297,3 +297,53 @@ def test_bulk_sampler_empty_batches(scratch_dir):
     assert df.batch_id.max() == 1
 
     shutil.rmtree(samples_path)
+
+
+@pytest.mark.sg
+def test_bulk_sampler_csr(scratch_dir):
+    el = email_Eu_core.get_edgelist()
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(el, source="src", destination="dst")
+
+    samples_path = os.path.join(scratch_dir, "test_bulk_sampler_csr")
+    create_directory_with_overwrite(samples_path)
+
+    bs = BulkSampler(
+        batch_size=7,
+        output_path=samples_path,
+        graph=G,
+        fanout_vals=[5, 4, 3],
+        with_replacement=False,
+        batches_per_partition=7,
+        renumber=True,
+        use_legacy_names=False,
+        compression="CSR",
+        compress_per_hop=False,
+        prior_sources_behavior="exclude",
+        include_hop_column=False,
+    )
+
+    seeds = G.select_random_vertices(62, 1000)
+    batch_ids = cudf.Series(
+        cupy.repeat(cupy.arange(int(1000 / 7) + 1, dtype="int32"), 7)[:1000]
+    ).sort_values()
+
+    batch_df = cudf.DataFrame(
+        {
+            "seed": seeds,
+            "batch": batch_ids,
+        }
+    )
+
+    bs.add_batches(batch_df, start_col_name="seed", batch_col_name="batch")
+    bs.flush()
+
+    assert len(os.listdir(samples_path)) == 21
+
+    for file in os.listdir(samples_path):
+        df = cudf.read_parquet(os.path.join(samples_path, file))
+
+        assert df.major_offsets.dropna().iloc[-1] - df.major_offsets.iloc[0] == len(df)
+
+    shutil.rmtree(samples_path)
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py
index f71c16a8368..5eafe89ea83 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io.py
@@ -16,6 +16,7 @@
 
 import pytest
 
+import cupy
 import cudf
 from cugraph.gnn.data_loading.bulk_sampler_io import write_samples
 from cugraph.utilities.utils import create_directory_with_overwrite
@@ -34,7 +35,9 @@ def test_bulk_sampler_io(scratch_dir):
         }
     )
 
-    offsets = cudf.DataFrame({"offsets": [0, 8], "batch_id": [0, 1]})
+    assert len(results) == 12
+
+    offsets = cudf.DataFrame({"offsets": [0, 8, 12], "batch_id": [0, 1, None]})
 
     samples_path = os.path.join(scratch_dir, "test_bulk_sampler_io")
     create_directory_with_overwrite(samples_path)
@@ -138,8 +141,12 @@ def test_bulk_sampler_io_empty_batch(scratch_dir):
         }
     )
 
+    assert len(results) == 20
+
     # some batches are missing
-    offsets = cudf.DataFrame({"offsets": [0, 8, 12, 16], "batch_id": [0, 3, 4, 10]})
+    offsets = cudf.DataFrame(
+        {"offsets": [0, 8, 12, 16, 20], "batch_id": [0, 3, 4, 10, None]}
+    )
 
     samples_path = os.path.join(scratch_dir, "test_bulk_sampler_io_empty_batch")
     create_directory_with_overwrite(samples_path)
@@ -157,3 +164,61 @@ def test_bulk_sampler_io_empty_batch(scratch_dir):
     df1 = cudf.read_parquet(os.path.join(samples_path, "batch=4-5.parquet"))
     assert df1.batch_id.min() == 4
     assert df1.batch_id.max() == 5
+
+    shutil.rmtree(samples_path)
+
+
+@pytest.mark.sg
+def test_bulk_sampler_io_mock_csr(scratch_dir):
+    major_offsets_array = cudf.Series([0, 5, 10, 15])
+    minors_array = cudf.Series([1, 2, 3, 4, 8, 9, 1, 3, 4, 5, 3, 0, 4, 9, 1])
+    edge_ids = cudf.Series(cupy.arange(len(minors_array)))
+
+    # 2 hops
+    label_hop_offsets = cudf.Series([0, 1, 3])
+
+    # map
+    renumber_map = cudf.Series(cupy.arange(10))
+    renumber_map_offsets = cudf.Series([0, 10])
+
+    results_df = cudf.DataFrame()
+    results_df["minors"] = minors_array
+    results_df["major_offsets"] = major_offsets_array
+    results_df["edge_id"] = edge_ids
+    results_df["edge_type"] = None
+    results_df["weight"] = None
+
+    offsets_df = cudf.DataFrame()
+    offsets_df["offsets"] = label_hop_offsets
+    offsets_df["renumber_map_offsets"] = renumber_map_offsets
+    offsets_df["batch_id"] = cudf.Series([0])
+
+    renumber_df = cudf.DataFrame()
+    renumber_df["map"] = renumber_map
+
+    samples_path = os.path.join(scratch_dir, "test_bulk_sampler_io_mock_csr")
+    create_directory_with_overwrite(samples_path)
+
+    write_samples(results_df, offsets_df, renumber_df, 1, samples_path)
+
+    result = cudf.read_parquet(os.path.join(samples_path, "batch=0-0.parquet"))
+
+    assert (
+        result.minors.dropna().values_host.tolist() == minors_array.values_host.tolist()
+    )
+    assert (
+        result.major_offsets.dropna().values_host.tolist()
+        == major_offsets_array.values_host.tolist()
+    )
+    assert result.edge_id.dropna().values_host.tolist() == edge_ids.values_host.tolist()
+    assert (
+        result.renumber_map_offsets.dropna().values_host.tolist()
+        == renumber_map_offsets.values_host.tolist()
+    )
+    assert result.map.dropna().values_host.tolist() == renumber_map.values_host.tolist()
+    assert (
+        result.label_hop_offsets.dropna().values_host.tolist()
+        == label_hop_offsets.values_host.tolist()
+    )
+
+    shutil.rmtree(samples_path)
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py
index 41f68c08e5c..638cccbdcaa 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_io_mg.py
@@ -38,8 +38,12 @@ def test_bulk_sampler_io(scratch_dir):
         divisions=[0, 8, 11]
     )
 
-    offsets = cudf.DataFrame({"offsets": [0, 0], "batch_id": [0, 1]})
-    offsets = dask_cudf.from_cudf(offsets, npartitions=2)
+    assert len(results) == 12
+
+    offsets = cudf.DataFrame({"offsets": [0, 8, 0, 4], "batch_id": [0, None, 1, None]})
+    offsets = dask_cudf.from_cudf(offsets, npartitions=1).repartition(
+        divisions=[0, 2, 3]
+    )
 
     samples_path = os.path.join(scratch_dir, "mg_test_bulk_sampler_io")
     create_directory_with_overwrite(samples_path)
@@ -149,9 +153,11 @@ def test_bulk_sampler_io_empty_batch(scratch_dir):
     )
 
     # some batches are missing
-    offsets = cudf.DataFrame({"offsets": [0, 8, 0, 4], "batch_id": [0, 3, 4, 10]})
+    offsets = cudf.DataFrame(
+        {"offsets": [0, 8, 12, 0, 4, 8], "batch_id": [0, 3, None, 4, 10, None]}
+    )
     offsets = dask_cudf.from_cudf(offsets, npartitions=1).repartition(
-        divisions=[0, 2, 3]
+        divisions=[0, 3, 5]
     )
 
     samples_path = os.path.join(scratch_dir, "mg_test_bulk_sampler_io_empty_batch")
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
index eded435f897..aee81e5ffed 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
@@ -21,7 +21,7 @@
 import cupy
 import cugraph
 import dask_cudf
-from cugraph.datasets import karate
+from cugraph.datasets import karate, email_Eu_core
 from cugraph.experimental import BulkSampler
 from cugraph.utilities.utils import create_directory_with_overwrite
 
@@ -247,3 +247,59 @@ def test_bulk_sampler_empty_batches(dask_client, scratch_dir):
     assert df.batch_id.max() == 1
 
     shutil.rmtree(samples_path)
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize("mg_input", [True, False])
+def test_bulk_sampler_csr(dask_client, scratch_dir, mg_input):
+    nworkers = len(dask_client.scheduler_info()["workers"])
+    el = dask_cudf.from_cudf(email_Eu_core.get_edgelist(), npartitions=nworkers * 2)
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(el, source="src", destination="dst")
+
+    samples_path = os.path.join(scratch_dir, "mg_test_bulk_sampler_csr")
+    create_directory_with_overwrite(samples_path)
+
+    bs = BulkSampler(
+        batch_size=7,
+        output_path=samples_path,
+        graph=G,
+        fanout_vals=[5, 4, 3],
+        with_replacement=False,
+        batches_per_partition=7,
+        renumber=True,
+        use_legacy_names=False,
+        compression="CSR",
+        compress_per_hop=True,
+        prior_sources_behavior="carryover",
+        deduplicate_sources=True,
+        include_hop_column=False,
+    )
+
+    seeds = G.select_random_vertices(62, 1000)
+    batch_ids = cudf.Series(
+        cupy.repeat(cupy.arange(int(1000 / 7) + 1, dtype="int32"), 7)[:1000]
+    ).sort_values()
+
+    batch_df = cudf.DataFrame(
+        {
+            "seed": seeds.compute().values,
+            "batch": batch_ids,
+        }
+    )
+
+    if mg_input:
+        batch_df = dask_cudf.from_cudf(batch_df, npartitions=2)
+
+    bs.add_batches(batch_df, start_col_name="seed", batch_col_name="batch")
+    bs.flush()
+
+    assert len(os.listdir(samples_path)) == 21
+
+    for file in os.listdir(samples_path):
+        df = cudf.read_parquet(os.path.join(samples_path, file))
+
+        assert df.major_offsets.dropna().iloc[-1] - df.major_offsets.iloc[0] == len(df)
+
+    shutil.rmtree(samples_path)
diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
index 62599291d04..206898088ab 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py
@@ -15,6 +15,7 @@
 
 import pytest
 
+import cupy
 import cudf
 import cugraph
 from cugraph import uniform_neighbor_sample
@@ -151,7 +152,7 @@ def test_uniform_neighbor_sample_simple(input_combo):
         G,
         input_combo["start_list"],
         input_combo["fanout_vals"],
-        input_combo["with_replacement"],
+        with_replacement=input_combo["with_replacement"],
     )
 
     print(input_df)
@@ -254,7 +255,9 @@ def test_uniform_neighbor_sample_tree(directed):
     start_list = cudf.Series([0, 0], dtype="int32")
     fanout_vals = [4, 1, 3]
     with_replacement = True
-    result_nbr = uniform_neighbor_sample(G, start_list, fanout_vals, with_replacement)
+    result_nbr = uniform_neighbor_sample(
+        G, start_list, fanout_vals, with_replacement=with_replacement
+    )
 
     result_nbr = result_nbr.drop_duplicates()
 
@@ -288,7 +291,7 @@ def test_uniform_neighbor_sample_unweighted(simple_unweighted_input_expected_out
         test_data["Graph"],
         test_data["start_list"].astype("int64"),
         test_data["fanout_vals"],
-        test_data["with_replacement"],
+        with_replacement=test_data["with_replacement"],
     )
 
     actual_src = sampling_results.sources
@@ -303,7 +306,8 @@ def test_uniform_neighbor_sample_unweighted(simple_unweighted_input_expected_out
 @pytest.mark.sg
 @pytest.mark.cugraph_ops
 @pytest.mark.parametrize("return_offsets", [True, False])
-def test_uniform_neighbor_sample_edge_properties(return_offsets):
+@pytest.mark.parametrize("include_hop_column", [True, False])
+def test_uniform_neighbor_sample_edge_properties(return_offsets, include_hop_column):
     edgelist_df = cudf.DataFrame(
         {
             "src": cudf.Series([0, 1, 2, 3, 4, 3, 4, 2, 0, 1, 0, 2], dtype="int32"),
@@ -337,6 +341,7 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets):
         with_edge_properties=True,
         with_batch_ids=True,
         return_offsets=return_offsets,
+        include_hop_column=include_hop_column,
     )
     if return_offsets:
         sampling_results, sampling_offsets = sampling_results
@@ -359,11 +364,29 @@ def test_uniform_neighbor_sample_edge_properties(return_offsets):
         == sampling_results["destinations"].values_host.tolist()
     )
 
-    assert sampling_results["hop_id"].values_host.tolist() == ([0, 0, 1, 1, 1, 1] * 2)
+    if include_hop_column:
+        assert sampling_results["hop_id"].values_host.tolist() == (
+            [0, 0, 1, 1, 1, 1] * 2
+        )
+    else:
+        assert "hop_id" not in sampling_results
 
     if return_offsets:
-        assert sampling_offsets["batch_id"].values_host.tolist() == [0, 1]
-        assert sampling_offsets["offsets"].values_host.tolist() == [0, 6]
+        assert sampling_offsets["batch_id"].dropna().values_host.tolist() == [0, 1]
+        if include_hop_column:
+            assert sampling_offsets["offsets"].dropna().values_host.tolist() == [
+                0,
+                6,
+                12,
+            ]
+        else:
+            assert sampling_offsets["offsets"].dropna().values_host.tolist() == [
+                0,
+                2,
+                6,
+                8,
+                12,
+            ]
     else:
         assert sampling_results["batch_id"].values_host.tolist() == ([0] * 6 + [1] * 6)
 
@@ -778,6 +801,176 @@ def test_uniform_neighbor_sample_renumber(hops):
     assert (renumber_map.batch_id == 0).all()
 
 
+@pytest.mark.sg
+@pytest.mark.parametrize("hops", [[5], [5, 5], [5, 5, 5]])
+def test_uniform_neighbor_sample_offset_renumber(hops):
+    el = email_Eu_core.get_edgelist()
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = G.select_random_vertices(62, int(0.0001 * len(el)))
+
+    (
+        sampling_results_unrenumbered,
+        offsets_unrenumbered,
+    ) = cugraph.uniform_neighbor_sample(
+        G,
+        seeds,
+        hops,
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        deduplicate_sources=True,
+        renumber=False,
+        return_offsets=True,
+        random_state=62,
+    )
+
+    (
+        sampling_results_renumbered,
+        offsets_renumbered,
+        renumber_map,
+    ) = cugraph.uniform_neighbor_sample(
+        G,
+        seeds,
+        hops,
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        deduplicate_sources=True,
+        renumber=True,
+        return_offsets=True,
+        random_state=62,
+    )
+
+    sources_hop_0 = sampling_results_unrenumbered[
+        sampling_results_unrenumbered.hop_id == 0
+    ].sources
+    for hop in range(len(hops)):
+        destinations_hop = sampling_results_unrenumbered[
+            sampling_results_unrenumbered.hop_id <= hop
+        ].destinations
+        expected_renumber_map = cudf.concat([sources_hop_0, destinations_hop]).unique()
+
+        assert sorted(expected_renumber_map.values_host.tolist()) == sorted(
+            renumber_map.map[0 : len(expected_renumber_map)].values_host.tolist()
+        )
+
+    renumber_map_offsets = offsets_renumbered.renumber_map_offsets.dropna()
+    assert len(renumber_map_offsets) == 2
+    assert renumber_map_offsets.iloc[0] == 0
+    assert renumber_map_offsets.iloc[-1] == len(renumber_map)
+
+    assert len(offsets_renumbered) == 2
+
+
+@pytest.mark.sg
+@pytest.mark.parametrize("hops", [[5], [5, 5], [5, 5, 5]])
+@pytest.mark.parametrize("seed", [62, 66, 68])
+def test_uniform_neighbor_sample_csr_csc_global(hops, seed):
+    el = email_Eu_core.get_edgelist()
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = G.select_random_vertices(seed, int(0.0001 * len(el)))
+
+    sampling_results, offsets, renumber_map = cugraph.uniform_neighbor_sample(
+        G,
+        seeds,
+        hops,
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        deduplicate_sources=True,
+        # carryover not valid because C++ sorts on (hop,src)
+        prior_sources_behavior="exclude",
+        renumber=True,
+        return_offsets=True,
+        random_state=seed,
+        use_legacy_names=False,
+        compress_per_hop=False,
+        compression="CSR",
+        include_hop_column=False,
+    )
+
+    major_offsets = sampling_results["major_offsets"].dropna().values
+    majors = cudf.Series(cupy.arange(len(major_offsets) - 1))
+    majors = majors.repeat(cupy.diff(major_offsets))
+
+    minors = sampling_results["minors"].dropna()
+    assert len(majors) == len(minors)
+
+    majors = renumber_map.map.iloc[majors]
+    minors = renumber_map.map.iloc[minors]
+
+    for i in range(len(majors)):
+        assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])])
+
+
+@pytest.mark.sg
+@pytest.mark.parametrize("seed", [62, 66, 68])
+@pytest.mark.parametrize("hops", [[5], [5, 5], [5, 5, 5]])
+def test_uniform_neighbor_sample_csr_csc_local(hops, seed):
+    el = email_Eu_core.get_edgelist(download=True)
+
+    G = cugraph.Graph(directed=True)
+    G.from_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = cudf.Series(
+        [49, 71], dtype="int32"
+    )  # hardcoded to ensure out-degree is high enough
+
+    sampling_results, offsets, renumber_map = cugraph.uniform_neighbor_sample(
+        G,
+        seeds,
+        hops,
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        deduplicate_sources=True,
+        prior_sources_behavior="carryover",
+        renumber=True,
+        return_offsets=True,
+        random_state=seed,
+        use_legacy_names=False,
+        compress_per_hop=True,
+        compression="CSR",
+        include_hop_column=False,
+    )
+
+    for hop in range(len(hops)):
+        major_offsets = sampling_results["major_offsets"].iloc[
+            offsets.offsets.iloc[hop] : (offsets.offsets.iloc[hop + 1] + 1)
+        ]
+
+        minors = sampling_results["minors"].iloc[
+            major_offsets.iloc[0] : major_offsets.iloc[-1]
+        ]
+
+        majors = cudf.Series(cupy.arange(len(major_offsets) - 1))
+        majors = majors.repeat(cupy.diff(major_offsets))
+
+        majors = renumber_map.map.iloc[majors]
+        minors = renumber_map.map.iloc[minors]
+
+        for i in range(len(majors)):
+            assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])])
+
+
+@pytest.mark.sg
+@pytest.mark.skip(reason="needs to be written!")
+def test_uniform_neighbor_sample_dcsr_dcsc_global():
+    raise NotImplementedError
+
+
+@pytest.mark.sg
+@pytest.mark.skip(reason="needs to be written!")
+def test_uniform_neighbor_sample_dcsr_dcsc_local():
+    raise NotImplementedError
+
+
 @pytest.mark.sg
 @pytest.mark.skip(reason="needs to be written!")
 def test_multi_client_sampling():
diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
index 9d87c097287..460a25cbd14 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
@@ -17,6 +17,7 @@
 
 import pytest
 
+import pandas
 import cupy
 import cudf
 import cugraph
@@ -138,7 +139,7 @@ def test_mg_uniform_neighbor_sample_simple(dask_client, input_combo):
         dg,
         input_combo["start_list"],
         input_combo["fanout_vals"],
-        input_combo["with_replacement"],
+        with_replacement=input_combo["with_replacement"],
     )
 
     # multi edges are dropped to easily verify that each edge in the
@@ -228,7 +229,9 @@ def test_mg_uniform_neighbor_sample_tree(dask_client, directed):
     start_list = cudf.Series([0, 0], dtype="int32")
     fanout_vals = [4, 1, 3]
     with_replacement = True
-    result_nbr = uniform_neighbor_sample(G, start_list, fanout_vals, with_replacement)
+    result_nbr = uniform_neighbor_sample(
+        G, start_list, fanout_vals, with_replacement=with_replacement
+    )
 
     result_nbr = result_nbr.drop_duplicates()
 
@@ -283,7 +286,7 @@ def test_mg_uniform_neighbor_sample_unweighted(dask_client):
     with_replacement = True
 
     sampling_results = uniform_neighbor_sample(
-        G, start_list, fanout_vals, with_replacement
+        G, start_list, fanout_vals, with_replacement=with_replacement
     )
 
     expected_src = [0, 0]
@@ -380,13 +383,17 @@ def test_uniform_neighbor_sample_edge_properties(dask_client, return_offsets):
             dfp = sampling_results.get_partition(i).compute()
             if len(dfp) > 0:
                 offsets_p = sampling_offsets.get_partition(i).compute()
+                print(offsets_p)
                 assert len(offsets_p) > 0
 
                 if offsets_p.batch_id.iloc[0] == 1:
                     batches_found[1] += 1
 
-                    assert offsets_p.batch_id.values_host.tolist() == [1]
-                    assert offsets_p.offsets.values_host.tolist() == [0]
+                    assert offsets_p.batch_id.dropna().values_host.tolist() == [1]
+                    assert offsets_p.offsets.dropna().values_host.tolist() == [
+                        0,
+                        len(dfp),
+                    ]
 
                     assert sorted(dfp.sources.values_host.tolist()) == (
                         [1, 1, 3, 3, 4, 4]
@@ -397,8 +404,11 @@ def test_uniform_neighbor_sample_edge_properties(dask_client, return_offsets):
                 elif offsets_p.batch_id.iloc[0] == 0:
                     batches_found[0] += 1
 
-                    assert offsets_p.batch_id.values_host.tolist() == [0]
-                    assert offsets_p.offsets.values_host.tolist() == [0]
+                    assert offsets_p.batch_id.dropna().values_host.tolist() == [0]
+                    assert offsets_p.offsets.dropna().values_host.tolist() == [
+                        0,
+                        len(dfp),
+                    ]
 
                     assert sorted(dfp.sources.values_host.tolist()) == (
                         [0, 0, 0, 1, 1, 2, 2, 2, 4, 4]
@@ -703,7 +713,6 @@ def test_uniform_neighbor_sample_batched(dask_client, dataset, input_df, max_bat
         source="src",
         destination="dst",
         edge_attr=["wgt", "eid", "etp"],
-        legacy_renum_only=True,
     )
 
     input_vertices = dask_cudf.concat([df.src, df.dst]).unique().compute()
@@ -960,7 +969,6 @@ def test_uniform_neighbor_sample_deduplicate_sources_email_eu_core(dask_client):
 
 @pytest.mark.mg
 @pytest.mark.parametrize("hops", [[5], [5, 5], [5, 5, 5]])
-@pytest.mark.tags("runme")
 def test_uniform_neighbor_sample_renumber(dask_client, hops):
     # FIXME This test is not very good because there is a lot of
     # non-deterministic behavior that still exists despite passing
@@ -1005,6 +1013,224 @@ def test_uniform_neighbor_sample_renumber(dask_client, hops):
     )
 
 
+@pytest.mark.mg
+@pytest.mark.parametrize("hops", [[5], [5, 5], [5, 5, 5]])
+def test_uniform_neighbor_sample_offset_renumber(dask_client, hops):
+    el = dask_cudf.from_cudf(email_Eu_core.get_edgelist(), npartitions=4)
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = G.select_random_vertices(62, int(0.0001 * len(el)))
+
+    (
+        sampling_results_unrenumbered,
+        offsets_unrenumbered,
+    ) = cugraph.dask.uniform_neighbor_sample(
+        G,
+        seeds,
+        hops,
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        deduplicate_sources=True,
+        renumber=False,
+        return_offsets=True,
+        random_state=62,
+    )
+    sampling_results_unrenumbered = sampling_results_unrenumbered.compute()
+    offsets_unrenumbered = offsets_unrenumbered.compute()
+
+    (
+        sampling_results_renumbered,
+        offsets_renumbered,
+        renumber_map,
+    ) = cugraph.dask.uniform_neighbor_sample(
+        G,
+        seeds,
+        hops,
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        deduplicate_sources=True,
+        renumber=True,
+        keep_batches_together=True,
+        min_batch_id=0,
+        max_batch_id=0,
+        return_offsets=True,
+        random_state=62,
+    )
+
+    # can't use compute() since empty batches still get a partition
+    n_workers = len(dask_client.scheduler_info()["workers"])
+    for p in range(n_workers):
+        partition = offsets_renumbered.get_partition(p).compute()
+        if not pandas.isna(partition.batch_id.iloc[0]):
+            break
+
+    sampling_results_renumbered = sampling_results_renumbered.get_partition(p).compute()
+    offsets_renumbered = offsets_renumbered.get_partition(p).compute()
+    renumber_map = renumber_map.get_partition(p).compute()
+
+    sources_hop_0 = sampling_results_unrenumbered[
+        sampling_results_unrenumbered.hop_id == 0
+    ].sources
+    for hop in range(len(hops)):
+        destinations_hop = sampling_results_unrenumbered[
+            sampling_results_unrenumbered.hop_id <= hop
+        ].destinations
+        expected_renumber_map = cudf.concat([sources_hop_0, destinations_hop]).unique()
+
+        assert sorted(expected_renumber_map.values_host.tolist()) == sorted(
+            renumber_map.map[0 : len(expected_renumber_map)].values_host.tolist()
+        )
+
+    renumber_map_offsets = offsets_renumbered.renumber_map_offsets.dropna()
+    assert len(renumber_map_offsets) == 2
+    assert renumber_map_offsets.iloc[0] == 0
+    assert renumber_map_offsets.iloc[-1] == len(renumber_map)
+
+    assert len(offsets_renumbered) == 2
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize("hops", [[5], [5, 5], [5, 5, 5]])
+@pytest.mark.parametrize("seed", [62, 66, 68])
+def test_uniform_neighbor_sample_csr_csc_global(dask_client, hops, seed):
+    el = dask_cudf.from_cudf(email_Eu_core.get_edgelist(), npartitions=4)
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = G.select_random_vertices(seed, int(0.0001 * len(el)))
+
+    sampling_results, offsets, renumber_map = cugraph.dask.uniform_neighbor_sample(
+        G,
+        seeds,
+        hops,
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        deduplicate_sources=True,
+        # carryover not valid because C++ sorts on (hop,src)
+        prior_sources_behavior="exclude",
+        renumber=True,
+        return_offsets=True,
+        random_state=seed,
+        use_legacy_names=False,
+        compress_per_hop=False,
+        compression="CSR",
+        include_hop_column=False,
+        keep_batches_together=True,
+        min_batch_id=0,
+        max_batch_id=0,
+    )
+
+    # can't use compute() since empty batches still get a partition
+    n_workers = len(dask_client.scheduler_info()["workers"])
+    for p in range(n_workers):
+        partition = offsets.get_partition(p).compute()
+        if not pandas.isna(partition.batch_id.iloc[0]):
+            break
+
+    sampling_results = sampling_results.get_partition(p).compute()
+    offsets = offsets.get_partition(p).compute()
+    renumber_map = renumber_map.get_partition(p).compute()
+
+    major_offsets = sampling_results["major_offsets"].dropna().values
+    majors = cudf.Series(cupy.arange(len(major_offsets) - 1))
+    majors = majors.repeat(cupy.diff(major_offsets))
+
+    minors = sampling_results["minors"].dropna()
+    assert len(majors) == len(minors)
+
+    majors = renumber_map.map.iloc[majors]
+    minors = renumber_map.map.iloc[minors]
+
+    for i in range(len(majors)):
+        assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])])
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize("seed", [62, 66, 68])
+@pytest.mark.parametrize("hops", [[5], [5, 5], [5, 5, 5]])
+def test_uniform_neighbor_sample_csr_csc_local(dask_client, hops, seed):
+    el = dask_cudf.from_cudf(email_Eu_core.get_edgelist(), npartitions=4)
+
+    G = cugraph.Graph(directed=True)
+    G.from_dask_cudf_edgelist(el, source="src", destination="dst")
+
+    seeds = dask_cudf.from_cudf(
+        cudf.Series([49, 71], dtype="int32"), npartitions=1
+    )  # hardcoded to ensure out-degree is high enough
+
+    sampling_results, offsets, renumber_map = cugraph.dask.uniform_neighbor_sample(
+        G,
+        seeds,
+        hops,
+        with_replacement=False,
+        with_edge_properties=True,
+        with_batch_ids=False,
+        deduplicate_sources=True,
+        prior_sources_behavior="carryover",
+        renumber=True,
+        return_offsets=True,
+        random_state=seed,
+        use_legacy_names=False,
+        compress_per_hop=True,
+        compression="CSR",
+        include_hop_column=False,
+        keep_batches_together=True,
+        min_batch_id=0,
+        max_batch_id=0,
+    )
+
+    # can't use compute() since empty batches still get a partition
+    n_workers = len(dask_client.scheduler_info()["workers"])
+    for p in range(n_workers):
+        partition = offsets.get_partition(p).compute()
+
+        if not pandas.isna(partition.batch_id.iloc[0]):
+            break
+
+    sampling_results = sampling_results.get_partition(p).compute()
+    offsets = offsets.get_partition(p).compute()
+    renumber_map = renumber_map.get_partition(p).compute()
+
+    print(sampling_results)
+    print(offsets)
+
+    for hop in range(len(hops)):
+        major_offsets = sampling_results["major_offsets"].iloc[
+            offsets.offsets.iloc[hop] : (offsets.offsets.iloc[hop + 1] + 1)
+        ]
+
+        minors = sampling_results["minors"].iloc[
+            major_offsets.iloc[0] : major_offsets.iloc[-1]
+        ]
+
+        majors = cudf.Series(cupy.arange(len(major_offsets) - 1))
+        majors = majors.repeat(cupy.diff(major_offsets))
+
+        majors = renumber_map.map.iloc[majors]
+        minors = renumber_map.map.iloc[minors]
+
+        for i in range(len(majors)):
+            assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])])
+
+
+@pytest.mark.mg
+@pytest.mark.skip(reason="needs to be written!")
+def test_uniform_neighbor_sample_dcsr_dcsc_global():
+    raise NotImplementedError
+
+
+@pytest.mark.mg
+@pytest.mark.skip(reason="needs to be written!")
+def test_uniform_neighbor_sample_dcsr_dcsc_local():
+    raise NotImplementedError
+
+
 # =============================================================================
 # Benchmarks
 # =============================================================================
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
index ffb458b409c..29c6d79e08d 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
@@ -176,15 +176,32 @@ cdef extern from "cugraph_c/algorithms.h":
             const cugraph_sample_result_t* result
         )
 
+    # Deprecated, use cugraph_sample_result_get_majors
     cdef cugraph_type_erased_device_array_view_t* \
         cugraph_sample_result_get_sources(
             const cugraph_sample_result_t* result
         )
 
+    # Deprecated, use cugraph_sample_result_get_minors
     cdef cugraph_type_erased_device_array_view_t* \
         cugraph_sample_result_get_destinations(
             const cugraph_sample_result_t* result
         )
+    
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_sample_result_get_majors(
+            const cugraph_sample_result_t* result
+        )
+    
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_sample_result_get_minors(
+            const cugraph_sample_result_t* result
+        )
+    
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_sample_result_get_major_offsets(
+            const cugraph_sample_result_t* result
+        )
 
     cdef cugraph_type_erased_device_array_view_t* \
         cugraph_sample_result_get_index(
@@ -211,11 +228,17 @@ cdef extern from "cugraph_c/algorithms.h":
             const cugraph_sample_result_t* result
         )
     
+    cdef cugraph_type_erased_device_array_view_t* \
+        cugraph_sample_result_get_label_hop_offsets(
+            const cugraph_sample_result_t* result
+        )
+
     cdef cugraph_type_erased_device_array_view_t* \
         cugraph_sample_result_get_start_labels(
             const cugraph_sample_result_t* result
         )
     
+    # Deprecated
     cdef cugraph_type_erased_device_array_view_t* \
         cugraph_sample_result_get_offsets(
             const cugraph_sample_result_t* result
@@ -246,10 +269,17 @@ cdef extern from "cugraph_c/algorithms.h":
         pass
     
     ctypedef enum cugraph_prior_sources_behavior_t:
-        DEFAULT
+        DEFAULT=0
         CARRY_OVER
         EXCLUDE
     
+    ctypedef enum cugraph_compression_type_t:
+        COO=0
+        CSR
+        CSC
+        DCSR
+        DCSC
+
     cdef cugraph_error_code_t \
         cugraph_sampling_options_create(
             cugraph_sampling_options_t** options,
@@ -277,7 +307,7 @@ cdef extern from "cugraph_c/algorithms.h":
     cdef void \
         cugraph_sampling_set_prior_sources_behavior(
             cugraph_sampling_options_t* options,
-            cugraph_prior_sources_behavior_t value
+            cugraph_prior_sources_behavior_t value,
         )
 
     cdef void \
@@ -286,10 +316,22 @@ cdef extern from "cugraph_c/algorithms.h":
             bool_t value,
         )
     
+    cdef void \
+        cugraph_sampling_set_compress_per_hop(
+            cugraph_sampling_options_t* options,
+            bool_t value,
+        )
+    
+    cdef void \
+        cugraph_sampling_set_compression_type(
+            cugraph_sampling_options_t* options,
+            cugraph_compression_type_t value,
+        )
+    
     cdef void \
         cugraph_sampling_options_free(
             cugraph_sampling_options_t* options,
-    )
+        )
 
     # uniform random walks
     cdef cugraph_error_code_t \
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
index 91cc11d6b1c..c32b57f8621 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd
@@ -43,23 +43,6 @@ from pylibcugraph._cugraph_c.array cimport (
 cdef extern from "cugraph_c/sampling_algorithms.h":
     ###########################################################################
 
-    # deprecated, should migrate to cugraph_uniform_neighbor_sample
-    cdef cugraph_error_code_t cugraph_uniform_neighbor_sample_with_edge_properties(
-        const cugraph_resource_handle_t* handle,
-        cugraph_graph_t* graph,
-        const cugraph_type_erased_device_array_view_t* start_vertices,
-        const cugraph_type_erased_device_array_view_t* start_vertex_labels,
-        const cugraph_type_erased_device_array_view_t* label_list,
-        const cugraph_type_erased_device_array_view_t* label_to_comm_rank,
-        const cugraph_type_erased_host_array_view_t* fan_out,
-        cugraph_rng_state_t* rng_state,
-        bool_t with_replacement,
-        bool_t return_hops,
-        bool_t do_expensive_check,
-        cugraph_sample_result_t** result,
-        cugraph_error_t** error
-    )
-
     cdef cugraph_error_code_t cugraph_uniform_neighbor_sample(
         const cugraph_resource_handle_t* handle,
         cugraph_graph_t* graph,
diff --git a/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx b/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
index d11f6994298..9f98b4f37b0 100644
--- a/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
+++ b/python/pylibcugraph/pylibcugraph/internal_types/sampling_result.pyx
@@ -20,14 +20,18 @@ from pylibcugraph._cugraph_c.array cimport (
 )
 from pylibcugraph._cugraph_c.algorithms cimport (
     cugraph_sample_result_t,
-    cugraph_sample_result_get_sources,
-    cugraph_sample_result_get_destinations,
+    cugraph_sample_result_get_major_offsets,
+    cugraph_sample_result_get_majors,
+    cugraph_sample_result_get_minors,
+    cugraph_sample_result_get_label_hop_offsets,
+    cugraph_sample_result_get_sources, # deprecated
+    cugraph_sample_result_get_destinations, # deprecated
     cugraph_sample_result_get_edge_weight,
     cugraph_sample_result_get_edge_id,
     cugraph_sample_result_get_edge_type,
-    cugraph_sample_result_get_hop,
+    cugraph_sample_result_get_hop, # deprecated
     cugraph_sample_result_get_start_labels,
-    cugraph_sample_result_get_offsets,
+    cugraph_sample_result_get_offsets, # deprecated
     cugraph_sample_result_get_renumber_map,
     cugraph_sample_result_get_renumber_map_offsets,
     cugraph_sample_result_free,
@@ -60,23 +64,71 @@ cdef class SamplingResult:
     cdef set_ptr(self, cugraph_sample_result_t* sample_result_ptr):
         self.c_sample_result_ptr = sample_result_ptr
 
+    def get_major_offsets(self):
+        if self.c_sample_result_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        
+        cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
+            cugraph_sample_result_get_major_offsets(self.c_sample_result_ptr)
+        )
+        if device_array_view_ptr is NULL:
+            return None
+        
+        return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
+                                                     self)
+
+    def get_majors(self):
+        if self.c_sample_result_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
+            cugraph_sample_result_get_majors(self.c_sample_result_ptr)
+        )
+        if device_array_view_ptr is NULL:
+            return None
+        
+        return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
+                                                     self)
+
+    def get_minors(self):
+        if self.c_sample_result_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
+            cugraph_sample_result_get_minors(self.c_sample_result_ptr)
+        )
+        if device_array_view_ptr is NULL:
+            return None
+        
+        return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
+                                                     self)
+
     def get_sources(self):
+        # Deprecated
         if self.c_sample_result_ptr is NULL:
             raise ValueError("pointer not set, must call set_ptr() with a "
                              "non-NULL value first.")
         cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
             cugraph_sample_result_get_sources(self.c_sample_result_ptr)
         )
+        if device_array_view_ptr is NULL:
+            return None
+        
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
 
     def get_destinations(self):
+        # Deprecated
         if self.c_sample_result_ptr is NULL:
             raise ValueError("pointer not set, must call set_ptr() with a "
                              "non-NULL value first.")
         cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
             cugraph_sample_result_get_destinations(self.c_sample_result_ptr)
         )
+        if device_array_view_ptr is NULL:
+            return None
+        
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
 
@@ -95,6 +147,7 @@ cdef class SamplingResult:
                                                      self)
 
     def get_indices(self):
+        # Deprecated
         return self.get_edge_weights()
     
     def get_edge_ids(self):
@@ -132,9 +185,26 @@ cdef class SamplingResult:
         cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
             cugraph_sample_result_get_start_labels(self.c_sample_result_ptr)
         )
+        if device_array_view_ptr is NULL:
+            return None
+        
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
 
+    def get_label_hop_offsets(self):
+        if self.c_sample_result_ptr is NULL:
+            raise ValueError("pointer not set, must call set_ptr() with a "
+                             "non-NULL value first.")
+        cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
+            cugraph_sample_result_get_label_hop_offsets(self.c_sample_result_ptr)
+        )
+        if device_array_view_ptr is NULL:
+            return None
+        
+        return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
+                                                     self)
+
+    # Deprecated
     def get_offsets(self):
         if self.c_sample_result_ptr is NULL:
             raise ValueError("pointer not set, must call set_ptr() with a "
@@ -142,9 +212,13 @@ cdef class SamplingResult:
         cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
             cugraph_sample_result_get_offsets(self.c_sample_result_ptr)
         )
+        if device_array_view_ptr is NULL:
+            return None
+        
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
 
+    # Deprecated
     def get_hop_ids(self):
         if self.c_sample_result_ptr is NULL:
             raise ValueError("pointer not set, must call set_ptr() with a "
@@ -152,6 +226,9 @@ cdef class SamplingResult:
         cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
             cugraph_sample_result_get_hop(self.c_sample_result_ptr)
         )
+        if device_array_view_ptr is NULL:
+            return None
+        
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
 
@@ -162,6 +239,9 @@ cdef class SamplingResult:
         cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
             cugraph_sample_result_get_renumber_map(self.c_sample_result_ptr)
         )
+        if device_array_view_ptr is NULL:
+            return None
+        
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
 
@@ -172,5 +252,8 @@ cdef class SamplingResult:
         cdef cugraph_type_erased_device_array_view_t* device_array_view_ptr = (
             cugraph_sample_result_get_renumber_map_offsets(self.c_sample_result_ptr)
         )
+        if device_array_view_ptr is NULL:
+            return None
+        
         return create_cupy_array_view_for_device_ptr(device_array_view_ptr,
                                                      self)
\ No newline at end of file
diff --git a/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py b/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py
index 74aa6830d24..ac04635edcf 100644
--- a/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py
+++ b/python/pylibcugraph/pylibcugraph/tests/test_uniform_neighbor_sample.py
@@ -266,7 +266,7 @@ def test_neighborhood_sampling_large_sg_graph(gpubenchmark):
 
 def test_sample_result():
     """
-    Ensure the SampleResult class returns zero-opy cupy arrays and properly
+    Ensure the SampleResult class returns zero-copy cupy arrays and properly
     frees device memory when all references to it are gone and it's garbage
     collected.
     """
@@ -304,6 +304,8 @@ def test_sample_result():
     assert isinstance(destinations, cp.ndarray)
     assert isinstance(indices, cp.ndarray)
 
+    print("sources:", destinations)
+
     # Delete the SampleResult instance. This *should not* free the device
     # memory yet since the variables sources, destinations, and indices are
     # keeping the refcount >0.
diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
index bc2aa9205f1..ce6493c38f5 100644
--- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
+++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx
@@ -38,6 +38,7 @@ from pylibcugraph._cugraph_c.graph cimport (
 from pylibcugraph._cugraph_c.algorithms cimport (
     cugraph_sample_result_t,
     cugraph_prior_sources_behavior_t,
+    cugraph_compression_type_t,
     cugraph_sampling_options_t,
     cugraph_sampling_options_create,
     cugraph_sampling_options_free,
@@ -46,7 +47,8 @@ from pylibcugraph._cugraph_c.algorithms cimport (
     cugraph_sampling_set_prior_sources_behavior,
     cugraph_sampling_set_dedupe_sources,
     cugraph_sampling_set_renumber_results,
-
+    cugraph_sampling_set_compress_per_hop,
+    cugraph_sampling_set_compression_type,
 )
 from pylibcugraph._cugraph_c.sampling_algorithms cimport (
     cugraph_uniform_neighbor_sample,
@@ -73,6 +75,7 @@ from pylibcugraph._cugraph_c.random cimport (
 from pylibcugraph.random cimport (
     CuGraphRandomState
 )
+import warnings
 
 # TODO accept cupy/numpy random state in addition to raw seed.
 def uniform_neighbor_sample(ResourceHandle resource_handle,
@@ -90,7 +93,10 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
                             deduplicate_sources=False,
                             return_hops=False,
                             renumber=False,
-                            random_state=None):
+                            compression='COO',
+                            compress_per_hop=False,
+                            random_state=None,
+                            return_dict=False,):
     """
     Does neighborhood sampling, which samples nodes from a graph based on the
     current node's neighbors, with a corresponding fanout value at each hop.
@@ -153,11 +159,27 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
         If True, will renumber the sources and destinations on a
         per-batch basis and return the renumber map and batch offsets
         in additional to the standard returns.
+    
+    compression: str (Optional)
+        Options: COO (default), CSR, CSC, DCSR, DCSR
+        Sets the compression format for the returned samples.
+    
+    compress_per_hop: bool (Optional)
+        If False (default), will create a compressed edgelist for the
+        entire batch.
+        If True, will create a separate compressed edgelist per hop within
+        a batch.
 
     random_state: int (Optional)
         Random state to use when generating samples.  Optional argument,
         defaults to a hash of process id, time, and hostname.
         (See pylibcugraph.random.CuGraphRandomState)
+    
+    return_dict: bool (Optional)
+        Whether to return a dictionary instead of a tuple.
+        Optional argument, defaults to False, returning a tuple.
+        This argument will eventually be deprecated in favor
+        of always returning a dictionary.
 
     Returns
     -------
@@ -173,13 +195,16 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
     the renumber map for each batch starts). 
 
     """
-    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = (
         resource_handle.c_resource_handle_ptr
+    )
+
     cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr
 
     cdef bool_t c_deduplicate_sources = deduplicate_sources
     cdef bool_t c_return_hops = return_hops
     cdef bool_t c_renumber = renumber
+    cdef bool_t c_compress_per_hop = compress_per_hop
 
     assert_CAI_type(start_list, "start_list")
     assert_CAI_type(batch_id_list, "batch_id_list", True)
@@ -269,6 +294,23 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
             f'Invalid option {prior_sources_behavior}'
             ' for prior sources behavior'
         )
+    
+    cdef cugraph_compression_type_t compression_behavior_e
+    if compression is None or compression == 'COO':
+        compression_behavior_e = cugraph_compression_type_t.COO
+    elif compression == 'CSR':
+        compression_behavior_e = cugraph_compression_type_t.CSR
+    elif compression == 'CSC':
+        compression_behavior_e = cugraph_compression_type_t.CSC
+    elif compression == 'DCSR':
+        compression_behavior_e = cugraph_compression_type_t.DCSR
+    elif compression == 'DCSC':
+        compression_behavior_e = cugraph_compression_type_t.DCSC
+    else:
+        raise ValueError(
+            f'Invalid option {compression}'
+            ' for compression type'
+        )
 
     cdef cugraph_sampling_options_t* sampling_options
     error_code = cugraph_sampling_options_create(&sampling_options, &error_ptr)
@@ -279,6 +321,8 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
     cugraph_sampling_set_dedupe_sources(sampling_options, c_deduplicate_sources)
     cugraph_sampling_set_prior_sources_behavior(sampling_options, prior_sources_behavior_e)
     cugraph_sampling_set_renumber_results(sampling_options, c_renumber)
+    cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e)
+    cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop)
 
     error_code = cugraph_uniform_neighbor_sample(
         c_resource_handle_ptr,
@@ -311,26 +355,74 @@ def uniform_neighbor_sample(ResourceHandle resource_handle,
     # Get cupy "views" of the individual arrays to return. These each increment
     # the refcount on the SamplingResult instance which will keep the data alive
     # until all references are removed and the GC runs.
+    # TODO Return everything that isn't null in release 23.12
     if with_edge_properties:
-        cupy_sources = result.get_sources()
-        cupy_destinations = result.get_destinations()
+        cupy_majors = result.get_majors()
+        cupy_major_offsets = result.get_major_offsets()
+        cupy_minors = result.get_minors()
         cupy_edge_weights = result.get_edge_weights()
         cupy_edge_ids = result.get_edge_ids()
         cupy_edge_types = result.get_edge_types()
         cupy_batch_ids = result.get_batch_ids()
-        cupy_offsets = result.get_offsets()
-        cupy_hop_ids = result.get_hop_ids()
+        cupy_label_hop_offsets = result.get_label_hop_offsets()
 
         if renumber:
             cupy_renumber_map = result.get_renumber_map()
             cupy_renumber_map_offsets = result.get_renumber_map_offsets()
-            return (cupy_sources, cupy_destinations, cupy_edge_weights, cupy_edge_ids, cupy_edge_types, cupy_batch_ids, cupy_offsets, cupy_hop_ids, cupy_renumber_map, cupy_renumber_map_offsets)
+            # TODO drop the placeholder for hop ids in release 23.12
+            if return_dict:
+                return {
+                    'major_offsets': cupy_major_offsets,
+                    'majors': cupy_majors,
+                    'minors': cupy_minors,
+                    'weight': cupy_edge_weights,
+                    'edge_id': cupy_edge_ids,
+                    'edge_type': cupy_edge_types,
+                    'batch_id': cupy_batch_ids,
+                    'label_hop_offsets': cupy_label_hop_offsets,
+                    'hop_id': None,
+                    'renumber_map': cupy_renumber_map,
+                    'renumber_map_offsets': cupy_renumber_map_offsets
+                }
+            else:
+                cupy_majors = cupy_major_offsets if cupy_majors is None else cupy_majors
+                return (cupy_majors, cupy_minors, cupy_edge_weights, cupy_edge_ids, cupy_edge_types, cupy_batch_ids, cupy_label_hop_offsets, None, cupy_renumber_map, cupy_renumber_map_offsets)
         else:
-            return (cupy_sources, cupy_destinations, cupy_edge_weights, cupy_edge_ids, cupy_edge_types, cupy_batch_ids, cupy_offsets, cupy_hop_ids)
+            cupy_hop_ids = result.get_hop_ids() # FIXME remove this
+            if return_dict:
+                return {
+                    'major_offsets': cupy_major_offsets,
+                    'majors': cupy_majors,
+                    'minors': cupy_minors,
+                    'weight': cupy_edge_weights,
+                    'edge_id': cupy_edge_ids,
+                    'edge_type': cupy_edge_types,
+                    'batch_id': cupy_batch_ids,
+                    'label_hop_offsets': cupy_label_hop_offsets,
+                    'hop_id': cupy_hop_ids,
+                }
+            else:
+                cupy_majors = cupy_major_offsets if cupy_majors is None else cupy_majors
+                return (cupy_majors, cupy_minors, cupy_edge_weights, cupy_edge_ids, cupy_edge_types, cupy_batch_ids, cupy_label_hop_offsets, cupy_hop_ids)
 
     else:
+        # TODO this is deprecated, remove it in release 23.12
+        warnings.warn(
+            "Calling uniform_neighbor_sample with the 'with_edge_properties' argument is deprecated."
+            " Starting in release 23.12, this argument will be removed in favor of behaving like the "
+            "with_edge_properties=True option, returning whatever properties are in the graph.",
+            FutureWarning,
+        )
+
         cupy_sources = result.get_sources()
         cupy_destinations = result.get_destinations()
         cupy_indices = result.get_indices()
 
-        return (cupy_sources, cupy_destinations, cupy_indices)
+        if return_dict:
+            return {
+                'sources': cupy_sources,
+                'destinations': cupy_destinations,
+                'indices': cupy_indices
+            }
+        else:
+            return (cupy_sources, cupy_destinations, cupy_indices)

From f0d633322e66be5e1521b2c91d94a6c96ed699bc Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Thu, 28 Sep 2023 13:09:17 -0500
Subject: [PATCH 58/72] Add entry point to tell NetworkX about nx-cugraph
 without importing it. (#3848)

This allows NetworkX docstrings to be updated (among other things).

This will have a companion PR in NetworkX. We still need to determine (and agree) on the dict returned by this entry point, and NetworkX doesn't need to use everything I have here. We should probably add a string for `"description"` that gives a very short description of the backend, and maybe `"url"` or `"homepage"` or whatever so online docs can have links.

Here's how to use the entry point (Python >= 3.10) after installing it:
```python
In [1]: from importlib.metadata import entry_points

In [2]: items = entry_points(group="networkx.plugin_info")

In [3]: [plugin] = items

In [4]: plugin.load()()
Out[4]:
{'backend_name': 'cugraph',
 'project': 'nx-cugraph',
 'package': 'nx_cugraph',
 'functions': {'betweenness_centrality',
  'edge_betweenness_centrality',
  'louvain_communities'},
 'extra_docstrings': {'betweenness_centrality': '`weight` parameter is not yet supported.',
  'edge_betweenness_centrality': '`weight` parameter is not yet supported.',
  'louvain_communities': '`threshold` and `seed` parameters are currently ignored.'},
 'extra_parameters': {'louvain_communities': {'max_level': 'Upper limit of the number of macro-iterations.'}}}
```

CC @rlratzel @betochimas

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/3848
---
 ci/release/update-version.sh                  |  1 +
 python/nx-cugraph/.flake8                     |  1 +
 python/nx-cugraph/Makefile                    | 10 +++
 python/nx-cugraph/_nx_cugraph/__init__.py     | 88 ++++++++++++++++++
 python/nx-cugraph/_nx_cugraph/core.py         | 90 +++++++++++++++++++
 python/nx-cugraph/lint.yaml                   | 19 ++--
 python/nx-cugraph/nx_cugraph/__init__.py      | 16 +++-
 .../algorithms/centrality/betweenness.py      |  6 +-
 .../algorithms/community/louvain.py           | 17 ++--
 python/nx-cugraph/nx_cugraph/interface.py     |  5 +-
 .../nx_cugraph/tests/test_match_api.py        |  5 +-
 .../nx-cugraph/nx_cugraph/utils/decorators.py | 41 +++++++--
 python/nx-cugraph/nx_cugraph/utils/misc.py    |  6 +-
 python/nx-cugraph/pyproject.toml              | 10 +++
 14 files changed, 278 insertions(+), 37 deletions(-)
 create mode 100644 python/nx-cugraph/_nx_cugraph/__init__.py
 create mode 100644 python/nx-cugraph/_nx_cugraph/core.py

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index f3892fbd3c4..adf3273e311 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -62,6 +62,7 @@ sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cugr
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cugraph-service/server/cugraph_service_server/__init__.py
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/pylibcugraph/pylibcugraph/__init__.py
 sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/nx-cugraph/nx_cugraph/__init__.py
+sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/nx-cugraph/_nx_cugraph/__init__.py
 
 # Python pyproject.toml updates
 sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cugraph/pyproject.toml
diff --git a/python/nx-cugraph/.flake8 b/python/nx-cugraph/.flake8
index 3a2e3fb8617..c5874e54f7e 100644
--- a/python/nx-cugraph/.flake8
+++ b/python/nx-cugraph/.flake8
@@ -11,3 +11,4 @@ extend-ignore =
 per-file-ignores =
     nx_cugraph/tests/*.py:T201,
     __init__.py:F401,F403,
+    _nx_cugraph/__init__.py:E501,
diff --git a/python/nx-cugraph/Makefile b/python/nx-cugraph/Makefile
index c9caf147d53..6e1b98ee6e9 100644
--- a/python/nx-cugraph/Makefile
+++ b/python/nx-cugraph/Makefile
@@ -1,7 +1,17 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 SHELL= /bin/bash
 
+.PHONY: all
+all: plugin-info lint
+
+.PHONY: lint
 lint:
 	git ls-files | xargs pre-commit run --config lint.yaml --files
+
+.PHONY: lint-update
 lint-update:
 	pre-commit autoupdate --config lint.yaml
+
+.PHONY: plugin-info
+plugin-info:
+	python _nx_cugraph/__init__.py
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
new file mode 100644
index 00000000000..9b3332106ec
--- /dev/null
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tell NetworkX about the cugraph backend. This file can update itself:
+
+$ make plugin-info  # Recommended method for development
+
+or
+
+$ python _nx_cugraph/__init__.py
+"""
+
+# Entries between BEGIN and END are automatically generated
+_info = {
+    "backend_name": "cugraph",
+    "project": "nx-cugraph",
+    "package": "nx_cugraph",
+    "url": "https://github.com/rapidsai/cugraph/tree/branch-23.10/python/nx-cugraph",
+    "short_summary": "GPU-accelerated backend.",
+    # "description": "TODO",
+    "functions": {
+        # BEGIN: functions
+        "betweenness_centrality",
+        "edge_betweenness_centrality",
+        "louvain_communities",
+        # END: functions
+    },
+    "extra_docstrings": {
+        # BEGIN: extra_docstrings
+        "betweenness_centrality": "`weight` parameter is not yet supported.",
+        "edge_betweenness_centrality": "`weight` parameter is not yet supported.",
+        "louvain_communities": "`threshold` and `seed` parameters are currently ignored.",
+        # END: extra_docstrings
+    },
+    "extra_parameters": {
+        # BEGIN: extra_parameters
+        "louvain_communities": {
+            "max_level : int, optional": "Upper limit of the number of macro-iterations.",
+        },
+        # END: extra_parameters
+    },
+}
+
+
+def get_info():
+    """Target of ``networkx.plugin_info`` entry point.
+
+    This tells NetworkX about the cugraph backend without importing nx_cugraph.
+    """
+    # Convert to e.g. `{"functions": {"myfunc": {"extra_docstring": ...}}}`
+    d = _info.copy()
+    info_keys = {
+        "extra_docstrings": "extra_docstring",
+        "extra_parameters": "extra_parameters",
+    }
+    d["functions"] = {
+        func: {
+            new_key: vals[func]
+            for old_key, new_key in info_keys.items()
+            if func in (vals := d[old_key])
+        }
+        for func in d["functions"]
+    }
+    for key in info_keys:
+        del d[key]
+    return d
+
+
+__version__ = "23.10.00"
+
+if __name__ == "__main__":
+    from pathlib import Path
+
+    from _nx_cugraph.core import main
+
+    filepath = Path(__file__)
+    text = main(filepath)
+    with filepath.open("w") as f:
+        f.write(text)
diff --git a/python/nx-cugraph/_nx_cugraph/core.py b/python/nx-cugraph/_nx_cugraph/core.py
new file mode 100644
index 00000000000..72f9203897e
--- /dev/null
+++ b/python/nx-cugraph/_nx_cugraph/core.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities to help keep _nx_cugraph up to date."""
+
+
+def get_functions():
+    from nx_cugraph.interface import BackendInterface
+    from nx_cugraph.utils import networkx_algorithm
+
+    return {
+        key: val
+        for key, val in vars(BackendInterface).items()
+        if isinstance(val, networkx_algorithm)
+    }
+
+
+def get_extra_docstrings(functions=None):
+    if functions is None:
+        functions = get_functions()
+    return {key: val.extra_doc for key, val in functions.items() if val.extra_doc}
+
+
+def get_extra_parameters(functions=None):
+    if functions is None:
+        functions = get_functions()
+    return {key: val.extra_params for key, val in functions.items() if val.extra_params}
+
+
+def update_text(text, lines_to_add, target, indent=" " * 8):
+    begin = f"# BEGIN: {target}\n"
+    end = f"# END: {target}\n"
+    start = text.index(begin)
+    stop = text.index(end)
+    to_add = "\n".join([f"{indent}{line}" for line in lines_to_add])
+    return f"{text[:start]}{begin}{to_add}\n{indent}{text[stop:]}"
+
+
+def dict_to_lines(d, *, indent=""):
+    for key in sorted(d):
+        val = d[key]
+        if "\n" not in val:
+            yield f"{indent}{key!r}: {val!r},"
+        else:
+            yield f"{indent}{key!r}: ("
+            *lines, last_line = val.split("\n")
+            for line in lines:
+                line += "\n"
+                yield f"    {indent}{line!r}"
+            yield f"    {indent}{last_line!r}"
+            yield f"{indent}),"
+
+
+def main(filepath):
+    from pathlib import Path
+
+    filepath = Path(filepath)
+    with filepath.open() as f:
+        orig_text = f.read()
+    text = orig_text
+
+    # Update functions
+    functions = get_functions()
+    to_add = [f'"{name}",' for name in sorted(functions)]
+    text = update_text(text, to_add, "functions")
+
+    # Update extra_docstrings
+    extra_docstrings = get_extra_docstrings(functions)
+    to_add = list(dict_to_lines(extra_docstrings))
+    text = update_text(text, to_add, "extra_docstrings")
+
+    # Update extra_parameters
+    extra_parameters = get_extra_parameters(functions)
+    to_add = []
+    for name in sorted(extra_parameters):
+        params = extra_parameters[name]
+        to_add.append(f"{name!r}: {{")
+        to_add.extend(dict_to_lines(params, indent=" " * 4))
+        to_add.append("},")
+    text = update_text(text, to_add, "extra_parameters")
+    return text
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index dba061bd6b5..6a462a6af79 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -31,7 +31,7 @@ repos:
       - id: validate-pyproject
         name: Validate pyproject.toml
   - repo: https://github.com/PyCQA/autoflake
-    rev: v2.2.0
+    rev: v2.2.1
     hooks:
       - id: autoflake
         args: [--in-place]
@@ -40,17 +40,17 @@ repos:
     hooks:
       - id: isort
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.10.1
+    rev: v3.13.0
     hooks:
       - id: pyupgrade
         args: [--py39-plus]
   - repo: https://github.com/psf/black
-    rev: 23.7.0
+    rev: 23.9.1
     hooks:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.286
+    rev: v0.0.291
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]
@@ -58,11 +58,12 @@ repos:
     rev: 6.1.0
     hooks:
       - id: flake8
+        args: ['--per-file-ignores=_nx_cugraph/__init__.py:E501']  # Why is this necessary?
         additional_dependencies: &flake8_dependencies
-        # These versions need updated manually
-        - flake8==6.1.0
-        - flake8-bugbear==23.7.10
-        - flake8-simplify==0.20.0
+          # These versions need updated manually
+          - flake8==6.1.0
+          - flake8-bugbear==23.9.16
+          - flake8-simplify==0.20.0
   - repo: https://github.com/asottile/yesqa
     rev: v1.5.0
     hooks:
@@ -76,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.286
+    rev: v0.0.291
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/__init__.py b/python/nx-cugraph/nx_cugraph/__init__.py
index 28066fe2b02..4a0e95a109f 100644
--- a/python/nx-cugraph/nx_cugraph/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/__init__.py
@@ -12,9 +12,21 @@
 # limitations under the License.
 from networkx.exception import *
 
-from . import algorithms, classes, convert, utils
-from .algorithms import *
+from . import utils
+
+from . import classes
 from .classes import *
+
+from . import convert
 from .convert import *
 
+# from . import convert_matrix
+# from .convert_matrix import *
+
+# from . import generators
+# from .generators import *
+
+from . import algorithms
+from .algorithms import *
+
 __version__ = "23.10.00"
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
index b777919f86f..104ac87414c 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
@@ -13,7 +13,7 @@
 import pylibcugraph as plc
 
 from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import _handle_seed, networkx_algorithm
+from nx_cugraph.utils import _seed_to_int, networkx_algorithm
 
 __all__ = ["betweenness_centrality", "edge_betweenness_centrality"]
 
@@ -22,11 +22,12 @@
 def betweenness_centrality(
     G, k=None, normalized=True, weight=None, endpoints=False, seed=None
 ):
+    """`weight` parameter is not yet supported."""
     if weight is not None:
         raise NotImplementedError(
             "Weighted implementation of betweenness centrality not currently supported"
         )
-    seed = _handle_seed(seed)
+    seed = _seed_to_int(seed)
     G = _to_graph(G, weight)
     node_ids, values = plc.betweenness_centrality(
         resource_handle=plc.ResourceHandle(),
@@ -47,6 +48,7 @@ def _(G, k=None, normalized=True, weight=None, endpoints=False, seed=None):
 
 @networkx_algorithm
 def edge_betweenness_centrality(G, k=None, normalized=True, weight=None, seed=None):
+    """`weight` parameter is not yet supported."""
     if weight is not None:
         raise NotImplementedError(
             "Weighted implementation of betweenness centrality not currently supported"
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
index ca5f05c2014..a183b59fe1d 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
@@ -17,7 +17,7 @@
 from nx_cugraph.convert import _to_undirected_graph
 from nx_cugraph.utils import (
     _groupby,
-    _handle_seed,
+    _seed_to_int,
     networkx_algorithm,
     not_implemented_for,
 )
@@ -26,16 +26,17 @@
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(extra_params="max_level")
+@networkx_algorithm(
+    extra_params={
+        "max_level : int, optional": "Upper limit of the number of macro-iterations."
+    }
+)
 def louvain_communities(
     G, weight="weight", resolution=1, threshold=0.0000001, seed=None, *, max_level=None
 ):
-    """`threshold` and `seed` parameters are currently ignored.
-
-    Extra parameter: `max_level` controls the maximum number of levels of the algorithm.
-    """
+    """`threshold` and `seed` parameters are currently ignored."""
     # NetworkX allows both directed and undirected, but cugraph only allows undirected.
-    seed = _handle_seed(seed)  # Unused, but ensure it's valid for future compatibility
+    seed = _seed_to_int(seed)  # Unused, but ensure it's valid for future compatibility
     G = _to_undirected_graph(G, weight)
     if G.row_indices.size == 0:
         # TODO: PLC doesn't handle empty graphs gracefully!
@@ -46,8 +47,8 @@ def louvain_communities(
         resource_handle=plc.ResourceHandle(),
         graph=G._get_plc_graph(),
         max_level=max_level,  # TODO: add this parameter to NetworkX
+        threshold=threshold,
         resolution=resolution,
-        # threshold=threshold,  # TODO: add this parameter to PLC
         do_expensive_check=False,
     )
     groups = _groupby(clusters, vertices)
diff --git a/python/nx-cugraph/nx_cugraph/interface.py b/python/nx-cugraph/nx_cugraph/interface.py
index cc750cd2d5b..2ad23acd940 100644
--- a/python/nx-cugraph/nx_cugraph/interface.py
+++ b/python/nx-cugraph/nx_cugraph/interface.py
@@ -62,9 +62,7 @@ def key(testpath):
         # Reasons for xfailing
         no_weights = "weighted implementation not currently supported"
         no_multigraph = "multigraphs not currently supported"
-        louvain_different = (
-            "Louvain may be different due to RNG or unsupported threshold parameter"
-        )
+        louvain_different = "Louvain may be different due to RNG"
 
         xfail = {}
 
@@ -176,7 +174,6 @@ def key(testpath):
                     ): louvain_different,
                     key("test_louvain.py:test_none_weight_param"): louvain_different,
                     key("test_louvain.py:test_multigraph"): louvain_different,
-                    key("test_louvain.py:test_threshold"): louvain_different,
                 }
             )
 
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_match_api.py b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
index 64d3704dd65..ecfda1397db 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_match_api.py
@@ -45,11 +45,14 @@ def test_match_signature_and_names():
             assert orig_sig == func_sig
         else:
             # Ignore extra parameters added to nx-cugraph algorithm
+            # The key of func.extra_params may be like "max_level : int, optional",
+            # but we only want "max_level" here.
+            extra_params = {name.split(" ")[0] for name in func.extra_params}
             assert orig_sig == func_sig.replace(
                 parameters=[
                     p
                     for name, p in func_sig.parameters.items()
-                    if name not in func.extra_params
+                    if name not in extra_params
                 ]
             )
         if func.can_run is not nxcg.utils.decorators._default_can_run:
diff --git a/python/nx-cugraph/nx_cugraph/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py
index 3dbdb07e87f..0f15d236ecd 100644
--- a/python/nx-cugraph/nx_cugraph/utils/decorators.py
+++ b/python/nx-cugraph/nx_cugraph/utils/decorators.py
@@ -10,13 +10,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+
 from functools import partial, update_wrapper
 
-from networkx.utils.decorators import not_implemented_for
+from networkx.utils.decorators import nodes_or_number, not_implemented_for
 
 from nx_cugraph.interface import BackendInterface
 
-__all__ = ["not_implemented_for", "networkx_algorithm"]
+try:
+    from networkx.utils.backends import _registered_algorithms
+except ModuleNotFoundError:
+    from networkx.classes.backends import _registered_algorithms
+
+
+__all__ = ["not_implemented_for", "nodes_or_number", "networkx_algorithm"]
 
 
 def networkx_class(api):
@@ -28,7 +36,17 @@ def inner(func):
 
 
 class networkx_algorithm:
-    def __new__(cls, func=None, *, name=None, extra_params=None):
+    name: str
+    extra_doc: str | None
+    extra_params: dict[str, str] | None
+
+    def __new__(
+        cls,
+        func=None,
+        *,
+        name: str | None = None,
+        extra_params: dict[str, str] | str | None = None,
+    ):
         if func is None:
             return partial(networkx_algorithm, name=name, extra_params=extra_params)
         instance = object.__new__(cls)
@@ -37,13 +55,20 @@ def __new__(cls, func=None, *, name=None, extra_params=None):
         instance.__defaults__ = func.__defaults__
         instance.__kwdefaults__ = func.__kwdefaults__
         instance.name = func.__name__ if name is None else name
-        # TODO: should extra_params be a dict[str, str] that describes the parameters?
         if extra_params is None:
-            instance.extra_params = None
+            pass
         elif isinstance(extra_params, str):
-            instance.extra_params = {extra_params}
-        else:
-            instance.extra_params = set(extra_params)
+            extra_params = {extra_params: ""}
+        elif not isinstance(extra_params, dict):
+            raise TypeError(
+                f"extra_params must be dict, str, or None; got {type(extra_params)}"
+            )
+        instance.extra_params = extra_params
+        # The docstring on our function is added to the NetworkX docstring.
+        instance.extra_doc = func.__doc__
+        # Copy __doc__ from NetworkX
+        if instance.name in _registered_algorithms:
+            instance.__doc__ = _registered_algorithms[instance.name].__doc__
         instance.can_run = _default_can_run
         setattr(BackendInterface, instance.name, instance)
         # Set methods so they are in __dict__
diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py
index 64c0be066f2..72e4094b8b7 100644
--- a/python/nx-cugraph/nx_cugraph/utils/misc.py
+++ b/python/nx-cugraph/nx_cugraph/utils/misc.py
@@ -18,7 +18,7 @@
 
 import cupy as cp
 
-__all__ = ["_groupby", "_handle_seed"]
+__all__ = ["_groupby", "_seed_to_int"]
 
 
 def _groupby(groups: cp.ndarray, values: cp.ndarray) -> dict[int, cp.ndarray]:
@@ -51,8 +51,8 @@ def _groupby(groups: cp.ndarray, values: cp.ndarray) -> dict[int, cp.ndarray]:
     return rv
 
 
-def _handle_seed(seed: int | Random | None) -> int:
-    """Handle seed argument and ensure it is what pylibcugraph needs: an int."""
+def _seed_to_int(seed: int | Random | None) -> int:
+    """Handle any valid seed argument and convert it to an int if necessary."""
     if seed is None:
         return
     if isinstance(seed, Random):
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index 95e9c256e5d..db3b3a22545 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -54,6 +54,9 @@ Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 [project.entry-points."networkx.plugins"]
 cugraph = "nx_cugraph.interface:BackendInterface"
 
+[project.entry-points."networkx.plugin_info"]
+cugraph = "_nx_cugraph:get_info"
+
 [tool.setuptools]
 license-files = ["LICENSE"]
 
@@ -61,6 +64,8 @@ license-files = ["LICENSE"]
 include = [
     "nx_cugraph*",
     "nx_cugraph.*",
+    "_nx_cugraph*",
+    "_nx_cugraph.*",
 ]
 
 [tool.black]
@@ -75,6 +80,7 @@ float_to_top = true
 default_section = "THIRDPARTY"
 known_first_party = "nx_cugraph"
 line_length = 88
+extend_skip_glob = ["nx_cugraph/__init__.py"]
 
 [tool.pytest.ini_options]
 minversion = "6.0"
@@ -128,6 +134,9 @@ exclude_lines = [
 # https://github.com/charliermarsh/ruff/
 line-length = 88
 target-version = "py39"
+unfixable = [
+    "F841",  # unused-variable (Note: can leave useless expression)
+]
 select = [
     "ALL",
 ]
@@ -203,6 +212,7 @@ ignore = [
 "__init__.py" = ["F401"]  # Allow unused imports (w/o defining `__all__`)
 # Allow assert, print, RNG, and no docstring
 "nx_cugraph/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"]
+"_nx_cugraph/__init__.py" = ["E501"]
 
 [tool.ruff.flake8-annotations]
 mypy-init-return = true

From f57119bf8d322a2eba902a5498ae194832d8d732 Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Thu, 28 Sep 2023 13:55:59 -0500
Subject: [PATCH 59/72] Temporarily disable the deletion of the dask dataframe
 (#3814)

temporarily disable the deletion of the dask dataframe

Authors:
  - Joseph Nke (https://github.com/jnke2016)
  - Naim (https://github.com/naimnv)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3814
---
 .../graph_implementation/simpleDistributedGraph.py   | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index 01885c2d1c3..fa94fa67625 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -14,7 +14,6 @@
 import gc
 from typing import Union
 import warnings
-import random
 
 import cudf
 import cupy as cp
@@ -182,10 +181,7 @@ def __from_edgelist(
         workers = _client.scheduler_info()["workers"]
         # Repartition to 2 partitions per GPU for memory efficient process
         input_ddf = input_ddf.repartition(npartitions=len(workers) * 2)
-        # FIXME: Make a copy of the input ddf before implicitly altering it.
-        input_ddf = input_ddf.map_partitions(
-            lambda df: df.copy(), token="custom-" + str(random.random())
-        )
+        input_ddf = input_ddf.map_partitions(lambda df: df.copy())
         # The dataframe will be symmetrized iff the graph is undirected
         # otherwise, the inital dataframe will be returned
         if edge_attr is not None:
@@ -337,7 +333,7 @@ def __from_edgelist(
             )
             for w, edata in ddf.items()
         }
-        del ddf
+        # FIXME: For now, don't delete the copied dataframe to avoid crash
         self._plc_graph = {
             w: _client.compute(delayed_task, workers=w, allow_other_workers=False)
             for w, delayed_task in delayed_tasks_d.items()
@@ -1196,7 +1192,5 @@ def _get_column_from_ls_dfs(lst_df, col_name):
     if len_df == 0:
         return lst_df[0][col_name]
     output_col = cudf.concat([df[col_name] for df in lst_df], ignore_index=True)
-    for df in lst_df:
-        df.drop(columns=[col_name], inplace=True)
-    gc.collect()
+    # FIXME: For now, don't delete the copied dataframe to avoid cras
     return output_col

From 91fbcca659ea1b29e4658913ca4d7f8381584df7 Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Thu, 28 Sep 2023 14:03:26 -0500
Subject: [PATCH 60/72] Updates the source build docs to include libcugraphops
 as a build prerequisite (#3893)

closes #3722

Updates the source build docs to include `libcugraphops` as a build prerequisite.

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3893
---
 docs/cugraph/source/installation/source_build.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/docs/cugraph/source/installation/source_build.md b/docs/cugraph/source/installation/source_build.md
index 7782591f1ce..f5ee0741da6 100644
--- a/docs/cugraph/source/installation/source_build.md
+++ b/docs/cugraph/source/installation/source_build.md
@@ -6,10 +6,10 @@ The cuGraph package include both a C/C++ CUDA portion and a python portion.  Bot
 
 ## Prerequisites
 
-__Compiler__:
-* `gcc`         version 9.3+
-* `nvcc`        version 11.0+
-* `cmake`       version 3.20.1+
+__Compiler:__
+* `gcc`           version 9.3+
+* `nvcc`          version 11.0+
+* `cmake`         version 3.20.1+
 
 __CUDA:__
 * CUDA 11.0+
@@ -18,6 +18,11 @@ __CUDA:__
 
 You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
 
+__Packages:__
+* `cmake`         version 3.20.1+
+* `libcugraphops` (version matching source branch version, eg. `23.10`)
+
+You can obtain `libcugraphops` using `conda`/`mamba` from the `nvidia` channel, or using `pip` with the `--extra-index-url=https://pypi.nvidia.com` option.  See the [RAPIDS docs](https://docs.rapids.ai/install#environment) for more details.
 
 ## Building cuGraph
 To install cuGraph from source, ensure the dependencies are met.

From b24121fc0f76f29cfab878875ff9a953b49cc6cd Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 28 Sep 2023 15:53:49 -0500
Subject: [PATCH 61/72] Pin `dask` and `distributed` for `23.10` release
 (#3896)

This PR pins `dask` and `distributed` to `2023.9.2` for `23.10` release.

xref: https://github.com/rapidsai/cudf/pull/14225

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3896
---
 ci/test_wheel_cugraph.sh                         | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 6 +++---
 conda/environments/all_cuda-120_arch-x86_64.yaml | 6 +++---
 conda/recipes/cugraph-pyg/meta.yaml              | 2 +-
 conda/recipes/cugraph-service/meta.yaml          | 2 +-
 conda/recipes/cugraph/meta.yaml                  | 6 +++---
 dependencies.yaml                                | 6 +++---
 python/cugraph-service/server/pyproject.toml     | 4 ++--
 python/cugraph/pyproject.toml                    | 4 ++--
 9 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/ci/test_wheel_cugraph.sh b/ci/test_wheel_cugraph.sh
index f9e2aa6d8da..ac18459128a 100755
--- a/ci/test_wheel_cugraph.sh
+++ b/ci/test_wheel_cugraph.sh
@@ -9,6 +9,6 @@ RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-whe
 python -m pip install --no-deps ./local-pylibcugraph-dep/pylibcugraph*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main
+python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2
 
 ./ci/test_wheel.sh cugraph python/cugraph
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 86de24c991d..952ec9317e2 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -19,11 +19,11 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
 - dask-cudf==23.10.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 1054f75ba54..38936c78c38 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -19,11 +19,11 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
 - dask-cudf==23.10.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index 2d7ed2f4cda..1dc5a75c41b 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -26,7 +26,7 @@ requirements:
     - python
     - scikit-build >=0.13.1
   run:
-    - distributed >=2023.7.1
+    - distributed ==2023.9.2
     - numba >=0.57
     - numpy >=1.21
     - python
diff --git a/conda/recipes/cugraph-service/meta.yaml b/conda/recipes/cugraph-service/meta.yaml
index f3229c27364..2daf0438351 100644
--- a/conda/recipes/cugraph-service/meta.yaml
+++ b/conda/recipes/cugraph-service/meta.yaml
@@ -59,7 +59,7 @@ outputs:
         - cupy >=12.0.0
         - dask-cuda ={{ minor_version }}
         - dask-cudf ={{ minor_version }}
-        - distributed >=2023.7.1
+        - distributed ==2023.9.2
         - numba >=0.57
         - numpy >=1.21
         - python
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index ad5965ad20c..f9bf54a2ef4 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -76,9 +76,9 @@ requirements:
     - cupy >=12.0.0
     - dask-cuda ={{ minor_version }}
     - dask-cudf ={{ minor_version }}
-    - dask >=2023.7.1
-    - dask-core >=2023.7.1
-    - distributed >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core ==2023.9.2
+    - distributed ==2023.9.2
     - fsspec>=0.6.0
     - libcugraph ={{ version }}
     - pylibcugraph ={{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index a162ac01354..f74ed13115b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -373,15 +373,15 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &dask dask>=2023.7.1
-          - &distributed distributed>=2023.7.1
+          - &dask dask==2023.9.2
+          - &distributed distributed==2023.9.2
           - &dask_cuda dask-cuda==23.10.*
           - &numba numba>=0.57
           - &ucx_py ucx-py==0.34.*
       - output_types: conda
         packages:
           - aiohttp
-          - &dask-core_conda dask-core>=2023.7.1
+          - &dask-core_conda dask-core==2023.9.2
           - fsspec>=0.6.0
           - libcudf==23.10.*
           - requests
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index f25ea6c46e5..8787cb838be 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -25,8 +25,8 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "dask-cuda==23.10.*",
     "dask-cudf==23.10.*",
-    "dask>=2023.7.1",
-    "distributed>=2023.7.1",
+    "dask==2023.9.2",
+    "distributed==2023.9.2",
     "numba>=0.57",
     "numpy>=1.21",
     "rmm==23.10.*",
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index cadf6879e23..1835ac8bb49 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -33,8 +33,8 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "dask-cuda==23.10.*",
     "dask-cudf==23.10.*",
-    "dask>=2023.7.1",
-    "distributed>=2023.7.1",
+    "dask==2023.9.2",
+    "distributed==2023.9.2",
     "fsspec[http]>=0.6.0",
     "numba>=0.57",
     "pylibcugraph==23.10.*",

From 6e5e0667b19106b61c98107b0cc58ff2ef7d4202 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Fri, 29 Sep 2023 03:44:59 +0200
Subject: [PATCH 62/72] Adds logic to handle isolated vertices at python layer
 (#3886)

- Adds logic to handle isolated vertices
- Clamp downs max level to a sane number

closes #3804

Authors:
  - Naim (https://github.com/naimnv)

Approvers:
  - Joseph Nke (https://github.com/jnke2016)
  - Brad Rees (https://github.com/BradReesWork)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3886
---
 python/cugraph/cugraph/community/louvain.py   | 91 +++++++++++++++----
 .../cugraph/tests/community/test_louvain.py   | 16 ++++
 python/cugraph/cugraph/utilities/utils.py     |  4 +-
 3 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/python/cugraph/cugraph/community/louvain.py b/python/cugraph/cugraph/community/louvain.py
index 7f9742c8f09..0bedd427824 100644
--- a/python/cugraph/cugraph/community/louvain.py
+++ b/python/cugraph/cugraph/community/louvain.py
@@ -11,7 +11,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Union, Tuple
+from cugraph.structure import Graph
 from cugraph.utilities import (
+    is_nx_graph_type,
     ensure_cugraph_obj_for_nx,
     df_score_to_dictionary,
 )
@@ -21,9 +24,26 @@
 from pylibcugraph import louvain as pylibcugraph_louvain
 from pylibcugraph import ResourceHandle
 
+from cugraph.utilities.utils import import_optional
+
+# FIXME: the networkx.Graph type used in type annotations is specified
+# using a string literal to avoid depending on and importing networkx.
+# Instead, networkx is imported optionally, which may cause a problem
+# for a type checker if run in an environment where networkx is not installed.
+networkx = import_optional("networkx")
+
+VERTEX_COL_NAME = "vertex"
+CLUSTER_ID_COL_NAME = "partition"
+
 
 # FIXME: max_level should default to 100 once max_iter is removed
-def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7):
+def louvain(
+    G: Union[Graph, "networkx.Graph"],
+    max_level: Union[int, None] = None,
+    max_iter: Union[int, None] = None,
+    resolution: float = 1.0,
+    threshold: float = 1e-7,
+) -> Tuple[Union[cudf.DataFrame, dict], float]:
     """
     Compute the modularity optimizing partition of the input graph using the
     Louvain method
@@ -48,6 +68,9 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7):
         than the specified number of levels. No error occurs when the
         algorithm terminates early in this manner.
 
+        If max_level > 500, it will be set to 500 and a warning is emitted
+        in order to prevent excessive runtime.
+
     max_iter : integer, optional (default=None)
         This parameter is deprecated in favor of max_level.  Previously
         it was used to control the maximum number of levels of the Louvain
@@ -68,18 +91,21 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7):
 
     Returns
     -------
-    parts : cudf.DataFrame
-        GPU data frame of size V containing two columns the vertex id and the
-        partition id it is assigned to.
+    result: cudf.DataFrame or dict
+        If input graph G is of type cugraph.Graph, a GPU dataframe
+        with two columns.
+
+            result[VERTEX_COL_NAME] : cudf.Series
+                Contains the vertex identifiers
+            result[CLUSTER_ID_COL_NAME] : cudf.Series
+                Contains the partition assigned to the vertices
 
-        df['vertex'] : cudf.Series
-            Contains the vertex identifiers
-        df['partition'] : cudf.Series
-            Contains the partition assigned to the vertices
+        If input graph G is of type networkx.Graph, a dict
+        Dictionary of vertices and their partition ids.
 
     modularity_score : float
-        a floating point number containing the global modularity score of the
-        partitioning.
+        A floating point number containing the global modularity score
+        of the partitioning.
 
     Examples
     --------
@@ -89,6 +115,17 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7):
 
     """
 
+    # FIXME: Onece the graph construction calls support isolated vertices through
+    #  the C API (the C++ interface already supports this) then there will be
+    # no need to compute isolated vertices here.
+
+    isolated_vertices = list()
+    if is_nx_graph_type(type(G)):
+        isolated_vertices = [v for v in range(G.number_of_nodes()) if G.degree[v] == 0]
+    else:
+        # FIXME: Gather isolated vertices of G
+        pass
+
     G, isNx = ensure_cugraph_obj_for_nx(G)
 
     if G.is_directed():
@@ -112,7 +149,12 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7):
     if max_level is None:
         max_level = 100
 
-    vertex, partition, mod_score = pylibcugraph_louvain(
+    if max_level > 500:
+        w_msg = "max_level is set too high, clamping it down to 500."
+        warnings.warn(w_msg)
+        max_level = 500
+
+    vertex, partition, modularity_score = pylibcugraph_louvain(
         resource_handle=ResourceHandle(),
         graph=G._plc_graph,
         max_level=max_level,
@@ -121,14 +163,27 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7):
         do_expensive_check=False,
     )
 
-    df = cudf.DataFrame()
-    df["vertex"] = vertex
-    df["partition"] = partition
+    result = cudf.DataFrame()
+    result[VERTEX_COL_NAME] = vertex
+    result[CLUSTER_ID_COL_NAME] = partition
+
+    if len(isolated_vertices) > 0:
+        unique_cids = result[CLUSTER_ID_COL_NAME].unique()
+        max_cluster_id = -1 if len(result) == 0 else unique_cids.max()
+
+        isolated_vtx_and_cids = cudf.DataFrame()
+        isolated_vtx_and_cids[VERTEX_COL_NAME] = isolated_vertices
+        isolated_vtx_and_cids[CLUSTER_ID_COL_NAME] = [
+            (max_cluster_id + i + 1) for i in range(len(isolated_vertices))
+        ]
+        result = cudf.concat(
+            [result, isolated_vtx_and_cids], ignore_index=True, sort=False
+        )
 
-    if G.renumbered:
-        df = G.unrenumber(df, "vertex")
+    if G.renumbered and len(G.input_df) > 0:
+        result = G.unrenumber(result, VERTEX_COL_NAME)
 
     if isNx is True:
-        df = df_score_to_dictionary(df, "partition")
+        result = df_score_to_dictionary(result, CLUSTER_ID_COL_NAME)
 
-    return df, mod_score
+    return result, modularity_score
diff --git a/python/cugraph/cugraph/tests/community/test_louvain.py b/python/cugraph/cugraph/tests/community/test_louvain.py
index 183be071a44..5441998fb46 100644
--- a/python/cugraph/cugraph/tests/community/test_louvain.py
+++ b/python/cugraph/cugraph/tests/community/test_louvain.py
@@ -142,3 +142,19 @@ def test_louvain_csr_graph(is_weighted):
 
     assert len(parition_diffs) == 0
     assert mod_csr == mod_coo
+
+
+@pytest.mark.sg
+def test_louvain_nx_graph_with_isolated_nodes():
+    # Cluster IDs are expected to unique if all nodes are isolated
+    G = nx.Graph()
+    G.add_nodes_from(range(5))
+    result, _ = cugraph.louvain(G)
+    assert set(result.keys()) == set(G.nodes)
+    assert len(set(result.values())) == G.number_of_nodes()
+
+    # A graph with 5 nodes, where 3 of the nodes are isolated
+    G.add_edge(1, 2)
+    result, _ = cugraph.louvain(G)
+    assert set(result.keys()) == set(G.nodes)
+    assert len(set(result.values())) == G.number_of_nodes() - 1
diff --git a/python/cugraph/cugraph/utilities/utils.py b/python/cugraph/cugraph/utilities/utils.py
index e68b5dd4880..7a54a0bf2cf 100644
--- a/python/cugraph/cugraph/utilities/utils.py
+++ b/python/cugraph/cugraph/utilities/utils.py
@@ -364,8 +364,8 @@ def is_matrix_type(m):
     return is_cp_matrix_type(m) or is_sp_matrix_type(m)
 
 
-def is_nx_graph_type(g):
-    return g in __nx_graph_types
+def is_nx_graph_type(graph_type):
+    return graph_type in __nx_graph_types
 
 
 def is_cugraph_graph_type(g):

From eed12230fb41da701ab9ea302642765d81024bc8 Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Fri, 29 Sep 2023 11:35:12 -0500
Subject: [PATCH 63/72] Refactor legacy k truss (#3843)

This PR refactor legacy k truss by leveraging the `C API`  instead of the `legacy:COO` graph. This also stands as a preliminary work to later provide a new implementation of `k_truss` matching the `C API`.

Closes #3753
Closes #3754

Authors:
  - Joseph Nke (https://github.com/jnke2016)
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Brad Rees (https://github.com/BradReesWork)
  - Don Acosta (https://github.com/acostadon)

URL: https://github.com/rapidsai/cugraph/pull/3843
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cugraph/algorithms.hpp            |  43 ++--
 .../cugraph/utilities/graph_traits.hpp        |  22 +-
 cpp/include/cugraph_c/community_algorithms.h  |  21 ++
 cpp/src/c_api/legacy_k_truss.cpp              | 150 +++++++++++
 cpp/src/community/legacy/ktruss.cu            | 127 ++++++----
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/c_api/legacy_k_truss_test.c         | 236 ++++++++++++++++++
 python/cugraph/CMakeLists.txt                 |   1 -
 .../cugraph/cugraph/community/CMakeLists.txt  |  25 --
 .../cugraph/community/ktruss_subgraph.pxd     |  26 --
 .../cugraph/community/ktruss_subgraph.py      |  54 +++-
 .../community/ktruss_subgraph_wrapper.pyx     |  43 ----
 .../pylibcugraph/pylibcugraph/CMakeLists.txt  |   1 +
 python/pylibcugraph/pylibcugraph/__init__.py  |   2 +
 .../_cugraph_c/community_algorithms.pxd       |  10 +
 .../pylibcugraph/k_truss_subgraph.pyx         | 163 ++++++++++++
 17 files changed, 759 insertions(+), 167 deletions(-)
 create mode 100644 cpp/src/c_api/legacy_k_truss.cpp
 create mode 100644 cpp/tests/c_api/legacy_k_truss_test.c
 delete mode 100644 python/cugraph/cugraph/community/CMakeLists.txt
 delete mode 100644 python/cugraph/cugraph/community/ktruss_subgraph.pxd
 delete mode 100644 python/cugraph/cugraph/community/ktruss_subgraph_wrapper.pyx
 create mode 100644 python/pylibcugraph/pylibcugraph/k_truss_subgraph.pyx

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0d7bd86075d..d9f87d7dd72 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -435,6 +435,7 @@ add_library(cugraph_c
         src/c_api/labeling_result.cpp
         src/c_api/weakly_connected_components.cpp
         src/c_api/strongly_connected_components.cpp
+        src/c_api/legacy_k_truss.cpp
         )
 add_library(cugraph::cugraph_c ALIAS cugraph_c)
 
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index b624ec5c0e0..78846bc5766 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -430,34 +430,39 @@ void connected_components(legacy::GraphCSRView<VT, ET, WT> const& graph,
                           VT* labels);
 
 /**
- * @brief     Compute k truss for a graph
+ * @brief     Compute k truss for a graph  ** temporary
  *
  * K Truss is the maximal subgraph of a graph which contains at least three
  * vertices where every edge is incident to at least k-2 triangles.
  *
- * Note that current implementation does not support a weighted graph.
+ * This version is a temporary solution to clean up python integration through the C API.
  *
- * @throws                           cugraph::logic_error with a custom message when an error
- * occurs.
+ * This version is only supported SG.
  *
- * @tparam VT                        Type of vertex identifiers. Supported value : int (signed,
- * 32-bit)
- * @tparam ET                        Type of edge identifiers.  Supported value : int (signed,
- * 32-bit)
- * @tparam WT                        Type of edge weights. Supported values : float or double.
+ * @throws                  cugraph::logic_error with a custom message when an error
+ * occurs.
  *
- * @param[in] graph                  cuGraph graph descriptor, should contain the connectivity
- * information as a COO
- * @param[in] k                      The order of the truss
- * @param[in] mr                     Memory resource used to allocate the returned graph
- * @return                           Unique pointer to K Truss subgraph in COO format
+ * @tparam vertex_t         Type of vertex identifiers. Supported value : int (signed, 32-bit)
+ * @tparam weight_t         Type of edge weights. Supported values : float or double.
  *
+ * @param[in] handle        Library handle (RAFT).
+ * @param[in] src           Source vertices from COO
+ * @param[in] dst           Destination vertices from COO
+ * @param[in] wgt           Optional edge weights from COO
+ * @param[in] k             The order of the truss
+ * @return                  Tuple containing extracted src, dst and optional weights for the
+ * subgraph
  */
-template <typename VT, typename ET, typename WT>
-std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> k_truss_subgraph(
-  legacy::GraphCOOView<VT, ET, WT> const& graph,
-  int k,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+k_truss_subgraph(raft::handle_t const& handle,
+                 raft::device_span<vertex_t> src,
+                 raft::device_span<vertex_t> dst,
+                 std::optional<raft::device_span<weight_t>> wgt,
+                 size_t number_of_vertices,
+                 int k);
 
 // FIXME: Internally distances is of int (signed 32-bit) data type, but current
 // template uses data from VT, ET, WT from the legacy::GraphCSR View even if weights
diff --git a/cpp/include/cugraph/utilities/graph_traits.hpp b/cpp/include/cugraph/utilities/graph_traits.hpp
index 7385630c011..e2737305aed 100644
--- a/cpp/include/cugraph/utilities/graph_traits.hpp
+++ b/cpp/include/cugraph/utilities/graph_traits.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,6 +47,16 @@ struct is_vertex_edge_combo {
                                 (sizeof(vertex_t) <= sizeof(edge_t));
 };
 
+// meta-function that constrains
+// vertex_t and edge_t template param candidates to only int32_t:
+//
+template <typename vertex_t, typename edge_t>
+struct is_vertex_edge_combo_legacy {
+  static constexpr bool value = is_one_of<vertex_t, int32_t>::value &&
+                                is_one_of<edge_t, int32_t>::value &&
+                                (sizeof(vertex_t) <= sizeof(edge_t));
+};
+
 // meta-function that constrains
 // all 3 template param candidates:
 //
@@ -56,4 +66,14 @@ struct is_candidate {
     is_vertex_edge_combo<vertex_t, edge_t>::value && is_one_of<weight_t, float, double>::value;
 };
 
+// meta-function that constrains
+// all 3 template param candidates where vertex_t and edge_t
+// are restricted to int32_t:
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+struct is_candidate_legacy {
+  static constexpr bool value = is_vertex_edge_combo_legacy<vertex_t, edge_t>::value &&
+                                is_one_of<weight_t, float, double>::value;
+};
+
 }  // namespace cugraph
diff --git a/cpp/include/cugraph_c/community_algorithms.h b/cpp/include/cugraph_c/community_algorithms.h
index e938c77cccd..8f1015f8632 100644
--- a/cpp/include/cugraph_c/community_algorithms.h
+++ b/cpp/include/cugraph_c/community_algorithms.h
@@ -227,6 +227,27 @@ cugraph_error_code_t cugraph_extract_ego(
   cugraph_induced_subgraph_result_t** result,
   cugraph_error_t** error);
 
+/**
+ * @brief   Extract k truss for a graph
+ *
+ * @param [in]  handle          Handle for accessing resources
+ * @param [in]  graph           Pointer to graph.  NOTE: Graph might be modified if the storage
+ *                              needs to be transposed
+ * @param [in]  k               The order of the truss
+ * @param [in]  do_expensive_check
+ *                              A flag to run expensive checks for input arguments (if set to true)
+ * @param [out] result          Opaque object containing the extracted subgraph
+ * @param [out] error           Pointer to an error object storing details of any error.  Will
+ *                              be populated if error code is not CUGRAPH_SUCCESS
+ * @return error code
+ */
+cugraph_error_code_t cugraph_k_truss_subgraph(const cugraph_resource_handle_t* handle,
+                                              cugraph_graph_t* graph,
+                                              size_t k,
+                                              bool_t do_expensive_check,
+                                              cugraph_induced_subgraph_result_t** result,
+                                              cugraph_error_t** error);
+
 /**
  * @brief     Opaque clustering output
  */
diff --git a/cpp/src/c_api/legacy_k_truss.cpp b/cpp/src/c_api/legacy_k_truss.cpp
new file mode 100644
index 00000000000..90e0894783a
--- /dev/null
+++ b/cpp/src/c_api/legacy_k_truss.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph_c/algorithms.h>
+
+#include <c_api/abstract_functor.hpp>
+#include <c_api/graph.hpp>
+#include <c_api/induced_subgraph_result.hpp>
+#include <c_api/resource_handle.hpp>
+#include <c_api/utils.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/graph_functions.hpp>
+
+#include <optional>
+
+namespace {
+
+struct k_truss_functor : public cugraph::c_api::abstract_functor {
+  raft::handle_t const& handle_;
+  cugraph::c_api::cugraph_graph_t* graph_;
+  size_t k_;
+  bool do_expensive_check_;
+  cugraph::c_api::cugraph_induced_subgraph_result_t* result_{};
+
+  k_truss_functor(::cugraph_resource_handle_t const* handle,
+                  ::cugraph_graph_t* graph,
+                  size_t k,
+                  bool do_expensive_check)
+    : abstract_functor(),
+      handle_(*reinterpret_cast<cugraph::c_api::cugraph_resource_handle_t const*>(handle)->handle_),
+      graph_(reinterpret_cast<cugraph::c_api::cugraph_graph_t*>(graph)),
+      k_(k),
+      do_expensive_check_(do_expensive_check)
+  {
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename edge_type_type_t,
+            bool store_transposed,
+            bool multi_gpu>
+  void operator()()
+  {
+    if constexpr (!cugraph::is_candidate_legacy<vertex_t, edge_t, weight_t>::value) {
+      unsupported();
+    } else if constexpr (multi_gpu) {
+      unsupported();
+    } else {
+      // k_truss expects store_transposed == false
+      if constexpr (store_transposed) {
+        error_code_ = cugraph::c_api::
+          transpose_storage<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
+            handle_, graph_, error_.get());
+        if (error_code_ != CUGRAPH_SUCCESS) return;
+      }
+
+      auto graph =
+        reinterpret_cast<cugraph::graph_t<vertex_t, edge_t, false, multi_gpu>*>(graph_->graph_);
+
+      auto edge_weights = reinterpret_cast<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+                                 weight_t>*>(graph_->edge_weights_);
+
+      auto number_map = reinterpret_cast<rmm::device_uvector<vertex_t>*>(graph_->number_map_);
+
+      auto graph_view = graph->view();
+      rmm::device_uvector<vertex_t> src(0, handle_.get_stream());
+      rmm::device_uvector<vertex_t> dst(0, handle_.get_stream());
+      std::optional<rmm::device_uvector<weight_t>> wgt{std::nullopt};
+
+      std::tie(src, dst, wgt, std::ignore) = cugraph::decompress_to_edgelist(
+        handle_,
+        graph_view,
+        edge_weights ? std::make_optional(edge_weights->view()) : std::nullopt,
+        std::optional<cugraph::edge_property_view_t<edge_t, edge_t const*>>{std::nullopt},
+        std::optional<raft::device_span<vertex_t const>>(std::nullopt),
+        do_expensive_check_);
+
+      auto [result_src, result_dst, result_wgt] = cugraph::k_truss_subgraph(
+        handle_,
+        raft::device_span<vertex_t>(src.data(), src.size()),
+        raft::device_span<vertex_t>(dst.data(), dst.size()),
+        wgt ? std::make_optional(raft::device_span<weight_t>(wgt->data(), wgt->size()))
+            : std::nullopt,
+        graph_view.number_of_vertices(),
+        k_);
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(
+        handle_,
+        result_src.data(),
+        result_src.size(),
+        number_map->data(),
+        graph_view.vertex_partition_range_lasts(),
+        do_expensive_check_);
+
+      cugraph::unrenumber_int_vertices<vertex_t, multi_gpu>(
+        handle_,
+        result_dst.data(),
+        result_dst.size(),
+        number_map->data(),
+        graph_view.vertex_partition_range_lasts(),
+        do_expensive_check_);
+
+      rmm::device_uvector<size_t> edge_offsets(2, handle_.get_stream());
+      std::vector<size_t> h_edge_offsets{{0, result_src.size()}};
+      raft::update_device(
+        edge_offsets.data(), h_edge_offsets.data(), h_edge_offsets.size(), handle_.get_stream());
+
+      result_ = new cugraph::c_api::cugraph_induced_subgraph_result_t{
+        new cugraph::c_api::cugraph_type_erased_device_array_t(result_src, graph_->vertex_type_),
+        new cugraph::c_api::cugraph_type_erased_device_array_t(result_dst, graph_->vertex_type_),
+        wgt ? new cugraph::c_api::cugraph_type_erased_device_array_t(*result_wgt,
+                                                                     graph_->weight_type_)
+            : NULL,
+        new cugraph::c_api::cugraph_type_erased_device_array_t(edge_offsets,
+                                                               cugraph_data_type_id_t::SIZE_T)};
+    }
+  }
+};
+
+}  // namespace
+
+extern "C" cugraph_error_code_t cugraph_k_truss_subgraph(const cugraph_resource_handle_t* handle,
+                                                         cugraph_graph_t* graph,
+                                                         size_t k,
+                                                         bool_t do_expensive_check,
+                                                         cugraph_induced_subgraph_result_t** result,
+                                                         cugraph_error_t** error)
+{
+  k_truss_functor functor(handle, graph, k, do_expensive_check);
+
+  return cugraph::c_api::run_algorithm(graph, functor, result, error);
+}
diff --git a/cpp/src/community/legacy/ktruss.cu b/cpp/src/community/legacy/ktruss.cu
index 74a871adb01..403593128c1 100644
--- a/cpp/src/community/legacy/ktruss.cu
+++ b/cpp/src/community/legacy/ktruss.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,18 +34,24 @@ namespace cugraph {
 
 namespace detail {
 
-template <typename VT, typename ET, typename WT>
-std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> ktruss_subgraph_impl(
-  legacy::GraphCOOView<VT, ET, WT> const& graph, int k, rmm::mr::device_memory_resource* mr)
+template <typename vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> ktruss_subgraph_impl(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t> src,
+  raft::device_span<vertex_t> dst,
+  size_t number_of_vertices,
+  int k)
 {
-  using HornetGraph = hornet::gpu::Hornet<VT>;
-  using UpdatePtr   = hornet::BatchUpdatePtr<VT, hornet::EMPTY, hornet::DeviceType::DEVICE>;
-  using Update      = hornet::gpu::BatchUpdate<VT>;
-  cudaStream_t stream{nullptr};
-  UpdatePtr ptr(graph.number_of_edges, graph.src_indices, graph.dst_indices);
+  using HornetGraph = hornet::gpu::Hornet<vertex_t>;
+  using UpdatePtr   = hornet::BatchUpdatePtr<vertex_t, hornet::EMPTY, hornet::DeviceType::DEVICE>;
+  using Update      = hornet::gpu::BatchUpdate<vertex_t>;
+
+  HornetGraph hnt(number_of_vertices + 1);
+
+  // NOTE: Should a constant pointer be passed for @src and @dst
+  UpdatePtr ptr(static_cast<int>(src.size()), src.data(), dst.data());
   Update batch(ptr);
 
-  HornetGraph hnt(graph.number_of_vertices + 1);
   hnt.insert(batch);
   CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to initialize graph");
 
@@ -67,32 +73,42 @@ std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> ktruss_subgraph_impl(
   kt.runForK(k);
   CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to run");
 
-  auto out_graph = std::make_unique<legacy::GraphCOO<VT, ET, WT>>(
-    graph.number_of_vertices, kt.getGraphEdgeCount(), graph.has_data(), stream, mr);
+  rmm::device_uvector<vertex_t> result_src(kt.getGraphEdgeCount(), handle.get_stream());
+  rmm::device_uvector<vertex_t> result_dst(kt.getGraphEdgeCount(), handle.get_stream());
 
-  kt.copyGraph(out_graph->src_indices(), out_graph->dst_indices());
+  kt.copyGraph(result_src.data(), result_dst.data());
 
   kt.release();
   CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to release");
 
-  return out_graph;
+  return std::make_tuple(std::move(result_src), std::move(result_dst));
 }
-template <typename VT, typename ET, typename WT>
-std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> weighted_ktruss_subgraph_impl(
-  legacy::GraphCOOView<VT, ET, WT> const& graph, int k, rmm::mr::device_memory_resource* mr)
+
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+weighted_ktruss_subgraph_impl(raft::handle_t const& handle,
+                              raft::device_span<vertex_t> src,
+                              raft::device_span<vertex_t> dst,
+                              std::optional<raft::device_span<weight_t>> wgt,
+                              size_t number_of_vertices,
+                              int k)
 {
-  using HornetGraph = hornet::gpu::Hornet<VT, hornet::EMPTY, hornet::TypeList<WT>>;
-  using UpdatePtr   = hornet::BatchUpdatePtr<VT, hornet::TypeList<WT>, hornet::DeviceType::DEVICE>;
-  using Update      = hornet::gpu::BatchUpdate<VT, hornet::TypeList<WT>>;
-  cudaStream_t stream{nullptr};
-  UpdatePtr ptr(graph.number_of_edges, graph.src_indices, graph.dst_indices, graph.edge_data);
+  using HornetGraph = hornet::gpu::Hornet<vertex_t, hornet::EMPTY, hornet::TypeList<weight_t>>;
+  using UpdatePtr =
+    hornet::BatchUpdatePtr<vertex_t, hornet::TypeList<weight_t>, hornet::DeviceType::DEVICE>;
+  using Update = hornet::gpu::BatchUpdate<vertex_t, hornet::TypeList<weight_t>>;
+
+  HornetGraph hnt(number_of_vertices + 1);
+
+  UpdatePtr ptr(static_cast<int>(src.size()), src.data(), dst.data(), wgt->data());
   Update batch(ptr);
 
-  HornetGraph hnt(graph.number_of_vertices + 1);
   hnt.insert(batch);
   CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to initialize graph");
 
-  KTrussWeighted<WT> kt(hnt);
+  KTrussWeighted<weight_t> kt(hnt);
 
   kt.init();
   kt.reset();
@@ -110,41 +126,60 @@ std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> weighted_ktruss_subgraph_impl(
   kt.runForK(k);
   CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to run");
 
-  auto out_graph = std::make_unique<legacy::GraphCOO<VT, ET, WT>>(
-    graph.number_of_vertices, kt.getGraphEdgeCount(), graph.has_data(), stream, mr);
+  rmm::device_uvector<vertex_t> result_src(kt.getGraphEdgeCount(), handle.get_stream());
+  rmm::device_uvector<vertex_t> result_dst(kt.getGraphEdgeCount(), handle.get_stream());
+  std::optional<rmm::device_uvector<weight_t>> result_wgt{std::nullopt};
 
-  kt.copyGraph(out_graph->src_indices(), out_graph->dst_indices(), out_graph->edge_data());
+  result_wgt = rmm::device_uvector<weight_t>(kt.getGraphEdgeCount(), handle.get_stream());
+  kt.copyGraph(result_src.data(), result_dst.data(), result_wgt->data());
 
   kt.release();
   CUGRAPH_EXPECTS(cudaPeekAtLastError() == cudaSuccess, "KTruss : Failed to release");
 
-  return out_graph;
+  return std::make_tuple(std::move(result_src), std::move(result_dst), std::move(result_wgt));
 }
 
 }  // namespace detail
 
-template <typename VT, typename ET, typename WT>
-std::unique_ptr<legacy::GraphCOO<VT, ET, WT>> k_truss_subgraph(
-  legacy::GraphCOOView<VT, ET, WT> const& graph, int k, rmm::mr::device_memory_resource* mr)
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+k_truss_subgraph(raft::handle_t const& handle,
+                 raft::device_span<vertex_t> src,
+                 raft::device_span<vertex_t> dst,
+                 std::optional<raft::device_span<weight_t>> wgt,
+                 size_t number_of_vertices,
+                 int k)
 {
-  CUGRAPH_EXPECTS(graph.src_indices != nullptr, "Graph source indices cannot be a nullptr");
-  CUGRAPH_EXPECTS(graph.dst_indices != nullptr, "Graph destination indices cannot be a nullptr");
-
-  if (graph.edge_data == nullptr) {
-    return detail::ktruss_subgraph_impl(graph, k, mr);
+  if (wgt.has_value()) {
+    return detail::weighted_ktruss_subgraph_impl(handle, src, dst, wgt, number_of_vertices, k);
   } else {
-    return detail::weighted_ktruss_subgraph_impl(graph, k, mr);
+    auto [result_src, result_dst] =
+      detail::ktruss_subgraph_impl(handle, src, dst, number_of_vertices, k);
+    std::optional<rmm::device_uvector<weight_t>> result_wgt{std::nullopt};
+    return std::make_tuple(std::move(result_src), std::move(result_dst), std::move(result_wgt));
   }
 }
 
-template std::unique_ptr<legacy::GraphCOO<int32_t, int32_t, float>>
-k_truss_subgraph<int, int, float>(legacy::GraphCOOView<int, int, float> const&,
-                                  int,
-                                  rmm::mr::device_memory_resource*);
-
-template std::unique_ptr<legacy::GraphCOO<int32_t, int32_t, double>>
-k_truss_subgraph<int, int, double>(legacy::GraphCOOView<int, int, double> const&,
-                                   int,
-                                   rmm::mr::device_memory_resource*);
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+k_truss_subgraph(raft::handle_t const& handle,
+                 raft::device_span<int32_t> src,
+                 raft::device_span<int32_t> dst,
+                 std::optional<raft::device_span<float>> wgt,
+                 size_t number_of_vertices,
+                 int k);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+k_truss_subgraph(raft::handle_t const& handle,
+                 raft::device_span<int32_t> src,
+                 raft::device_span<int32_t> dst,
+                 std::optional<raft::device_span<double>> wgt,
+                 size_t number_of_vertices,
+                 int k);
 
 }  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f08606df8ea..2a4bb8ab2a5 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -741,5 +741,6 @@ ConfigureCTest(CAPI_K_CORE_TEST c_api/k_core_test.c)
 ConfigureCTest(CAPI_INDUCED_SUBGRAPH_TEST c_api/induced_subgraph_test.c)
 ConfigureCTest(CAPI_EGONET_TEST c_api/egonet_test.c)
 ConfigureCTest(CAPI_TWO_HOP_NEIGHBORS_TEST c_api/two_hop_neighbors_test.c)
+ConfigureCTest(CAPI_LEGACY_K_TRUSS_TEST c_api/legacy_k_truss_test.c)
 
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing_c DESTINATION bin/gtests/libcugraph_c)
diff --git a/cpp/tests/c_api/legacy_k_truss_test.c b/cpp/tests/c_api/legacy_k_truss_test.c
new file mode 100644
index 00000000000..bc85f568688
--- /dev/null
+++ b/cpp/tests/c_api/legacy_k_truss_test.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "c_test_utils.h" /* RUN_TEST */
+
+#include <cugraph_c/algorithms.h>
+#include <cugraph_c/graph.h>
+
+#include <math.h>
+
+typedef int32_t vertex_t;
+typedef int32_t edge_t;
+typedef float weight_t;
+
+int generic_k_truss_test(vertex_t* h_src,
+                         vertex_t* h_dst,
+                         weight_t* h_wgt,
+                         vertex_t* h_expected_src,
+                         vertex_t* h_expected_dst,
+                         weight_t* h_expected_wgt,
+                         size_t* h_expected_offsets,
+                         size_t num_vertices,
+                         size_t num_edges,
+                         size_t k,
+                         size_t num_expected_offsets,
+                         size_t num_expected_edges,
+                         bool_t store_transposed)
+{
+  int test_ret_value = 0;
+
+  cugraph_error_code_t ret_code = CUGRAPH_SUCCESS;
+  cugraph_error_t* ret_error;
+
+  data_type_id_t vertex_tid = INT32;
+  data_type_id_t edge_tid   = INT32;
+  data_type_id_t weight_tid = FLOAT32;
+  data_type_id_t edge_id_tid   = INT32;
+  data_type_id_t edge_type_tid = INT32;
+
+  cugraph_resource_handle_t* resource_handle          = NULL;
+  cugraph_graph_t* graph                              = NULL;
+  cugraph_type_erased_device_array_t* seeds           = NULL;
+  cugraph_type_erased_device_array_view_t* seeds_view = NULL;
+  cugraph_induced_subgraph_result_t* result           = NULL;
+
+  resource_handle = cugraph_create_resource_handle(NULL);
+  TEST_ASSERT(test_ret_value, resource_handle != NULL, "resource handle creation failed.");
+
+  ret_code = create_sg_test_graph(
+    resource_handle,
+    vertex_tid,
+    edge_tid,
+    h_src,
+    h_dst,
+    weight_tid,
+    h_wgt,
+    edge_type_tid,
+    NULL,
+    edge_id_tid,
+    NULL,
+    num_edges,
+    store_transposed,
+    FALSE,
+    TRUE,
+    FALSE,
+    &graph,
+    &ret_error);
+
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+
+  ret_code =
+    cugraph_k_truss_subgraph(resource_handle, graph, k, FALSE, &result, &ret_error);
+  TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
+  TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "cugraph_k_truss_subgraph failed.");
+
+  if (test_ret_value == 0) {
+    cugraph_type_erased_device_array_view_t* src;
+    cugraph_type_erased_device_array_view_t* dst;
+    cugraph_type_erased_device_array_view_t* wgt;
+    cugraph_type_erased_device_array_view_t* offsets;
+
+    src     = cugraph_induced_subgraph_get_sources(result);
+    dst     = cugraph_induced_subgraph_get_destinations(result);
+    wgt     = cugraph_induced_subgraph_get_edge_weights(result);
+    offsets = cugraph_induced_subgraph_get_subgraph_offsets(result);
+
+    size_t num_result_edges   = cugraph_type_erased_device_array_view_size(src);
+    size_t num_result_offsets = cugraph_type_erased_device_array_view_size(offsets);
+
+    vertex_t h_result_src[num_result_edges];
+    vertex_t h_result_dst[num_result_edges];
+    weight_t h_result_wgt[num_result_edges];
+    size_t h_result_offsets[num_result_offsets];
+
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      resource_handle, (byte_t*)h_result_src, src, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      resource_handle, (byte_t*)h_result_dst, dst, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+    if (wgt != NULL){
+      ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+        resource_handle, (byte_t*)h_result_wgt, wgt, &ret_error);
+      TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+    }
+
+
+    ret_code = cugraph_type_erased_device_array_view_copy_to_host(
+      resource_handle, (byte_t*)h_result_offsets, offsets, &ret_error);
+    TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed.");
+
+    TEST_ASSERT(test_ret_value, num_result_edges == num_expected_edges, "results not the same size");
+
+    for (size_t i = 0; (i < num_expected_offsets) && (test_ret_value == 0); ++i) {
+      TEST_ASSERT(test_ret_value,
+                  h_expected_offsets[i] == h_result_offsets[i],
+                  "graph offsets should match");
+    }
+
+  for (size_t i = 0; (i < num_expected_edges) && (test_ret_value == 0); ++i) {
+    bool_t found = FALSE;
+    for (size_t j = 0; (j < num_expected_edges) && !found; ++j) {
+      if ((h_expected_src[i] == h_result_src[j]) && (h_expected_dst[i] == h_result_dst[j]))
+          if (wgt != NULL){
+            found = (nearlyEqual(h_expected_wgt[i], h_result_wgt[j], 0.001));
+          }
+          else{
+            found = TRUE;
+          }
+    }
+    TEST_ASSERT(test_ret_value, found, "extracted an edge that doesn't match");
+  }
+
+    cugraph_type_erased_device_array_view_free(src);
+    cugraph_type_erased_device_array_view_free(dst);
+    cugraph_type_erased_device_array_view_free(wgt);
+    cugraph_type_erased_device_array_view_free(offsets);
+    cugraph_induced_subgraph_result_free(result);
+  }
+
+  cugraph_sg_graph_free(graph);
+  cugraph_free_resource_handle(resource_handle);
+  cugraph_error_free(ret_error);
+
+  return test_ret_value;
+}
+
+int test_k_truss()
+{
+  size_t num_edges    = 16;
+  size_t num_vertices = 6;
+  size_t k = 3;
+
+  vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  weight_t h_wgt[] = {
+    0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f, 0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+
+  vertex_t h_result_src[]   = {0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
+  vertex_t h_result_dst[]   = {1, 2, 0, 2, 3, 0, 1, 3, 1, 2};
+  weight_t h_result_wgt[]   = {0.1, 5.1, 0.1, 3.1, 2.1, 5.1, 3.1, 4.1, 2.1, 4.1};
+  size_t h_result_offsets[] = {0, 10};
+  size_t num_expected_edges = 10;
+  size_t num_expected_offsets = 2;
+
+  return generic_k_truss_test(h_src,
+                             h_dst,
+                             h_wgt,
+                             h_result_src,
+                             h_result_dst,
+                             h_result_wgt,
+                             h_result_offsets,
+                             num_vertices,
+                             num_edges,
+                             k,
+                             num_expected_offsets,
+                             num_expected_edges,
+                             FALSE);
+}
+
+int test_k_truss_no_weights()
+{
+  size_t num_edges    = 16;
+  size_t num_vertices = 6;
+  size_t k = 3;
+
+  vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+
+  vertex_t h_result_src[]   = {0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
+  vertex_t h_result_dst[]   = {1, 2, 0, 2, 3, 0, 1, 3, 1, 2};
+  size_t h_result_offsets[] = {0, 10};
+  size_t num_expected_edges = 10;
+  size_t num_expected_offsets = 2;
+
+  return generic_k_truss_test(h_src,
+                             h_dst,
+                             NULL,
+                             h_result_src,
+                             h_result_dst,
+                             NULL,
+                             h_result_offsets,
+                             num_vertices,
+                             num_edges,
+                             k,
+                             num_expected_offsets,
+                             num_expected_edges,
+                             FALSE);
+}
+
+
+/******************************************************************************/
+
+int main(int argc, char** argv)
+{
+  int result = 0;
+  result |= RUN_TEST(test_k_truss);
+  result |= RUN_TEST(test_k_truss_no_weights);
+  return result;
+}
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index ecfcb9b219f..64db9571dc9 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -82,7 +82,6 @@ endif()
 
 rapids_cython_init()
 
-add_subdirectory(cugraph/community)
 add_subdirectory(cugraph/components)
 add_subdirectory(cugraph/dask/comms)
 add_subdirectory(cugraph/dask/structure)
diff --git a/python/cugraph/cugraph/community/CMakeLists.txt b/python/cugraph/cugraph/community/CMakeLists.txt
deleted file mode 100644
index 185f6accbab..00000000000
--- a/python/cugraph/cugraph/community/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources
-    ktruss_subgraph_wrapper.pyx
-)
-
-set(linked_libraries cugraph::cugraph)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX community_
-  ASSOCIATED_TARGETS cugraph
-)
diff --git a/python/cugraph/cugraph/community/ktruss_subgraph.pxd b/python/cugraph/cugraph/community/ktruss_subgraph.pxd
deleted file mode 100644
index d993c31c375..00000000000
--- a/python/cugraph/cugraph/community/ktruss_subgraph.pxd
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.structure.graph_primtypes cimport *
-
-
-cdef extern from "cugraph/algorithms.hpp" namespace "cugraph":
-
-    cdef unique_ptr[GraphCOO[VT,ET,WT]] k_truss_subgraph[VT,ET,WT](
-        const GraphCOOView[VT,ET,WT] &graph,
-        int k) except +
diff --git a/python/cugraph/cugraph/community/ktruss_subgraph.py b/python/cugraph/cugraph/community/ktruss_subgraph.py
index 0ebbe633317..15a10007610 100644
--- a/python/cugraph/cugraph/community/ktruss_subgraph.py
+++ b/python/cugraph/cugraph/community/ktruss_subgraph.py
@@ -11,14 +11,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.community import ktruss_subgraph_wrapper
 from cugraph.structure.graph_classes import Graph
+from typing import Union
 from cugraph.utilities import (
     ensure_cugraph_obj_for_nx,
     cugraph_to_nx,
 )
 
+from pylibcugraph import k_truss_subgraph as pylibcugraph_k_truss_subgraph
+from pylibcugraph import ResourceHandle
+import warnings
+
 from numba import cuda
+import cudf
+from cugraph.utilities.utils import import_optional
+
+# FIXME: the networkx.Graph type used in the type annotation for
+# ktruss_subgraph() is specified using a string literal to avoid depending on
+# and importing networkx. Instead, networkx is imported optionally, which may
+# cause a problem for a type checker if run in an environment where networkx is
+# not installed.
+networkx = import_optional("networkx")
 
 
 # FIXME: special case for ktruss on CUDA 11.4: an 11.4 bug causes ktruss to
@@ -39,7 +52,9 @@ def _ensure_compatible_cuda_version():
         )
 
 
-def k_truss(G, k):
+def k_truss(
+    G: Union[Graph, "networkx.Graph"], k: int
+) -> Union[Graph, "networkx.Graph"]:
     """
     Returns the K-Truss subgraph of a graph for a specific k.
 
@@ -90,7 +105,11 @@ def k_truss(G, k):
 # FIXME: merge this function with k_truss
 
 
-def ktruss_subgraph(G, k, use_weights=True):
+def ktruss_subgraph(
+    G: Union[Graph, "networkx.Graph"],
+    k: int,
+    use_weights=True,  # deprecated
+) -> Graph:
     """
     Returns the K-Truss subgraph of a graph for a specific k.
 
@@ -103,7 +122,7 @@ def ktruss_subgraph(G, k, use_weights=True):
     finding the maximal k-clique is known to be NP-Hard.
 
     In contrast, finding a k-truss is computationally tractable as its
-    key building block, namely triangle counting counting, can be executed
+    key building block, namely triangle counting, can be executed
     in polnymomial time.Typically, it takes many iterations of triangle
     counting to find the k-truss of a graph. Yet these iterations operate
     on a weakly monotonically shrinking graph.
@@ -141,7 +160,10 @@ def ktruss_subgraph(G, k, use_weights=True):
         The desired k to be used for extracting the k-truss subgraph.
 
     use_weights : bool, optional (default=True)
-        whether the output should contain the edge weights if G has them
+        Whether the output should contain the edge weights if G has them.
+
+        Deprecated: If 'weights' were passed at the graph creation, they will
+        be used.
 
     Returns
     -------
@@ -162,7 +184,27 @@ def ktruss_subgraph(G, k, use_weights=True):
     if G.is_directed():
         raise ValueError("input graph must be undirected")
 
-    subgraph_df = ktruss_subgraph_wrapper.ktruss_subgraph(G, k, use_weights)
+    if use_weights:
+        warning_msg = (
+            "The use_weights flag is deprecated "
+            "and will be removed in the next release. if weights "
+            "were passed at the graph creation, they will be used."
+        )
+        warnings.warn(warning_msg, FutureWarning)
+
+    sources, destinations, edge_weights, _ = pylibcugraph_k_truss_subgraph(
+        resource_handle=ResourceHandle(),
+        graph=G._plc_graph,
+        k=k,
+        do_expensive_check=True,
+    )
+
+    subgraph_df = cudf.DataFrame()
+    subgraph_df["src"] = sources
+    subgraph_df["dst"] = destinations
+    if edge_weights is not None:
+        subgraph_df["weight"] = edge_weights
+
     if G.renumbered:
         subgraph_df = G.unrenumber(subgraph_df, "src")
         subgraph_df = G.unrenumber(subgraph_df, "dst")
diff --git a/python/cugraph/cugraph/community/ktruss_subgraph_wrapper.pyx b/python/cugraph/cugraph/community/ktruss_subgraph_wrapper.pyx
deleted file mode 100644
index 8b705e8a7b4..00000000000
--- a/python/cugraph/cugraph/community/ktruss_subgraph_wrapper.pyx
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cugraph.community.ktruss_subgraph cimport *
-from cugraph.structure.graph_primtypes cimport *
-from cugraph.structure import graph_primtypes_wrapper
-import numpy as np
-
-
-def ktruss_subgraph_float(input_graph, k, use_weights):
-    cdef GraphCOOViewFloat in_graph = get_coo_float_graph_view(input_graph, use_weights)
-    return coo_to_df(move(k_truss_subgraph[int,int,float](in_graph, k)))
-
-
-def ktruss_subgraph_double(input_graph, k, use_weights):
-    cdef GraphCOOViewDouble in_graph = get_coo_double_graph_view(input_graph, use_weights)
-    return coo_to_df(move(k_truss_subgraph[int,int,double](in_graph, k)))
-
-
-def ktruss_subgraph(input_graph, k, use_weights):
-    [input_graph.edgelist.edgelist_df['src'],
-     input_graph.edgelist.edgelist_df['dst']] = graph_primtypes_wrapper.datatype_cast([input_graph.edgelist.edgelist_df['src'],
-                                                                                       input_graph.edgelist.edgelist_df['dst']],
-                                                                                      [np.int32])
-    if graph_primtypes_wrapper.weight_type(input_graph) == np.float64 and use_weights:
-        return ktruss_subgraph_double(input_graph, k, use_weights)
-    else:
-        return ktruss_subgraph_float(input_graph, k, use_weights)
diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
index 2f7e63b5c55..6618c50122c 100644
--- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt
@@ -35,6 +35,7 @@ set(cython_sources
     hits.pyx
     induced_subgraph.pyx
     k_core.pyx
+    k_truss_subgraph.pyx
     jaccard_coefficients.pyx
     sorensen_coefficients.pyx
     overlap_coefficients.pyx
diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py
index 45f6de2f663..bd47c7da184 100644
--- a/python/pylibcugraph/pylibcugraph/__init__.py
+++ b/python/pylibcugraph/pylibcugraph/__init__.py
@@ -87,6 +87,8 @@
 
 from pylibcugraph.generate_rmat_edgelists import generate_rmat_edgelists
 
+from pylibcugraph.k_truss_subgraph import k_truss_subgraph
+
 from pylibcugraph.jaccard_coefficients import jaccard_coefficients
 
 from pylibcugraph.overlap_coefficients import overlap_coefficients
diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd
index 64944e8773f..3c273b7d3fa 100644
--- a/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd
+++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/community_algorithms.pxd
@@ -253,3 +253,13 @@ cdef extern from "cugraph_c/community_algorithms.h":
             cugraph_error_t** error
         )
 
+    ###########################################################################
+    # K truss
+    cdef cugraph_error_code_t \
+        cugraph_k_truss_subgraph(
+            const cugraph_resource_handle_t* handle,
+            cugraph_graph_t* graph,
+            size_t k,
+            bool_t do_expensive_check,
+            cugraph_induced_subgraph_result_t** result,
+            cugraph_error_t** error)
diff --git a/python/pylibcugraph/pylibcugraph/k_truss_subgraph.pyx b/python/pylibcugraph/pylibcugraph/k_truss_subgraph.pyx
new file mode 100644
index 00000000000..cc91e76dd55
--- /dev/null
+++ b/python/pylibcugraph/pylibcugraph/k_truss_subgraph.pyx
@@ -0,0 +1,163 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Have cython use python 3 syntax
+# cython: language_level = 3
+
+
+from pylibcugraph._cugraph_c.resource_handle cimport (
+    bool_t,
+    cugraph_resource_handle_t,
+)
+from pylibcugraph._cugraph_c.error cimport (
+    cugraph_error_code_t,
+    cugraph_error_t,
+)
+from pylibcugraph._cugraph_c.array cimport (
+    cugraph_type_erased_device_array_view_t,
+)
+from pylibcugraph._cugraph_c.graph cimport (
+    cugraph_graph_t,
+)
+from pylibcugraph._cugraph_c.graph_functions cimport (
+    cugraph_induced_subgraph_result_t,
+    cugraph_induced_subgraph_get_sources,
+    cugraph_induced_subgraph_get_destinations,
+    cugraph_induced_subgraph_get_edge_weights,
+    cugraph_induced_subgraph_get_subgraph_offsets,
+    cugraph_induced_subgraph_result_free,
+)
+from pylibcugraph._cugraph_c.community_algorithms cimport (
+    cugraph_k_truss_subgraph,
+)
+from pylibcugraph.resource_handle cimport (
+    ResourceHandle,
+)
+from pylibcugraph.graphs cimport (
+    _GPUGraph,
+)
+from pylibcugraph.utils cimport (
+    assert_success,
+    copy_to_cupy_array,
+)
+
+
+def k_truss_subgraph(ResourceHandle resource_handle,
+                     _GPUGraph graph,
+                     size_t k,
+                     bool_t do_expensive_check):
+    """
+    Extract k truss of a graph for a specific k.
+
+    Parameters
+    ----------
+    resource_handle : ResourceHandle
+        Handle to the underlying device resources needed for referencing data
+        and running algorithms.
+
+    graph : SGGraph
+        The input graph.
+
+    k: size_t
+        The desired k to be used for extracting the k-truss subgraph.
+
+    do_expensive_check : bool_t
+        If True, performs more extensive tests on the inputs to ensure
+        validitity, at the expense of increased run time.
+
+    Returns
+    -------
+    A tuple of device arrays containing the sources, destinations,
+    edge_weights and edge_offsets.
+
+    Examples
+    --------
+    >>> import pylibcugraph, cupy, numpy
+    >>> srcs = cupy.asarray([0, 1, 1, 3, 1, 4, 2, 0, 2, 1, 2,
+    ...     3, 3, 4, 3, 5, 4, 5], dtype=numpy.int32)
+    >>> dsts = cupy.asarray([1, 0, 3, 1, 4, 1, 0, 2, 1, 2, 3,
+    ...     2, 4, 3, 5, 3, 5, 4], dtype=numpy.int32)
+    >>> weights = cupy.asarray(
+    ...     [0.1, 0.1, 2.1, 2.1, 1.1, 1.1, 7.2, 7.2, 2.1, 2.1,
+    ...     1.1, 1.1, 7.2, 7.2, 3.2, 3.2, 6.1, 6.1]
+    ...     ,dtype=numpy.float32)   
+    >>> k = 2
+    >>> resource_handle = pylibcugraph.ResourceHandle()
+    >>> graph_props = pylibcugraph.GraphProperties(
+    ...     is_symmetric=True, is_multigraph=False)
+    >>> G = pylibcugraph.SGGraph(
+    ...     resource_handle, graph_props, srcs, dsts, weights,
+    ...     store_transposed=False, renumber=False, do_expensive_check=False)
+    >>> (sources, destinations, edge_weights, subgraph_offsets) =
+    ...     pylibcugraph.k_truss_subgraph(resource_handle, G, k, False)
+    >>> sources
+    [0 0 1 1 1 1 2 2 2 3 3 3 3 4 4 4 5 5]
+    >>> destinations
+    [1 2 0 2 3 4 0 1 3 1 2 4 5 1 3 5 3 4]
+    >>> edge_weights
+    [0.1 7.2 0.1 2.1 2.1 1.1 7.2 2.1 1.1 2.1 1.1 7.2 3.2 1.1 7.2 6.1 3.2 6.1]
+    >>> subgraph_offsets
+    [0 18]
+
+    """
+    cdef cugraph_resource_handle_t* c_resource_handle_ptr = \
+        resource_handle.c_resource_handle_ptr
+    cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr
+    cdef cugraph_induced_subgraph_result_t* result_ptr
+    cdef cugraph_error_code_t error_code
+    cdef cugraph_error_t* error_ptr
+
+    error_code = cugraph_k_truss_subgraph(c_resource_handle_ptr,
+                                          c_graph_ptr,
+                                          k,
+                                          do_expensive_check,
+                                          &result_ptr,
+                                          &error_ptr)
+    assert_success(error_code, error_ptr, "cugraph_k_truss_subgraph")
+
+    # Extract individual device array pointers from result and copy to cupy
+    # arrays for returning.
+    cdef cugraph_type_erased_device_array_view_t* sources_ptr = \
+        cugraph_induced_subgraph_get_sources(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* destinations_ptr = \
+        cugraph_induced_subgraph_get_destinations(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* edge_weights_ptr = \
+        cugraph_induced_subgraph_get_edge_weights(result_ptr)
+    cdef cugraph_type_erased_device_array_view_t* subgraph_offsets_ptr = \
+        cugraph_induced_subgraph_get_subgraph_offsets(result_ptr)
+
+
+    # FIXME: Get ownership of the result data instead of performing a copy
+    # for perfomance improvement
+    cupy_sources = copy_to_cupy_array(
+        c_resource_handle_ptr, sources_ptr)
+    
+    cupy_destinations = copy_to_cupy_array(
+        c_resource_handle_ptr, destinations_ptr)
+
+    if edge_weights_ptr is not NULL:
+        cupy_edge_weights = copy_to_cupy_array(
+            c_resource_handle_ptr, edge_weights_ptr)
+    else:
+        cupy_edge_weights = None
+    
+    # FIXME: Should we keep the offsets array or just drop it from the final
+    # solution?
+    cupy_subgraph_offsets = copy_to_cupy_array(
+        c_resource_handle_ptr, subgraph_offsets_ptr)
+
+
+    # Free pointer
+    cugraph_induced_subgraph_result_free(result_ptr)
+
+    return (cupy_sources, cupy_destinations, cupy_edge_weights, cupy_subgraph_offsets)

From 9dd01d82c17b6ada9c6701a9f3238e10053ce8a6 Mon Sep 17 00:00:00 2001
From: Don Acosta <97529984+acostadon@users.noreply.github.com>
Date: Fri, 29 Sep 2023 15:17:22 -0400
Subject: [PATCH 64/72] fixes force atlas to allow string as vertex names
 (#3891)

fixes force atlas to allow string as vertex names and removes need for mtx formated datasets.
resolves #3610

Authors:
  - Don Acosta (https://github.com/acostadon)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - ralph (https://github.com/nv-rliu)

URL: https://github.com/rapidsai/cugraph/pull/3891
---
 .../cugraph/layout/force_atlas2_wrapper.pyx   |   8 +-
 .../cugraph/tests/layout/test_force_atlas2.py | 145 ++++++++----------
 2 files changed, 67 insertions(+), 86 deletions(-)

diff --git a/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx b/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx
index 4258be3ef71..5a2784e2363 100644
--- a/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx
+++ b/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx
@@ -56,9 +56,11 @@ def force_atlas2(input_graph,
     if not input_graph.edgelist:
         input_graph.view_edge_list()
 
-    # FIXME: This implementation assumes that the number of vertices
-    # is the max vertex ID + 1 which is not always the case.
-    num_verts = input_graph.nodes().max() + 1
+    # this code allows handling of renumbered graphs
+    if input_graph.is_renumbered():
+        num_verts = input_graph.renumber_map.df_internal_to_external['id'].max()+1
+    else:
+        num_verts = input_graph.nodes().max() + 1
     num_edges = len(input_graph.edgelist.edgelist_df['src'])
 
     cdef GraphCOOView[int,int,float] graph_float
diff --git a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py
index 495a2d945c0..6b1fd6bcc4e 100644
--- a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py
+++ b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py
@@ -13,13 +13,46 @@
 
 import time
 import pytest
-import scipy.io
-from sklearn.manifold import trustworthiness
 
-import cudf
 import cugraph
+from cugraph.structure import number_map
 from cugraph.internals import GraphBasedDimRedCallback
-from cugraph.datasets import karate, polbooks, dolphins, netscience
+from sklearn.manifold import trustworthiness
+import scipy.io
+from cugraph.datasets import (
+    karate,
+    polbooks,
+    dolphins,
+    netscience,
+    dining_prefs,
+)
+
+# FIXME Removed the multi column positional due to it being non-deterministic
+# need to replace this coverage. Issue 3890 in cuGraph repo was created.
+
+# This method renumbers a dataframe so it can be tested using Trustworthiness.
+# it converts a dataframe with string vertex ids to a renumbered int one.
+
+
+def renumbered_edgelist(df):
+    renumbered_df, num_map = number_map.NumberMap.renumber(df, "src", "dst")
+    new_df = renumbered_df[["renumbered_src", "renumbered_dst", "wgt"]]
+    column_names = {"renumbered_src": "src", "renumbered_dst": "dst"}
+    new_df = new_df.rename(columns=column_names)
+    return new_df
+
+
+# This method converts a dataframe to a sparce matrix that is required by
+# scipy Trustworthiness to verify the layout
+def get_coo_array(edgelist):
+    coo = edgelist
+    x = max(coo["src"].max(), coo["dst"].max()) + 1
+    row = coo["src"].to_numpy()
+    col = coo["dst"].to_numpy()
+    data = coo["wgt"].to_numpy()
+    M = scipy.sparse.coo_array((data, (row, col)), shape=(x, x))
+
+    return M
 
 
 def cugraph_call(
@@ -37,11 +70,15 @@ def cugraph_call(
     strong_gravity_mode,
     gravity,
     callback=None,
+    renumber=False,
 ):
-
     G = cugraph.Graph()
+    if cu_M["src"] is not int or cu_M["dst"] is not int:
+        renumber = True
+    else:
+        renumber = False
     G.from_cudf_edgelist(
-        cu_M, source="src", destination="dst", edge_attr="wgt", renumber=False
+        cu_M, source="src", destination="dst", edge_attr="wgt", renumber=renumber
     )
 
     t1 = time.time()
@@ -66,7 +103,19 @@ def cugraph_call(
     return pos
 
 
-DATASETS = [(karate, 0.70), (polbooks, 0.75), (dolphins, 0.66), (netscience, 0.66)]
+DATASETS = [
+    (karate, 0.70),
+    (polbooks, 0.75),
+    (dolphins, 0.66),
+    (netscience, 0.66),
+    (dining_prefs, 0.50),
+]
+
+DATASETS2 = [
+    (polbooks, 0.75),
+    (dolphins, 0.66),
+    (netscience, 0.66),
+]
 
 
 MAX_ITERATIONS = [500]
@@ -95,8 +144,7 @@ def on_train_end(self, positions):
 @pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
 @pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE)
 def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
-    cu_M = graph_file.get_edgelist()
-    dataset_path = graph_file.get_path()
+    cu_M = graph_file.get_edgelist(download=True)
     test_callback = TestCallback()
     cu_pos = cugraph_call(
         cu_M,
@@ -126,9 +174,11 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
         iterations on a given graph.
     """
 
-    matrix_file = dataset_path.with_suffix(".mtx")
-    M = scipy.io.mmread(matrix_file)
-    M = M.toarray()
+    if "string" in graph_file.metadata["col_types"]:
+        df = renumbered_edgelist(graph_file.get_edgelist(download=True))
+        M = get_coo_array(df)
+    else:
+        M = get_coo_array(graph_file.get_edgelist(download=True))
     cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
     print(cu_trust, score)
     assert cu_trust > score
@@ -138,74 +188,3 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
     assert test_callback.on_epoch_end_called_count == max_iter
     # verify `on_train_end` was only called once
     assert test_callback.on_train_end_called_count == 1
-
-
-# FIXME: this test occasionally fails - skipping to prevent CI failures but
-# need to revisit ASAP
-@pytest.mark.sg
-@pytest.mark.skip(reason="non-deterministric - needs fixing!")
-@pytest.mark.parametrize("graph_file, score", DATASETS[:-1])
-@pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
-@pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE)
-def test_force_atlas2_multi_column_pos_list(
-    graph_file, score, max_iter, barnes_hut_optimize
-):
-    cu_M = graph_file.get_edgelist()
-    dataset_path = graph_file.get_path()
-    test_callback = TestCallback()
-    pos = cugraph_call(
-        cu_M,
-        max_iter=max_iter,
-        pos_list=None,
-        outbound_attraction_distribution=True,
-        lin_log_mode=False,
-        prevent_overlapping=False,
-        edge_weight_influence=1.0,
-        jitter_tolerance=1.0,
-        barnes_hut_optimize=False,
-        barnes_hut_theta=0.5,
-        scaling_ratio=2.0,
-        strong_gravity_mode=False,
-        gravity=1.0,
-        callback=test_callback,
-    )
-
-    cu_M.rename(columns={"0": "src_0", "1": "dst_0"}, inplace=True)
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
-
-    G = cugraph.Graph()
-    G.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"], edge_attr="2"
-    )
-
-    pos_list = cudf.DataFrame()
-    pos_list["vertex_0"] = pos["vertex"]
-    pos_list["vertex_1"] = pos_list["vertex_0"] + 1000
-    pos_list["x"] = pos["x"]
-    pos_list["y"] = pos["y"]
-
-    cu_pos = cugraph.force_atlas2(
-        G,
-        max_iter=max_iter,
-        pos_list=pos_list,
-        outbound_attraction_distribution=True,
-        lin_log_mode=False,
-        prevent_overlapping=False,
-        edge_weight_influence=1.0,
-        jitter_tolerance=1.0,
-        barnes_hut_optimize=False,
-        barnes_hut_theta=0.5,
-        scaling_ratio=2.0,
-        strong_gravity_mode=False,
-        gravity=1.0,
-        callback=test_callback,
-    )
-
-    cu_pos = cu_pos.sort_values("0_vertex")
-    matrix_file = dataset_path.with_suffix(".mtx")
-    M = scipy.io.mmread(matrix_file)
-    M = M.todense()
-    cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
-    print(cu_trust, score)
-    assert cu_trust > score

From c00c3f1b9b64dbda3173b8f343de37f62d70ad94 Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Sat, 30 Sep 2023 19:35:44 -0400
Subject: [PATCH 65/72] WholeGraph Feature Store for cuGraph-PyG and
 cuGraph-DGL (#3874)

Created based on code from @dongxuy04

Adds support for `WholeGraph` `WholeMemory` in the cuGraph `FeatureStore` class.  This enables both DGL and PyG to take advantage of distributed feature store functionality.

Adds `pylibwholegraph` as a testing dependency so the feature store can be tested.  Adds appropriate SG and MG tests.

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Brad Rees (https://github.com/BradReesWork)
  - Vibhu Jawa (https://github.com/VibhuJawa)

URL: https://github.com/rapidsai/cugraph/pull/3874
---
 ci/release/update-version.sh                  |   4 +
 .../all_cuda-118_arch-x86_64.yaml             |   1 +
 .../all_cuda-120_arch-x86_64.yaml             |   1 +
 dependencies.yaml                             |   3 +
 .../gnn/feature_storage/feat_storage.py       | 147 ++++++++++++++++--
 .../tests/data_store/test_gnn_feat_storage.py |   1 -
 .../test_gnn_feat_storage_wholegraph.py       |  85 ++++++++++
 7 files changed, 226 insertions(+), 16 deletions(-)
 create mode 100644 python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index adf3273e311..aaeaa715434 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -82,6 +82,9 @@ NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; prin
 DEPENDENCIES=(
   cudf
   cugraph
+  cugraph-dgl
+  cugraph-pyg
+  cugraph-service-server
   cugraph-service-client
   cuxfilter
   dask-cuda
@@ -93,6 +96,7 @@ DEPENDENCIES=(
   librmm
   pylibcugraph
   pylibcugraphops
+  pylibwholegraph
   pylibraft
   pyraft
   raft-dask
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 952ec9317e2..87179ef892e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -53,6 +53,7 @@ dependencies:
 - pydata-sphinx-theme
 - pylibcugraphops==23.10.*
 - pylibraft==23.10.*
+- pylibwholegraph==23.10.*
 - pytest
 - pytest-benchmark
 - pytest-cov
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 38936c78c38..d54dc0abf51 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -52,6 +52,7 @@ dependencies:
 - pydata-sphinx-theme
 - pylibcugraphops==23.10.*
 - pylibraft==23.10.*
+- pylibwholegraph==23.10.*
 - pytest
 - pytest-benchmark
 - pytest-cov
diff --git a/dependencies.yaml b/dependencies.yaml
index f74ed13115b..292fcf0baed 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -481,6 +481,9 @@ dependencies:
           - *numpy
           - python-louvain
           - scikit-learn>=0.23.1
+      - output_types: [conda]
+        packages:
+          - pylibwholegraph==23.10.*
   test_python_pylibcugraph:
     common:
       - output_types: [conda, pyproject]
diff --git a/python/cugraph/cugraph/gnn/feature_storage/feat_storage.py b/python/cugraph/cugraph/gnn/feature_storage/feat_storage.py
index e3fdeb7f150..77a53882fc4 100644
--- a/python/cugraph/cugraph/gnn/feature_storage/feat_storage.py
+++ b/python/cugraph/cugraph/gnn/feature_storage/feat_storage.py
@@ -17,23 +17,77 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
-from cugraph.utilities.utils import import_optional
+from cugraph.utilities.utils import import_optional, MissingModule
 
 torch = import_optional("torch")
+wgth = import_optional("pylibwholegraph.torch")
 
 
 class FeatureStore:
-    """The feature-store class used to store feature data for GNNS"""
+    """The feature-store class used to store feature data for GNNs"""
+
+    def __init__(
+        self,
+        backend: str = "numpy",
+        wg_comm: object = None,
+        wg_type: str = None,
+        wg_location: str = None,
+    ):
+        """
+        Constructs a new FeatureStore object
+
+        Parameters:
+        ----------
+        backend: str ('numpy', 'torch', 'wholegraph')
+            Optional (default='numpy')
+            The name of the backend to use.
+
+        wg_comm: WholeMemoryCommunicator
+            Optional (default=automatic)
+            Only used with the 'wholegraph' backend.
+            The communicator to use to store features in WholeGraph.
+
+        wg_type: str ('distributed', 'continuous', 'chunked')
+            Optional (default='distributed')
+            Only used with the 'wholegraph' backend.
+            The memory format (distributed, continuous, or chunked) of
+            this FeatureStore.  For more information see the WholeGraph
+            documentation.
+
+        wg_location: str ('cpu', 'cuda')
+            Optional (default='cuda')
+            Only used with the 'wholegraph' backend.
+            Where the data is stored (cpu or cuda).
+            Defaults to storing on the GPU (cuda).
+        """
 
-    def __init__(self, backend="numpy"):
         self.fd = defaultdict(dict)
-        if backend not in ["numpy", "torch"]:
+        if backend not in ["numpy", "torch", "wholegraph"]:
             raise ValueError(
-                f"backend {backend} not supported. Supported backends are numpy, torch"
+                f"backend {backend} not supported. "
+                "Supported backends are numpy, torch, wholegraph"
             )
         self.backend = backend
 
-    def add_data(self, feat_obj: Sequence, type_name: str, feat_name: str) -> None:
+        self.__wg_comm = None
+        self.__wg_type = None
+        self.__wg_location = None
+
+        if backend == "wholegraph":
+            self.__wg_comm = (
+                wg_comm if wg_comm is not None else wgth.get_local_node_communicator()
+            )
+            self.__wg_type = wg_type if wg_type is not None else "distributed"
+            self.__wg_location = wg_location if wg_location is not None else "cuda"
+
+            if self.__wg_type not in ["distributed", "chunked", "continuous"]:
+                raise ValueError(f"invalid memory format {self.__wg_type}")
+            if (self.__wg_location != "cuda") and (self.__wg_location != "cpu"):
+                raise ValueError(f"invalid location {self.__wg_location}")
+
+    def add_data(
+        self, feat_obj: Sequence, type_name: str, feat_name: str, **kwargs
+    ) -> None:
         """
         Add the feature data to the feature_storage class
         Parameters:
@@ -49,9 +103,31 @@ def add_data(self, feat_obj: Sequence, type_name: str, feat_name: str) -> None:
             None
         """
         self.fd[feat_name][type_name] = self._cast_feat_obj_to_backend(
-            feat_obj, self.backend
+            feat_obj,
+            self.backend,
+            wg_comm=self.__wg_comm,
+            wg_type=self.__wg_type,
+            wg_location=self.__wg_location,
+            **kwargs,
         )
 
+    def add_data_no_cast(self, feat_obj, type_name: str, feat_name: str) -> None:
+        """
+        Direct add the feature data to the feature_storage class with no cast
+        Parameters:
+        ----------
+          feat_obj : array_like object
+            The feature object to save in feature store
+          type_name : str
+            The node-type/edge-type of the feature
+          feat_name: str
+            The name of the feature being stored
+        Returns:
+        -------
+            None
+        """
+        self.fd[feat_name][type_name] = feat_obj
+
     def get_data(
         self,
         indices: Union[np.ndarray, torch.Tensor],
@@ -87,26 +163,67 @@ def get_data(
                 f" feature: {list(self.fd[feat_name].keys())}"
             )
 
-        return self.fd[feat_name][type_name][indices]
+        feat = self.fd[feat_name][type_name]
+        if not isinstance(wgth, MissingModule) and isinstance(
+            feat, wgth.WholeMemoryEmbedding
+        ):
+            indices_tensor = (
+                indices
+                if isinstance(indices, torch.Tensor)
+                else torch.as_tensor(indices, device="cuda")
+            )
+            return feat.gather(indices_tensor)
+        else:
+            return feat[indices]
 
     def get_feature_list(self) -> list[str]:
         return {feat_name: feats.keys() for feat_name, feats in self.fd.items()}
 
     @staticmethod
-    def _cast_feat_obj_to_backend(feat_obj, backend: str):
+    def _cast_feat_obj_to_backend(feat_obj, backend: str, **kwargs):
         if backend == "numpy":
             if isinstance(feat_obj, (cudf.DataFrame, pd.DataFrame)):
-                return _cast_to_numpy_ar(feat_obj.values)
+                return _cast_to_numpy_ar(feat_obj.values, **kwargs)
             else:
-                return _cast_to_numpy_ar(feat_obj)
+                return _cast_to_numpy_ar(feat_obj, **kwargs)
         elif backend == "torch":
             if isinstance(feat_obj, (cudf.DataFrame, pd.DataFrame)):
-                return _cast_to_torch_tensor(feat_obj.values)
+                return _cast_to_torch_tensor(feat_obj.values, **kwargs)
             else:
-                return _cast_to_torch_tensor(feat_obj)
+                return _cast_to_torch_tensor(feat_obj, **kwargs)
+        elif backend == "wholegraph":
+            return _get_wg_embedding(feat_obj, **kwargs)
+
 
+def _get_wg_embedding(feat_obj, wg_comm=None, wg_type=None, wg_location=None, **kwargs):
+    wg_comm_obj = wg_comm or wgth.get_local_node_communicator()
+    wg_type_str = wg_type or "distributed"
+    wg_location_str = wg_location or "cuda"
 
-def _cast_to_torch_tensor(ar):
+    if isinstance(feat_obj, (cudf.DataFrame, pd.DataFrame)):
+        th_tensor = _cast_to_torch_tensor(feat_obj.values)
+    else:
+        th_tensor = _cast_to_torch_tensor(feat_obj)
+    wg_embedding = wgth.create_embedding(
+        wg_comm_obj,
+        wg_type_str,
+        wg_location_str,
+        th_tensor.dtype,
+        th_tensor.shape,
+    )
+    (
+        local_wg_tensor,
+        local_ld_offset,
+    ) = wg_embedding.get_embedding_tensor().get_local_tensor()
+    local_th_tensor = th_tensor[
+        local_ld_offset : local_ld_offset + local_wg_tensor.shape[0]
+    ]
+    local_wg_tensor.copy_(local_th_tensor)
+    wg_comm_obj.barrier()
+    return wg_embedding
+
+
+def _cast_to_torch_tensor(ar, **kwargs):
     if isinstance(ar, cp.ndarray):
         ar = torch.as_tensor(ar, device="cuda")
     elif isinstance(ar, np.ndarray):
@@ -116,7 +233,7 @@ def _cast_to_torch_tensor(ar):
     return ar
 
 
-def _cast_to_numpy_ar(ar):
+def _cast_to_numpy_ar(ar, **kwargs):
     if isinstance(ar, cp.ndarray):
         ar = ar.get()
     elif type(ar).__name__ == "Tensor":
diff --git a/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage.py b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage.py
index 2d1537d11e3..2b0ec4b11d0 100644
--- a/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage.py
+++ b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage.py
@@ -10,7 +10,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Import FeatureStore class
 
 import pytest
 import numpy as np
diff --git a/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py
new file mode 100644
index 00000000000..1892e8a85a6
--- /dev/null
+++ b/python/cugraph/cugraph/tests/data_store/test_gnn_feat_storage_wholegraph.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import numpy as np
+
+from cugraph.gnn import FeatureStore
+
+from cugraph.utilities.utils import import_optional, MissingModule
+
+pylibwholegraph = import_optional("pylibwholegraph")
+wmb = import_optional("pylibwholegraph.binding.wholememory_binding")
+torch = import_optional("torch")
+
+
+def runtest(world_rank: int, world_size: int):
+    from pylibwholegraph.torch.initialize import init_torch_env_and_create_wm_comm
+
+    wm_comm, _ = init_torch_env_and_create_wm_comm(
+        world_rank,
+        world_size,
+        world_rank,
+        world_size,
+    )
+    wm_comm = wm_comm.wmb_comm
+
+    generator = np.random.default_rng(62)
+    arr = (
+        generator.integers(low=0, high=100, size=100_000)
+        .reshape(10_000, -1)
+        .astype("float64")
+    )
+
+    fs = FeatureStore(backend="wholegraph")
+    fs.add_data(arr, "type2", "feat1")
+    wm_comm.barrier()
+
+    indices_to_fetch = np.random.randint(low=0, high=len(arr), size=1024)
+    output_fs = fs.get_data(indices_to_fetch, type_name="type2", feat_name="feat1")
+    assert isinstance(output_fs, torch.Tensor)
+    assert output_fs.is_cuda
+    expected = arr[indices_to_fetch]
+    np.testing.assert_array_equal(output_fs.cpu().numpy(), expected)
+
+    wmb.finalize()
+
+
+@pytest.mark.sg
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
+)
+def test_feature_storage_wholegraph_backend():
+    from pylibwholegraph.utils.multiprocess import multiprocess_run
+
+    gpu_count = wmb.fork_get_gpu_count()
+    print("gpu count:", gpu_count)
+    assert gpu_count > 0
+
+    multiprocess_run(1, runtest)
+
+
+@pytest.mark.mg
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(pylibwholegraph, MissingModule), reason="wholegraph not available"
+)
+def test_feature_storage_wholegraph_backend_mg():
+    from pylibwholegraph.utils.multiprocess import multiprocess_run
+
+    gpu_count = wmb.fork_get_gpu_count()
+    print("gpu count:", gpu_count)
+    assert gpu_count > 0
+
+    multiprocess_run(gpu_count, runtest)

From a8638355d4e74351e58706c4f747dcc63d23bd81 Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Tue, 3 Oct 2023 13:45:14 -0400
Subject: [PATCH 66/72] Integrate renumbering and compression to `cugraph-dgl`
 to accelerate MFG creation (#3887)

Allow cugraph-dgl dataloader to consume sampled outputs from BulkSampler in CSC format.

Authors:
  - Tingyu Wang (https://github.com/tingyu66)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Vibhu Jawa (https://github.com/VibhuJawa)

URL: https://github.com/rapidsai/cugraph/pull/3887
---
 .../cugraph_dgl/dataloading/__init__.py       |   2 +-
 .../cugraph_dgl/dataloading/dataloader.py     |  49 ++++--
 .../cugraph_dgl/dataloading/dataset.py        |  37 +++--
 .../dataloading/utils/sampling_helpers.py     | 155 +++++++++++++++++-
 .../cugraph-dgl/cugraph_dgl/nn/conv/base.py   |   7 +
 python/cugraph-dgl/tests/test_utils.py        |  28 ++++
 6 files changed, 250 insertions(+), 28 deletions(-)

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
index 6cabea198f6..2fd7d29bd49 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/__init__.py
@@ -13,7 +13,7 @@
 
 from cugraph_dgl.dataloading.dataset import (
     HomogenousBulkSamplerDataset,
-    HetrogenousBulkSamplerDataset,
+    HeterogenousBulkSamplerDataset,
 )
 from cugraph_dgl.dataloading.neighbor_sampler import NeighborSampler
 from cugraph_dgl.dataloading.dataloader import DataLoader
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index 0480f61807a..f154b096256 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -21,7 +21,7 @@
 from dask.distributed import default_client, Event
 from cugraph_dgl.dataloading import (
     HomogenousBulkSamplerDataset,
-    HetrogenousBulkSamplerDataset,
+    HeterogenousBulkSamplerDataset,
 )
 from cugraph_dgl.dataloading.utils.extract_graph_helpers import (
     create_cugraph_graph_from_edges_dict,
@@ -47,19 +47,20 @@ def __init__(
         graph_sampler: cugraph_dgl.dataloading.NeighborSampler,
         sampling_output_dir: str,
         batches_per_partition: int = 50,
-        seeds_per_call: int = 400_000,
+        seeds_per_call: int = 200_000,
         device: torch.device = None,
         use_ddp: bool = False,
         ddp_seed: int = 0,
         batch_size: int = 1024,
         drop_last: bool = False,
         shuffle: bool = False,
+        sparse_format: str = "coo",
         **kwargs,
     ):
         """
         Constructor for CuGraphStorage:
         -------------------------------
-        graph :  CuGraphStorage
+        graph : CuGraphStorage
             The graph.
         indices : Tensor or dict[ntype, Tensor]
             The set of indices.  It can either be a tensor of
@@ -89,7 +90,12 @@ def __init__(
             The seed for shuffling the dataset in
             :class:`torch.utils.data.distributed.DistributedSampler`.
             Only effective when :attr:`use_ddp` is True.
-        batch_size: int,
+        batch_size: int
+            Batch size.
+        sparse_format: str, default = "coo"
+            The sparse format of the emitted sampled graphs. Choose between "csc"
+            and "coo". When using "csc", the graphs are of type
+            cugraph_dgl.nn.SparseGraph.
         kwargs : dict
             Key-word arguments to be passed to the parent PyTorch
             :py:class:`torch.utils.data.DataLoader` class. Common arguments are:
@@ -123,6 +129,12 @@ def __init__(
         ...     for input_nodes, output_nodes, blocks in dataloader:
         ...
         """
+        if sparse_format not in ["coo", "csc"]:
+            raise ValueError(
+                f"sparse_format must be one of 'coo', 'csc', "
+                f"but got {sparse_format}."
+            )
+        self.sparse_format = sparse_format
 
         self.ddp_seed = ddp_seed
         self.use_ddp = use_ddp
@@ -156,11 +168,12 @@ def __init__(
             self.cugraph_dgl_dataset = HomogenousBulkSamplerDataset(
                 total_number_of_nodes=graph.total_number_of_nodes,
                 edge_dir=self.graph_sampler.edge_dir,
+                sparse_format=sparse_format,
             )
         else:
             etype_id_to_etype_str_dict = {v: k for k, v in graph._etype_id_dict.items()}
 
-            self.cugraph_dgl_dataset = HetrogenousBulkSamplerDataset(
+            self.cugraph_dgl_dataset = HeterogenousBulkSamplerDataset(
                 num_nodes_dict=graph.num_nodes_dict,
                 etype_id_dict=etype_id_to_etype_str_dict,
                 etype_offset_dict=graph._etype_offset_d,
@@ -210,14 +223,23 @@ def __iter__(self):
         output_dir = os.path.join(
             self._sampling_output_dir, "epoch_" + str(self.epoch_number)
         )
+        kwargs = {}
         if isinstance(self.cugraph_dgl_dataset, HomogenousBulkSamplerDataset):
-            deduplicate_sources = True
-            prior_sources_behavior = "carryover"
-            renumber = True
+            kwargs["deduplicate_sources"] = True
+            kwargs["prior_sources_behavior"] = "carryover"
+            kwargs["renumber"] = True
+
+            if self.sparse_format == "csc":
+                kwargs["compression"] = "CSR"
+                kwargs["compress_per_hop"] = True
+                # The following kwargs will be deprecated in uniform sampler.
+                kwargs["use_legacy_names"] = False
+                kwargs["include_hop_column"] = False
+
         else:
-            deduplicate_sources = False
-            prior_sources_behavior = None
-            renumber = False
+            kwargs["deduplicate_sources"] = False
+            kwargs["prior_sources_behavior"] = None
+            kwargs["renumber"] = False
 
         bs = BulkSampler(
             output_path=output_dir,
@@ -227,10 +249,9 @@ def __iter__(self):
             seeds_per_call=self._seeds_per_call,
             fanout_vals=self.graph_sampler._reversed_fanout_vals,
             with_replacement=self.graph_sampler.replace,
-            deduplicate_sources=deduplicate_sources,
-            prior_sources_behavior=prior_sources_behavior,
-            renumber=renumber,
+            **kwargs,
         )
+
         if self.shuffle:
             self.tensorized_indices_ds.shuffle()
 
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
index e0d51bcf4cf..815fd30d8eb 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataset.py
@@ -19,6 +19,7 @@
 from cugraph_dgl.dataloading.utils.sampling_helpers import (
     create_homogeneous_sampled_graphs_from_dataframe,
     create_heterogeneous_sampled_graphs_from_dataframe,
+    create_homogeneous_sampled_graphs_from_dataframe_csc,
 )
 
 
@@ -33,17 +34,19 @@ def __init__(
         total_number_of_nodes: int,
         edge_dir: str,
         return_type: str = "dgl.Block",
+        sparse_format: str = "coo",
     ):
         if return_type not in ["dgl.Block", "cugraph_dgl.nn.SparseGraph"]:
             raise ValueError(
-                "return_type must be either 'dgl.Block' or \
-                    'cugraph_dgl.nn.SparseGraph' "
+                "return_type must be either 'dgl.Block' or "
+                "'cugraph_dgl.nn.SparseGraph'."
             )
         # TODO: Deprecate `total_number_of_nodes`
         # as it is no longer needed
         # in the next release
         self.total_number_of_nodes = total_number_of_nodes
         self.edge_dir = edge_dir
+        self.sparse_format = sparse_format
         self._current_batch_fn = None
         self._input_files = None
         self._return_type = return_type
@@ -60,10 +63,20 @@ def __getitem__(self, idx: int):
 
         fn, batch_offset = self._batch_to_fn_d[idx]
         if fn != self._current_batch_fn:
-            df = _load_sampled_file(dataset_obj=self, fn=fn)
-            self._current_batches = create_homogeneous_sampled_graphs_from_dataframe(
-                sampled_df=df, edge_dir=self.edge_dir, return_type=self._return_type
-            )
+            if self.sparse_format == "csc":
+                df = _load_sampled_file(dataset_obj=self, fn=fn, skip_rename=True)
+                self._current_batches = (
+                    create_homogeneous_sampled_graphs_from_dataframe_csc(df)
+                )
+            else:
+                df = _load_sampled_file(dataset_obj=self, fn=fn)
+                self._current_batches = (
+                    create_homogeneous_sampled_graphs_from_dataframe(
+                        sampled_df=df,
+                        edge_dir=self.edge_dir,
+                        return_type=self._return_type,
+                    )
+                )
         current_offset = idx - batch_offset
         return self._current_batches[current_offset]
 
@@ -87,7 +100,7 @@ def set_input_files(
         )
 
 
-class HetrogenousBulkSamplerDataset(torch.utils.data.Dataset):
+class HeterogenousBulkSamplerDataset(torch.utils.data.Dataset):
     def __init__(
         self,
         num_nodes_dict: Dict[str, int],
@@ -141,18 +154,18 @@ def set_input_files(
         ----------
         input_directory: str
             input_directory which contains all the files that will be
-            loaded by HetrogenousBulkSamplerDataset
+            loaded by HeterogenousBulkSamplerDataset
         input_file_paths: List[str]
-            File names that will be loaded by the HetrogenousBulkSamplerDataset
+            File names that will be loaded by the HeterogenousBulkSamplerDataset
         """
         _set_input_files(
             self, input_directory=input_directory, input_file_paths=input_file_paths
         )
 
 
-def _load_sampled_file(dataset_obj, fn):
+def _load_sampled_file(dataset_obj, fn, skip_rename=False):
     df = cudf.read_parquet(os.path.join(fn))
-    if dataset_obj.edge_dir == "in":
+    if dataset_obj.edge_dir == "in" and not skip_rename:
         df.rename(
             columns={"sources": "destinations", "destinations": "sources"},
             inplace=True,
@@ -181,7 +194,7 @@ def get_batch_to_fn_d(files):
 
 
 def _set_input_files(
-    dataset_obj: Union[HomogenousBulkSamplerDataset, HetrogenousBulkSamplerDataset],
+    dataset_obj: Union[HomogenousBulkSamplerDataset, HeterogenousBulkSamplerDataset],
     input_directory: Optional[str] = None,
     input_file_paths: Optional[List[str]] = None,
 ) -> None:
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
index bdac3b1a323..a4f64668348 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/utils/sampling_helpers.py
@@ -11,10 +11,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import annotations
-from typing import Tuple, Dict, Optional
+from typing import List, Tuple, Dict, Optional
 from collections import defaultdict
 import cudf
+import cupy
 from cugraph.utilities.utils import import_optional
+from cugraph_dgl.nn import SparseGraph
 
 dgl = import_optional("dgl")
 torch = import_optional("torch")
@@ -401,3 +403,154 @@ def create_heterogenous_dgl_block_from_tensors_dict(
     block = dgl.to_block(sampled_graph, dst_nodes=seed_nodes, src_nodes=src_d)
     block.edata[dgl.EID] = sampled_graph.edata[dgl.EID]
     return block
+
+
+def _process_sampled_df_csc(
+    df: cudf.DataFrame,
+    reverse_hop_id: bool = True,
+) -> Tuple[
+    Dict[int, Dict[int, Dict[str, torch.Tensor]]],
+    List[torch.Tensor],
+    List[List[int, int]],
+]:
+    """
+    Convert a dataframe generated by BulkSampler to a dictionary of tensors, to
+    facilitate MFG creation. The sampled graphs in the dataframe use CSC-format.
+
+    Parameters
+    ----------
+    df: cudf.DataFrame
+        The output from BulkSampler compressed in CSC format. The dataframe
+        should be generated with `compression="CSR"` in BulkSampler,
+        since the sampling routine treats seed nodes as sources.
+
+    reverse_hop_id: bool (default=True)
+        Reverse hop id.
+
+    Returns
+    -------
+    tensors_dict: dict
+        A nested dictionary keyed by batch id and hop id.
+        `tensor_dict[batch_id][hop_id]` holds "minors" and "major_offsets"
+        values for CSC MFGs.
+
+    renumber_map_list: list
+        List of renumbering maps for looking up global indices of nodes. One
+        map for each batch.
+
+    mfg_sizes: list
+        List of the number of nodes in each message passing layer. For the
+        k-th hop, mfg_sizes[k] and mfg_sizes[k+1] is the number of sources and
+        destinations, respectively.
+    """
+    # dropna
+    major_offsets = df.major_offsets.dropna().values
+    label_hop_offsets = df.label_hop_offsets.dropna().values
+    renumber_map_offsets = df.renumber_map_offsets.dropna().values
+    renumber_map = df.map.dropna().values
+    minors = df.minors.dropna().values
+
+    n_batches = renumber_map_offsets.size - 1
+    n_hops = int((label_hop_offsets.size - 1) / n_batches)
+
+    # make global offsets local
+    major_offsets -= major_offsets[0]
+    label_hop_offsets -= label_hop_offsets[0]
+    renumber_map_offsets -= renumber_map_offsets[0]
+
+    # get the sizes of each adjacency matrix (for MFGs)
+    mfg_sizes = (label_hop_offsets[1:] - label_hop_offsets[:-1]).reshape(
+        (n_batches, n_hops)
+    )
+    n_nodes = renumber_map_offsets[1:] - renumber_map_offsets[:-1]
+    mfg_sizes = cupy.hstack((mfg_sizes, n_nodes.reshape(n_batches, -1)))
+    if reverse_hop_id:
+        mfg_sizes = mfg_sizes[:, ::-1]
+
+    tensors_dict = {}
+    renumber_map_list = []
+    for batch_id in range(n_batches):
+        batch_dict = {}
+
+        for hop_id in range(n_hops):
+            hop_dict = {}
+            idx = batch_id * n_hops + hop_id  # idx in label_hop_offsets
+            major_offsets_start = label_hop_offsets[idx].item()
+            major_offsets_end = label_hop_offsets[idx + 1].item()
+            minors_start = major_offsets[major_offsets_start].item()
+            minors_end = major_offsets[major_offsets_end].item()
+            # Note: minors and major_offsets from BulkSampler are of type int32
+            # and int64 respectively. Since pylibcugraphops binding code doesn't
+            # support distinct node and edge index type, we simply casting both
+            # to int32 for now.
+            hop_dict["minors"] = torch.as_tensor(
+                minors[minors_start:minors_end], device="cuda"
+            ).int()
+            hop_dict["major_offsets"] = torch.as_tensor(
+                major_offsets[major_offsets_start : major_offsets_end + 1]
+                - major_offsets[major_offsets_start],
+                device="cuda",
+            ).int()
+            if reverse_hop_id:
+                batch_dict[n_hops - 1 - hop_id] = hop_dict
+            else:
+                batch_dict[hop_id] = hop_dict
+
+        tensors_dict[batch_id] = batch_dict
+
+        renumber_map_list.append(
+            torch.as_tensor(
+                renumber_map[
+                    renumber_map_offsets[batch_id] : renumber_map_offsets[batch_id + 1]
+                ],
+                device="cuda",
+            )
+        )
+
+    return tensors_dict, renumber_map_list, mfg_sizes.tolist()
+
+
+def _create_homogeneous_sparse_graphs_from_csc(
+    tensors_dict: Dict[int, Dict[int, Dict[str, torch.Tensor]]],
+    renumber_map_list: List[torch.Tensor],
+    mfg_sizes: List[int, int],
+) -> List[List[torch.Tensor, torch.Tensor, List[SparseGraph]]]:
+    """Create mini-batches of MFGs. The input arguments are the outputs of
+    the function `_process_sampled_df_csc`.
+
+    Returns
+    -------
+    output: list
+        A list of mini-batches. Each mini-batch is a list that consists of
+        `input_nodes` tensor, `output_nodes` tensor and a list of MFGs.
+    """
+    n_batches, n_hops = len(mfg_sizes), len(mfg_sizes[0]) - 1
+    output = []
+    for b_id in range(n_batches):
+        output_batch = []
+        output_batch.append(renumber_map_list[b_id])
+        output_batch.append(renumber_map_list[b_id][: mfg_sizes[b_id][-1]])
+        mfgs = [
+            SparseGraph(
+                size=(mfg_sizes[b_id][h_id], mfg_sizes[b_id][h_id + 1]),
+                src_ids=tensors_dict[b_id][h_id]["minors"],
+                cdst_ids=tensors_dict[b_id][h_id]["major_offsets"],
+                formats=["csc"],
+                reduce_memory=True,
+            )
+            for h_id in range(n_hops)
+        ]
+
+        output_batch.append(mfgs)
+
+        output.append(output_batch)
+
+    return output
+
+
+def create_homogeneous_sampled_graphs_from_dataframe_csc(sampled_df: cudf.DataFrame):
+    """Public API to create mini-batches of MFGs using a dataframe output by
+    BulkSampler, where the sampled graph is compressed in CSC format."""
+    return _create_homogeneous_sparse_graphs_from_csc(
+        *(_process_sampled_df_csc(sampled_df))
+    )
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
index 307eb33078e..ddd95a76366 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
@@ -248,6 +248,13 @@ def csr(self) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
             value = value[self._perm_csc2csr]
         return csrc_ids, dst_ids, value
 
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}(num_src_nodes={self._num_src_nodes}, "
+            f"num_dst_nodes={self._num_dst_nodes}, "
+            f"num_edges={self._src_ids.size(0)}, formats={self._formats})"
+        )
+
 
 class BaseConv(torch.nn.Module):
     r"""An abstract base class for cugraph-ops nn module."""
diff --git a/python/cugraph-dgl/tests/test_utils.py b/python/cugraph-dgl/tests/test_utils.py
index 740db59ce7f..4be66758b43 100644
--- a/python/cugraph-dgl/tests/test_utils.py
+++ b/python/cugraph-dgl/tests/test_utils.py
@@ -22,6 +22,7 @@
     create_homogeneous_sampled_graphs_from_dataframe,
     _get_source_destination_range,
     _create_homogeneous_cugraph_dgl_nn_sparse_graph,
+    create_homogeneous_sampled_graphs_from_dataframe_csc,
 )
 from cugraph.utilities.utils import import_optional
 
@@ -50,6 +51,23 @@ def get_dummy_sampled_df():
     return df
 
 
+def get_dummy_sampled_df_csc():
+    df_dict = dict(
+        minors=np.array(
+            [1, 1, 2, 1, 0, 3, 1, 3, 2, 3, 2, 4, 0, 1, 1, 0, 3, 2], dtype=np.int32
+        ),
+        major_offsets=np.arange(19, dtype=np.int64),
+        map=np.array(
+            [26, 29, 33, 22, 23, 32, 18, 29, 33, 33, 8, 30, 32], dtype=np.int32
+        ),
+        renumber_map_offsets=np.array([0, 4, 9, 13], dtype=np.int64),
+        label_hop_offsets=np.array([0, 1, 3, 6, 7, 9, 13, 14, 16, 18], dtype=np.int64),
+    )
+
+    # convert values to Series so that NaNs are padded automatically
+    return cudf.DataFrame({k: cudf.Series(v) for k, v in df_dict.items()})
+
+
 def test_get_renumber_map():
 
     sampled_df = get_dummy_sampled_df()
@@ -176,3 +194,13 @@ def test__create_homogeneous_cugraph_dgl_nn_sparse_graph():
     assert sparse_graph.num_src_nodes() == 2
     assert sparse_graph.num_dst_nodes() == seednodes_range + 1
     assert isinstance(sparse_graph, cugraph_dgl.nn.SparseGraph)
+
+
+def test_create_homogeneous_sampled_graphs_from_dataframe_csc():
+    df = get_dummy_sampled_df_csc()
+    batches = create_homogeneous_sampled_graphs_from_dataframe_csc(df)
+
+    assert len(batches) == 3
+    assert torch.equal(batches[0][0], torch.IntTensor([26, 29, 33, 22]).cuda())
+    assert torch.equal(batches[1][0], torch.IntTensor([23, 32, 18, 29, 33]).cuda())
+    assert torch.equal(batches[2][0], torch.IntTensor([33, 8, 30, 32]).cuda())

From 5ce3ee1b11db62647337514a200845bbb392d351 Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Tue, 3 Oct 2023 16:50:10 -0500
Subject: [PATCH 67/72] nx-cugraph: handle louvain with isolated nodes (#3897)

This handles isolated nodes in `louvain_communities` similar to what is done in #3886. This is expected to be a temporary fix until pylibcugraph can handle isolated nodes.

As a bonus, I added `isolates` algorithm :tada:

CC @naimnv @rlratzel

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/3897
---
 python/nx-cugraph/_nx_cugraph/__init__.py     |  7 ++-
 python/nx-cugraph/lint.yaml                   |  2 +-
 .../nx_cugraph/algorithms/__init__.py         |  1 +
 .../algorithms/community/louvain.py           | 28 +++++++--
 .../nx_cugraph/algorithms/isolate.py          | 63 +++++++++++++++++++
 .../nx-cugraph/nx_cugraph/classes/digraph.py  |  2 +-
 python/nx-cugraph/nx_cugraph/classes/graph.py |  6 +-
 python/nx-cugraph/nx_cugraph/convert.py       |  2 +-
 .../nx_cugraph/tests/test_community.py        | 53 ++++++++++++++++
 9 files changed, 151 insertions(+), 13 deletions(-)
 create mode 100644 python/nx-cugraph/nx_cugraph/algorithms/isolate.py
 create mode 100644 python/nx-cugraph/nx_cugraph/tests/test_community.py

diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index 9b3332106ec..ebd13daded0 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -31,20 +31,23 @@
         # BEGIN: functions
         "betweenness_centrality",
         "edge_betweenness_centrality",
+        "is_isolate",
+        "isolates",
         "louvain_communities",
+        "number_of_isolates",
         # END: functions
     },
     "extra_docstrings": {
         # BEGIN: extra_docstrings
         "betweenness_centrality": "`weight` parameter is not yet supported.",
         "edge_betweenness_centrality": "`weight` parameter is not yet supported.",
-        "louvain_communities": "`threshold` and `seed` parameters are currently ignored.",
+        "louvain_communities": "`seed` parameter is currently ignored.",
         # END: extra_docstrings
     },
     "extra_parameters": {
         # BEGIN: extra_parameters
         "louvain_communities": {
-            "max_level : int, optional": "Upper limit of the number of macro-iterations.",
+            "max_level : int, optional": "Upper limit of the number of macro-iterations (max: 500).",
         },
         # END: extra_parameters
     },
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index 6a462a6af79..338ca7b230e 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -63,7 +63,7 @@ repos:
           # These versions need updated manually
           - flake8==6.1.0
           - flake8-bugbear==23.9.16
-          - flake8-simplify==0.20.0
+          - flake8-simplify==0.21.0
   - repo: https://github.com/asottile/yesqa
     rev: v1.5.0
     hooks:
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
index 3a585452d6d..dfd9adfc61a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
@@ -12,3 +12,4 @@
 # limitations under the License.
 from . import centrality, community
 from .centrality import *
+from .isolate import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
index a183b59fe1d..dc209870c89 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
@@ -10,7 +10,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
+import warnings
 
 import pylibcugraph as plc
 
@@ -22,19 +22,23 @@
     not_implemented_for,
 )
 
+from ..isolate import _isolates
+
 __all__ = ["louvain_communities"]
 
 
 @not_implemented_for("directed")
 @networkx_algorithm(
     extra_params={
-        "max_level : int, optional": "Upper limit of the number of macro-iterations."
+        "max_level : int, optional": (
+            "Upper limit of the number of macro-iterations (max: 500)."
+        )
     }
 )
 def louvain_communities(
     G, weight="weight", resolution=1, threshold=0.0000001, seed=None, *, max_level=None
 ):
-    """`threshold` and `seed` parameters are currently ignored."""
+    """`seed` parameter is currently ignored."""
     # NetworkX allows both directed and undirected, but cugraph only allows undirected.
     seed = _seed_to_int(seed)  # Unused, but ensure it's valid for future compatibility
     G = _to_undirected_graph(G, weight)
@@ -42,7 +46,14 @@ def louvain_communities(
         # TODO: PLC doesn't handle empty graphs gracefully!
         return [{key} for key in G._nodeiter_to_iter(range(len(G)))]
     if max_level is None:
-        max_level = sys.maxsize
+        max_level = 500
+    elif max_level > 500:
+        warnings.warn(
+            f"max_level is set too high (={max_level}), setting it to 500.",
+            UserWarning,
+            stacklevel=2,
+        )
+        max_level = 500
     vertices, clusters, modularity = plc.louvain(
         resource_handle=plc.ResourceHandle(),
         graph=G._get_plc_graph(),
@@ -52,7 +63,14 @@ def louvain_communities(
         do_expensive_check=False,
     )
     groups = _groupby(clusters, vertices)
-    return [set(G._nodearray_to_list(node_ids)) for node_ids in groups.values()]
+    rv = [set(G._nodearray_to_list(node_ids)) for node_ids in groups.values()]
+    # TODO: PLC doesn't handle isolated vertices yet, so this is a temporary fix
+    isolates = _isolates(G)
+    if isolates.size > 0:
+        isolates = isolates[isolates > vertices.max()]
+        if isolates.size > 0:
+            rv.extend({node} for node in G._nodearray_to_list(isolates))
+    return rv
 
 
 @louvain_communities._can_run
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/isolate.py b/python/nx-cugraph/nx_cugraph/algorithms/isolate.py
new file mode 100644
index 00000000000..774627e84f6
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/isolate.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import cupy as cp
+
+from nx_cugraph.convert import _to_graph
+from nx_cugraph.utils import networkx_algorithm
+
+if TYPE_CHECKING:  # pragma: no cover
+    from nx_cugraph.typing import IndexValue
+
+__all__ = ["is_isolate", "isolates", "number_of_isolates"]
+
+
+@networkx_algorithm
+def is_isolate(G, n):
+    G = _to_graph(G)
+    index = n if G.key_to_id is None else G.key_to_id[n]
+    return not (
+        (G.row_indices == index).any().tolist()
+        or G.is_directed()
+        and (G.col_indices == index).any().tolist()
+    )
+
+
+def _mark_isolates(G) -> cp.ndarray[bool]:
+    """Return a boolean mask array indicating indices of isolated nodes."""
+    mark_isolates = cp.ones(len(G), bool)
+    mark_isolates[G.row_indices] = False
+    if G.is_directed():
+        mark_isolates[G.col_indices] = False
+    return mark_isolates
+
+
+def _isolates(G) -> cp.ndarray[IndexValue]:
+    """Like isolates, but return an array of indices instead of an iterator of nodes."""
+    G = _to_graph(G)
+    return cp.nonzero(_mark_isolates(G))[0]
+
+
+@networkx_algorithm
+def isolates(G):
+    G = _to_graph(G)
+    return G._nodeiter_to_iter(iter(_isolates(G).tolist()))
+
+
+@networkx_algorithm
+def number_of_isolates(G):
+    G = _to_graph(G)
+    return _mark_isolates(G).sum().tolist()
diff --git a/python/nx-cugraph/nx_cugraph/classes/digraph.py b/python/nx-cugraph/nx_cugraph/classes/digraph.py
index 0aaf88fd793..72a1bff21a9 100644
--- a/python/nx-cugraph/nx_cugraph/classes/digraph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/digraph.py
@@ -20,7 +20,7 @@
 
 from .graph import Graph
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from nx_cugraph.typing import NodeKey
 
 __all__ = ["DiGraph"]
diff --git a/python/nx-cugraph/nx_cugraph/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py
index 1432f68c752..ded4cc3943f 100644
--- a/python/nx-cugraph/nx_cugraph/classes/graph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/graph.py
@@ -23,7 +23,7 @@
 
 import nx_cugraph as nxcg
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Iterable, Iterator
 
     from nx_cugraph.typing import (
@@ -245,9 +245,9 @@ def from_dcsc(
     def __new__(cls, incoming_graph_data=None, **attr) -> Graph:
         if incoming_graph_data is None:
             new_graph = cls.from_coo(0, cp.empty(0, np.int32), cp.empty(0, np.int32))
-        elif incoming_graph_data.__class__ is new_graph.__class__:
+        elif incoming_graph_data.__class__ is cls:
             new_graph = incoming_graph_data.copy()
-        elif incoming_graph_data.__class__ is new_graph.to_networkx_class():
+        elif incoming_graph_data.__class__ is cls.to_networkx_class():
             new_graph = nxcg.from_networkx(incoming_graph_data, preserve_all_attrs=True)
         else:
             raise NotImplementedError
diff --git a/python/nx-cugraph/nx_cugraph/convert.py b/python/nx-cugraph/nx_cugraph/convert.py
index 9be8cac7877..1240ea71db7 100644
--- a/python/nx-cugraph/nx_cugraph/convert.py
+++ b/python/nx-cugraph/nx_cugraph/convert.py
@@ -24,7 +24,7 @@
 
 import nx_cugraph as nxcg
 
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from nx_cugraph.typing import AttrKey, Dtype, EdgeValue, NodeValue
 
 __all__ = [
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_community.py b/python/nx-cugraph/nx_cugraph/tests/test_community.py
new file mode 100644
index 00000000000..126f45c14ae
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_community.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pytest
+
+import nx_cugraph as nxcg
+
+
+def test_louvain_isolated_nodes():
+    is_nx_30_or_31 = hasattr(nx.classes, "backends")
+
+    def check(left, right):
+        assert len(left) == len(right)
+        assert set(map(frozenset, left)) == set(map(frozenset, right))
+
+    # Empty graph (no nodes)
+    G = nx.Graph()
+    if is_nx_30_or_31:
+        with pytest.raises(ZeroDivisionError):
+            nx.community.louvain_communities(G)
+    else:
+        nx_result = nx.community.louvain_communities(G)
+        cg_result = nxcg.community.louvain_communities(G)
+        check(nx_result, cg_result)
+    # Graph with no edges
+    G.add_nodes_from(range(5))
+    if is_nx_30_or_31:
+        with pytest.raises(ZeroDivisionError):
+            nx.community.louvain_communities(G)
+    else:
+        nx_result = nx.community.louvain_communities(G)
+        cg_result = nxcg.community.louvain_communities(G)
+        check(nx_result, cg_result)
+    # Graph with isolated nodes
+    G.add_edge(1, 2)
+    nx_result = nx.community.louvain_communities(G)
+    cg_result = nxcg.community.louvain_communities(G)
+    check(nx_result, cg_result)
+    # Another one
+    G.add_edge(4, 4)
+    nx_result = nx.community.louvain_communities(G)
+    cg_result = nxcg.community.louvain_communities(G)
+    check(nx_result, cg_result)

From 26af14ebad6a6b1f115779d90d3c0a68f0d380ee Mon Sep 17 00:00:00 2001
From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com>
Date: Wed, 4 Oct 2023 11:44:01 -0400
Subject: [PATCH 68/72] cuGraph-PyG MFG Creation and Conversion (#3873)

Integrates the new CSR bulk sampler output, allowing reading of batches without having to call CSC conversion or count the numbers of vertices and edges in each batch.  Should result in major performance improvements, especially for small batches.

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Ray Douglass (https://github.com/raydouglass)
  - Tingyu Wang (https://github.com/tingyu66)

URL: https://github.com/rapidsai/cugraph/pull/3873
---
 ci/test_python.sh                             |   7 +-
 .../cugraph_pyg/data/cugraph_store.py         |  38 ++--
 .../cugraph_pyg/loader/cugraph_node_loader.py | 199 +++++++++++++----
 .../cugraph_pyg/sampler/cugraph_sampler.py    | 130 +++++++++++-
 .../tests/mg/test_mg_cugraph_sampler.py       |  10 +-
 .../tests/mg/test_mg_cugraph_store.py         |   4 +-
 .../cugraph_pyg/tests/test_cugraph_loader.py  | 200 ++++++++++++++++--
 .../cugraph_pyg/tests/test_cugraph_sampler.py |  10 +-
 .../cugraph_pyg/tests/test_cugraph_store.py   |   4 +-
 9 files changed, 497 insertions(+), 105 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 7b0077991ae..825d5b242d5 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -200,8 +200,11 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
       --channel pytorch \
       --channel nvidia \
       'pyg=2.3' \
-      'pytorch>=2.0' \
-      'pytorch-cuda>=11.8'
+      'pytorch=2.0.0' \
+      'pytorch-cuda=11.8'
+    
+    # Install pyg dependencies (which requires pip)
+    pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
 
     rapids-mamba-retry install \
       --channel "${CPP_CHANNEL}" \
diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
index e0d318adbe0..fd2172e6ade 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
@@ -819,8 +819,8 @@ def _get_renumbered_edge_groups_from_sample(
           before this one to get the noi_index.
 
         Example Input: Series({
-                'sources': [0, 5, 11, 3],
-                'destinations': [8, 2, 3, 5]},
+                'majors': [0, 5, 11, 3],
+                'minors': [8, 2, 3, 5]},
                 'edge_type': [1, 3, 5, 14]
             }),
             {
@@ -865,24 +865,22 @@ def _get_renumbered_edge_groups_from_sample(
                     index=cupy.asarray(id_table),
                 ).sort_index()
 
-                # Renumber the sources using binary search
+                # Renumber the majors using binary search
                 # Step 1: get the index of the new id
                 ix_r = torch.searchsorted(
                     torch.as_tensor(id_map.index.values, device="cuda"),
-                    torch.as_tensor(sampling_results.sources.values, device="cuda"),
+                    torch.as_tensor(sampling_results.majors.values, device="cuda"),
                 )
                 # Step 2: Go from id indices to actual ids
                 row_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
                     ix_r
                 ]
 
-                # Renumber the destinations using binary search
+                # Renumber the minors using binary search
                 # Step 1: get the index of the new id
                 ix_c = torch.searchsorted(
                     torch.as_tensor(id_map.index.values, device="cuda"),
-                    torch.as_tensor(
-                        sampling_results.destinations.values, device="cuda"
-                    ),
+                    torch.as_tensor(sampling_results.minors.values, device="cuda"),
                 )
                 # Step 2: Go from id indices to actual ids
                 col_dict[t_pyg_type] = torch.as_tensor(id_map.values, device="cuda")[
@@ -897,7 +895,7 @@ def _get_renumbered_edge_groups_from_sample(
                         "new_id": cupy.arange(dst_id_table.shape[0]),
                     }
                 ).set_index("dst")
-                dst = dst_id_map["new_id"].loc[sampling_results.destinations]
+                dst = dst_id_map["new_id"].loc[sampling_results.minors]
                 col_dict[t_pyg_type] = torch.as_tensor(dst.values, device="cuda")
 
                 src_id_table = noi_index[src_type]
@@ -907,7 +905,7 @@ def _get_renumbered_edge_groups_from_sample(
                         "new_id": cupy.arange(src_id_table.shape[0]),
                     }
                 ).set_index("src")
-                src = src_id_map["new_id"].loc[sampling_results.sources]
+                src = src_id_map["new_id"].loc[sampling_results.majors]
                 row_dict[t_pyg_type] = torch.as_tensor(src.values, device="cuda")
 
         else:
@@ -929,12 +927,12 @@ def _get_renumbered_edge_groups_from_sample(
                 else:  # CSC
                     dst_type, _, src_type = pyg_can_edge_type
 
-                # Get the de-offsetted destinations
+                # Get the de-offsetted minors
                 dst_num_type = self._numeric_vertex_type_from_name(dst_type)
-                destinations = torch.as_tensor(
-                    sampling_results.destinations.iloc[ix].values, device="cuda"
+                minors = torch.as_tensor(
+                    sampling_results.minors.iloc[ix].values, device="cuda"
                 )
-                destinations -= self.__vertex_type_offsets["start"][dst_num_type]
+                minors -= self.__vertex_type_offsets["start"][dst_num_type]
 
                 # Create the col entry for this type
                 dst_id_table = noi_index[dst_type]
@@ -944,15 +942,15 @@ def _get_renumbered_edge_groups_from_sample(
                     .rename(columns={"index": "new_id"})
                     .set_index("dst")
                 )
-                dst = dst_id_map["new_id"].loc[cupy.asarray(destinations)]
+                dst = dst_id_map["new_id"].loc[cupy.asarray(minors)]
                 col_dict[pyg_can_edge_type] = torch.as_tensor(dst.values, device="cuda")
 
-                # Get the de-offsetted sources
+                # Get the de-offsetted majors
                 src_num_type = self._numeric_vertex_type_from_name(src_type)
-                sources = torch.as_tensor(
-                    sampling_results.sources.iloc[ix].values, device="cuda"
+                majors = torch.as_tensor(
+                    sampling_results.majors.iloc[ix].values, device="cuda"
                 )
-                sources -= self.__vertex_type_offsets["start"][src_num_type]
+                majors -= self.__vertex_type_offsets["start"][src_num_type]
 
                 # Create the row entry for this type
                 src_id_table = noi_index[src_type]
@@ -962,7 +960,7 @@ def _get_renumbered_edge_groups_from_sample(
                     .rename(columns={"index": "new_id"})
                     .set_index("src")
                 )
-                src = src_id_map["new_id"].loc[cupy.asarray(sources)]
+                src = src_id_map["new_id"].loc[cupy.asarray(majors)]
                 row_dict[pyg_can_edge_type] = torch.as_tensor(src.values, device="cuda")
 
         return row_dict, col_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
index cf7eb330d67..8552e7412e0 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
@@ -25,7 +25,9 @@
 from cugraph_pyg.data import CuGraphStore
 from cugraph_pyg.sampler.cugraph_sampler import (
     _sampler_output_from_sampling_results_heterogeneous,
-    _sampler_output_from_sampling_results_homogeneous,
+    _sampler_output_from_sampling_results_homogeneous_csr,
+    _sampler_output_from_sampling_results_homogeneous_coo,
+    filter_cugraph_store_csc,
 )
 
 from typing import Union, Tuple, Sequence, List, Dict
@@ -58,6 +60,7 @@ def __init__(
         # Sampler args
         num_neighbors: Union[List[int], Dict[Tuple[str, str, str], List[int]]] = None,
         replace: bool = True,
+        compression: str = "COO",
         # Other kwargs for the BulkSampler
         **kwargs,
     ):
@@ -128,6 +131,10 @@ def __init__(
         self.__batches_per_partition = batches_per_partition
         self.__starting_batch_id = starting_batch_id
 
+        self._total_read_time = 0.0
+        self._total_convert_time = 0.0
+        self._total_feature_time = 0.0
+
         if input_nodes is None:
             # Will be loading from disk
             self.__num_batches = input_nodes
@@ -174,6 +181,10 @@ def __init__(
             with_replacement=replace,
             batches_per_partition=self.__batches_per_partition,
             renumber=renumber,
+            use_legacy_names=False,
+            deduplicate_sources=True,
+            prior_sources_behavior="exclude",
+            include_hop_column=(compression == "COO"),
             **kwargs,
         )
 
@@ -211,6 +222,10 @@ def __init__(
         self.__input_files = iter(os.listdir(self.__directory.name))
 
     def __next__(self):
+        from time import perf_counter
+
+        start_time_read_data = perf_counter()
+
         # Load the next set of sampling results if necessary
         if self.__next_batch >= self.__end_exclusive:
             if self.__directory is None:
@@ -245,51 +260,98 @@ def __next__(self):
                 fname,
             )
 
-            columns = {
-                "sources": "int64",
-                "destinations": "int64",
-                # 'edge_id':'int64',
-                "edge_type": "int32",
-                "batch_id": "int32",
-                "hop_id": "int32",
-            }
-
             raw_sample_data = cudf.read_parquet(parquet_path)
+
             if "map" in raw_sample_data.columns:
-                num_batches = end_inclusive - self.__start_inclusive + 1
+                if "renumber_map_offsets" not in raw_sample_data.columns:
+                    num_batches = end_inclusive - self.__start_inclusive + 1
 
-                map_end = raw_sample_data["map"].iloc[num_batches]
+                    map_end = raw_sample_data["map"].iloc[num_batches]
 
-                map = torch.as_tensor(
-                    raw_sample_data["map"].iloc[0:map_end], device="cuda"
-                )
-                raw_sample_data.drop("map", axis=1, inplace=True)
+                    map = torch.as_tensor(
+                        raw_sample_data["map"].iloc[0:map_end], device="cuda"
+                    )
+                    raw_sample_data.drop("map", axis=1, inplace=True)
 
-                self.__renumber_map_offsets = map[0 : num_batches + 1] - map[0]
-                self.__renumber_map = map[num_batches + 1 :]
+                    self.__renumber_map_offsets = map[0 : num_batches + 1] - map[0]
+                    self.__renumber_map = map[num_batches + 1 :]
+                else:
+                    self.__renumber_map = raw_sample_data["map"]
+                    self.__renumber_map_offsets = raw_sample_data[
+                        "renumber_map_offsets"
+                    ]
+                    raw_sample_data.drop(
+                        columns=["map", "renumber_map_offsets"], inplace=True
+                    )
+
+                    self.__renumber_map.dropna(inplace=True)
+                    self.__renumber_map = torch.as_tensor(
+                        self.__renumber_map, device="cuda"
+                    )
+
+                    self.__renumber_map_offsets.dropna(inplace=True)
+                    self.__renumber_map_offsets = torch.as_tensor(
+                        self.__renumber_map_offsets, device="cuda"
+                    )
 
             else:
                 self.__renumber_map = None
 
-            self.__data = raw_sample_data[list(columns.keys())].astype(columns)
-            self.__data.dropna(inplace=True)
+            self.__data = raw_sample_data
+            self.__coo = "majors" in self.__data.columns
+            if self.__coo:
+                self.__data.dropna(inplace=True)
 
             if (
                 len(self.__graph_store.edge_types) == 1
                 and len(self.__graph_store.node_types) == 1
             ):
-                group_cols = ["batch_id", "hop_id"]
-                self.__data_index = self.__data.groupby(group_cols, as_index=True).agg(
-                    {"sources": "max", "destinations": "max"}
-                )
-                self.__data_index.rename(
-                    columns={"sources": "src_max", "destinations": "dst_max"},
-                    inplace=True,
-                )
-                self.__data_index = self.__data_index.to_dict(orient="index")
+                if self.__coo:
+                    group_cols = ["batch_id", "hop_id"]
+                    self.__data_index = self.__data.groupby(
+                        group_cols, as_index=True
+                    ).agg({"majors": "max", "minors": "max"})
+                    self.__data_index.rename(
+                        columns={"majors": "src_max", "minors": "dst_max"},
+                        inplace=True,
+                    )
+                    self.__data_index = self.__data_index.to_dict(orient="index")
+                else:
+                    self.__data_index = None
+
+                    self.__label_hop_offsets = self.__data["label_hop_offsets"]
+                    self.__data.drop(columns=["label_hop_offsets"], inplace=True)
+                    self.__label_hop_offsets.dropna(inplace=True)
+                    self.__label_hop_offsets = torch.as_tensor(
+                        self.__label_hop_offsets, device="cuda"
+                    )
+                    self.__label_hop_offsets -= self.__label_hop_offsets[0].clone()
+
+                    self.__major_offsets = self.__data["major_offsets"]
+                    self.__data.drop(columns="major_offsets", inplace=True)
+                    self.__major_offsets.dropna(inplace=True)
+                    self.__major_offsets = torch.as_tensor(
+                        self.__major_offsets, device="cuda"
+                    )
+                    self.__major_offsets -= self.__major_offsets[0].clone()
+
+                    self.__minors = self.__data["minors"]
+                    self.__data.drop(columns="minors", inplace=True)
+                    self.__minors.dropna(inplace=True)
+                    self.__minors = torch.as_tensor(self.__minors, device="cuda")
+
+                    num_batches = self.__end_exclusive - self.__start_inclusive
+                    offsets_len = len(self.__label_hop_offsets) - 1
+                    if offsets_len % num_batches != 0:
+                        raise ValueError("invalid label-hop offsets")
+                    self.__fanout_length = int(offsets_len / num_batches)
+
+        end_time_read_data = perf_counter()
+        self._total_read_time += end_time_read_data - start_time_read_data
 
         # Pull the next set of sampling results out of the dataframe in memory
-        f = self.__data["batch_id"] == self.__next_batch
+        if self.__coo:
+            f = self.__data["batch_id"] == self.__next_batch
         if self.__renumber_map is not None:
             i = self.__next_batch - self.__start_inclusive
 
@@ -301,18 +363,43 @@ def __next__(self):
         else:
             current_renumber_map = None
 
+        start_time_convert = perf_counter()
         # Get and return the sampled subgraph
         if (
             len(self.__graph_store.edge_types) == 1
             and len(self.__graph_store.node_types) == 1
         ):
-            sampler_output = _sampler_output_from_sampling_results_homogeneous(
-                self.__data[f],
-                current_renumber_map,
-                self.__graph_store,
-                self.__data_index,
-                self.__next_batch,
-            )
+            if self.__coo:
+                sampler_output = _sampler_output_from_sampling_results_homogeneous_coo(
+                    self.__data[f],
+                    current_renumber_map,
+                    self.__graph_store,
+                    self.__data_index,
+                    self.__next_batch,
+                )
+            else:
+                i = (self.__next_batch - self.__start_inclusive) * self.__fanout_length
+                current_label_hop_offsets = self.__label_hop_offsets[
+                    i : i + self.__fanout_length + 1
+                ]
+
+                current_major_offsets = self.__major_offsets[
+                    current_label_hop_offsets[0] : (current_label_hop_offsets[-1] + 1)
+                ]
+
+                current_minors = self.__minors[
+                    current_major_offsets[0] : current_major_offsets[-1]
+                ]
+
+                sampler_output = _sampler_output_from_sampling_results_homogeneous_csr(
+                    current_major_offsets,
+                    current_minors,
+                    current_renumber_map,
+                    self.__graph_store,
+                    current_label_hop_offsets,
+                    self.__data_index,
+                    self.__next_batch,
+                )
         else:
             sampler_output = _sampler_output_from_sampling_results_heterogeneous(
                 self.__data[f], current_renumber_map, self.__graph_store
@@ -321,18 +408,35 @@ def __next__(self):
         # Get ready for next iteration
         self.__next_batch += 1
 
+        end_time_convert = perf_counter()
+        self._total_convert_time += end_time_convert - start_time_convert
+
+        start_time_feature = perf_counter()
         # Create a PyG HeteroData object, loading the required features
-        out = torch_geometric.loader.utils.filter_custom_store(
-            self.__feature_store,
-            self.__graph_store,
-            sampler_output.node,
-            sampler_output.row,
-            sampler_output.col,
-            sampler_output.edge,
-        )
+        if self.__coo:
+            out = torch_geometric.loader.utils.filter_custom_store(
+                self.__feature_store,
+                self.__graph_store,
+                sampler_output.node,
+                sampler_output.row,
+                sampler_output.col,
+                sampler_output.edge,
+            )
+        else:
+            if self.__graph_store.order == "CSR":
+                raise ValueError("CSR format incompatible with CSC output")
+
+            out = filter_cugraph_store_csc(
+                self.__feature_store,
+                self.__graph_store,
+                sampler_output.node,
+                sampler_output.row,
+                sampler_output.col,
+                sampler_output.edge,
+            )
 
         # Account for CSR format in cuGraph vs. CSC format in PyG
-        if self.__graph_store.order == "CSC":
+        if self.__coo and self.__graph_store.order == "CSC":
             for node_type in out.edge_index_dict:
                 out[node_type].edge_index[0], out[node_type].edge_index[1] = (
                     out[node_type].edge_index[1],
@@ -342,6 +446,9 @@ def __next__(self):
         out.set_value_dict("num_sampled_nodes", sampler_output.num_sampled_nodes)
         out.set_value_dict("num_sampled_edges", sampler_output.num_sampled_edges)
 
+        end_time_feature = perf_counter()
+        self._total_feature_time = end_time_feature - start_time_feature
+
         return out
 
     @property
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
index 6e8c4322418..300ca9beb5a 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
@@ -53,10 +53,10 @@ def _get_unique_nodes(
         The unique nodes of the given node type.
     """
     if node_position == "src":
-        edge_index = "sources"
+        edge_index = "majors"
         edge_sel = 0
     elif node_position == "dst":
-        edge_index = "destinations"
+        edge_index = "minors"
         edge_sel = -1
     else:
         raise ValueError(f"Illegal value {node_position} for node_position")
@@ -78,7 +78,7 @@ def _get_unique_nodes(
     return sampling_results_node[edge_index]
 
 
-def _sampler_output_from_sampling_results_homogeneous(
+def _sampler_output_from_sampling_results_homogeneous_coo(
     sampling_results: cudf.DataFrame,
     renumber_map: torch.Tensor,
     graph_store: CuGraphStore,
@@ -133,11 +133,11 @@ def _sampler_output_from_sampling_results_homogeneous(
     noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
 
     row_dict = {
-        edge_type: torch.as_tensor(sampling_results.sources, device="cuda"),
+        edge_type: torch.as_tensor(sampling_results.majors, device="cuda"),
     }
 
     col_dict = {
-        edge_type: torch.as_tensor(sampling_results.destinations, device="cuda"),
+        edge_type: torch.as_tensor(sampling_results.minors, device="cuda"),
     }
 
     num_nodes_per_hop_dict[node_type][0] = data_index[batch_id, 0]["src_max"] + 1
@@ -177,6 +177,88 @@ def _sampler_output_from_sampling_results_homogeneous(
     )
 
 
+def _sampler_output_from_sampling_results_homogeneous_csr(
+    major_offsets: torch.Tensor,
+    minors: torch.Tensor,
+    renumber_map: torch.Tensor,
+    graph_store: CuGraphStore,
+    label_hop_offsets: torch.Tensor,
+    batch_id: int,
+    metadata: Sequence = None,
+) -> HeteroSamplerOutput:
+    """
+    Parameters
+    ----------
+    major_offsets: torch.Tensor
+        The major offsets for the CSC/CSR matrix ("row pointer")
+    minors: torch.Tensor
+        The minors for the CSC/CSR matrix ("col index")
+    renumber_map: torch.Tensor
+        The tensor containing the renumber map.
+        Required.
+    graph_store: CuGraphStore
+        The graph store containing the structure of the sampled graph.
+    label_hop_offsets: torch.Tensor
+        The tensor containing the label-hop offsets.
+    batch_id: int
+        The current batch id, whose samples are being retrieved
+        from the sampling results and data index.
+    metadata: Tensor
+        The metadata for the sampled batch.
+
+    Returns
+    -------
+    HeteroSamplerOutput
+    """
+
+    if len(graph_store.edge_types) > 1 or len(graph_store.node_types) > 1:
+        raise ValueError("Graph is heterogeneous")
+
+    if renumber_map is None:
+        raise ValueError("Renumbered input is expected for homogeneous graphs")
+
+    node_type = graph_store.node_types[0]
+    edge_type = graph_store.edge_types[0]
+
+    major_offsets = major_offsets.clone() - major_offsets[0]
+    label_hop_offsets = label_hop_offsets.clone() - label_hop_offsets[0]
+
+    num_edges_per_hop_dict = {edge_type: major_offsets[label_hop_offsets].diff().cpu()}
+
+    label_hop_offsets = label_hop_offsets.cpu()
+    num_nodes_per_hop_dict = {
+        node_type: torch.concat(
+            [
+                label_hop_offsets.diff(),
+                (renumber_map.shape[0] - label_hop_offsets[-1]).reshape((1,)),
+            ]
+        ).cpu()
+    }
+
+    noi_index = {node_type: torch.as_tensor(renumber_map, device="cuda")}
+
+    col_dict = {
+        edge_type: major_offsets,
+    }
+
+    row_dict = {
+        edge_type: minors,
+    }
+
+    if HeteroSamplerOutput is None:
+        raise ImportError("Error importing from pyg")
+
+    return HeteroSamplerOutput(
+        node=noi_index,
+        row=row_dict,
+        col=col_dict,
+        edge=None,
+        num_sampled_nodes=num_nodes_per_hop_dict,
+        num_sampled_edges=num_edges_per_hop_dict,
+        metadata=metadata,
+    )
+
+
 def _sampler_output_from_sampling_results_heterogeneous(
     sampling_results: cudf.DataFrame,
     renumber_map: cudf.Series,
@@ -244,8 +326,8 @@ def _sampler_output_from_sampling_results_heterogeneous(
         cudf.Series(
             torch.concat(
                 [
-                    torch.as_tensor(sampling_results_hop_0.sources, device="cuda"),
-                    torch.as_tensor(sampling_results.destinations, device="cuda"),
+                    torch.as_tensor(sampling_results_hop_0.majors, device="cuda"),
+                    torch.as_tensor(sampling_results.minors, device="cuda"),
                 ]
             ),
             name="nodes_of_interest",
@@ -320,3 +402,37 @@ def _sampler_output_from_sampling_results_heterogeneous(
         num_sampled_edges=num_edges_per_hop_dict,
         metadata=metadata,
     )
+
+
+def filter_cugraph_store_csc(
+    feature_store: torch_geometric.data.FeatureStore,
+    graph_store: torch_geometric.data.GraphStore,
+    node_dict: Dict[str, torch.Tensor],
+    row_dict: Dict[str, torch.Tensor],
+    col_dict: Dict[str, torch.Tensor],
+    edge_dict: Dict[str, Tuple[torch.Tensor]],
+) -> torch_geometric.data.HeteroData:
+    data = torch_geometric.data.HeteroData()
+
+    for attr in graph_store.get_all_edge_attrs():
+        key = attr.edge_type
+        if key in row_dict and key in col_dict:
+            data.put_edge_index(
+                (row_dict[key], col_dict[key]),
+                edge_type=key,
+                layout="csc",
+                is_sorted=True,
+            )
+
+    required_attrs = []
+    for attr in feature_store.get_all_tensor_attrs():
+        if attr.group_name in node_dict:
+            attr.index = node_dict[attr.group_name]
+            required_attrs.append(attr)
+            data[attr.group_name].num_nodes = attr.index.size(0)
+
+    tensors = feature_store.multi_get_tensor(required_attrs)
+    for i, attr in enumerate(required_attrs):
+        data[attr.group_name][attr.attr_name] = tensors[i]
+
+    return data
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
index a1a72a44d0c..80a2d0a6c79 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_sampler.py
@@ -53,9 +53,10 @@ def test_neighbor_sample(dask_client, basic_graph_1):
             random_state=62,
             return_offsets=False,
             return_hops=True,
+            use_legacy_names=False,
         )
         .compute()
-        .sort_values(by=["sources", "destinations"])
+        .sort_values(by=["majors", "minors"])
     )
 
     out = _sampler_output_from_sampling_results_heterogeneous(
@@ -116,8 +117,9 @@ def test_neighbor_sample_multi_vertex(dask_client, multi_edge_multi_vertex_graph
             random_state=62,
             return_offsets=False,
             with_batch_ids=True,
+            use_legacy_names=False,
         )
-        .sort_values(by=["sources", "destinations"])
+        .sort_values(by=["majors", "minors"])
         .compute()
     )
 
@@ -193,8 +195,8 @@ def test_neighbor_sample_mock_sampling_results(dask_client):
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
         {
-            "sources": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "destinations": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
+            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
+            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
             "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
             "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
         }
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
index 43b1e5da5a0..ed7f70034e2 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
@@ -220,8 +220,8 @@ def test_renumber_edges(abc_graph, dask_client):
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
         {
-            "sources": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "destinations": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
+            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
+            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
             "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
             "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
         }
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
index 48a21cb7fd6..03274948158 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py
@@ -22,6 +22,8 @@
 from cugraph_pyg.loader import CuGraphNeighborLoader
 from cugraph_pyg.loader import BulkSampleLoader
 from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
+
 from cugraph.gnn import FeatureStore
 from cugraph.utilities.utils import import_optional, MissingModule
 
@@ -98,8 +100,8 @@ def test_cugraph_loader_from_disk():
 
     bogus_samples = cudf.DataFrame(
         {
-            "sources": [0, 1, 2, 3, 4, 5, 6, 6],
-            "destinations": [5, 4, 3, 2, 2, 6, 5, 2],
+            "majors": [0, 1, 2, 3, 4, 5, 6, 6],
+            "minors": [5, 4, 3, 2, 2, 6, 5, 2],
             "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
             "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
             "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
@@ -130,12 +132,10 @@ def test_cugraph_loader_from_disk():
         assert list(edge_index.shape) == [2, 8]
 
         assert (
-            edge_index[0].tolist()
-            == bogus_samples.sources.dropna().values_host.tolist()
+            edge_index[0].tolist() == bogus_samples.majors.dropna().values_host.tolist()
         )
         assert (
-            edge_index[1].tolist()
-            == bogus_samples.destinations.dropna().values_host.tolist()
+            edge_index[1].tolist() == bogus_samples.minors.dropna().values_host.tolist()
         )
 
     assert num_samples == 256
@@ -157,8 +157,8 @@ def test_cugraph_loader_from_disk_subset():
 
     bogus_samples = cudf.DataFrame(
         {
-            "sources": [0, 1, 2, 3, 4, 5, 6, 6],
-            "destinations": [5, 4, 3, 2, 2, 6, 5, 2],
+            "majors": [0, 1, 2, 3, 4, 5, 6, 6],
+            "minors": [5, 4, 3, 2, 2, 6, 5, 2],
             "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
             "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
             "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
@@ -190,13 +190,77 @@ def test_cugraph_loader_from_disk_subset():
         assert list(edge_index.shape) == [2, 8]
 
         assert (
-            edge_index[0].tolist()
-            == bogus_samples.sources.dropna().values_host.tolist()
+            edge_index[0].tolist() == bogus_samples.majors.dropna().values_host.tolist()
+        )
+        assert (
+            edge_index[1].tolist() == bogus_samples.minors.dropna().values_host.tolist()
         )
+
+    assert num_samples == 100
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+def test_cugraph_loader_from_disk_subset_csr():
+    m = [2, 9, 99, 82, 11, 13]
+    n = torch.arange(1, 1 + len(m), dtype=torch.int32)
+    x = torch.zeros(256, dtype=torch.int32)
+    x[torch.tensor(m, dtype=torch.int32)] = n
+    F = FeatureStore()
+    F.add_data(x, "t0", "x")
+
+    G = {("t0", "knows", "t0"): 9080}
+    N = {"t0": 256}
+
+    cugraph_store = CuGraphStore(F, G, N)
+
+    bogus_samples = cudf.DataFrame(
+        {
+            "major_offsets": [0, 3, 5, 7, 8, None, None, None],
+            "minors": [1, 2, 3, 0, 3, 4, 5, 1],
+            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
+            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
+            "label_hop_offsets": cudf.Series(
+                [0, 1, 4, None, None, None, None, None], dtype="int32"
+            ),
+            "renumber_map_offsets": cudf.Series([0, 6], dtype="int32"),
+        }
+    )
+    map = cudf.Series(m, name="map")
+    bogus_samples["map"] = map
+
+    tempdir = tempfile.TemporaryDirectory()
+    for s in range(256):
+        # offset the offsets
+        bogus_samples["batch_id"] = cupy.int32(s)
+        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
+
+    loader = BulkSampleLoader(
+        feature_store=cugraph_store,
+        graph_store=cugraph_store,
+        directory=tempdir,
+        input_files=list(os.listdir(tempdir.name))[100:200],
+    )
+
+    num_samples = 0
+    for sample in loader:
+        num_samples += 1
+        assert sample["t0"]["num_nodes"] == 6
+
+        assert sample["t0"]["x"].tolist() == [1, 2, 3, 4, 5, 6]
+
+        edge_index = sample[("t0", "knows", "t0")]["adj_t"]
+        assert edge_index.size(0) == 4
+        assert edge_index.size(1) == 6
+
+        colptr, row, _ = edge_index.csr()
+
         assert (
-            edge_index[1].tolist()
-            == bogus_samples.destinations.dropna().values_host.tolist()
+            colptr.tolist() == bogus_samples.major_offsets.dropna().values_host.tolist()
         )
+        assert row.tolist() == bogus_samples.minors.dropna().values_host.tolist()
+
+        assert sample["t0"]["num_sampled_nodes"].tolist() == [1, 3, 2]
+        assert sample["t0", "knows", "t0"]["num_sampled_edges"].tolist() == [3, 5]
 
     assert num_samples == 100
 
@@ -215,8 +279,8 @@ def test_cugraph_loader_e2e_coo():
 
     bogus_samples = cudf.DataFrame(
         {
-            "sources": [0, 1, 2, 3, 4, 5, 6, 6],
-            "destinations": [5, 4, 3, 2, 2, 6, 5, 2],
+            "majors": [0, 1, 2, 3, 4, 5, 6, 6],
+            "minors": [5, 4, 3, 2, 2, 6, 5, 2],
             "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
             "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
             "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 2], dtype="int32"),
@@ -253,8 +317,6 @@ def test_cugraph_loader_e2e_coo():
         num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
         num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
 
-        print(num_sampled_nodes, num_sampled_edges)
-
         for i in range(len(convs)):
             x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)
 
@@ -263,9 +325,111 @@ def test_cugraph_loader_e2e_coo():
             x = convs[i](x, ei, size=(s, s))
             x = relu(x)
             x = dropout(x, p=0.5)
-            print(x.shape)
 
-        print(x.shape)
         x = x.narrow(dim=0, start=0, length=x.shape[0] - num_sampled_nodes[1])
 
         assert list(x.shape) == [3, 1]
+
+
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.parametrize("framework", ["pyg", "cugraph-ops"])
+def test_cugraph_loader_e2e_csc(framework):
+    m = [2, 9, 99, 82, 9, 3, 18, 1, 12]
+    x = torch.randint(3000, (256, 256)).to(torch.float32)
+    F = FeatureStore()
+    F.add_data(x, "t0", "x")
+
+    G = {("t0", "knows", "t0"): 9999}
+    N = {"t0": 256}
+
+    cugraph_store = CuGraphStore(F, G, N)
+
+    bogus_samples = cudf.DataFrame(
+        {
+            "major_offsets": [0, 3, 5, 7, 8, None, None, None],
+            "minors": [1, 2, 3, 0, 3, 4, 5, 1],
+            "edge_type": cudf.Series([0, 0, 0, 0, 0, 0, 0, 0], dtype="int32"),
+            "edge_id": [5, 10, 15, 20, 25, 30, 35, 40],
+            "label_hop_offsets": cudf.Series(
+                [0, 1, 4, None, None, None, None, None], dtype="int32"
+            ),
+            "renumber_map_offsets": cudf.Series([0, 6], dtype="int32"),
+        }
+    )
+    map = cudf.Series(m, name="map")
+    bogus_samples = bogus_samples.join(map, how="outer").sort_index()
+
+    tempdir = tempfile.TemporaryDirectory()
+    for s in range(256):
+        bogus_samples["batch_id"] = cupy.int32(s)
+        bogus_samples.to_parquet(os.path.join(tempdir.name, f"batch={s}-{s}.parquet"))
+
+    loader = BulkSampleLoader(
+        feature_store=cugraph_store,
+        graph_store=cugraph_store,
+        directory=tempdir,
+        input_files=list(os.listdir(tempdir.name))[100:200],
+    )
+
+    if framework == "pyg":
+        convs = [
+            torch_geometric.nn.SAGEConv(256, 64, aggr="mean").cuda(),
+            torch_geometric.nn.SAGEConv(64, 1, aggr="mean").cuda(),
+        ]
+    else:
+        convs = [
+            CuGraphSAGEConv(256, 64, aggr="mean").cuda(),
+            CuGraphSAGEConv(64, 1, aggr="mean").cuda(),
+        ]
+
+    trim = trim_to_layer.TrimToLayer()
+    relu = torch.nn.functional.relu
+    dropout = torch.nn.functional.dropout
+
+    for hetero_data in loader:
+        x = hetero_data["t0"]["x"].cuda()
+
+        if framework == "pyg":
+            ei = hetero_data["t0", "knows", "t0"]["adj_t"].coo()
+            ei = torch.stack((ei[0], ei[1]))
+        else:
+            ei = hetero_data["t0", "knows", "t0"]["adj_t"].csr()
+            ei = [ei[1], ei[0], x.shape[0]]
+
+        num_sampled_nodes = hetero_data["t0"]["num_sampled_nodes"]
+        num_sampled_edges = hetero_data["t0", "knows", "t0"]["num_sampled_edges"]
+
+        s = x.shape[0]
+        for i in range(len(convs)):
+            if framework == "pyg":
+                x, ei, _ = trim(i, num_sampled_nodes, num_sampled_edges, x, ei, None)
+            else:
+                if i > 0:
+                    x = x.narrow(
+                        dim=0,
+                        start=0,
+                        length=s - num_sampled_nodes[-i],
+                    )
+
+                    ei[0] = ei[0].narrow(
+                        dim=0,
+                        start=0,
+                        length=ei[0].size(0) - num_sampled_edges[-i],
+                    )
+                    ei[1] = ei[1].narrow(
+                        dim=0, start=0, length=ei[1].size(0) - num_sampled_nodes[-i]
+                    )
+                    ei[2] = x.size(0)
+
+            s = x.shape[0]
+
+            if framework == "pyg":
+                x = convs[i](x, ei, size=(s, s))
+            else:
+                x = convs[i](x, ei)
+            x = relu(x)
+            x = dropout(x, p=0.5)
+
+        x = x.narrow(dim=0, start=0, length=s - num_sampled_nodes[1])
+
+        assert list(x.shape) == [1, 1]
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
index 84f62e80c9d..e703d477b70 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_sampler.py
@@ -49,7 +49,8 @@ def test_neighbor_sample(basic_graph_1):
         with_batch_ids=True,
         random_state=62,
         return_offsets=False,
-    ).sort_values(by=["sources", "destinations"])
+        use_legacy_names=False,
+    ).sort_values(by=["majors", "minors"])
 
     out = _sampler_output_from_sampling_results_heterogeneous(
         sampling_results=sampling_results,
@@ -107,7 +108,8 @@ def test_neighbor_sample_multi_vertex(multi_edge_multi_vertex_graph_1):
         random_state=62,
         return_offsets=False,
         with_batch_ids=True,
-    ).sort_values(by=["sources", "destinations"])
+        use_legacy_names=False,
+    ).sort_values(by=["majors", "minors"])
 
     out = _sampler_output_from_sampling_results_heterogeneous(
         sampling_results=sampling_results,
@@ -154,8 +156,8 @@ def test_neighbor_sample_mock_sampling_results(abc_graph):
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
         {
-            "sources": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "destinations": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
+            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
+            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
             "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
             "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
         }
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
index e815b813050..da3043760d4 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
@@ -204,8 +204,8 @@ def test_renumber_edges(abc_graph):
     # let 0, 1 be the start vertices, fanout = [2, 1, 2, 3]
     mock_sampling_results = cudf.DataFrame(
         {
-            "sources": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
-            "destinations": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
+            "majors": cudf.Series([0, 0, 1, 2, 3, 3, 1, 3, 3, 3], dtype="int64"),
+            "minors": cudf.Series([2, 3, 3, 8, 1, 7, 3, 1, 5, 7], dtype="int64"),
             "hop_id": cudf.Series([0, 0, 0, 1, 1, 1, 2, 3, 3, 3], dtype="int32"),
             "edge_type": cudf.Series([0, 0, 0, 2, 1, 2, 0, 1, 2, 2], dtype="int32"),
         }

From 061ada449b7b11fd39789aca59a65a663f631a3e Mon Sep 17 00:00:00 2001
From: Joseph Nke <76006812+jnke2016@users.noreply.github.com>
Date: Wed, 4 Oct 2023 14:41:24 -0500
Subject: [PATCH 69/72] Increase dask-related timeouts for CI testing (#3907)

This PR increases the minimum timeout when waiting for the workers to complete their tasks.

Authors:
  - Joseph Nke (https://github.com/jnke2016)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Rick Ratzel (https://github.com/rlratzel)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cugraph/pull/3907
---
 ci/test_python.sh | 5 ++++-
 ci/test_wheel.sh  | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 825d5b242d5..df0f34377a3 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -65,7 +65,10 @@ popd
 
 rapids-logger "pytest cugraph"
 pushd python/cugraph/cugraph
-export DASK_WORKER_DEVICES="0"
+DASK_WORKER_DEVICES="0" \
+DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL="1000s" \
+DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="1000s" \
+DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 pytest \
   -v \
   --benchmark-disable \
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 3ac3549f143..d6ec67cd9e9 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -21,5 +21,9 @@ arch=$(uname -m)
 if [[ "${arch}" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
     python ./ci/wheel_smoke_test_${package_name}.py
 else
-    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets python -m pytest ./python/${package_name}/${python_package_name}/tests
+    RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets \
+    DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL="1000s" \
+    DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="1000s" \
+    DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
+    python -m pytest ./python/${package_name}/${python_package_name}/tests
 fi

From d03cb0fd33e072756f6d87fbe009b561ff48bafb Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Thu, 5 Oct 2023 09:31:01 -0700
Subject: [PATCH 70/72] Remove `dask_cudf` dataframe for the `_make_plc_graph`
 while creating `cugraph.Graph` (#3895)

This PR attempts to fix https://github.com/rapidsai/cugraph/issues/3790

Please note that I  have not being able to cause failure locally so it is really hard for me to know if it actually fixes anything or not  .

MRE being used to test locally: https://gist.github.com/VibhuJawa/4b1ec24022b6e2dd7879cd2e8d3fab67


CC: @jnke2016 , @rlratzel ,

CC:  @rjzamora , Please let me know what i can do better here.

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Joseph Nke (https://github.com/jnke2016)

URL: https://github.com/rapidsai/cugraph/pull/3895
---
 .../cugraph/cugraph/dask/common/part_utils.py | 62 ++++++++++++++++---
 .../simpleDistributedGraph.py                 | 24 ++++---
 2 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/python/cugraph/cugraph/dask/common/part_utils.py b/python/cugraph/cugraph/dask/common/part_utils.py
index 7c0aad6c3ee..25311902b29 100644
--- a/python/cugraph/cugraph/dask/common/part_utils.py
+++ b/python/cugraph/cugraph/dask/common/part_utils.py
@@ -99,19 +99,65 @@ def _chunk_lst(ls, num_parts):
     return [ls[i::num_parts] for i in range(num_parts)]
 
 
-def persist_dask_df_equal_parts_per_worker(dask_df, client):
+def persist_dask_df_equal_parts_per_worker(
+    dask_df, client, return_type="dask_cudf.DataFrame"
+):
+    """
+    Persist dask_df with equal parts per worker
+    Args:
+        dask_df: dask_cudf.DataFrame
+        client: dask.distributed.Client
+        return_type: str, "dask_cudf.DataFrame" or "dict"
+    Returns:
+        persisted_keys: dict of {worker: [persisted_keys]}
+    """
+    if return_type not in ["dask_cudf.DataFrame", "dict"]:
+        raise ValueError("return_type must be either 'dask_cudf.DataFrame' or 'dict'")
+
     ddf_keys = dask_df.to_delayed()
     workers = client.scheduler_info()["workers"].keys()
     ddf_keys_ls = _chunk_lst(ddf_keys, len(workers))
-    persisted_keys = []
+    persisted_keys_d = {}
     for w, ddf_k in zip(workers, ddf_keys_ls):
-        persisted_keys.extend(
-            client.persist(ddf_k, workers=w, allow_other_workers=False)
+        persisted_keys_d[w] = client.compute(
+            ddf_k, workers=w, allow_other_workers=False, pure=False
         )
-    dask_df = dask_cudf.from_delayed(persisted_keys, meta=dask_df._meta).persist()
-    wait(dask_df)
-    client.rebalance(dask_df)
-    return dask_df
+
+    persisted_keys_ls = [
+        item for sublist in persisted_keys_d.values() for item in sublist
+    ]
+    wait(persisted_keys_ls)
+    if return_type == "dask_cudf.DataFrame":
+        dask_df = dask_cudf.from_delayed(
+            persisted_keys_ls, meta=dask_df._meta
+        ).persist()
+        wait(dask_df)
+        return dask_df
+
+    return persisted_keys_d
+
+
+def get_length_of_parts(persisted_keys_d, client):
+    """
+    Get the length of each partition
+    Args:
+        persisted_keys_d: dict of {worker: [persisted_keys]}
+        client: dask.distributed.Client
+    Returns:
+        length_of_parts: dict of {worker: [length_of_parts]}
+    """
+    length_of_parts = {}
+    for w, p_keys in persisted_keys_d.items():
+        length_of_parts[w] = [
+            client.submit(
+                len, p_key, pure=False, workers=[w], allow_other_workers=False
+            )
+            for p_key in p_keys
+        ]
+
+    for w, len_futures in length_of_parts.items():
+        length_of_parts[w] = client.gather(len_futures)
+    return length_of_parts
 
 
 async def _extract_partitions(
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index fa94fa67625..935d0c597d4 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -36,6 +36,7 @@
 from cugraph.structure.symmetrize import symmetrize
 from cugraph.dask.common.part_utils import (
     get_persisted_df_worker_map,
+    get_length_of_parts,
     persist_dask_df_equal_parts_per_worker,
 )
 from cugraph.dask import get_n_workers
@@ -318,9 +319,14 @@ def __from_edgelist(
             is_symmetric=not self.properties.directed,
         )
         ddf = ddf.repartition(npartitions=len(workers) * 2)
-        ddf = persist_dask_df_equal_parts_per_worker(ddf, _client)
-        num_edges = len(ddf)
-        ddf = get_persisted_df_worker_map(ddf, _client)
+        persisted_keys_d = persist_dask_df_equal_parts_per_worker(
+            ddf, _client, return_type="dict"
+        )
+        del ddf
+        length_of_parts = get_length_of_parts(persisted_keys_d, _client)
+        num_edges = sum(
+            [item for sublist in length_of_parts.values() for item in sublist]
+        )
         delayed_tasks_d = {
             w: delayed(simpleDistributedGraphImpl._make_plc_graph)(
                 Comms.get_session_id(),
@@ -331,14 +337,16 @@ def __from_edgelist(
                 store_transposed,
                 num_edges,
             )
-            for w, edata in ddf.items()
+            for w, edata in persisted_keys_d.items()
         }
-        # FIXME: For now, don't delete the copied dataframe to avoid crash
         self._plc_graph = {
-            w: _client.compute(delayed_task, workers=w, allow_other_workers=False)
+            w: _client.compute(
+                delayed_task, workers=w, allow_other_workers=False, pure=False
+            )
             for w, delayed_task in delayed_tasks_d.items()
         }
         wait(list(self._plc_graph.values()))
+        del persisted_keys_d
         del delayed_tasks_d
         _client.run(gc.collect)
 
@@ -1192,5 +1200,7 @@ def _get_column_from_ls_dfs(lst_df, col_name):
     if len_df == 0:
         return lst_df[0][col_name]
     output_col = cudf.concat([df[col_name] for df in lst_df], ignore_index=True)
-    # FIXME: For now, don't delete the copied dataframe to avoid cras
+    for df in lst_df:
+        df.drop(columns=[col_name], inplace=True)
+    gc.collect()
     return output_col

From 40a5f8e1dd868b6a9faf9f70dfcd6a70fa78e461 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:05:30 -0400
Subject: [PATCH 71/72] Add wget to test_notebook dependencies (#3918)

Running certain notebooks requires downloading datasets via https://github.com/rapidsai/cugraph/blob/branch-23.10/notebooks/cugraph_benchmarks/dataPrep.sh which uses `wget`, so this PR ensures the `test_notebooks` environment created in `rapidsai/docker` will have `wget` installed.

Authors:
   - Ray Douglass (https://github.com/raydouglass)

Approvers:
   - Jake Awe (https://github.com/AyodeAwe)
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 conda/environments/all_cuda-120_arch-x86_64.yaml | 1 +
 dependencies.yaml                                | 3 +++
 3 files changed, 5 insertions(+)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 87179ef892e..dea52887f23 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -74,5 +74,6 @@ dependencies:
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
 - ucx-py==0.34.*
+- wget
 - wheel
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index d54dc0abf51..2d55f73c5d1 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -73,5 +73,6 @@ dependencies:
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
 - ucx-py==0.34.*
+- wget
 - wheel
 name: all_cuda-120_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 292fcf0baed..f330361ba88 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -463,6 +463,9 @@ dependencies:
         packages:
           - ipython
           - notebook>=0.5.0
+      - output_types: [conda]
+        packages:
+          - wget
   test_python_common:
     common:
       - output_types: [conda, pyproject]

From bbc8fed271deb973de01d227e81a0a2981ac2b38 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 11 Oct 2023 10:28:07 -0400
Subject: [PATCH 72/72] Update Changelog [skip ci]

---
 CHANGELOG.md | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4832bc2fb04..33a5b2bc5e7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,97 @@
+# cuGraph 23.10.00 (11 Oct 2023)
+
+## 🚨 Breaking Changes
+
+- Rename `cugraph-nx` to `nx-cugraph` ([#3840](https://github.com/rapidsai/cugraph/pull/3840)) [@eriknw](https://github.com/eriknw)
+- Remove legacy betweenness centrality ([#3829](https://github.com/rapidsai/cugraph/pull/3829)) [@jnke2016](https://github.com/jnke2016)
+- Remove Deprecated Sampling Options ([#3816](https://github.com/rapidsai/cugraph/pull/3816)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- cuGraph-PyG Loader Improvements ([#3795](https://github.com/rapidsai/cugraph/pull/3795)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Expose threshold in louvain ([#3792](https://github.com/rapidsai/cugraph/pull/3792)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Fix ValueError Caused By Batches With No Samples ([#3789](https://github.com/rapidsai/cugraph/pull/3789)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Update to Cython 3.0.0 ([#3716](https://github.com/rapidsai/cugraph/pull/3716)) [@vyasr](https://github.com/vyasr)
+
+## 🐛 Bug Fixes
+
+- Add wget to test_notebook dependencies ([#3918](https://github.com/rapidsai/cugraph/pull/3918)) [@raydouglass](https://github.com/raydouglass)
+- Increase dask-related timeouts for CI testing ([#3907](https://github.com/rapidsai/cugraph/pull/3907)) [@jnke2016](https://github.com/jnke2016)
+- Remove `dask_cudf` dataframe for the `_make_plc_graph` while creating `cugraph.Graph` ([#3895](https://github.com/rapidsai/cugraph/pull/3895)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Adds logic to handle isolated vertices at python layer ([#3886](https://github.com/rapidsai/cugraph/pull/3886)) [@naimnv](https://github.com/naimnv)
+- Update Allocator Selection in cuGraph-DGL Example ([#3877](https://github.com/rapidsai/cugraph/pull/3877)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Add file to update-version.sh ([#3870](https://github.com/rapidsai/cugraph/pull/3870)) [@raydouglass](https://github.com/raydouglass)
+- Fix torch seed in `cugraph-dgl` and `-pyg` tests for conv layers ([#3869](https://github.com/rapidsai/cugraph/pull/3869)) [@tingyu66](https://github.com/tingyu66)
+- MFG C++ code bug fix ([#3865](https://github.com/rapidsai/cugraph/pull/3865)) [@seunghwak](https://github.com/seunghwak)
+- Fix subtle memory leak in nbr_intersection primitive ([#3858](https://github.com/rapidsai/cugraph/pull/3858)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Uses `conda mambabuild` rather than `mamba mambabuild` ([#3853](https://github.com/rapidsai/cugraph/pull/3853)) [@rlratzel](https://github.com/rlratzel)
+- Remove the assumption made on the client data&#39;s keys ([#3835](https://github.com/rapidsai/cugraph/pull/3835)) [@jnke2016](https://github.com/jnke2016)
+- Disable mg tests ([#3833](https://github.com/rapidsai/cugraph/pull/3833)) [@naimnv](https://github.com/naimnv)
+- Refactor python code for similarity algos to use latest CAPI ([#3828](https://github.com/rapidsai/cugraph/pull/3828)) [@naimnv](https://github.com/naimnv)
+- [BUG] Fix Batch Renumbering of Empty Batches ([#3823](https://github.com/rapidsai/cugraph/pull/3823)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Temporarily disable the deletion of the dask dataframe ([#3814](https://github.com/rapidsai/cugraph/pull/3814)) [@jnke2016](https://github.com/jnke2016)
+- Fix OD shortest distance matrix computation test failures. ([#3813](https://github.com/rapidsai/cugraph/pull/3813)) [@seunghwak](https://github.com/seunghwak)
+- Use rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10 for docs build ([#3811](https://github.com/rapidsai/cugraph/pull/3811)) [@naimnv](https://github.com/naimnv)
+- Fix ValueError Caused By Batches With No Samples ([#3789](https://github.com/rapidsai/cugraph/pull/3789)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Update `python_run_cugraph` in `dependencies.yaml` ([#3781](https://github.com/rapidsai/cugraph/pull/3781)) [@nv-rliu](https://github.com/nv-rliu)
+- Fixes `KeyError` for `get_two_hop_neighbors` when called with a small start vertices list ([#3778](https://github.com/rapidsai/cugraph/pull/3778)) [@rlratzel](https://github.com/rlratzel)
+
+## 📖 Documentation
+
+- Update the docstrings of the similarity algorithms ([#3817](https://github.com/rapidsai/cugraph/pull/3817)) [@jnke2016](https://github.com/jnke2016)
+
+## 🚀 New Features
+
+- WholeGraph Feature Store for cuGraph-PyG and cuGraph-DGL ([#3874](https://github.com/rapidsai/cugraph/pull/3874)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- similarity notebook to compare link prediction algos ([#3868](https://github.com/rapidsai/cugraph/pull/3868)) [@acostadon](https://github.com/acostadon)
+- adding dining preference dataset ([#3866](https://github.com/rapidsai/cugraph/pull/3866)) [@acostadon](https://github.com/acostadon)
+- Integrate C++ Renumbering and Compression ([#3841](https://github.com/rapidsai/cugraph/pull/3841)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Sampling post processing functions to accelerate MFG creation. ([#3815](https://github.com/rapidsai/cugraph/pull/3815)) [@seunghwak](https://github.com/seunghwak)
+- [REVIEW] Add Pure DGL Dataloading benchmark ([#3660](https://github.com/rapidsai/cugraph/pull/3660)) [@VibhuJawa](https://github.com/VibhuJawa)
+
+## 🛠️ Improvements
+
+- nx-cugraph: handle louvain with isolated nodes ([#3897](https://github.com/rapidsai/cugraph/pull/3897)) [@eriknw](https://github.com/eriknw)
+- Pin `dask` and `distributed` for `23.10` release ([#3896](https://github.com/rapidsai/cugraph/pull/3896)) [@galipremsagar](https://github.com/galipremsagar)
+- Updates the source build docs to include libcugraphops as a build prerequisite ([#3893](https://github.com/rapidsai/cugraph/pull/3893)) [@rlratzel](https://github.com/rlratzel)
+- fixes force atlas to allow string as vertex names ([#3891](https://github.com/rapidsai/cugraph/pull/3891)) [@acostadon](https://github.com/acostadon)
+- Integrate renumbering and compression to `cugraph-dgl` to accelerate MFG creation ([#3887](https://github.com/rapidsai/cugraph/pull/3887)) [@tingyu66](https://github.com/tingyu66)
+- Enable weights for MG similarity algorithms ([#3879](https://github.com/rapidsai/cugraph/pull/3879)) [@jnke2016](https://github.com/jnke2016)
+- cuGraph-PyG MFG Creation and Conversion ([#3873](https://github.com/rapidsai/cugraph/pull/3873)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Update image names ([#3867](https://github.com/rapidsai/cugraph/pull/3867)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Update to clang 16.0.6. ([#3859](https://github.com/rapidsai/cugraph/pull/3859)) [@bdice](https://github.com/bdice)
+- Updates to build and test `nx-cugraph` wheel as part of CI and nightly workflows ([#3852](https://github.com/rapidsai/cugraph/pull/3852)) [@rlratzel](https://github.com/rlratzel)
+- Update `cugraph-dgl` conv layers to use improved graph class ([#3849](https://github.com/rapidsai/cugraph/pull/3849)) [@tingyu66](https://github.com/tingyu66)
+- Add entry point to tell NetworkX about nx-cugraph without importing it. ([#3848](https://github.com/rapidsai/cugraph/pull/3848)) [@eriknw](https://github.com/eriknw)
+- [IMP] Add ability to get batch size from the loader in cuGraph-PyG ([#3846](https://github.com/rapidsai/cugraph/pull/3846)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Refactor legacy k truss ([#3843](https://github.com/rapidsai/cugraph/pull/3843)) [@jnke2016](https://github.com/jnke2016)
+- Use new `raft::compiled_static` targets ([#3842](https://github.com/rapidsai/cugraph/pull/3842)) [@divyegala](https://github.com/divyegala)
+- Rename `cugraph-nx` to `nx-cugraph` ([#3840](https://github.com/rapidsai/cugraph/pull/3840)) [@eriknw](https://github.com/eriknw)
+- Add cuGraph devcontainers ([#3838](https://github.com/rapidsai/cugraph/pull/3838)) [@trxcllnt](https://github.com/trxcllnt)
+- Enable temporarily disabled MG tests ([#3837](https://github.com/rapidsai/cugraph/pull/3837)) [@naimnv](https://github.com/naimnv)
+- Remove legacy betweenness centrality ([#3829](https://github.com/rapidsai/cugraph/pull/3829)) [@jnke2016](https://github.com/jnke2016)
+- Use `copy-pr-bot` ([#3827](https://github.com/rapidsai/cugraph/pull/3827)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Update README.md ([#3826](https://github.com/rapidsai/cugraph/pull/3826)) [@lmeyerov](https://github.com/lmeyerov)
+- Adding metadata getter methods to datasets API ([#3821](https://github.com/rapidsai/cugraph/pull/3821)) [@nv-rliu](https://github.com/nv-rliu)
+- Unpin `dask` and `distributed` for `23.10` development ([#3818](https://github.com/rapidsai/cugraph/pull/3818)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove Deprecated Sampling Options ([#3816](https://github.com/rapidsai/cugraph/pull/3816)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- [REVIEW] Cugraph dgl block improvements ([#3810](https://github.com/rapidsai/cugraph/pull/3810)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Simplify wheel build scripts and allow alphas of RAPIDS dependencies ([#3809](https://github.com/rapidsai/cugraph/pull/3809)) [@vyasr](https://github.com/vyasr)
+- Allow cugraph-nx to run networkx tests for nx versions 3.0, 3.1, and 3.2 ([#3808](https://github.com/rapidsai/cugraph/pull/3808)) [@eriknw](https://github.com/eriknw)
+- Add `louvain_communities` to cugraph-nx ([#3803](https://github.com/rapidsai/cugraph/pull/3803)) [@eriknw](https://github.com/eriknw)
+- Adds missing copyright and license text to __init__.py package files ([#3799](https://github.com/rapidsai/cugraph/pull/3799)) [@rlratzel](https://github.com/rlratzel)
+- cuGraph-PyG Loader Improvements ([#3795](https://github.com/rapidsai/cugraph/pull/3795)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Adds updates to build wheel and conda packages for `cugraph-nx` ([#3793](https://github.com/rapidsai/cugraph/pull/3793)) [@rlratzel](https://github.com/rlratzel)
+- Expose threshold in louvain ([#3792](https://github.com/rapidsai/cugraph/pull/3792)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Allow models to use a lightweight sparse structure ([#3782](https://github.com/rapidsai/cugraph/pull/3782)) [@tingyu66](https://github.com/tingyu66)
+- Clean-up old testing conventions in `test_ecg.py` ([#3779](https://github.com/rapidsai/cugraph/pull/3779)) [@nv-rliu](https://github.com/nv-rliu)
+- Calling `dataset.get_edgelist()` returns a copy of an edge list instead of global ([#3777](https://github.com/rapidsai/cugraph/pull/3777)) [@nv-rliu](https://github.com/nv-rliu)
+- Update dgl benchmarks ([#3775](https://github.com/rapidsai/cugraph/pull/3775)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Forward-merge branch-23.08 to branch-23.10 ([#3774](https://github.com/rapidsai/cugraph/pull/3774)) [@nv-rliu](https://github.com/nv-rliu)
+- Migrate upstream models to `cugraph-pyg` ([#3763](https://github.com/rapidsai/cugraph/pull/3763)) [@tingyu66](https://github.com/tingyu66)
+- Branch 23.10 merge 23.08 ([#3743](https://github.com/rapidsai/cugraph/pull/3743)) [@vyasr](https://github.com/vyasr)
+- Update to Cython 3.0.0 ([#3716](https://github.com/rapidsai/cugraph/pull/3716)) [@vyasr](https://github.com/vyasr)
+- Testing util improvements and refactoring ([#3705](https://github.com/rapidsai/cugraph/pull/3705)) [@betochimas](https://github.com/betochimas)
+- Add new cugraph-nx package (networkx backend using pylibcugraph) ([#3614](https://github.com/rapidsai/cugraph/pull/3614)) [@eriknw](https://github.com/eriknw)
+- New mtmg API for integration ([#3521](https://github.com/rapidsai/cugraph/pull/3521)) [@ChuckHastings](https://github.com/ChuckHastings)
+
 # cuGraph 23.08.00 (9 Aug 2023)
 
 ## 🚨 Breaking Changes