From dd7f0a24baa203b759a56d9dc2d20ee76b61b526 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 18 Jan 2024 14:59:02 -0500
Subject: [PATCH 1/7] DOC v24.04 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  4 +-
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 +-
 .../cuda12.0-conda/devcontainer.json          |  4 +-
 .devcontainer/cuda12.0-pip/devcontainer.json  |  6 +-
 .github/workflows/build.yaml                  | 32 ++++-----
 .github/workflows/pr.yaml                     | 42 ++++++------
 .github/workflows/test.yaml                   | 14 ++--
 VERSION                                       |  2 +-
 build.sh                                      |  2 +-
 ci/build_docs.sh                              |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             | 30 ++++----
 .../all_cuda-120_arch-x86_64.yaml             | 30 ++++----
 .../cugraph-service/conda_build_config.yaml   |  2 +-
 conda/recipes/cugraph/conda_build_config.yaml |  2 +-
 .../pylibcugraph/conda_build_config.yaml      |  2 +-
 cpp/CMakeLists.txt                            |  2 +-
 cpp/doxygen/Doxyfile                          |  2 +-
 cpp/libcugraph_etl/CMakeLists.txt             |  2 +-
 dependencies.yaml                             | 68 +++++++++----------
 docs/cugraph/source/conf.py                   |  4 +-
 fetch_rapids.cmake                            |  2 +-
 .../conda/cugraph_dgl_dev_cuda-118.yaml       |  4 +-
 python/cugraph-dgl/pyproject.toml             |  4 +-
 .../conda/cugraph_pyg_dev_cuda-118.yaml       |  4 +-
 python/cugraph-pyg/pyproject.toml             |  4 +-
 python/cugraph-service/server/pyproject.toml  | 16 ++---
 python/cugraph/CMakeLists.txt                 |  2 +-
 python/cugraph/pyproject.toml                 | 22 +++---
 python/nx-cugraph/_nx_cugraph/__init__.py     |  4 +-
 python/nx-cugraph/pyproject.toml              |  2 +-
 python/pylibcugraph/CMakeLists.txt            |  2 +-
 python/pylibcugraph/pyproject.toml            | 10 +--
 32 files changed, 167 insertions(+), 167 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index e48301e4d14..4c8eb45734e 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index a57ea0d163b..5d583a2f0f1 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,15 +5,15 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {
       "version": "1.14.1"
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
index 10ba2f8fd3d..8e85395e572 100644
--- a/.devcontainer/cuda12.0-conda/devcontainer.json
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
index a112483a6db..632b6ba7f09 100644
--- a/.devcontainer/cuda12.0-pip/devcontainer.json
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -5,15 +5,15 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda12.0-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda12.0-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {
       "version": "1.14.1"
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 273a8902eae..aafa4d51ce6 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -47,7 +47,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -77,13 +77,13 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel_pylibcugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.02
+      extra-repo-sha: branch-24.04
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       node_type: cpu32
   wheel-publish-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -93,7 +93,7 @@ jobs:
   wheel-build-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,12 +101,12 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel_cugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.02
+      extra-repo-sha: branch-24.04
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
   wheel-publish-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -116,7 +116,7 @@ jobs:
   wheel-build-nx-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
   wheel-publish-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -136,7 +136,7 @@ jobs:
   wheel-build-cugraph-dgl:
     needs: wheel-publish-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-cugraph-dgl:
     needs: wheel-build-cugraph-dgl
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -156,7 +156,7 @@ jobs:
   wheel-build-cugraph-pyg:
     needs: wheel-publish-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -166,7 +166,7 @@ jobs:
   wheel-publish-cugraph-pyg:
     needs: wheel-build-cugraph-pyg
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 84d22f8e896..8fde0522515 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -31,41 +31,41 @@ jobs:
       - wheel-tests-cugraph-pyg
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: cpu32
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -75,7 +75,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -85,63 +85,63 @@ jobs:
   wheel-build-pylibcugraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibcugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.02
+      extra-repo-sha: branch-24.04
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       node_type: cpu32
   wheel-tests-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibcugraph.sh
   wheel-build-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-24.02
+      extra-repo-sha: branch-24.04
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
   wheel-tests-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph.sh
   wheel-build-nx-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_nx-cugraph.sh
   wheel-tests-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_nx-cugraph.sh
   wheel-build-cugraph-dgl:
     needs: wheel-tests-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-dgl.sh
   wheel-tests-cugraph-dgl:
     needs: wheel-build-cugraph-dgl
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-dgl.sh
@@ -149,21 +149,21 @@ jobs:
   wheel-build-cugraph-pyg:
     needs: wheel-tests-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-pyg.sh
   wheel-tests-cugraph-pyg:
     needs: wheel-build-cugraph-pyg
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-pyg.sh
       matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "11.8.0"))
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04
     with:
       node_type: cpu32
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 773358ede8d..1c150653bc7 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -41,7 +41,7 @@ jobs:
       script: ci/test_wheel_pylibcugraph.sh
   wheel-tests-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -50,7 +50,7 @@ jobs:
       script: ci/test_wheel_cugraph.sh
   wheel-tests-nx-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       script: ci/test_wheel_nx-cugraph.sh
   wheel-tests-cugraph-dgl:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       script: ci/test_wheel_cugraph-dgl.sh
   wheel-tests-cugraph-pyg:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index 3c6c5e2b706..4a2fe8aa570 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.02.00
+24.04.00
diff --git a/build.sh b/build.sh
index 5044b3a55b3..0ba3a4defed 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-RAPIDS_VERSION=24.02
+RAPIDS_VERSION=24.04
 
 # Valid args to this script (all possible targets and options) - only one per line
 VALIDARGS="
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index d88c7d7bcd7..0ed2e69ae90 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -39,7 +39,7 @@ rapids-mamba-retry install \
 rapids-logger "Install cugraph-dgl"
 rapids-mamba-retry install "${PYTHON_CHANNEL}/linux-64/cugraph-dgl-*.tar.bz2"
 
-export RAPIDS_VERSION_NUMBER="24.02"
+export RAPIDS_VERSION_NUMBER="24.04"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 for PROJECT in libcugraphops libwholegraph; do
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 76178269ab0..43bc60d91fb 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -16,12 +16,12 @@ dependencies:
 - cmake>=3.26.4
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.2.*
+- cudf==24.4.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.2.*
-- dask-cudf==24.2.*
+- dask-cuda==24.4.*
+- dask-cudf==24.4.*
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
@@ -29,11 +29,11 @@ dependencies:
 - graphviz
 - gtest>=1.13.0
 - ipython
-- libcudf==24.2.*
-- libcugraphops==24.2.*
-- libraft-headers==24.2.*
-- libraft==24.2.*
-- librmm==24.2.*
+- libcudf==24.4.*
+- libcugraphops==24.4.*
+- libraft-headers==24.4.*
+- libraft==24.4.*
+- librmm==24.4.*
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
@@ -49,20 +49,20 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibcugraphops==24.2.*
-- pylibraft==24.2.*
-- pylibwholegraph==24.2.*
+- pylibcugraphops==24.4.*
+- pylibraft==24.4.*
+- pylibwholegraph==24.4.*
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-mpl
 - pytest-xdist
 - python-louvain
-- raft-dask==24.2.*
-- rapids-dask-dependency==24.2.*
+- raft-dask==24.4.*
+- rapids-dask-dependency==24.4.*
 - recommonmark
 - requests
-- rmm==24.2.*
+- rmm==24.4.*
 - scikit-build-core>=0.7.0
 - scikit-learn>=0.23.1
 - scipy
@@ -72,7 +72,7 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
-- ucx-py==0.36.*
+- ucx-py==0.37.*
 - wget
 - wheel
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 84a6525bf0c..1829f3dd860 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -16,12 +16,12 @@ dependencies:
 - cmake>=3.26.4
 - cuda-nvcc
 - cuda-version=12.0
-- cudf==24.2.*
+- cudf==24.4.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.2.*
-- dask-cudf==24.2.*
+- dask-cuda==24.4.*
+- dask-cudf==24.4.*
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
@@ -29,11 +29,11 @@ dependencies:
 - graphviz
 - gtest>=1.13.0
 - ipython
-- libcudf==24.2.*
-- libcugraphops==24.2.*
-- libraft-headers==24.2.*
-- libraft==24.2.*
-- librmm==24.2.*
+- libcudf==24.4.*
+- libcugraphops==24.4.*
+- libraft-headers==24.4.*
+- libraft==24.4.*
+- librmm==24.4.*
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
@@ -48,20 +48,20 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibcugraphops==24.2.*
-- pylibraft==24.2.*
-- pylibwholegraph==24.2.*
+- pylibcugraphops==24.4.*
+- pylibraft==24.4.*
+- pylibwholegraph==24.4.*
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-mpl
 - pytest-xdist
 - python-louvain
-- raft-dask==24.2.*
-- rapids-dask-dependency==24.2.*
+- raft-dask==24.4.*
+- rapids-dask-dependency==24.4.*
 - recommonmark
 - requests
-- rmm==24.2.*
+- rmm==24.4.*
 - scikit-build-core>=0.7.0
 - scikit-learn>=0.23.1
 - scipy
@@ -71,7 +71,7 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
-- ucx-py==0.36.*
+- ucx-py==0.37.*
 - wget
 - wheel
 name: all_cuda-120_arch-x86_64
diff --git a/conda/recipes/cugraph-service/conda_build_config.yaml b/conda/recipes/cugraph-service/conda_build_config.yaml
index 6a0124983fd..11ec3d6ebe4 100644
--- a/conda/recipes/cugraph-service/conda_build_config.yaml
+++ b/conda/recipes/cugraph-service/conda_build_config.yaml
@@ -1,2 +1,2 @@
 ucx_py_version:
-  - "0.36.*"
+  - "0.37.*"
diff --git a/conda/recipes/cugraph/conda_build_config.yaml b/conda/recipes/cugraph/conda_build_config.yaml
index 387f3451d8d..6092360a404 100644
--- a/conda/recipes/cugraph/conda_build_config.yaml
+++ b/conda/recipes/cugraph/conda_build_config.yaml
@@ -17,4 +17,4 @@ sysroot_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.36.*"
+  - "0.37.*"
diff --git a/conda/recipes/pylibcugraph/conda_build_config.yaml b/conda/recipes/pylibcugraph/conda_build_config.yaml
index 387f3451d8d..6092360a404 100644
--- a/conda/recipes/pylibcugraph/conda_build_config.yaml
+++ b/conda/recipes/pylibcugraph/conda_build_config.yaml
@@ -17,4 +17,4 @@ sysroot_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.36.*"
+  - "0.37.*"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ecc2ebf06d3..c812cd8e4b3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUGRAPH)
 
-project(CUGRAPH VERSION 24.02.00 LANGUAGES C CXX CUDA)
+project(CUGRAPH VERSION 24.04.00 LANGUAGES C CXX CUDA)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 3b74956e121..d4652a79f91 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -48,7 +48,7 @@ PROJECT_NAME           = libcugraph
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 24.02
+PROJECT_NUMBER=24.04
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/cpp/libcugraph_etl/CMakeLists.txt b/cpp/libcugraph_etl/CMakeLists.txt
index 8874c75896c..0d1f5d2c867 100644
--- a/cpp/libcugraph_etl/CMakeLists.txt
+++ b/cpp/libcugraph_etl/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUGRAPH_ETL)
 
-project(CUGRAPH_ETL VERSION 24.02.00 LANGUAGES C CXX CUDA)
+project(CUGRAPH_ETL VERSION 24.04.00 LANGUAGES C CXX CUDA)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
diff --git a/dependencies.yaml b/dependencies.yaml
index 18ddb6c51dd..b91c1f2addc 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -323,10 +323,10 @@ dependencies:
           - cxx-compiler
           - gmock>=1.13.0
           - gtest>=1.13.0
-          - libcugraphops==24.2.*
-          - libraft-headers==24.2.*
-          - libraft==24.2.*
-          - librmm==24.2.*
+          - libcugraphops==24.4.*
+          - libraft-headers==24.4.*
+          - libraft==24.4.*
+          - librmm==24.4.*
           - openmpi # Required for building cpp-mgtests (multi-GPU tests)
     specific:
       - output_types: [conda]
@@ -407,16 +407,16 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &dask rapids-dask-dependency==24.2.*
-          - &dask_cuda dask-cuda==24.2.*
+          - &dask rapids-dask-dependency==24.4.*
+          - &dask_cuda dask-cuda==24.4.*
           - &numba numba>=0.57
           - &numpy numpy>=1.21
-          - &ucx_py ucx-py==0.36.*
+          - &ucx_py ucx-py==0.37.*
       - output_types: conda
         packages:
           - aiohttp
           - fsspec>=0.6.0
-          - libcudf==24.2.*
+          - libcudf==24.4.*
           - requests
           - nccl>=2.9.9
           - ucx-proc=*=gpu
@@ -439,7 +439,7 @@ dependencies:
           - *numpy
       - output_types: [pyproject]
         packages:
-          - &cugraph cugraph==24.2.*
+          - &cugraph cugraph==24.4.*
   python_run_cugraph_pyg:
     common:
       - output_types: [conda, pyproject]
@@ -467,7 +467,7 @@ dependencies:
       - output_types: pyproject
         packages:
           - *cugraph
-          - cugraph-service-client==24.2.*
+          - cugraph-service-client==24.4.*
   test_cpp:
     common:
       - output_types: conda
@@ -502,7 +502,7 @@ dependencies:
           - scikit-learn>=0.23.1
       - output_types: [conda]
         packages:
-          - pylibwholegraph==24.2.*
+          - pylibwholegraph==24.4.*
   test_python_pylibcugraph:
     common:
       - output_types: [conda, pyproject]
@@ -519,7 +519,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==24.2.*
+          - cugraph==24.4.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - dgl>=1.1.0.cu*
@@ -527,7 +527,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==24.2.*
+          - cugraph==24.4.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - pyg>=2.4.0
@@ -536,7 +536,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.2.*
+          - &rmm_conda rmm==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -547,12 +547,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &rmm_packages_pip_cu12
-              - rmm-cu12==24.2.*
+              - rmm-cu12==24.4.*
           - {matrix: {cuda: "12.1"}, packages: *rmm_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *rmm_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &rmm_packages_pip_cu11
-              - rmm-cu11==24.2.*
+              - rmm-cu11==24.4.*
           - {matrix: {cuda: "11.5"}, packages: *rmm_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *rmm_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *rmm_packages_pip_cu11}
@@ -562,7 +562,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.2.*
+          - &cudf_conda cudf==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -573,12 +573,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &cudf_packages_pip_cu12
-              - cudf-cu12==24.2.*
+              - cudf-cu12==24.4.*
           - {matrix: {cuda: "12.1"}, packages: *cudf_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *cudf_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &cudf_packages_pip_cu11
-              - cudf-cu11==24.2.*
+              - cudf-cu11==24.4.*
           - {matrix: {cuda: "11.5"}, packages: *cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *cudf_packages_pip_cu11}
@@ -588,7 +588,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &dask_cudf_conda dask-cudf==24.2.*
+          - &dask_cudf_conda dask-cudf==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -599,12 +599,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &dask_cudf_packages_pip_cu12
-              - dask-cudf-cu12==24.2.*
+              - dask-cudf-cu12==24.4.*
           - {matrix: {cuda: "12.1"}, packages: *dask_cudf_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *dask_cudf_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &dask_cudf_packages_pip_cu11
-              - dask-cudf-cu11==24.2.*
+              - dask-cudf-cu11==24.4.*
           - {matrix: {cuda: "11.5"}, packages: *dask_cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *dask_cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *dask_cudf_packages_pip_cu11}
@@ -614,7 +614,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_conda pylibraft==24.2.*
+          - &pylibraft_conda pylibraft==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -625,12 +625,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &pylibraft_packages_pip_cu12
-              - pylibraft-cu12==24.2.*
+              - pylibraft-cu12==24.4.*
           - {matrix: {cuda: "12.1"}, packages: *pylibraft_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *pylibraft_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &pylibraft_packages_pip_cu11
-              - pylibraft-cu11==24.2.*
+              - pylibraft-cu11==24.4.*
           - {matrix: {cuda: "11.5"}, packages: *pylibraft_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *pylibraft_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *pylibraft_packages_pip_cu11}
@@ -640,7 +640,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &raft_dask_conda raft-dask==24.2.*
+          - &raft_dask_conda raft-dask==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -651,12 +651,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &raft_dask_packages_pip_cu12
-              - raft-dask-cu12==24.2.*
+              - raft-dask-cu12==24.4.*
           - {matrix: {cuda: "12.1"}, packages: *raft_dask_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *raft_dask_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &raft_dask_packages_pip_cu11
-              - raft-dask-cu11==24.2.*
+              - raft-dask-cu11==24.4.*
           - {matrix: {cuda: "11.5"}, packages: *raft_dask_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *raft_dask_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *raft_dask_packages_pip_cu11}
@@ -666,7 +666,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraph_conda pylibcugraph==24.2.*
+          - &pylibcugraph_conda pylibcugraph==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -677,12 +677,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &pylibcugraph_packages_pip_cu12
-              - pylibcugraph-cu12==24.2.*
+              - pylibcugraph-cu12==24.4.*
           - {matrix: {cuda: "12.1"}, packages: *pylibcugraph_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *pylibcugraph_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &pylibcugraph_packages_pip_cu11
-              - pylibcugraph-cu11==24.2.*
+              - pylibcugraph-cu11==24.4.*
           - {matrix: {cuda: "11.5"}, packages: *pylibcugraph_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *pylibcugraph_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *pylibcugraph_packages_pip_cu11}
@@ -692,7 +692,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraphops_conda pylibcugraphops==24.2.*
+          - &pylibcugraphops_conda pylibcugraphops==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -703,12 +703,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &pylibcugraphops_packages_pip_cu12
-              - pylibcugraphops-cu12==24.2.*
+              - pylibcugraphops-cu12==24.4.*
           - {matrix: {cuda: "12.1"}, packages: *pylibcugraphops_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *pylibcugraphops_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &pylibcugraphops_packages_pip_cu11
-              - pylibcugraphops-cu11==24.2.*
+              - pylibcugraphops-cu11==24.4.*
           - {matrix: {cuda: "11.5"}, packages: *pylibcugraphops_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *pylibcugraphops_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *pylibcugraphops_packages_pip_cu11}
diff --git a/docs/cugraph/source/conf.py b/docs/cugraph/source/conf.py
index cef06a584fc..141e14a04ab 100644
--- a/docs/cugraph/source/conf.py
+++ b/docs/cugraph/source/conf.py
@@ -77,9 +77,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '24.02'
+version = '24.04'
 # The full version, including alpha/beta/rc tags.
-release = '24.02.00'
+release = '24.04.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 1f099e8f85f..3f90238109a 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.04/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake
   )
 endif()
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
index c6df338ab9a..aab2834d445 100644
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -10,11 +10,11 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==24.2.*
+- cugraph==24.4.*
 - dgl>=1.1.0.cu*
 - pandas
 - pre-commit
-- pylibcugraphops==24.2.*
+- pylibcugraphops==24.4.*
 - pytest
 - pytest-benchmark
 - pytest-cov
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index 65ee414da44..78bd13bd36d 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -23,10 +23,10 @@ classifiers = [
     "Programming Language :: Python",
 ]
 dependencies = [
-    "cugraph==24.2.*",
+    "cugraph==24.4.*",
     "numba>=0.57",
     "numpy>=1.21",
-    "pylibcugraphops==24.2.*",
+    "pylibcugraphops==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
index 0c63dc9d80e..84d5fea4eed 100644
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -10,11 +10,11 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==24.2.*
+- cugraph==24.4.*
 - pandas
 - pre-commit
 - pyg>=2.4.0
-- pylibcugraphops==24.2.*
+- pylibcugraphops==24.4.*
 - pytest
 - pytest-benchmark
 - pytest-cov
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index b8666c0d806..61c8bb2a81b 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -26,10 +26,10 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
 ]
 dependencies = [
-    "cugraph==24.2.*",
+    "cugraph==24.4.*",
     "numba>=0.57",
     "numpy>=1.21",
-    "pylibcugraphops==24.2.*",
+    "pylibcugraphops==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index 84c0358668b..d0138b438c3 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -19,18 +19,18 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.2.*",
-    "cugraph-service-client==24.2.*",
-    "cugraph==24.2.*",
+    "cudf==24.4.*",
+    "cugraph-service-client==24.4.*",
+    "cugraph==24.4.*",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==24.2.*",
-    "dask-cudf==24.2.*",
+    "dask-cuda==24.4.*",
+    "dask-cudf==24.4.*",
     "numba>=0.57",
     "numpy>=1.21",
-    "rapids-dask-dependency==24.2.*",
-    "rmm==24.2.*",
+    "rapids-dask-dependency==24.4.*",
+    "rmm==24.4.*",
     "thriftpy2",
-    "ucx-py==0.36.*",
+    "ucx-py==0.37.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index 92345a324e4..f3548192fe2 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cugraph_version 24.02.00)
+set(cugraph_version 24.04.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index 6a9d88bf5c8..700e256c01e 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -6,9 +6,9 @@ requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
     "ninja",
-    "pylibcugraph==24.2.*",
-    "pylibraft==24.2.*",
-    "rmm==24.2.*",
+    "pylibcugraph==24.4.*",
+    "pylibraft==24.4.*",
+    "rmm==24.4.*",
     "scikit-build-core[pyproject]>=0.7.0",
     "setuptools>=61.0.0",
     "wheel",
@@ -29,18 +29,18 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.2.*",
+    "cudf==24.4.*",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==24.2.*",
-    "dask-cudf==24.2.*",
+    "dask-cuda==24.4.*",
+    "dask-cudf==24.4.*",
     "fsspec[http]>=0.6.0",
     "numba>=0.57",
     "numpy>=1.21",
-    "pylibcugraph==24.2.*",
-    "raft-dask==24.2.*",
-    "rapids-dask-dependency==24.2.*",
-    "rmm==24.2.*",
-    "ucx-py==0.36.*",
+    "pylibcugraph==24.4.*",
+    "raft-dask==24.4.*",
+    "rapids-dask-dependency==24.4.*",
+    "rmm==24.4.*",
+    "ucx-py==0.37.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index 69320e6b55c..3e30bb4514a 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -24,7 +24,7 @@
     "backend_name": "cugraph",
     "project": "nx-cugraph",
     "package": "nx_cugraph",
-    "url": "https://github.com/rapidsai/cugraph/tree/branch-24.02/python/nx-cugraph",
+    "url": "https://github.com/rapidsai/cugraph/tree/branch-24.04/python/nx-cugraph",
     "short_summary": "GPU-accelerated backend.",
     # "description": "TODO",
     "functions": {
@@ -182,7 +182,7 @@ def get_info():
 
 
 # FIXME: can this use the standard VERSION file and update mechanism?
-__version__ = "24.02.00"
+__version__ = "24.04.00"
 
 if __name__ == "__main__":
     from pathlib import Path
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index 63ac115918f..a7525530ac8 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -33,7 +33,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "networkx>=3.0",
     "numpy>=1.21",
-    "pylibcugraph==24.2.*",
+    "pylibcugraph==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index 6ef3bf9dd40..0901cce0ae9 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(pylibcugraph_version 24.02.00)
+set(pylibcugraph_version 24.04.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 1d27d952af1..2452bfb6b63 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -6,8 +6,8 @@ requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
     "ninja",
-    "pylibraft==24.2.*",
-    "rmm==24.2.*",
+    "pylibraft==24.4.*",
+    "rmm==24.4.*",
     "scikit-build-core[pyproject]>=0.7.0",
     "setuptools>=61.0.0",
     "wheel",
@@ -28,8 +28,8 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "pylibraft==24.2.*",
-    "rmm==24.2.*",
+    "pylibraft==24.4.*",
+    "rmm==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -40,7 +40,7 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "cudf==24.2.*",
+    "cudf==24.4.*",
     "numpy>=1.21",
     "pandas",
     "pytest",

From 6774abc0798eb7d7ef9184ebd6a63d53fe90b4ab Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Tue, 23 Jan 2024 15:41:15 -0500
Subject: [PATCH 2/7] Match weight-sharing option of GATConv in DGL (#4074)

Closes #4060

Authors:
  - Tingyu Wang (https://github.com/tingyu66)

Approvers:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Vibhu Jawa (https://github.com/VibhuJawa)

URL: https://github.com/rapidsai/cugraph/pull/4074
---
 .../cugraph_dgl/nn/conv/gatconv.py            |  70 +++++++-----
 .../cugraph_dgl/nn/conv/gatv2conv.py          |  69 ++++++------
 .../cugraph_dgl/nn/conv/relgraphconv.py       |  10 +-
 .../cugraph_dgl/nn/conv/sageconv.py           |   8 +-
 .../cugraph_dgl/nn/conv/transformerconv.py    |   8 +-
 python/cugraph-dgl/tests/conftest.py          |  10 +-
 python/cugraph-dgl/tests/nn/__init__.py       |  12 --
 python/cugraph-dgl/tests/nn/common.py         |  23 ----
 python/cugraph-dgl/tests/nn/test_gatconv.py   | 100 ++++++++++-------
 python/cugraph-dgl/tests/nn/test_gatv2conv.py | 103 ++++++++++++------
 .../cugraph-dgl/tests/nn/test_relgraphconv.py |  38 +++----
 python/cugraph-dgl/tests/nn/test_sageconv.py  |  36 +++---
 .../tests/nn/test_transformerconv.py          |  19 ++--
 13 files changed, 268 insertions(+), 238 deletions(-)
 delete mode 100644 python/cugraph-dgl/tests/nn/__init__.py
 delete mode 100644 python/cugraph-dgl/tests/nn/common.py

diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
index 8843e61ad89..cc4ce474f2d 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -29,7 +29,7 @@ class GATConv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int or tuple
+    in_feats : int or (int, int)
         Input feature size. A pair denotes feature sizes of source and
         destination nodes.
     out_feats : int
@@ -92,7 +92,7 @@ class GATConv(BaseConv):
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         num_heads: int,
         feat_drop: float = 0.0,
@@ -104,14 +104,19 @@ def __init__(
         bias: bool = True,
     ):
         super().__init__()
+
+        if isinstance(in_feats, int):
+            self.in_feats_src = self.in_feats_dst = in_feats
+        else:
+            self.in_feats_src, self.in_feats_dst = in_feats
         self.in_feats = in_feats
         self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
         self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.residual = residual
         self.allow_zero_in_degree = allow_zero_in_degree
 
         if isinstance(in_feats, int):
@@ -126,28 +131,34 @@ def __init__(
 
         if edge_feats is not None:
             self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
-            self.attn_weights = nn.Parameter(torch.Tensor(3 * num_heads * out_feats))
+            self.attn_weights = nn.Parameter(torch.empty(3 * num_heads * out_feats))
         else:
             self.register_parameter("lin_edge", None)
-            self.attn_weights = nn.Parameter(torch.Tensor(2 * num_heads * out_feats))
+            self.attn_weights = nn.Parameter(torch.empty(2 * num_heads * out_feats))
 
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
+        out_dim = num_heads * out_feats if concat else out_feats
+        if residual:
+            if self.in_feats_dst != out_dim:
+                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
+            else:
+                self.lin_res = nn.Identity()
         else:
-            self.register_buffer("bias", None)
+            self.register_buffer("lin_res", None)
 
-        self.residual = residual and self.in_feats_dst != out_feats * num_heads
-        if self.residual:
-            self.lin_res = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
+        if bias and not isinstance(self.lin_res, nn.Linear):
+            if concat:
+                self.bias = nn.Parameter(torch.empty(num_heads, out_feats))
+            else:
+                self.bias = nn.Parameter(torch.empty(out_feats))
         else:
-            self.register_buffer("lin_res", None)
+            self.register_buffer("bias", None)
 
         self.reset_parameters()
 
+    def set_allow_zero_in_degree(self, set_value):
+        r"""Set allow_zero_in_degree flag."""
+        self.allow_zero_in_degree = set_value
+
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
@@ -172,7 +183,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
@@ -182,8 +193,10 @@ def forward(
         ----------
         graph : DGLGraph or SparseGraph
             The graph.
-        nfeat : torch.Tensor
-            Input features of shape :math:`(N, D_{in})`.
+        nfeat : torch.Tensor or (torch.Tensor, torch.Tensor)
+            Node features. If given as a tuple, the two elements correspond to
+            the source and destination node features, respectively, in a
+            bipartite graph.
         efeat: torch.Tensor, optional
             Optional edge features.
         max_in_degree : int
@@ -237,18 +250,17 @@ def forward(
 
         if bipartite:
             if not hasattr(self, "lin_src"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_feats must be a pair of "
-                    f"integers to allow bipartite node features, but got "
-                    f"{self.in_feats}."
-                )
-            nfeat_src = self.lin_src(nfeat[0])
-            nfeat_dst = self.lin_dst(nfeat[1])
+                nfeat_src = self.lin(nfeat[0])
+                nfeat_dst = self.lin(nfeat[1])
+            else:
+                nfeat_src = self.lin_src(nfeat[0])
+                nfeat_dst = self.lin_dst(nfeat[1])
         else:
             if not hasattr(self, "lin"):
                 raise RuntimeError(
                     f"{self.__class__.__name__}.in_feats is expected to be an "
-                    f"integer, but got {self.in_feats}."
+                    f"integer when the graph is not bipartite, "
+                    f"but got {self.in_feats}."
                 )
             nfeat = self.lin(nfeat)
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
index 209a5fe1a8d..6c78b4df0b8 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -29,14 +29,11 @@ class GATv2Conv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int, or pair of ints
-        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.
-        If the layer is to be applied to a unidirectional bipartite graph, `in_feats`
-        specifies the input feature size on both the source and destination nodes.
-        If a scalar is given, the source and destination node feature size
-        would take the same value.
+    in_feats : int or (int, int)
+        Input feature size. A pair denotes feature sizes of source and
+        destination nodes.
     out_feats : int
-        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
+        Output feature size.
     num_heads : int
         Number of heads in Multi-Head Attention.
     feat_drop : float, optional
@@ -58,17 +55,15 @@ class GATv2Conv(BaseConv):
         input graph. By setting ``True``, it will suppress the check and let the
         users handle it by themselves. Defaults: ``False``.
     bias : bool, optional
-        If set to :obj:`False`, the layer will not learn
-        an additive bias. (default: :obj:`True`)
+        If True, learns a bias term. Defaults: ``True``.
     share_weights : bool, optional
-        If set to :obj:`True`, the same matrix for :math:`W_{left}` and
-        :math:`W_{right}` in the above equations, will be applied to the source
-        and the target node of every edge. (default: :obj:`False`)
+        If ``True``, the same matrix will be applied to the source and the
+        destination node features. Defaults: ``False``.
     """
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         num_heads: int,
         feat_drop: float = 0.0,
@@ -81,16 +76,22 @@ def __init__(
         share_weights: bool = False,
     ):
         super().__init__()
+
+        if isinstance(in_feats, int):
+            self.in_feats_src = self.in_feats_dst = in_feats
+        else:
+            self.in_feats_src, self.in_feats_dst = in_feats
         self.in_feats = in_feats
         self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
         self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.residual = residual
         self.allow_zero_in_degree = allow_zero_in_degree
         self.share_weights = share_weights
+        self.bias = bias
 
         self.lin_src = nn.Linear(self.in_feats_src, num_heads * out_feats, bias=bias)
         if share_weights:
@@ -106,30 +107,28 @@ def __init__(
                 self.in_feats_dst, num_heads * out_feats, bias=bias
             )
 
-        self.attn = nn.Parameter(torch.Tensor(num_heads * out_feats))
+        self.attn_weights = nn.Parameter(torch.empty(num_heads * out_feats))
 
         if edge_feats is not None:
             self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
         else:
             self.register_parameter("lin_edge", None)
 
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
-        else:
-            self.register_buffer("bias", None)
-
-        self.residual = residual and self.in_feats_dst != out_feats * num_heads
-        if self.residual:
-            self.lin_res = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
+        out_dim = num_heads * out_feats if concat else out_feats
+        if residual:
+            if self.in_feats_dst != out_dim:
+                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
+            else:
+                self.lin_res = nn.Identity()
         else:
             self.register_buffer("lin_res", None)
 
         self.reset_parameters()
 
+    def set_allow_zero_in_degree(self, set_value):
+        r"""Set allow_zero_in_degree flag."""
+        self.allow_zero_in_degree = set_value
+
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
@@ -137,7 +136,7 @@ def reset_parameters(self):
         nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
 
         nn.init.xavier_normal_(
-            self.attn.view(-1, self.num_heads, self.out_feats), gain=gain
+            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
         )
         if self.lin_edge is not None:
             self.lin_edge.reset_parameters()
@@ -145,13 +144,10 @@ def reset_parameters(self):
         if self.lin_res is not None:
             self.lin_res.reset_parameters()
 
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
@@ -225,7 +221,7 @@ def forward(
 
         out = ops_torch.operators.mha_gat_v2_n2n(
             nfeat,
-            self.attn,
+            self.attn_weights,
             _graph,
             num_heads=self.num_heads,
             activation="LeakyReLU",
@@ -243,7 +239,4 @@ def forward(
                 res = res.mean(dim=1)
             out = out + res
 
-        if self.bias is not None:
-            out = out + self.bias
-
         return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
index 54916674210..5c4b5dea441 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -100,16 +100,16 @@ def __init__(
         self.self_loop = self_loop
         if regularizer is None:
             self.W = nn.Parameter(
-                torch.Tensor(num_rels + dim_self_loop, in_feats, out_feats)
+                torch.empty(num_rels + dim_self_loop, in_feats, out_feats)
             )
             self.coeff = None
         elif regularizer == "basis":
             if num_bases is None:
                 raise ValueError('Missing "num_bases" for basis regularization.')
             self.W = nn.Parameter(
-                torch.Tensor(num_bases + dim_self_loop, in_feats, out_feats)
+                torch.empty(num_bases + dim_self_loop, in_feats, out_feats)
             )
-            self.coeff = nn.Parameter(torch.Tensor(num_rels, num_bases))
+            self.coeff = nn.Parameter(torch.empty(num_rels, num_bases))
             self.num_bases = num_bases
         else:
             raise ValueError(
@@ -119,7 +119,7 @@ def __init__(
         self.regularizer = regularizer
 
         if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
+            self.bias = nn.Parameter(torch.empty(out_feats))
         else:
             self.register_parameter("bias", None)
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
index a3f946d7cb4..b6198903766 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -65,7 +65,7 @@ class SAGEConv(BaseConv):
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         aggregator_type: str = "mean",
         feat_drop: float = 0.0,
@@ -111,7 +111,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        feat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
         r"""Forward computation.
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
index 8481b9ee265..e77556fb76f 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -51,7 +51,7 @@ class TransformerConv(BaseConv):
 
     def __init__(
         self,
-        in_node_feats: Union[int, Tuple[int, int]],
+        in_node_feats: Union[int, tuple[int, int]],
         out_node_feats: int,
         num_heads: int,
         concat: bool = True,
@@ -116,7 +116,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward computation.
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/tests/conftest.py
index a3863ed81fa..ee1183f5cd1 100644
--- a/python/cugraph-dgl/tests/conftest.py
+++ b/python/cugraph-dgl/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,6 +13,7 @@
 
 import pytest
 
+import dgl
 import torch
 
 from cugraph.testing.mg_utils import (
@@ -58,3 +59,10 @@ class SparseGraphData1:
 @pytest.fixture
 def sparse_graph_1():
     return SparseGraphData1()
+
+
+@pytest.fixture
+def dgl_graph_1():
+    src = torch.tensor([0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9])
+    dst = torch.tensor([1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0])
+    return dgl.graph((src, dst))
diff --git a/python/cugraph-dgl/tests/nn/__init__.py b/python/cugraph-dgl/tests/nn/__init__.py
deleted file mode 100644
index a1dd01f33d4..00000000000
--- a/python/cugraph-dgl/tests/nn/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/cugraph-dgl/tests/nn/common.py b/python/cugraph-dgl/tests/nn/common.py
deleted file mode 100644
index 34787d20c9a..00000000000
--- a/python/cugraph-dgl/tests/nn/common.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from cugraph.utilities.utils import import_optional
-
-th = import_optional("torch")
-dgl = import_optional("dgl")
-
-
-def create_graph1():
-    u = th.tensor([0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9])
-    v = th.tensor([1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0])
-    g = dgl.graph((u, v))
-    return g
diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/tests/nn/test_gatconv.py
index ce145b2bc87..de27efc6329 100644
--- a/python/cugraph-dgl/tests/nn/test_gatconv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import GATConv as CuGraphGATConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -23,37 +22,49 @@
 ATOL = 1e-6
 
 
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_gatconv_equality(
-    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+    dgl_graph_1,
+    mode,
+    idx_type,
+    max_in_degree,
+    num_heads,
+    residual,
+    to_block,
+    sparse_format,
 ):
     from dgl.nn.pytorch import GATConv
 
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
 
-    if bipartite:
+    if mode == "bipartite":
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.randn(size[0], in_feats[0]).to(device),
+            torch.randn(size[1], in_feats[1]).to(device),
+        )
+    elif mode == "share_weights":
+        in_feats = 5
+        nfeat = (
+            torch.randn(size[0], in_feats).to(device),
+            torch.randn(size[1], in_feats).to(device),
         )
     else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        in_feats = 7
+        nfeat = torch.randn(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,24 +76,24 @@ def test_gatconv_equality(
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True}
+    kwargs = {"bias": False, "allow_zero_in_degree": True, "residual": residual}
 
-    conv1 = GATConv(*args, **kwargs).cuda()
-    out1 = conv1(g, nfeat)
+    conv1 = GATConv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATConv(*args, **kwargs).to(device)
 
-    conv2 = CuGraphGATConv(*args, **kwargs).cuda()
     dim = num_heads * out_feats
     with torch.no_grad():
-        conv2.attn_weights.data[:dim] = conv1.attn_l.data.flatten()
-        conv2.attn_weights.data[dim:] = conv1.attn_r.data.flatten()
-        if bipartite:
-            conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-            conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+        conv2.attn_weights[:dim].copy_(conv1.attn_l.flatten())
+        conv2.attn_weights[dim:].copy_(conv1.attn_r.flatten())
+        if mode == "bipartite":
+            conv2.lin_src.weight.copy_(conv1.fc_src.weight)
+            conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
         else:
-            conv2.lin.weight.data = conv1.fc.weight.data.detach().clone()
-        if residual and conv2.residual:
-            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+            conv2.lin.weight.copy_(conv1.fc.weight)
+        if residual and conv1.has_linear_res:
+            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
 
+    out1 = conv1(g, nfeat)
     if sparse_format is not None:
         out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
     else:
@@ -90,12 +101,12 @@ def test_gatconv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out1 = torch.rand_like(out1)
-    grad_out2 = grad_out1.clone().detach()
+    grad_out1 = torch.randn_like(out1)
+    grad_out2 = grad_out1.detach().clone()
     out1.backward(grad_out1)
     out2.backward(grad_out2)
 
-    if bipartite:
+    if mode == "bipartite":
         assert torch.allclose(
             conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
         )
@@ -105,25 +116,38 @@ def test_gatconv_equality(
     else:
         assert torch.allclose(conv1.fc.weight.grad, conv2.lin.weight.grad, atol=ATOL)
 
+    if residual and conv1.has_linear_res:
+        assert torch.allclose(
+            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
+        )
+
     assert torch.allclose(
         torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
         conv2.attn_weights.grad.view(2, num_heads, out_feats),
-        atol=ATOL,
+        atol=1e-5,  # Note: using a loosened tolerance here due to numerical error
     )
 
 
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 def test_gatconv_edge_feats(
-    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+    dgl_graph_1,
+    bias,
+    bipartite,
+    concat,
+    max_in_degree,
+    num_heads,
+    to_block,
+    use_edge_feats,
 ):
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device)
 
     if to_block:
         g = dgl.to_block(g)
@@ -131,17 +155,17 @@ def test_gatconv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
     else:
         edge_feats = None
         efeat = None
@@ -154,8 +178,8 @@ def test_gatconv_edge_feats(
         edge_feats=edge_feats,
         bias=bias,
         allow_zero_in_degree=True,
-    ).cuda()
+    ).to(device)
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
index 52003edacca..2d26b7fdc28 100644
--- a/python/cugraph-dgl/tests/nn/test_gatv2conv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,45 +15,56 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import GATv2Conv as CuGraphGATv2Conv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
 
-ATOL = 1e-6
+ATOL = 1e-5
 
 
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_gatv2conv_equality(
-    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+    dgl_graph_1,
+    mode,
+    idx_type,
+    max_in_degree,
+    num_heads,
+    residual,
+    to_block,
+    sparse_format,
 ):
     from dgl.nn.pytorch import GATv2Conv
 
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
 
-    if bipartite:
+    if mode == "bipartite":
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.randn(size[0], in_feats[0]).to(device),
+            torch.randn(size[1], in_feats[1]).to(device),
+        )
+    elif mode == "share_weights":
+        in_feats = 5
+        nfeat = (
+            torch.randn(size[0], in_feats).to(device),
+            torch.randn(size[1], in_feats).to(device),
         )
     else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        in_feats = 7
+        nfeat = torch.randn(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,19 +76,24 @@ def test_gatv2conv_equality(
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True}
+    kwargs = {
+        "bias": False,
+        "allow_zero_in_degree": True,
+        "residual": residual,
+        "share_weights": mode == "share_weights",
+    }
 
-    conv1 = GATv2Conv(*args, **kwargs).cuda()
-    out1 = conv1(g, nfeat)
+    conv1 = GATv2Conv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATv2Conv(*args, **kwargs).to(device)
 
-    conv2 = CuGraphGATv2Conv(*args, **kwargs).cuda()
     with torch.no_grad():
-        conv2.attn.data = conv1.attn.data.flatten()
-        conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-        conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
-        if residual and conv2.residual:
-            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+        conv2.attn_weights.copy_(conv1.attn.flatten())
+        conv2.lin_src.weight.copy_(conv1.fc_src.weight)
+        conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
+        if residual:
+            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
 
+    out1 = conv1(g, nfeat)
     if sparse_format is not None:
         out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
     else:
@@ -85,8 +101,8 @@ def test_gatv2conv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out1 = torch.rand_like(out1)
-    grad_out2 = grad_out1.clone().detach()
+    grad_out1 = torch.randn_like(out1)
+    grad_out2 = grad_out1.detach().clone()
     out1.backward(grad_out1)
     out2.backward(grad_out2)
 
@@ -97,21 +113,38 @@ def test_gatv2conv_equality(
         conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
     )
 
-    assert torch.allclose(conv1.attn.grad, conv1.attn.grad, atol=ATOL)
+    if residual:
+        assert torch.allclose(
+            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
+        )
+
+    assert torch.allclose(
+        conv1.attn.grad,
+        conv2.attn_weights.grad.view(1, num_heads, out_feats),
+        atol=ATOL,
+    )
 
 
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 def test_gatv2conv_edge_feats(
-    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+    dgl_graph_1,
+    bias,
+    bipartite,
+    concat,
+    max_in_degree,
+    num_heads,
+    to_block,
+    use_edge_feats,
 ):
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device)
 
     if to_block:
         g = dgl.to_block(g)
@@ -119,17 +152,17 @@ def test_gatv2conv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
     else:
         edge_feats = None
         efeat = None
@@ -142,8 +175,8 @@ def test_gatv2conv_edge_feats(
         edge_feats=edge_feats,
         bias=bias,
         allow_zero_in_degree=True,
-    ).cuda()
+    ).to(device)
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
index bdaa89e57f2..b5d3686c609 100644
--- a/python/cugraph-dgl/tests/nn/test_relgraphconv.py
+++ b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import RelGraphConv as CuGraphRelGraphConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -23,7 +22,7 @@
 ATOL = 1e-6
 
 
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_bases", [1, 2, 5])
 @pytest.mark.parametrize("regularizer", [None, "basis"])
@@ -31,7 +30,8 @@
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_relgraphconv_equality(
-    idtype_int,
+    dgl_graph_1,
+    idx_type,
     max_in_degree,
     num_bases,
     regularizer,
@@ -42,6 +42,12 @@ def test_relgraphconv_equality(
     from dgl.nn.pytorch import RelGraphConv
 
     torch.manual_seed(12345)
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
+
+    if to_block:
+        g = dgl.to_block(g)
+
     in_feat, out_feat, num_rels = 10, 2, 3
     args = (in_feat, out_feat, num_rels)
     kwargs = {
@@ -50,16 +56,10 @@ def test_relgraphconv_equality(
         "bias": False,
         "self_loop": self_loop,
     }
-    g = create_graph1().to("cuda")
-    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).cuda()
-
-    if idtype_int:
-        g = g.int()
-    if to_block:
-        g = dgl.to_block(g)
 
+    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).to(device)
     size = (g.num_src_nodes(), g.num_dst_nodes())
-    feat = torch.rand(g.num_src_nodes(), in_feat).cuda()
+    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
 
     if sparse_format == "coo":
         sg = SparseGraph(
@@ -76,18 +76,18 @@ def test_relgraphconv_equality(
             size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
         )
 
-    conv1 = RelGraphConv(*args, **kwargs).cuda()
-    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).cuda()
+    conv1 = RelGraphConv(*args, **kwargs).to(device)
+    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).to(device)
 
     with torch.no_grad():
         if self_loop:
-            conv2.W.data[:-1] = conv1.linear_r.W.data
-            conv2.W.data[-1] = conv1.loop_weight.data
+            conv2.W[:-1].copy_(conv1.linear_r.W)
+            conv2.W[-1].copy_(conv1.loop_weight)
         else:
-            conv2.W.data = conv1.linear_r.W.data.detach().clone()
+            conv2.W.copy_(conv1.linear_r.W)
 
         if regularizer is not None:
-            conv2.coeff.data = conv1.linear_r.coeff.data.detach().clone()
+            conv2.coeff.copy_(conv1.linear_r.coeff)
 
     out1 = conv1(g, feat, g.edata[dgl.ETYPE])
 
@@ -98,7 +98,7 @@ def test_relgraphconv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out = torch.rand_like(out1)
+    grad_out = torch.randn_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
 
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index b5d0a44b868..3f1c2b1b3fe 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -26,21 +25,19 @@
 @pytest.mark.parametrize("aggr", ["mean", "pool"])
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_sageconv_equality(
-    aggr, bias, bipartite, idtype_int, max_in_degree, to_block, sparse_format
+    dgl_graph_1, aggr, bias, bipartite, idx_type, max_in_degree, to_block, sparse_format
 ):
     from dgl.nn.pytorch import SAGEConv
 
     torch.manual_seed(12345)
-    kwargs = {"aggregator_type": aggr, "bias": bias}
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
@@ -49,12 +46,12 @@ def test_sageconv_equality(
     if bipartite:
         in_feats = (5, 3)
         feat = (
-            torch.rand(size[0], in_feats[0], requires_grad=True).cuda(),
-            torch.rand(size[1], in_feats[1], requires_grad=True).cuda(),
+            torch.rand(size[0], in_feats[0], requires_grad=True).to(device),
+            torch.rand(size[1], in_feats[1], requires_grad=True).to(device),
         )
     else:
         in_feats = 5
-        feat = torch.rand(size[0], in_feats).cuda()
+        feat = torch.rand(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,18 +62,19 @@ def test_sageconv_equality(
         offsets, indices, _ = g.adj_tensors("csc")
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
-    conv1 = SAGEConv(in_feats, out_feats, **kwargs).cuda()
-    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).cuda()
+    kwargs = {"aggregator_type": aggr, "bias": bias}
+    conv1 = SAGEConv(in_feats, out_feats, **kwargs).to(device)
+    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).to(device)
 
     in_feats_src = conv2.in_feats_src
     with torch.no_grad():
-        conv2.lin.weight.data[:, :in_feats_src] = conv1.fc_neigh.weight.data
-        conv2.lin.weight.data[:, in_feats_src:] = conv1.fc_self.weight.data
+        conv2.lin.weight[:, :in_feats_src].copy_(conv1.fc_neigh.weight)
+        conv2.lin.weight[:, in_feats_src:].copy_(conv1.fc_self.weight)
         if bias:
-            conv2.lin.bias.data[:] = conv1.fc_self.bias.data
+            conv2.lin.bias.copy_(conv1.fc_self.bias)
         if aggr == "pool":
-            conv2.pre_lin.weight.data[:] = conv1.fc_pool.weight.data
-            conv2.pre_lin.bias.data[:] = conv1.fc_pool.bias.data
+            conv2.pre_lin.weight.copy_(conv1.fc_pool.weight)
+            conv2.pre_lin.bias.copy_(conv1.fc_pool.bias)
 
     out1 = conv1(g, feat)
     if sparse_format is not None:
@@ -85,7 +83,7 @@ def test_sageconv_equality(
         out2 = conv2(g, feat, max_in_degree=max_in_degree)
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out = torch.rand_like(out1)
+    grad_out = torch.randn_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
     assert torch.allclose(
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
index 5ac4fd7bea7..28d13dedec8 100644
--- a/python/cugraph-dgl/tests/nn/test_transformerconv.py
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import TransformerConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -26,27 +25,25 @@
 @pytest.mark.parametrize("beta", [False, True])
 @pytest.mark.parametrize("bipartite_node_feats", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
-@pytest.mark.parametrize("num_heads", [1, 2, 3, 4])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
+@pytest.mark.parametrize("num_heads", [1, 3, 4])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_transformerconv(
+    dgl_graph_1,
     beta,
     bipartite_node_feats,
     concat,
-    idtype_int,
+    idx_type,
     num_heads,
     to_block,
     use_edge_feats,
     sparse_format,
 ):
     torch.manual_seed(12345)
-    device = "cuda"
-    g = create_graph1().to(device)
-
-    if idtype_int:
-        g = g.int()
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
     if to_block:
         g = dgl.to_block(g)
@@ -92,5 +89,5 @@ def test_transformerconv(
     else:
         out = conv(g, nfeat, efeat)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)

From 82552ab903b0cab9f5b673eb38c4bc4eac50eb48 Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Wed, 24 Jan 2024 16:54:30 -0600
Subject: [PATCH 3/7] nx-cugraph: rename `plc=` to `_plc=` (#4106)

As discussed here: https://github.com/rapidsai/cugraph/pull/4093#discussion_r1458012713

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4106
---
 python/nx-cugraph/lint.yaml                   |  6 +--
 .../nx_cugraph/algorithms/bipartite/basic.py  |  2 +-
 .../algorithms/centrality/betweenness.py      |  4 +-
 .../algorithms/centrality/eigenvector.py      |  2 +-
 .../nx_cugraph/algorithms/centrality/katz.py  |  2 +-
 .../nx_cugraph/algorithms/cluster.py          | 16 ++++++--
 .../algorithms/community/louvain.py           |  2 +-
 .../algorithms/components/connected.py        | 13 +++----
 .../components/strongly_connected.py          |  6 +--
 .../algorithms/components/weakly_connected.py |  6 +--
 .../nx-cugraph/nx_cugraph/algorithms/core.py  |  4 +-
 .../nx-cugraph/nx_cugraph/algorithms/dag.py   |  4 +-
 .../algorithms/link_analysis/hits_alg.py      |  2 +-
 .../algorithms/link_analysis/pagerank_alg.py  |  2 +-
 .../algorithms/shortest_paths/unweighted.py   |  4 +-
 .../traversal/breadth_first_search.py         | 38 +++++++++++++++----
 .../nx_cugraph/algorithms/tree/recognition.py |  8 ++--
 .../nx-cugraph/nx_cugraph/utils/decorators.py | 16 ++++----
 18 files changed, 84 insertions(+), 53 deletions(-)

diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index 0d4f0b59413..5a4773168b6 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -50,7 +50,7 @@ repos:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.1.14
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]  # --unsafe-fixes]
@@ -62,7 +62,7 @@ repos:
         additional_dependencies: &flake8_dependencies
           # These versions need updated manually
           - flake8==7.0.0
-          - flake8-bugbear==23.12.2
+          - flake8-bugbear==24.1.17
           - flake8-simplify==0.21.0
   - repo: https://github.com/asottile/yesqa
     rev: v1.5.0
@@ -77,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.1.14
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py
index d0e9a5c7f1b..46c6b54075b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py
@@ -21,7 +21,7 @@
 ]
 
 
-@networkx_algorithm(plc="triangle_count", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="triangle_count")
 def is_bipartite(G):
     G = _to_graph(G)
     # Counting triangles may not be the fastest way to do this, but it is simple.
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
index ba2b3d9c895..f6bb142cded 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
@@ -21,8 +21,8 @@
 @networkx_algorithm(
     is_incomplete=True,  # weight not supported
     is_different=True,  # RNG with seed is different
-    plc="betweenness_centrality",
     version_added="23.10",
+    _plc="betweenness_centrality",
 )
 def betweenness_centrality(
     G, k=None, normalized=True, weight=None, endpoints=False, seed=None
@@ -54,8 +54,8 @@ def _(G, k=None, normalized=True, weight=None, endpoints=False, seed=None):
 @networkx_algorithm(
     is_incomplete=True,  # weight not supported
     is_different=True,  # RNG with seed is different
-    plc="edge_betweenness_centrality",
     version_added="23.10",
+    _plc="edge_betweenness_centrality",
 )
 def edge_betweenness_centrality(G, k=None, normalized=True, weight=None, seed=None):
     """`weight` parameter is not yet supported, and RNG with seed may be different."""
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
index 9e615955a8b..65a8633667a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
@@ -29,8 +29,8 @@
 @networkx_algorithm(
     extra_params=_dtype_param,
     is_incomplete=True,  # nstart not supported
-    plc="eigenvector_centrality",
     version_added="23.12",
+    _plc="eigenvector_centrality",
 )
 def eigenvector_centrality(
     G, max_iter=100, tol=1.0e-6, nstart=None, weight=None, *, dtype=None
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
index a2fb950c1aa..4a0684f72ee 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
@@ -29,8 +29,8 @@
 @networkx_algorithm(
     extra_params=_dtype_param,
     is_incomplete=True,  # nstart and normalized=False not supported
-    plc="katz_centrality",
     version_added="23.12",
+    _plc="katz_centrality",
 )
 def katz_centrality(
     G,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/cluster.py b/python/nx-cugraph/nx_cugraph/algorithms/cluster.py
index 951c358ff26..a458e6c04db 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/cluster.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/cluster.py
@@ -45,7 +45,7 @@ def _triangles(G, nodes, symmetrize=None):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(plc="triangle_count", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="triangle_count")
 def triangles(G, nodes=None):
     G = _to_undirected_graph(G)
     node_ids, triangles, is_single_node = _triangles(G, nodes)
@@ -57,9 +57,13 @@ def triangles(G, nodes=None):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(is_incomplete=True, plc="triangle_count", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
 def clustering(G, nodes=None, weight=None):
     """Directed graphs and `weight` parameter are not yet supported."""
+    if weight is not None:
+        raise NotImplementedError(
+            "Weighted implementation of clustering not currently supported"
+        )
     G = _to_undirected_graph(G)
     node_ids, triangles, is_single_node = _triangles(G, nodes)
     if len(G) == 0:
@@ -83,9 +87,13 @@ def _(G, nodes=None, weight=None):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(is_incomplete=True, plc="triangle_count", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
 def average_clustering(G, nodes=None, weight=None, count_zeros=True):
     """Directed graphs and `weight` parameter are not yet supported."""
+    if weight is not None:
+        raise NotImplementedError(
+            "Weighted implementation of average_clustering not currently supported"
+        )
     G = _to_undirected_graph(G)
     node_ids, triangles, is_single_node = _triangles(G, nodes)
     if len(G) == 0:
@@ -110,7 +118,7 @@ def _(G, nodes=None, weight=None, count_zeros=True):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(is_incomplete=True, plc="triangle_count", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
 def transitivity(G):
     """Directed graphs are not yet supported."""
     G = _to_undirected_graph(G)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
index 413ff9ca5e3..f58f1000fc4 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
@@ -36,8 +36,8 @@
     },
     is_incomplete=True,  # seed not supported; self-loops not supported
     is_different=True,  # RNG different
-    plc="louvain",
     version_added="23.10",
+    _plc="louvain",
 )
 def louvain_communities(
     G,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
index cdb9f54f6c4..24955e3eac8 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
@@ -26,7 +26,7 @@
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(plc="weakly_connected_components", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def number_connected_components(G):
     G = _to_undirected_graph(G)
     return _number_connected_components(G)
@@ -50,14 +50,11 @@ def _number_connected_components(G, symmetrize=None):
 @number_connected_components._can_run
 def _(G):
     # NetworkX <= 3.2.1 does not check directedness for us
-    try:
-        return not G.is_directed()
-    except Exception:
-        return False
+    return not G.is_directed()
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(plc="weakly_connected_components", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def connected_components(G):
     G = _to_undirected_graph(G)
     return _connected_components(G)
@@ -80,7 +77,7 @@ def _connected_components(G, symmetrize=None):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(plc="weakly_connected_components", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def is_connected(G):
     G = _to_undirected_graph(G)
     return _is_connected(G)
@@ -106,7 +103,7 @@ def _is_connected(G, symmetrize=None):
 
 
 @not_implemented_for("directed")
-@networkx_algorithm(plc="weakly_connected_components", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def node_connected_component(G, n):
     # We could also do plain BFS from n
     G = _to_undirected_graph(G)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py
index 8fdf99ed5ea..d1713129703 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py
@@ -51,7 +51,7 @@ def _strongly_connected_components(G):
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", plc="strongly_connected_components")
+@networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
 def strongly_connected_components(G):
     G = _to_directed_graph(G)
     if G.src_indices.size == 0:
@@ -62,7 +62,7 @@ def strongly_connected_components(G):
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", plc="strongly_connected_components")
+@networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
 def number_strongly_connected_components(G):
     G = _to_directed_graph(G)
     if G.src_indices.size == 0:
@@ -72,7 +72,7 @@ def number_strongly_connected_components(G):
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(version_added="24.02", plc="strongly_connected_components")
+@networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
 def is_strongly_connected(G):
     G = _to_directed_graph(G)
     if len(G) == 0:
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py
index 5b797b39118..e42acdd3d84 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py
@@ -27,21 +27,21 @@
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def weakly_connected_components(G):
     G = _to_directed_graph(G)
     return _connected_components(G, symmetrize="union")
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def number_weakly_connected_components(G):
     G = _to_directed_graph(G)
     return _number_connected_components(G, symmetrize="union")
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def is_weakly_connected(G):
     G = _to_directed_graph(G)
     return _is_connected(G, symmetrize="union")
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/core.py b/python/nx-cugraph/nx_cugraph/algorithms/core.py
index f323cdf6004..71f61abf45b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/core.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/core.py
@@ -28,7 +28,7 @@
 
 @not_implemented_for("directed")
 @not_implemented_for("multigraph")
-@networkx_algorithm(is_incomplete=True, plc="core_number", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="core_number")
 def core_number(G):
     """Directed graphs are not yet supported."""
     G = _to_undirected_graph(G)
@@ -55,7 +55,7 @@ def _(G):
 
 @not_implemented_for("directed")
 @not_implemented_for("multigraph")
-@networkx_algorithm(is_incomplete=True, plc="k_truss_subgraph", version_added="23.12")
+@networkx_algorithm(is_incomplete=True, version_added="23.12", _plc="k_truss_subgraph")
 def k_truss(G, k):
     """
     Currently raises `NotImplementedError` for graphs with more than one connected
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/dag.py b/python/nx-cugraph/nx_cugraph/algorithms/dag.py
index ad5b7594aa1..64be0a58105 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/dag.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/dag.py
@@ -45,11 +45,11 @@ def _ancestors_and_descendants(G, source, *, is_ancestors):
     return G._nodearray_to_set(node_ids[mask])
 
 
-@networkx_algorithm(plc="bfs", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="bfs")
 def descendants(G, source):
     return _ancestors_and_descendants(G, source, is_ancestors=False)
 
 
-@networkx_algorithm(plc="bfs", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="bfs")
 def ancestors(G, source):
     return _ancestors_and_descendants(G, source, is_ancestors=True)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
index caa01327a56..9e723624a3b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
@@ -33,8 +33,8 @@
         ),
         **_dtype_param,
     },
-    plc="hits",
     version_added="23.12",
+    _plc="hits",
 )
 def hits(
     G,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
index d45d019c1b7..55fcc3e520a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
@@ -29,8 +29,8 @@
 @networkx_algorithm(
     extra_params=_dtype_param,
     is_incomplete=True,  # dangling not supported
-    plc={"pagerank", "personalized_pagerank"},
     version_added="23.12",
+    _plc={"pagerank", "personalized_pagerank"},
 )
 def pagerank(
     G,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
index b1032a8236b..2012495953e 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
@@ -21,12 +21,12 @@
 __all__ = ["single_source_shortest_path_length", "single_target_shortest_path_length"]
 
 
-@networkx_algorithm(plc="bfs", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="bfs")
 def single_source_shortest_path_length(G, source, cutoff=None):
     return _single_shortest_path_length(G, source, cutoff, "Source")
 
 
-@networkx_algorithm(plc="bfs", version_added="23.12")
+@networkx_algorithm(version_added="23.12", _plc="bfs")
 def single_target_shortest_path_length(G, target, cutoff=None):
     return _single_shortest_path_length(G, target, cutoff, "Target")
 
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
index aa671bbb7d4..ef1c011363a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
@@ -57,9 +57,17 @@ def _bfs(G, source, *, depth_limit=None, reverse=False):
     return distances[mask], predecessors[mask], node_ids[mask]
 
 
-@networkx_algorithm(is_incomplete=True, plc="bfs", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
 def generic_bfs_edges(G, source, neighbors=None, depth_limit=None, sort_neighbors=None):
     """`neighbors` and `sort_neighbors` parameters are not yet supported."""
+    if neighbors is not None:
+        raise NotImplementedError(
+            "neighbors argument in generic_bfs_edges is not currently supported"
+        )
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in generic_bfs_edges is not currently supported"
+        )
     return bfs_edges(source, depth_limit=depth_limit)
 
 
@@ -68,9 +76,13 @@ def _(G, source, neighbors=None, depth_limit=None, sort_neighbors=None):
     return neighbors is None and sort_neighbors is None
 
 
-@networkx_algorithm(is_incomplete=True, plc="bfs", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
 def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
     """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_edges is not currently supported"
+        )
     G = _check_G_and_source(G, source)
     if depth_limit is not None and depth_limit < 1:
         return
@@ -95,9 +107,13 @@ def _(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
     return sort_neighbors is None
 
 
-@networkx_algorithm(is_incomplete=True, plc="bfs", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
 def bfs_tree(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
     """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_tree is not currently supported"
+        )
     G = _check_G_and_source(G, source)
     if depth_limit is not None and depth_limit < 1:
         return nxcg.DiGraph.from_coo(
@@ -149,9 +165,13 @@ def _(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
     return sort_neighbors is None
 
 
-@networkx_algorithm(is_incomplete=True, plc="bfs", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
 def bfs_successors(G, source, depth_limit=None, sort_neighbors=None):
     """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_successors is not currently supported"
+        )
     G = _check_G_and_source(G, source)
     if depth_limit is not None and depth_limit < 1:
         yield (source, [])
@@ -173,7 +193,7 @@ def _(G, source, depth_limit=None, sort_neighbors=None):
     return sort_neighbors is None
 
 
-@networkx_algorithm(plc="bfs", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="bfs")
 def bfs_layers(G, sources):
     G = _to_graph(G)
     if sources in G:
@@ -201,9 +221,13 @@ def bfs_layers(G, sources):
     return (G._nodearray_to_list(groups[key]) for key in range(len(groups)))
 
 
-@networkx_algorithm(is_incomplete=True, plc="bfs", version_added="24.02")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
 def bfs_predecessors(G, source, depth_limit=None, sort_neighbors=None):
     """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_predecessors is not currently supported"
+        )
     G = _check_G_and_source(G, source)
     if depth_limit is not None and depth_limit < 1:
         return
@@ -227,7 +251,7 @@ def _(G, source, depth_limit=None, sort_neighbors=None):
     return sort_neighbors is None
 
 
-@networkx_algorithm(plc="bfs", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="bfs")
 def descendants_at_distance(G, source, distance):
     G = _check_G_and_source(G, source)
     if distance is None or distance < 0:
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py b/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py
index 0b82f079d43..74f57b5ea5a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py
@@ -21,20 +21,20 @@
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def is_arborescence(G):
     G = _to_directed_graph(G)
     return is_tree(G) and int(G._in_degrees_array().max()) <= 1
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def is_branching(G):
     G = _to_directed_graph(G)
     return is_forest(G) and int(G._in_degrees_array().max()) <= 1
 
 
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def is_forest(G):
     G = _to_graph(G)
     if len(G) == 0:
@@ -60,7 +60,7 @@ def is_forest(G):
     return True
 
 
-@networkx_algorithm(plc="weakly_connected_components", version_added="24.02")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
 def is_tree(G):
     G = _to_graph(G)
     if len(G) == 0:
diff --git a/python/nx-cugraph/nx_cugraph/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py
index d09a9e9617a..011ebfd6ef7 100644
--- a/python/nx-cugraph/nx_cugraph/utils/decorators.py
+++ b/python/nx-cugraph/nx_cugraph/utils/decorators.py
@@ -59,7 +59,7 @@ def __new__(
         version_added: str,  # Required
         is_incomplete: bool = False,  # See self.extra_doc for details if True
         is_different: bool = False,  # See self.extra_doc for details if True
-        plc: str | set[str] | None = None,  # Hidden from user, may be removed someday
+        _plc: str | set[str] | None = None,  # Hidden from user, may be removed someday
     ):
         if func is None:
             return partial(
@@ -67,10 +67,10 @@ def __new__(
                 name=name,
                 extra_params=extra_params,
                 nodes_or_number=nodes_or_number,
-                plc=plc,
                 version_added=version_added,
                 is_incomplete=is_incomplete,
                 is_different=is_different,
+                _plc=_plc,
             )
         instance = object.__new__(cls)
         if nodes_or_number is not None and nx.__version__[:3] > "3.2":
@@ -89,12 +89,14 @@ def __new__(
                 f"extra_params must be dict, str, or None; got {type(extra_params)}"
             )
         instance.extra_params = extra_params
-        if plc is None or isinstance(plc, set):
-            instance._plc_names = plc
-        elif isinstance(plc, str):
-            instance._plc_names = {plc}
+        if _plc is None or isinstance(_plc, set):
+            instance._plc_names = _plc
+        elif isinstance(_plc, str):
+            instance._plc_names = {_plc}
         else:
-            raise TypeError(f"plc argument must be str, set, or None; got {type(plc)}")
+            raise TypeError(
+                f"_plc argument must be str, set, or None; got {type(_plc)}"
+            )
         instance.version_added = version_added
         instance.is_incomplete = is_incomplete
         instance.is_different = is_different

From 3526af4f8776f9e84c664d26b79768f1345aba8b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Wed, 24 Jan 2024 21:58:17 -0800
Subject: [PATCH 4/7] Implement has_edge() & compute_multiplicity() (#4096)

`graph_view.has_edge()` query whether the graph has given (src, dst) pairs as edges.

`graph_view.compute_multiplicity()` query the edge multiplicity of given (src, dst) pairs (assumes that multiplicity is 0 if there is no edge between a given pair). This function throws an exception if `graph_view.is_multigraph()` is false (better use `has_edge()` for non-multigraph).

In addition to adding the above two functions, this PR includes few code cleanups.

* `major_idx_from_major_nocheck()` to `edge_partition_device_view_t`
* Move `count_invalid_vertex_pais` from `nbr_intersection.cuh` to `error_check_utils.cuh`
* Update `cugraph::test::to_host`, `to_device`, `device_gatherv`, and `device_allgatherv` to support `bool` type (and to handle `std::vector<bool>` which stores `bool` values in a packed format)

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Naim (https://github.com/naimnv)
  - Joseph Nke (https://github.com/jnke2016)

URL: https://github.com/rapidsai/cugraph/pull/4096
---
 .../cugraph/edge_partition_device_view.cuh    |  22 +-
 cpp/include/cugraph/graph_view.hpp            |  25 +-
 cpp/src/link_prediction/similarity_impl.cuh   |   3 +-
 cpp/src/prims/detail/nbr_intersection.cuh     | 170 +--------
 ..._v_pair_transform_dst_nbr_intersection.cuh |   3 +-
 cpp/src/prims/transform_e.cuh                 |  43 +--
 cpp/src/structure/graph_view_impl.cuh         | 345 +++++++++++++++++-
 cpp/src/utilities/error_check_utils.cuh       | 137 +++++++
 cpp/tests/CMakeLists.txt                      |  10 +
 .../count_self_loops_and_multi_edges_test.cpp |  17 +-
 ...has_edge_and_compute_multiplicity_test.cpp | 281 ++++++++++++++
 ...has_edge_and_compute_multiplicity_test.cpp | 331 +++++++++++++++++
 cpp/tests/utilities/device_comm_wrapper.cu    |  18 +-
 cpp/tests/utilities/test_utilities.hpp        |  48 +--
 14 files changed, 1217 insertions(+), 236 deletions(-)
 create mode 100644 cpp/src/utilities/error_check_utils.cuh
 create mode 100644 cpp/tests/structure/has_edge_and_compute_multiplicity_test.cpp
 create mode 100644 cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp

diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
index 213f9b9497a..d1c2cf3df52 100644
--- a/cpp/include/cugraph/edge_partition_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -298,6 +298,20 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return major_range_first_ + major_offset;
   }
 
+  __device__ thrust::optional<vertex_t> major_idx_from_major_nocheck(vertex_t major) const noexcept
+  {
+    if (major_hypersparse_first_ && (major >= *major_hypersparse_first_)) {
+      auto major_hypersparse_idx =
+        detail::major_hypersparse_idx_from_major_nocheck_impl(*dcs_nzd_vertices_, major);
+      return major_hypersparse_idx
+               ? thrust::make_optional((*major_hypersparse_first_ - major_range_first_) +
+                                       *major_hypersparse_idx)
+               : thrust::nullopt;
+    } else {
+      return major - major_range_first_;
+    }
+  }
+
   __device__ vertex_t major_from_major_idx_nocheck(vertex_t major_idx) const noexcept
   {
     if (major_hypersparse_first_) {
@@ -339,6 +353,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return dcs_nzd_vertices_ ? thrust::optional<vertex_t const*>{(*dcs_nzd_vertices_).data()}
                              : thrust::nullopt;
   }
+
   __host__ __device__ thrust::optional<vertex_t> dcs_nzd_vertex_count() const
   {
     return dcs_nzd_vertices_
@@ -460,6 +475,11 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return major_offset;
   }
 
+  __device__ thrust::optional<vertex_t> major_idx_from_major_nocheck(vertex_t major) const noexcept
+  {
+    return major_offset_from_major_nocheck(major);
+  }
+
   __device__ vertex_t major_from_major_idx_nocheck(vertex_t major_idx) const noexcept
   {
     return major_from_major_offset_nocheck(major_idx);
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 53c66c6483e..93d884a56d9 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -631,6 +631,19 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   edge_t count_self_loops(raft::handle_t const& handle) const;
   edge_t count_multi_edges(raft::handle_t const& handle) const;
 
+  rmm::device_uvector<bool> has_edge(raft::handle_t const& handle,
+                                     /* (edge_srcs, edge_dsts) should be pre-shuffled */
+                                     raft::device_span<vertex_t const> edge_srcs,
+                                     raft::device_span<vertex_t const> edge_dsts,
+                                     bool do_expensive_check = false);
+
+  rmm::device_uvector<edge_t> compute_multiplicity(
+    raft::handle_t const& handle,
+    /* (edge_srcs, edge_dsts) should be pre-shuffled */
+    raft::device_span<vertex_t const> edge_srcs,
+    raft::device_span<vertex_t const> edge_dsts,
+    bool do_expensive_check = false);
+
   template <bool transposed = is_storage_transposed>
   std::enable_if_t<transposed, std::optional<raft::device_span<vertex_t const>>>
   local_sorted_unique_edge_srcs() const
@@ -928,6 +941,16 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   edge_t count_self_loops(raft::handle_t const& handle) const;
   edge_t count_multi_edges(raft::handle_t const& handle) const;
 
+  rmm::device_uvector<bool> has_edge(raft::handle_t const& handle,
+                                     raft::device_span<vertex_t const> edge_srcs,
+                                     raft::device_span<vertex_t const> edge_dsts,
+                                     bool do_expensive_check = false);
+
+  rmm::device_uvector<edge_t> compute_multiplicity(raft::handle_t const& handle,
+                                                   raft::device_span<vertex_t const> edge_srcs,
+                                                   raft::device_span<vertex_t const> edge_dsts,
+                                                   bool do_expensive_check = false);
+
   template <bool transposed = is_storage_transposed>
   std::enable_if_t<transposed, std::optional<raft::device_span<vertex_t const>>>
   local_sorted_unique_edge_srcs() const
diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh
index 55e8f5c88d7..7ac294d7719 100644
--- a/cpp/src/link_prediction/similarity_impl.cuh
+++ b/cpp/src/link_prediction/similarity_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <prims/count_if_e.cuh>
 #include <prims/per_v_pair_transform_dst_nbr_intersection.cuh>
 #include <prims/update_edge_src_dst_property.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index cefc1836fa6..8261ec747f9 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <prims/detail/optional_dataframe_buffer.hpp>
 #include <prims/kv_store.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_edge_property_device_view.cuh>
@@ -63,35 +64,6 @@ namespace cugraph {
 
 namespace detail {
 
-// check vertices in the pair are valid and first element of the pair is within the local vertex
-// partition range
-template <typename vertex_t>
-struct is_invalid_input_vertex_pair_t {
-  vertex_t num_vertices{};
-  raft::device_span<vertex_t const> edge_partition_major_range_firsts{};
-  raft::device_span<vertex_t const> edge_partition_major_range_lasts{};
-  vertex_t edge_partition_minor_range_first{};
-  vertex_t edge_partition_minor_range_last{};
-
-  __device__ bool operator()(thrust::tuple<vertex_t, vertex_t> pair) const
-  {
-    auto major = thrust::get<0>(pair);
-    auto minor = thrust::get<1>(pair);
-    if (!is_valid_vertex(num_vertices, major) || !is_valid_vertex(num_vertices, minor)) {
-      return true;
-    }
-    auto it = thrust::upper_bound(thrust::seq,
-                                  edge_partition_major_range_lasts.begin(),
-                                  edge_partition_major_range_lasts.end(),
-                                  major);
-    if (it == edge_partition_major_range_lasts.end()) { return true; }
-    auto edge_partition_idx =
-      static_cast<size_t>(thrust::distance(edge_partition_major_range_lasts.begin(), it));
-    if (major < edge_partition_major_range_firsts[edge_partition_idx]) { return true; }
-    return (minor < edge_partition_minor_range_first) || (minor >= edge_partition_minor_range_last);
-  }
-};
-
 // group index determined by major_comm_rank (primary key) and local edge partition index (secondary
 // key)
 template <typename vertex_t>
@@ -154,24 +126,11 @@ struct update_rx_major_local_degree_t {
     auto major =
       rx_majors[rx_group_firsts[major_comm_rank * minor_comm_size + local_edge_partition_idx] +
                 offset_in_local_edge_partition];
-    vertex_t major_idx{0};
-    edge_t local_degree{0};
-    if (multi_gpu && (edge_partition.major_hypersparse_first() &&
-                      (major >= *(edge_partition.major_hypersparse_first())))) {
-      auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-      if (major_hypersparse_idx) {
-        major_idx =
-          (*(edge_partition.major_hypersparse_first()) - edge_partition.major_range_first()) +
-          *major_hypersparse_idx;
-        local_degree = edge_partition.local_degree(major_idx);
-      }
-    } else {
-      major_idx    = edge_partition.major_offset_from_major_nocheck(major);
-      local_degree = edge_partition.local_degree(major_idx);
-    }
+    auto major_idx    = edge_partition.major_idx_from_major_nocheck(major);
+    auto local_degree = major_idx ? edge_partition.local_degree(*major_idx) : edge_t{0};
 
     if (edge_partition_e_mask && (local_degree > edge_t{0})) {
-      auto local_offset = edge_partition.local_offset(major_idx);
+      auto local_offset = edge_partition.local_offset(*major_idx);
       local_degree      = static_cast<edge_t>(
         count_set_bits((*edge_partition_e_mask).value_first(), local_offset, local_degree));
     }
@@ -325,29 +284,11 @@ struct pick_min_degree_t {
     edge_t local_degree0{0};
     vertex_t major0 = thrust::get<0>(pair);
     if constexpr (std::is_same_v<FirstElementToIdxMap, void*>) {
-      vertex_t major_idx{0};
-      if constexpr (multi_gpu) {
-        if (edge_partition.major_hypersparse_first() &&
-            (major0 >= *(edge_partition.major_hypersparse_first()))) {
-          auto major_hypersparse_idx =
-            edge_partition.major_hypersparse_idx_from_major_nocheck(major0);
-          if (major_hypersparse_idx) {
-            major_idx =
-              (*(edge_partition.major_hypersparse_first()) - edge_partition.major_range_first()) +
-              *major_hypersparse_idx;
-            local_degree0 = edge_partition.local_degree(major_idx);
-          }
-        } else {
-          major_idx     = edge_partition.major_offset_from_major_nocheck(major0);
-          local_degree0 = edge_partition.local_degree(major_idx);
-        }
-      } else {
-        major_idx     = edge_partition.major_offset_from_major_nocheck(major0);
-        local_degree0 = edge_partition.local_degree(major_idx);
-      }
+      auto major_idx = edge_partition.major_idx_from_major_nocheck(major0);
+      local_degree0  = major_idx ? edge_partition.local_degree(*major_idx) : edge_t{0};
 
       if (edge_partition_e_mask && (local_degree0 > edge_t{0})) {
-        auto local_offset = edge_partition.local_offset(major_idx);
+        auto local_offset = edge_partition.local_offset(*major_idx);
         local_degree0 =
           count_set_bits((*edge_partition_e_mask).value_first(), local_offset, local_degree0);
       }
@@ -360,29 +301,11 @@ struct pick_min_degree_t {
     edge_t local_degree1{0};
     vertex_t major1 = thrust::get<1>(pair);
     if constexpr (std::is_same_v<SecondElementToIdxMap, void*>) {
-      vertex_t major_idx{0};
-      if constexpr (multi_gpu) {
-        if (edge_partition.major_hypersparse_first() &&
-            (major1 >= *(edge_partition.major_hypersparse_first()))) {
-          auto major_hypersparse_idx =
-            edge_partition.major_hypersparse_idx_from_major_nocheck(major1);
-          if (major_hypersparse_idx) {
-            major_idx =
-              (*(edge_partition.major_hypersparse_first()) - edge_partition.major_range_first()) +
-              *major_hypersparse_idx;
-            local_degree1 = edge_partition.local_degree(major_idx);
-          }
-        } else {
-          major_idx     = edge_partition.major_offset_from_major_nocheck(major1);
-          local_degree1 = edge_partition.local_degree(major_idx);
-        }
-      } else {
-        major_idx     = edge_partition.major_offset_from_major_nocheck(major1);
-        local_degree1 = edge_partition.local_degree(major_idx);
-      }
+      auto major_idx = edge_partition.major_idx_from_major_nocheck(major1);
+      local_degree1  = major_idx ? edge_partition.local_degree(*major_idx) : edge_t{0};
 
       if (edge_partition_e_mask && (local_degree1 > edge_t{0})) {
-        auto local_offset = edge_partition.local_offset(major_idx);
+        auto local_offset = edge_partition.local_offset(*major_idx);
         local_degree1 =
           count_set_bits((*edge_partition_e_mask).value_first(), local_offset, local_degree1);
       }
@@ -699,77 +622,6 @@ struct gatherv_indices_t {
   }
 };
 
-template <typename GraphViewType, typename VertexPairIterator>
-size_t count_invalid_vertex_pairs(raft::handle_t const& handle,
-                                  GraphViewType const& graph_view,
-                                  VertexPairIterator vertex_pair_first,
-                                  VertexPairIterator vertex_pair_last)
-{
-  using vertex_t = typename GraphViewType::vertex_type;
-
-  std::vector<vertex_t> h_edge_partition_major_range_firsts(
-    graph_view.number_of_local_edge_partitions());
-  std::vector<vertex_t> h_edge_partition_major_range_lasts(
-    h_edge_partition_major_range_firsts.size());
-  vertex_t edge_partition_minor_range_first{};
-  vertex_t edge_partition_minor_range_last{};
-  if constexpr (GraphViewType::is_multi_gpu) {
-    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i++) {
-      if constexpr (GraphViewType::is_storage_transposed) {
-        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_dst_range_first(i);
-        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_dst_range_last(i);
-      } else {
-        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_src_range_first(i);
-        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_src_range_last(i);
-      }
-    }
-    if constexpr (GraphViewType::is_storage_transposed) {
-      edge_partition_minor_range_first = graph_view.local_edge_partition_src_range_first();
-      edge_partition_minor_range_last  = graph_view.local_edge_partition_src_range_last();
-    } else {
-      edge_partition_minor_range_first = graph_view.local_edge_partition_dst_range_first();
-      edge_partition_minor_range_last  = graph_view.local_edge_partition_dst_range_last();
-    }
-  } else {
-    h_edge_partition_major_range_firsts[0] = vertex_t{0};
-    h_edge_partition_major_range_lasts[0]  = graph_view.number_of_vertices();
-    edge_partition_minor_range_first       = vertex_t{0};
-    edge_partition_minor_range_last        = graph_view.number_of_vertices();
-  }
-  rmm::device_uvector<vertex_t> d_edge_partition_major_range_firsts(
-    h_edge_partition_major_range_firsts.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> d_edge_partition_major_range_lasts(
-    h_edge_partition_major_range_lasts.size(), handle.get_stream());
-  raft::update_device(d_edge_partition_major_range_firsts.data(),
-                      h_edge_partition_major_range_firsts.data(),
-                      h_edge_partition_major_range_firsts.size(),
-                      handle.get_stream());
-  raft::update_device(d_edge_partition_major_range_lasts.data(),
-                      h_edge_partition_major_range_lasts.data(),
-                      h_edge_partition_major_range_lasts.size(),
-                      handle.get_stream());
-
-  auto num_invalid_pairs = thrust::count_if(
-    handle.get_thrust_policy(),
-    vertex_pair_first,
-    vertex_pair_last,
-    is_invalid_input_vertex_pair_t<vertex_t>{
-      graph_view.number_of_vertices(),
-      raft::device_span<vertex_t const>(d_edge_partition_major_range_firsts.begin(),
-                                        d_edge_partition_major_range_firsts.end()),
-      raft::device_span<vertex_t const>(d_edge_partition_major_range_lasts.begin(),
-                                        d_edge_partition_major_range_lasts.end()),
-      edge_partition_minor_range_first,
-      edge_partition_minor_range_last});
-  if constexpr (GraphViewType::is_multi_gpu) {
-    auto& comm = handle.get_comms();
-    num_invalid_pairs =
-      host_scalar_allreduce(comm, num_invalid_pairs, raft::comms::op_t::SUM, handle.get_stream());
-  }
-
-  return num_invalid_pairs;
-}
-
 // In multi-GPU, the first element of every vertex pair in [vertex_pair_first, vertex_pair) should
 // be within the valid edge partition major range assigned to this process and the second element
 // should be within the valid edge partition minor range assigned to this process.
diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
index 201c08325d7..469bfcb4e47 100644
--- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
+++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <prims/detail/nbr_intersection.cuh>
 #include <prims/property_op_utils.cuh>
 #include <utilities/collect_comm.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_edge_property_device_view.cuh>
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index c6623621d24..93a2d040b60 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -426,28 +426,15 @@ void transform_e(raft::handle_t const& handle,
           edge_first + edge_partition_offsets[i + 1],
           [edge_partition,
            edge_partition_e_mask] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
-            auto major = thrust::get<0>(edge);
-            auto minor = thrust::get<1>(edge);
-            vertex_t major_idx{};
-            auto major_hypersparse_first = edge_partition.major_hypersparse_first();
-            if (major_hypersparse_first) {
-              if (major < *major_hypersparse_first) {
-                major_idx = edge_partition.major_offset_from_major_nocheck(major);
-              } else {
-                auto major_hypersparse_idx =
-                  edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-                if (!major_hypersparse_idx) { return true; }
-                major_idx =
-                  edge_partition.major_offset_from_major_nocheck(*major_hypersparse_first) +
-                  *major_hypersparse_idx;
-              }
-            } else {
-              major_idx = edge_partition.major_offset_from_major_nocheck(major);
-            }
+            auto major     = thrust::get<0>(edge);
+            auto minor     = thrust::get<1>(edge);
+            auto major_idx = edge_partition.major_idx_from_major_nocheck(major);
+            if (!major_idx) { return true; }
             vertex_t const* indices{nullptr};
             edge_t edge_offset{};
             edge_t local_degree{};
-            thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
+            thrust::tie(indices, edge_offset, local_degree) =
+              edge_partition.local_edges(*major_idx);
             auto lower_it =
               thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
             if (*lower_it != minor) { return true; }
@@ -494,24 +481,16 @@ void transform_e(raft::handle_t const& handle,
         auto major = thrust::get<0>(edge);
         auto minor = thrust::get<1>(edge);
 
-        auto major_hypersparse_first = edge_partition.major_hypersparse_first();
-        auto major_offset            = edge_partition.major_offset_from_major_nocheck(major);
-        vertex_t major_idx{major_offset};
-
-        if ((major_hypersparse_first) && (major >= *major_hypersparse_first)) {
-          auto major_hypersparse_idx =
-            edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-          assert(major_hypersparse_idx);
-          major_idx = edge_partition.major_offset_from_major_nocheck(*major_hypersparse_first) +
-                      *major_hypersparse_idx;
-        }
+        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+        auto major_idx    = edge_partition.major_idx_from_major_nocheck(major);
+        assert(major_idx);
 
         auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
 
         vertex_t const* indices{nullptr};
         edge_t edge_offset{};
         edge_t local_degree{};
-        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
+        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(*major_idx);
         auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
         auto upper_it = thrust::upper_bound(thrust::seq, lower_it, indices + local_degree, minor);
 
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index da0ecc991df..7928c61cf7b 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <prims/count_if_e.cuh>
 #include <prims/per_v_transform_reduce_incoming_outgoing_e.cuh>
 #include <prims/reduce_op.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/edge_property.hpp>
 #include <cugraph/edge_src_dst_property.hpp>
@@ -414,6 +415,59 @@ edge_t count_edge_partition_multi_edges(
   }
 }
 
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+std::tuple<rmm::device_uvector<size_t>, std::vector<size_t>>
+compute_edge_indices_and_edge_partition_offsets(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  raft::device_span<vertex_t const> edge_majors,
+  raft::device_span<vertex_t const> edge_minors)
+{
+  auto edge_first = thrust::make_zip_iterator(edge_majors.begin(), edge_minors.begin());
+
+  rmm::device_uvector<size_t> edge_indices(edge_majors.size(), handle.get_stream());
+  thrust::sequence(handle.get_thrust_policy(), edge_indices.begin(), edge_indices.end(), size_t{0});
+  thrust::sort(handle.get_thrust_policy(),
+               edge_indices.begin(),
+               edge_indices.end(),
+               [edge_first] __device__(size_t lhs, size_t rhs) {
+                 return *(edge_first + lhs) < *(edge_first + rhs);
+               });
+
+  std::vector<size_t> h_major_range_lasts(graph_view.number_of_local_edge_partitions());
+  for (size_t i = 0; i < h_major_range_lasts.size(); ++i) {
+    if constexpr (store_transposed) {
+      h_major_range_lasts[i] = graph_view.local_edge_partition_dst_range_last(i);
+    } else {
+      h_major_range_lasts[i] = graph_view.local_edge_partition_src_range_last(i);
+    }
+  }
+  rmm::device_uvector<size_t> d_major_range_lasts(h_major_range_lasts.size(), handle.get_stream());
+  raft::update_device(d_major_range_lasts.data(),
+                      h_major_range_lasts.data(),
+                      h_major_range_lasts.size(),
+                      handle.get_stream());
+  rmm::device_uvector<size_t> d_lower_bounds(d_major_range_lasts.size(), handle.get_stream());
+  auto major_first        = edge_majors.begin();
+  auto sorted_major_first = thrust::make_transform_iterator(
+    edge_indices.begin(),
+    cugraph::detail::indirection_t<size_t, decltype(major_first)>{major_first});
+  thrust::lower_bound(handle.get_thrust_policy(),
+                      sorted_major_first,
+                      sorted_major_first + edge_indices.size(),
+                      d_major_range_lasts.begin(),
+                      d_major_range_lasts.end(),
+                      d_lower_bounds.begin());
+  std::vector<size_t> edge_partition_offsets(d_lower_bounds.size() + 1, 0);
+  raft::update_host(edge_partition_offsets.data() + 1,
+                    d_lower_bounds.data(),
+                    d_lower_bounds.size(),
+                    handle.get_stream());
+  handle.sync_stream();
+
+  return std::make_tuple(std::move(edge_indices), edge_partition_offsets);
+}
+
 }  // namespace
 
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
@@ -751,4 +805,293 @@ edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_i
     this->local_edge_partition_segment_offsets());
 }
 
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<bool>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::has_edge(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> edge_srcs,
+  raft::device_span<vertex_t const> edge_dsts,
+  bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto [edge_indices, edge_partition_offsets] =
+    compute_edge_indices_and_edge_partition_offsets(handle,
+                                                    *this,
+                                                    store_transposed ? edge_dsts : edge_srcs,
+                                                    store_transposed ? edge_srcs : edge_dsts);
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  auto sorted_edge_first = thrust::make_transform_iterator(
+    edge_indices.begin(), cugraph::detail::indirection_t<size_t, decltype(edge_first)>{edge_first});
+  rmm::device_uvector<bool> ret(edge_srcs.size(), handle.get_stream());
+
+  for (size_t i = 0; i < this->number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+    thrust::transform(handle.get_thrust_policy(),
+                      sorted_edge_first + edge_partition_offsets[i],
+                      sorted_edge_first + edge_partition_offsets[i + 1],
+                      thrust::make_permutation_iterator(
+                        ret.begin(), edge_indices.begin() + edge_partition_offsets[i]),
+                      [edge_partition, edge_partition_e_mask] __device__(auto e) {
+                        auto major     = thrust::get<0>(e);
+                        auto minor     = thrust::get<1>(e);
+                        auto major_idx = edge_partition.major_idx_from_major_nocheck(major);
+                        if (major_idx) {
+                          vertex_t const* indices{nullptr};
+                          edge_t local_edge_offset{};
+                          edge_t local_degree{};
+                          thrust::tie(indices, local_edge_offset, local_degree) =
+                            edge_partition.local_edges(*major_idx);
+                          auto it = thrust::lower_bound(
+                            thrust::seq, indices, indices + local_degree, minor);
+                          if ((it != indices + local_degree) && *it == minor) {
+                            if (edge_partition_e_mask) {
+                              return (*edge_partition_e_mask)
+                                .get(local_edge_offset + thrust::distance(indices, it));
+                            } else {
+                              return true;
+                            }
+                          } else {
+                            return false;
+                          }
+                        } else {
+                          return false;
+                        }
+                      });
+  }
+
+  return ret;
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<bool>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::has_edge(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> edge_srcs,
+  raft::device_span<vertex_t const> edge_dsts,
+  bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  rmm::device_uvector<bool> ret(edge_srcs.size(), handle.get_stream());
+
+  auto edge_partition =
+    edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view());
+  auto edge_partition_e_mask =
+    edge_mask_view
+      ? thrust::make_optional<
+          detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+          *edge_mask_view, 0)
+      : thrust::nullopt;
+  thrust::transform(
+    handle.get_thrust_policy(),
+    edge_first,
+    edge_first + edge_srcs.size(),
+    ret.begin(),
+    [edge_partition, edge_partition_e_mask] __device__(auto e) {
+      auto major        = thrust::get<0>(e);
+      auto minor        = thrust::get<1>(e);
+      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+      vertex_t const* indices{nullptr};
+      edge_t local_edge_offset{};
+      edge_t local_degree{};
+      thrust::tie(indices, local_edge_offset, local_degree) =
+        edge_partition.local_edges(major_offset);
+      auto it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+      if ((it != indices + local_degree) && *it == minor) {
+        if (edge_partition_e_mask) {
+          return (*edge_partition_e_mask).get(local_edge_offset + thrust::distance(indices, it));
+        } else {
+          return true;
+        }
+      } else {
+        return false;
+      }
+    });
+
+  return ret;
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<edge_t>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_multiplicity(raft::handle_t const& handle,
+                       raft::device_span<vertex_t const> edge_srcs,
+                       raft::device_span<vertex_t const> edge_dsts,
+                       bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(this->is_multigraph(), "Use has_edge() instead for non-multigraphs.");
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto [edge_indices, edge_partition_offsets] =
+    compute_edge_indices_and_edge_partition_offsets(handle,
+                                                    *this,
+                                                    store_transposed ? edge_dsts : edge_srcs,
+                                                    store_transposed ? edge_srcs : edge_dsts);
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  auto sorted_edge_first = thrust::make_transform_iterator(
+    edge_indices.begin(), cugraph::detail::indirection_t<size_t, decltype(edge_first)>{edge_first});
+  rmm::device_uvector<edge_t> ret(edge_srcs.size(), handle.get_stream());
+
+  for (size_t i = 0; i < this->number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+    thrust::transform(
+      handle.get_thrust_policy(),
+      sorted_edge_first + edge_partition_offsets[i],
+      sorted_edge_first + edge_partition_offsets[i + 1],
+      thrust::make_permutation_iterator(ret.begin(),
+                                        edge_indices.begin() + edge_partition_offsets[i]),
+      [edge_partition, edge_partition_e_mask] __device__(auto e) {
+        auto major     = thrust::get<0>(e);
+        auto minor     = thrust::get<1>(e);
+        auto major_idx = edge_partition.major_idx_from_major_nocheck(major);
+        if (major_idx) {
+          vertex_t const* indices{nullptr};
+          edge_t local_edge_offset{};
+          edge_t local_degree{};
+          thrust::tie(indices, local_edge_offset, local_degree) =
+            edge_partition.local_edges(*major_idx);
+          auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+          auto upper_it = thrust::upper_bound(thrust::seq, indices, indices + local_degree, minor);
+          auto multiplicity = static_cast<edge_t>(thrust::distance(lower_it, upper_it));
+          if (edge_partition_e_mask && (multiplicity > 0)) {
+            multiplicity = static_cast<edge_t>(detail::count_set_bits(
+              (*edge_partition_e_mask).value_first(),
+              static_cast<size_t>(local_edge_offset + thrust::distance(indices, lower_it)),
+              static_cast<size_t>(multiplicity)));
+          }
+          return multiplicity;
+        } else {
+          return edge_t{0};
+        }
+      });
+  }
+
+  return ret;
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<edge_t>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
+  compute_multiplicity(raft::handle_t const& handle,
+                       raft::device_span<vertex_t const> edge_srcs,
+                       raft::device_span<vertex_t const> edge_dsts,
+                       bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(this->is_multigraph(), "Use has_edge() instead for non-multigraphs.");
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  rmm::device_uvector<edge_t> ret(edge_srcs.size(), handle.get_stream());
+
+  auto edge_partition =
+    edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view());
+  auto edge_partition_e_mask =
+    edge_mask_view
+      ? thrust::make_optional<
+          detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+          *edge_mask_view, 0)
+      : thrust::nullopt;
+  thrust::transform(
+    handle.get_thrust_policy(),
+    edge_first,
+    edge_first + edge_srcs.size(),
+    ret.begin(),
+    [edge_partition, edge_partition_e_mask] __device__(auto e) {
+      auto major        = thrust::get<0>(e);
+      auto minor        = thrust::get<1>(e);
+      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+      vertex_t const* indices{nullptr};
+      edge_t local_edge_offset{};
+      edge_t local_degree{};
+      thrust::tie(indices, local_edge_offset, local_degree) =
+        edge_partition.local_edges(major_offset);
+      auto lower_it     = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+      auto upper_it     = thrust::upper_bound(thrust::seq, indices, indices + local_degree, minor);
+      auto multiplicity = static_cast<edge_t>(thrust::distance(lower_it, upper_it));
+      if (edge_partition_e_mask && (multiplicity > 0)) {
+        multiplicity = static_cast<edge_t>(detail::count_set_bits(
+          (*edge_partition_e_mask).value_first(),
+          static_cast<size_t>(local_edge_offset + thrust::distance(indices, lower_it)),
+          static_cast<size_t>(multiplicity)));
+      }
+      return multiplicity;
+    });
+
+  return ret;
+}
+
 }  // namespace cugraph
diff --git a/cpp/src/utilities/error_check_utils.cuh b/cpp/src/utilities/error_check_utils.cuh
new file mode 100644
index 00000000000..baaf513d93d
--- /dev/null
+++ b/cpp/src/utilities/error_check_utils.cuh
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/count.h>
+
+#include <vector>
+
+namespace cugraph {
+namespace detail {
+
+// check vertices in the pair are in [0, num_vertices) and belongs to one of the local edge
+// partitions.
+template <typename vertex_t>
+struct is_invalid_input_vertex_pair_t {
+  vertex_t num_vertices{};
+  raft::device_span<vertex_t const> edge_partition_major_range_firsts{};
+  raft::device_span<vertex_t const> edge_partition_major_range_lasts{};
+  vertex_t edge_partition_minor_range_first{};
+  vertex_t edge_partition_minor_range_last{};
+
+  __device__ bool operator()(thrust::tuple<vertex_t, vertex_t> pair) const
+  {
+    auto major = thrust::get<0>(pair);
+    auto minor = thrust::get<1>(pair);
+    if (!is_valid_vertex(num_vertices, major) || !is_valid_vertex(num_vertices, minor)) {
+      return true;
+    }
+    auto it = thrust::upper_bound(thrust::seq,
+                                  edge_partition_major_range_lasts.begin(),
+                                  edge_partition_major_range_lasts.end(),
+                                  major);
+    if (it == edge_partition_major_range_lasts.end()) { return true; }
+    auto edge_partition_idx =
+      static_cast<size_t>(thrust::distance(edge_partition_major_range_lasts.begin(), it));
+    if (major < edge_partition_major_range_firsts[edge_partition_idx]) { return true; }
+    return (minor < edge_partition_minor_range_first) || (minor >= edge_partition_minor_range_last);
+  }
+};
+
+template <typename GraphViewType, typename VertexPairIterator>
+size_t count_invalid_vertex_pairs(raft::handle_t const& handle,
+                                  GraphViewType const& graph_view,
+                                  VertexPairIterator vertex_pair_first,
+                                  VertexPairIterator vertex_pair_last)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  std::vector<vertex_t> h_edge_partition_major_range_firsts(
+    graph_view.number_of_local_edge_partitions());
+  std::vector<vertex_t> h_edge_partition_major_range_lasts(
+    h_edge_partition_major_range_firsts.size());
+  vertex_t edge_partition_minor_range_first{};
+  vertex_t edge_partition_minor_range_last{};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i++) {
+      if constexpr (GraphViewType::is_storage_transposed) {
+        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_dst_range_first(i);
+        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_dst_range_last(i);
+      } else {
+        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_src_range_first(i);
+        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_src_range_last(i);
+      }
+    }
+    if constexpr (GraphViewType::is_storage_transposed) {
+      edge_partition_minor_range_first = graph_view.local_edge_partition_src_range_first();
+      edge_partition_minor_range_last  = graph_view.local_edge_partition_src_range_last();
+    } else {
+      edge_partition_minor_range_first = graph_view.local_edge_partition_dst_range_first();
+      edge_partition_minor_range_last  = graph_view.local_edge_partition_dst_range_last();
+    }
+  } else {
+    h_edge_partition_major_range_firsts[0] = vertex_t{0};
+    h_edge_partition_major_range_lasts[0]  = graph_view.number_of_vertices();
+    edge_partition_minor_range_first       = vertex_t{0};
+    edge_partition_minor_range_last        = graph_view.number_of_vertices();
+  }
+  rmm::device_uvector<vertex_t> d_edge_partition_major_range_firsts(
+    h_edge_partition_major_range_firsts.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_edge_partition_major_range_lasts(
+    h_edge_partition_major_range_lasts.size(), handle.get_stream());
+  raft::update_device(d_edge_partition_major_range_firsts.data(),
+                      h_edge_partition_major_range_firsts.data(),
+                      h_edge_partition_major_range_firsts.size(),
+                      handle.get_stream());
+  raft::update_device(d_edge_partition_major_range_lasts.data(),
+                      h_edge_partition_major_range_lasts.data(),
+                      h_edge_partition_major_range_lasts.size(),
+                      handle.get_stream());
+
+  auto num_invalid_pairs = thrust::count_if(
+    handle.get_thrust_policy(),
+    vertex_pair_first,
+    vertex_pair_last,
+    is_invalid_input_vertex_pair_t<vertex_t>{
+      graph_view.number_of_vertices(),
+      raft::device_span<vertex_t const>(d_edge_partition_major_range_firsts.begin(),
+                                        d_edge_partition_major_range_firsts.end()),
+      raft::device_span<vertex_t const>(d_edge_partition_major_range_lasts.begin(),
+                                        d_edge_partition_major_range_lasts.end()),
+      edge_partition_minor_range_first,
+      edge_partition_minor_range_last});
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& comm = handle.get_comms();
+    num_invalid_pairs =
+      host_scalar_allreduce(comm, num_invalid_pairs, raft::comms::op_t::SUM, handle.get_stream());
+  }
+
+  return num_invalid_pairs;
+}
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d9d2f677abc..3df979fe5c2 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -313,6 +313,11 @@ ConfigureTest(DEGREE_TEST structure/degree_test.cpp)
 ConfigureTest(COUNT_SELF_LOOPS_AND_MULTI_EDGES_TEST
               "structure/count_self_loops_and_multi_edges_test.cpp")
 
+###################################################################################################
+# - Query edge existence and multiplicity tests ---------------------------------------------------
+ConfigureTest(HAS_EDGE_AND_COMPUTE_MULTIPLICITY_TEST
+              "structure/has_edge_and_compute_multiplicity_test.cpp")
+
 ###################################################################################################
 # - Coarsening tests ------------------------------------------------------------------------------
 ConfigureTest(COARSEN_GRAPH_TEST structure/coarsen_graph_test.cpp)
@@ -479,6 +484,11 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureTestMG(MG_COUNT_SELF_LOOPS_AND_MULTI_EDGES_TEST
           "structure/mg_count_self_loops_and_multi_edges_test.cpp")
 
+    ###############################################################################################
+    # - MG Query edge existence and multiplicity tests --------------------------------------------
+    ConfigureTestMG(MG_HAS_EDGE_AND_COMPUTE_MULTIPLICITY_TEST
+          "structure/mg_has_edge_and_compute_multiplicity_test.cpp")
+
     ###############################################################################################
     # - MG PAGERANK tests -------------------------------------------------------------------------
     ConfigureTestMG(MG_PAGERANK_TEST link_analysis/mg_pagerank_test.cpp)
diff --git a/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp b/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp
index 68828d5eee1..b7f1dce2023 100644
--- a/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp
+++ b/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -208,10 +208,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_CountSelfLoopsAndMultiEdges_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{}),
+    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
 
@@ -220,10 +217,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_CountSelfLoopsAndMultiEdges_Rmat,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{}),
+    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -235,10 +229,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_CountSelfLoopsAndMultiEdges_Rmat,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{false},
-                      CountSelfLoopsAndMultiEdges_Usecase{false},
-                      CountSelfLoopsAndMultiEdges_Usecase{false},
-                      CountSelfLoopsAndMultiEdges_Usecase{false}),
+    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/structure/has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/has_edge_and_compute_multiplicity_test.cpp
new file mode 100644
index 00000000000..3ad6953ca03
--- /dev/null
+++ b/cpp/tests/structure/has_edge_and_compute_multiplicity_test.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <vector>
+
+struct HasEdgeAndComputeMultiplicity_Usecase {
+  size_t num_vertex_pairs{};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_HasEdgeAndComputeMultiplicity
+  : public ::testing::TestWithParam<
+      std::tuple<HasEdgeAndComputeMultiplicity_Usecase, input_usecase_t>> {
+ public:
+  Tests_HasEdgeAndComputeMultiplicity() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, bool store_transposed>
+  void run_current_test(
+    HasEdgeAndComputeMultiplicity_Usecase const& has_edge_and_compute_multiplicity_usecase,
+    input_usecase_t const& input_usecase)
+  {
+    using weight_t = float;
+
+    constexpr bool renumber = true;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, false> graph(handle);
+    std::optional<rmm::device_uvector<vertex_t>> d_renumber_map_labels{std::nullopt};
+    std::tie(graph, std::ignore, d_renumber_map_labels) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, input_usecase, false, renumber);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+
+    raft::random::RngState rng_state(0);
+    rmm::device_uvector<vertex_t> edge_srcs(
+      has_edge_and_compute_multiplicity_usecase.num_vertex_pairs, handle.get_stream());
+    rmm::device_uvector<vertex_t> edge_dsts(edge_srcs.size(), handle.get_stream());
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         edge_srcs.data(),
+                                         edge_srcs.size(),
+                                         vertex_t{0},
+                                         graph_view.number_of_vertices(),
+                                         rng_state);
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         edge_dsts.data(),
+                                         edge_dsts.size(),
+                                         vertex_t{0},
+                                         graph_view.number_of_vertices(),
+                                         rng_state);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Querying edge existence");
+    }
+
+    auto edge_exists =
+      graph_view.has_edge(handle,
+                          raft::device_span<vertex_t const>(edge_srcs.data(), edge_srcs.size()),
+                          raft::device_span<vertex_t const>(edge_dsts.data(), edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Computing multiplicity");
+    }
+
+    auto edge_multiplicities = graph_view.compute_multiplicity(
+      handle,
+      raft::device_span<vertex_t const>(edge_srcs.data(), edge_srcs.size()),
+      raft::device_span<vertex_t const>(edge_dsts.data(), edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (has_edge_and_compute_multiplicity_usecase.check_correctness) {
+      cugraph::graph_t<vertex_t, edge_t, store_transposed, false> unrenumbered_graph(handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore, std::ignore) =
+          cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+            handle, input_usecase, false, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets = cugraph::test::to_host(
+        handle, unrenumbered_graph_view.local_edge_partition_view().offsets());
+      std::vector<vertex_t> h_indices = cugraph::test::to_host(
+        handle, unrenumbered_graph_view.local_edge_partition_view().indices());
+
+      rmm::device_uvector<vertex_t> d_unrenumbered_edge_srcs(edge_srcs.size(), handle.get_stream());
+      rmm::device_uvector<vertex_t> d_unrenumbered_edge_dsts(edge_dsts.size(), handle.get_stream());
+      raft::copy_async(
+        d_unrenumbered_edge_srcs.data(), edge_srcs.data(), edge_srcs.size(), handle.get_stream());
+      raft::copy_async(
+        d_unrenumbered_edge_dsts.data(), edge_dsts.data(), edge_dsts.size(), handle.get_stream());
+      if (renumber) {
+        cugraph::unrenumber_local_int_vertices(handle,
+                                               d_unrenumbered_edge_srcs.data(),
+                                               d_unrenumbered_edge_srcs.size(),
+                                               (*d_renumber_map_labels).data(),
+                                               vertex_t{0},
+                                               graph_view.number_of_vertices());
+        cugraph::unrenumber_local_int_vertices(handle,
+                                               d_unrenumbered_edge_dsts.data(),
+                                               d_unrenumbered_edge_dsts.size(),
+                                               (*d_renumber_map_labels).data(),
+                                               vertex_t{0},
+                                               graph_view.number_of_vertices());
+      }
+      auto h_unrenumbered_edge_srcs = cugraph::test::to_host(handle, d_unrenumbered_edge_srcs);
+      auto h_unrenumbered_edge_dsts = cugraph::test::to_host(handle, d_unrenumbered_edge_dsts);
+
+      auto h_cugraph_edge_exists         = cugraph::test::to_host(handle, edge_exists);
+      auto h_cugraph_edge_multiplicities = cugraph::test::to_host(handle, edge_multiplicities);
+      std::vector<bool> h_reference_edge_exists(edge_srcs.size());
+      std::vector<edge_t> h_reference_edge_multiplicities(edge_srcs.size());
+      for (size_t i = 0; i < edge_srcs.size(); ++i) {
+        auto src      = h_unrenumbered_edge_srcs[i];
+        auto dst      = h_unrenumbered_edge_dsts[i];
+        auto major    = store_transposed ? dst : src;
+        auto minor    = store_transposed ? src : dst;
+        auto lower_it = std::lower_bound(
+          h_indices.begin() + h_offsets[major], h_indices.begin() + h_offsets[major + 1], minor);
+        auto upper_it = std::upper_bound(
+          h_indices.begin() + h_offsets[major], h_indices.begin() + h_offsets[major + 1], minor);
+        auto multiplicity                  = static_cast<edge_t>(std::distance(lower_it, upper_it));
+        h_reference_edge_exists[i]         = multiplicity > 0 ? true : false;
+        h_reference_edge_multiplicities[i] = multiplicity;
+      }
+
+      ASSERT_TRUE(std::equal(h_reference_edge_exists.begin(),
+                             h_reference_edge_exists.end(),
+                             h_cugraph_edge_exists.begin()))
+        << "has_edge() return values do not match with the reference values.";
+      ASSERT_TRUE(std::equal(h_reference_edge_multiplicities.begin(),
+                             h_reference_edge_multiplicities.end(),
+                             h_cugraph_edge_multiplicities.begin()))
+        << "compute_multiplicity() return values do not match with the reference values.";
+    }
+  }
+};
+
+using Tests_HasEdgeAndComputeMultiplicity_File =
+  Tests_HasEdgeAndComputeMultiplicity<cugraph::test::File_Usecase>;
+using Tests_HasEdgeAndComputeMultiplicity_Rmat =
+  Tests_HasEdgeAndComputeMultiplicity<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_HasEdgeAndComputeMultiplicity_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_HasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_HasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 1024 * 128, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
new file mode 100644
index 00000000000..8079de7ebfe
--- /dev/null
+++ b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+struct HasEdgeAndComputeMultiplicity_Usecase {
+  size_t num_vertex_pairs{};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHasEdgeAndComputeMultiplicity
+  : public ::testing::TestWithParam<
+      std::tuple<HasEdgeAndComputeMultiplicity_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHasEdgeAndComputeMultiplicity() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running has_edge & compute_multiplicity on multiple GPUs to that of
+  // a single-GPU run
+  template <typename vertex_t, typename edge_t, bool store_transposed>
+  void run_current_test(
+    HasEdgeAndComputeMultiplicity_Usecase const& has_edge_and_compute_multiplicity_usecase,
+    input_usecase_t const& input_usecase)
+  {
+    using weight_t       = float;
+    using edge_type_id_t = int32_t;
+
+    HighResTimer hr_timer{};
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+    auto const comm_size = handle_->get_comms().get_size();
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, true> mg_graph(*handle_);
+    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
+    std::tie(mg_graph, std::ignore, mg_renumber_map) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, true>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 2. create an edge list to query
+
+    raft::random::RngState rng_state(comm_rank);
+    size_t num_vertex_pairs_this_gpu =
+      (has_edge_and_compute_multiplicity_usecase.num_vertex_pairs / comm_size) +
+      ((comm_rank < has_edge_and_compute_multiplicity_usecase.num_vertex_pairs % comm_size)
+         ? size_t{1}
+         : size_t{0});
+    rmm::device_uvector<vertex_t> d_mg_edge_srcs(num_vertex_pairs_this_gpu, handle_->get_stream());
+    rmm::device_uvector<vertex_t> d_mg_edge_dsts(d_mg_edge_srcs.size(), handle_->get_stream());
+    cugraph::detail::uniform_random_fill(handle_->get_stream(),
+                                         d_mg_edge_srcs.data(),
+                                         d_mg_edge_srcs.size(),
+                                         vertex_t{0},
+                                         mg_graph_view.number_of_vertices(),
+                                         rng_state);
+    cugraph::detail::uniform_random_fill(handle_->get_stream(),
+                                         d_mg_edge_dsts.data(),
+                                         d_mg_edge_dsts.size(),
+                                         vertex_t{0},
+                                         mg_graph_view.number_of_vertices(),
+                                         rng_state);
+
+    std::tie(store_transposed ? d_mg_edge_dsts : d_mg_edge_srcs,
+             store_transposed ? d_mg_edge_srcs : d_mg_edge_dsts,
+             std::ignore,
+             std::ignore,
+             std::ignore) =
+      cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
+        vertex_t,
+        edge_t,
+        weight_t,
+        edge_type_id_t>(*handle_,
+                        std::move(store_transposed ? d_mg_edge_dsts : d_mg_edge_srcs),
+                        std::move(store_transposed ? d_mg_edge_srcs : d_mg_edge_dsts),
+                        std::nullopt,
+                        std::nullopt,
+                        std::nullopt,
+                        mg_graph_view.vertex_partition_range_lasts());
+
+    // 3. run MG has_edge & compute_multiplicity
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Querying edge existence");
+    }
+
+    auto d_mg_edge_exists = mg_graph_view.has_edge(
+      *handle_,
+      raft::device_span<vertex_t const>(d_mg_edge_srcs.data(), d_mg_edge_srcs.size()),
+      raft::device_span<vertex_t const>(d_mg_edge_dsts.data(), d_mg_edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Computing multiplicity");
+    }
+
+    auto d_mg_edge_multiplicities = mg_graph_view.compute_multiplicity(
+      *handle_,
+      raft::device_span<vertex_t const>(d_mg_edge_srcs.data(), d_mg_edge_srcs.size()),
+      raft::device_span<vertex_t const>(d_mg_edge_dsts.data(), d_mg_edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 4. copmare SG & MG results
+
+    if (has_edge_and_compute_multiplicity_usecase.check_correctness) {
+      // 4-1. aggregate MG results
+
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_mg_edge_srcs.data(),
+        d_mg_edge_srcs.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_mg_edge_dsts.data(),
+        d_mg_edge_dsts.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+
+      auto d_mg_aggregate_edge_srcs = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_mg_edge_srcs.data(), d_mg_edge_srcs.size()));
+      auto d_mg_aggregate_edge_dsts = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_mg_edge_dsts.data(), d_mg_edge_dsts.size()));
+      auto d_mg_aggregate_edge_exists = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<bool const>(d_mg_edge_exists.data(), d_mg_edge_exists.size()));
+      auto d_mg_aggregate_edge_multiplicities = cugraph::test::device_gatherv(
+        *handle_,
+        raft::device_span<edge_t const>(d_mg_edge_multiplicities.data(),
+                                        d_mg_edge_multiplicities.size()));
+
+      cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
+      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                              (*mg_renumber_map).size()),
+        false);
+
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
+
+        // 4-2. run SG count_self_loops & count_multi_edges
+
+        auto d_sg_edge_exists = sg_graph_view.has_edge(
+          *handle_,
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_srcs.data(),
+                                            d_mg_aggregate_edge_srcs.size()),
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_dsts.data(),
+                                            d_mg_aggregate_edge_dsts.size()));
+        auto d_sg_edge_multiplicities = sg_graph_view.compute_multiplicity(
+          *handle_,
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_srcs.data(),
+                                            d_mg_aggregate_edge_srcs.size()),
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_dsts.data(),
+                                            d_mg_aggregate_edge_dsts.size()));
+
+        // 4-3. compare
+
+        auto h_mg_aggregate_edge_exists =
+          cugraph::test::to_host(*handle_, d_mg_aggregate_edge_exists);
+        auto h_mg_aggregate_edge_multiplicities =
+          cugraph::test::to_host(*handle_, d_mg_aggregate_edge_multiplicities);
+        auto h_sg_edge_exists         = cugraph::test::to_host(*handle_, d_sg_edge_exists);
+        auto h_sg_edge_multiplicities = cugraph::test::to_host(*handle_, d_sg_edge_multiplicities);
+
+        ASSERT_TRUE(std::equal(h_mg_aggregate_edge_exists.begin(),
+                               h_mg_aggregate_edge_exists.end(),
+                               h_sg_edge_exists.begin()));
+        ASSERT_TRUE(std::equal(h_mg_aggregate_edge_multiplicities.begin(),
+                               h_mg_aggregate_edge_multiplicities.end(),
+                               h_sg_edge_multiplicities.begin()));
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGHasEdgeAndComputeMultiplicity<input_usecase_t>::handle_ =
+  nullptr;
+
+using Tests_MGHasEdgeAndComputeMultiplicity_File =
+  Tests_MGHasEdgeAndComputeMultiplicity<cugraph::test::File_Usecase>;
+using Tests_MGHasEdgeAndComputeMultiplicity_Rmat =
+  Tests_MGHasEdgeAndComputeMultiplicity<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_tests,
+  Tests_MGHasEdgeAndComputeMultiplicity_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_MGHasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 1024 * 128, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/device_comm_wrapper.cu b/cpp/tests/utilities/device_comm_wrapper.cu
index cfc65b5d741..50727394ad7 100644
--- a/cpp/tests/utilities/device_comm_wrapper.cu
+++ b/cpp/tests/utilities/device_comm_wrapper.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,9 +40,10 @@ rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle,
   rmm::device_uvector<T> gathered_v(
     is_root ? std::reduce(rx_sizes.begin(), rx_sizes.end()) : size_t{0}, handle.get_stream());
 
+  using comm_datatype_t = std::conditional_t<std::is_same_v<T, bool>, uint8_t, T>;
   cugraph::device_gatherv(handle.get_comms(),
-                          d_input.data(),
-                          gathered_v.data(),
+                          reinterpret_cast<comm_datatype_t const*>(d_input.data()),
+                          reinterpret_cast<comm_datatype_t*>(gathered_v.data()),
                           d_input.size(),
                           rx_sizes,
                           rx_displs,
@@ -64,9 +65,10 @@ rmm::device_uvector<T> device_allgatherv(raft::handle_t const& handle,
   rmm::device_uvector<T> gathered_v(std::reduce(rx_sizes.begin(), rx_sizes.end()),
                                     handle.get_stream());
 
+  using comm_datatype_t = std::conditional_t<std::is_same_v<T, bool>, uint8_t, T>;
   cugraph::device_allgatherv(handle.get_comms(),
-                             d_input.data(),
-                             gathered_v.data(),
+                             reinterpret_cast<comm_datatype_t const*>(d_input.data()),
+                             reinterpret_cast<comm_datatype_t*>(gathered_v.data()),
                              rx_sizes,
                              rx_displs,
                              handle.get_stream());
@@ -76,6 +78,9 @@ rmm::device_uvector<T> device_allgatherv(raft::handle_t const& handle,
 
 // explicit instantiation
 
+template rmm::device_uvector<bool> device_gatherv(raft::handle_t const& handle,
+                                                  raft::device_span<bool const> d_input);
+
 template rmm::device_uvector<int32_t> device_gatherv(raft::handle_t const& handle,
                                                      raft::device_span<int32_t const> d_input);
 
@@ -91,6 +96,9 @@ template rmm::device_uvector<float> device_gatherv(raft::handle_t const& handle,
 template rmm::device_uvector<double> device_gatherv(raft::handle_t const& handle,
                                                     raft::device_span<double const> d_input);
 
+template rmm::device_uvector<bool> device_allgatherv(raft::handle_t const& handle,
+                                                     raft::device_span<bool const> d_input);
+
 template rmm::device_uvector<int32_t> device_allgatherv(raft::handle_t const& handle,
                                                         raft::device_span<int32_t const> d_input);
 
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 321a0536e02..3fa6ae089d3 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -377,18 +377,24 @@ template <typename T>
 std::vector<T> to_host(raft::handle_t const& handle, raft::device_span<T const> data)
 {
   std::vector<T> h_data(data.size());
-  raft::update_host(h_data.data(), data.data(), data.size(), handle.get_stream());
-  handle.sync_stream();
+  if constexpr (std::is_same_v<T, bool>) {  // std::vector<bool> stores values in a packed format
+    auto h_tmp = new bool[data.size()];
+    raft::update_host(h_tmp, data.data(), data.size(), handle.get_stream());
+    handle.sync_stream();
+    std::transform(
+      h_tmp, h_tmp + data.size(), h_data.begin(), [](uint8_t v) { return static_cast<bool>(v); });
+    delete[] h_tmp;
+  } else {
+    raft::update_host(h_data.data(), data.data(), data.size(), handle.get_stream());
+    handle.sync_stream();
+  }
   return h_data;
 }
 
 template <typename T>
 std::vector<T> to_host(raft::handle_t const& handle, rmm::device_uvector<T> const& data)
 {
-  std::vector<T> h_data(data.size());
-  raft::update_host(h_data.data(), data.data(), data.size(), handle.get_stream());
-  handle.sync_stream();
-  return h_data;
+  return to_host(handle, raft::device_span<T const>(data.data(), data.size()));
 }
 
 template <typename T>
@@ -396,11 +402,7 @@ std::optional<std::vector<T>> to_host(raft::handle_t const& handle,
                                       std::optional<raft::device_span<T const>> data)
 {
   std::optional<std::vector<T>> h_data{std::nullopt};
-  if (data) {
-    h_data = std::vector<T>((*data).size());
-    raft::update_host((*h_data).data(), (*data).data(), (*data).size(), handle.get_stream());
-    handle.sync_stream();
-  }
+  if (data) { h_data = to_host(handle, *data); }
   return h_data;
 }
 
@@ -410,9 +412,7 @@ std::optional<std::vector<T>> to_host(raft::handle_t const& handle,
 {
   std::optional<std::vector<T>> h_data{std::nullopt};
   if (data) {
-    h_data = std::vector<T>((*data).size());
-    raft::update_host((*h_data).data(), (*data).data(), (*data).size(), handle.get_stream());
-    handle.sync_stream();
+    h_data = to_host(handle, raft::device_span<T const>((*data).data(), (*data).size()));
   }
   return h_data;
 }
@@ -430,8 +430,16 @@ template <typename T>
 rmm::device_uvector<T> to_device(raft::handle_t const& handle, std::vector<T> const& data)
 {
   rmm::device_uvector<T> d_data(data.size(), handle.get_stream());
-  raft::update_device(d_data.data(), data.data(), data.size(), handle.get_stream());
-  handle.sync_stream();
+  if constexpr (std::is_same_v<T, bool>) {  // std::vector<bool> stores values in a packed format
+    auto h_tmp = new bool[data.size()];
+    std::copy(data.begin(), data.end(), h_tmp);
+    raft::update_device(d_data.data(), h_tmp, h_tmp + data.size(), handle.get_stream());
+    handle.sync_stream();
+    delete[] h_tmp;
+  } else {
+    raft::update_device(d_data.data(), data.data(), data.size(), handle.get_stream());
+    handle.sync_stream();
+  }
   return d_data;
 }
 
@@ -453,11 +461,7 @@ std::optional<rmm::device_uvector<T>> to_device(raft::handle_t const& handle,
                                                 std::optional<std::vector<T>> const& data)
 {
   std::optional<rmm::device_uvector<T>> d_data{std::nullopt};
-  if (data) {
-    d_data = rmm::device_uvector<T>(data->size(), handle.get_stream());
-    raft::update_host(d_data->data(), data->data(), data->size(), handle.get_stream());
-    handle.sync_stream();
-  }
+  if (data) { d_data = to_device(handle, *data); }
   return d_data;
 }
 

From 9a261ffd1f5e055e9c8b751c6009f99e9c39d0c1 Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Thu, 25 Jan 2024 14:05:45 -0600
Subject: [PATCH 5/7] nx-cugraph: add `complement` and `reverse` (#4103)

We apparently already had `G.reverse()`, which made that function extra easy :)

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4103
---
 python/nx-cugraph/_nx_cugraph/__init__.py     |  2 +
 .../nx_cugraph/algorithms/__init__.py         |  2 +
 .../algorithms/operators/__init__.py          | 13 +++++
 .../nx_cugraph/algorithms/operators/unary.py  | 55 +++++++++++++++++++
 4 files changed, 72 insertions(+)
 create mode 100644 python/nx-cugraph/nx_cugraph/algorithms/operators/__init__.py
 create mode 100644 python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py

diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index 9bca031a2f0..2f283aa153c 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -43,6 +43,7 @@
         "chvatal_graph",
         "circular_ladder_graph",
         "clustering",
+        "complement",
         "complete_bipartite_graph",
         "complete_graph",
         "complete_multipartite_graph",
@@ -105,6 +106,7 @@
         "path_graph",
         "petersen_graph",
         "reciprocity",
+        "reverse",
         "sedgewick_maze_graph",
         "single_source_shortest_path_length",
         "single_target_shortest_path_length",
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
index 08658ad94cb..7aafa85f5b7 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
@@ -17,6 +17,7 @@
     community,
     components,
     link_analysis,
+    operators,
     shortest_paths,
     traversal,
     tree,
@@ -29,6 +30,7 @@
 from .dag import *
 from .isolate import *
 from .link_analysis import *
+from .operators import *
 from .reciprocity import *
 from .shortest_paths import *
 from .traversal import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/operators/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/operators/__init__.py
new file mode 100644
index 00000000000..32fd45f5726
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/operators/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .unary import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py b/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
new file mode 100644
index 00000000000..08abc9f2872
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+import networkx as nx
+import numpy as np
+
+import nx_cugraph as nxcg
+from nx_cugraph.convert import _to_graph
+from nx_cugraph.utils import index_dtype, networkx_algorithm
+
+__all__ = ["complement", "reverse"]
+
+
+@networkx_algorithm(version_added="24.02")
+def complement(G):
+    G = _to_graph(G)
+    N = G._N
+    # Upcast to int64 so indices don't overflow.
+    edges_a_b = N * G.src_indices.astype(np.int64) + G.dst_indices
+    # Now compute flattened indices for all edges except self-loops
+    # Alt (slower):
+    # edges_full = np.arange(N * N)
+    # edges_full = edges_full[(edges_full % (N + 1)).astype(bool)]
+    edges_full = cp.arange(1, N * (N - 1) + 1) + cp.repeat(cp.arange(N - 1), N)
+    edges_comp = cp.setdiff1d(
+        edges_full,
+        edges_a_b,
+        assume_unique=not G.is_multigraph(),
+    )
+    src_indices, dst_indices = cp.divmod(edges_comp, N)
+    return G.__class__.from_coo(
+        N,
+        src_indices.astype(index_dtype),
+        dst_indices.astype(index_dtype),
+        key_to_id=G.key_to_id,
+    )
+
+
+@networkx_algorithm(version_added="24.02")
+def reverse(G, copy=True):
+    if not G.is_directed():
+        raise nx.NetworkXError("Cannot reverse an undirected graph.")
+    if isinstance(G, nx.Graph):
+        G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    return G.reverse(copy=copy)

From ff76a385c4048ac75d1408c3bcf9d4e3018a88b0 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Thu, 25 Jan 2024 13:13:58 -0800
Subject: [PATCH 6/7] Update per_v_transform_reduce_incoming|outgoing_e to
 support edge masking (#4085)

per_v_transform_reduce_(incoming|outgoing_e) now supports edge masking.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Naim (https://github.com/naimnv)
  - Joseph Nke (https://github.com/jnke2016)

URL: https://github.com/rapidsai/cugraph/pull/4085
---
 .../eigenvector_centrality_impl.cuh           |   4 +-
 cpp/src/link_analysis/pagerank_impl.cuh       |   4 +-
 cpp/src/prims/detail/prim_functors.cuh        |  60 +++
 ...v_transform_reduce_incoming_outgoing_e.cuh | 442 +++++++++++-------
 ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh |  18 +-
 .../traversal/od_shortest_distances_impl.cuh  |  17 +-
 ..._v_transform_reduce_incoming_outgoing_e.cu |  27 +-
 7 files changed, 381 insertions(+), 191 deletions(-)
 create mode 100644 cpp/src/prims/detail/prim_functors.cuh

diff --git a/cpp/src/centrality/eigenvector_centrality_impl.cuh b/cpp/src/centrality/eigenvector_centrality_impl.cuh
index 8d1bea4004d..2129dca6985 100644
--- a/cpp/src/centrality/eigenvector_centrality_impl.cuh
+++ b/cpp/src/centrality/eigenvector_centrality_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,7 +117,7 @@ rmm::device_uvector<weight_t> eigenvector_centrality(
         edge_src_centralities.view(),
         edge_dst_dummy_property_t{}.view(),
         edge_dummy_property_t{}.view(),
-        [] __device__(vertex_t, vertex_t, auto src_val, auto, auto) { return src_val * 1.0; },
+        [] __device__(vertex_t, vertex_t, auto src_val, auto, auto) { return src_val; },
         weight_t{0},
         reduce_op::plus<weight_t>{},
         centralities.begin());
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index 92c70fcff20..9a76ba73f92 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -288,7 +288,7 @@ centrality_algorithm_metadata_t pagerank(
         edge_dst_dummy_property_t{}.view(),
         edge_dummy_property_t{}.view(),
         [alpha] __device__(vertex_t, vertex_t, auto src_val, auto, auto) {
-          return src_val * 1.0 * alpha;
+          return src_val * alpha;
         },
         unvarying_part,
         reduce_op::plus<result_t>{},
diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh
new file mode 100644
index 00000000000..2785ba38dfd
--- /dev/null
+++ b/cpp/src/prims/detail/prim_functors.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/edge_partition_device_view.cuh>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename GraphViewType,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgeOp>
+struct call_e_op_t {
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> const& edge_partition{};
+  EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input{};
+  EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input{};
+  EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input{};
+  EdgeOp const& e_op{};
+  typename GraphViewType::vertex_type major{};
+  typename GraphViewType::vertex_type major_offset{};
+  typename GraphViewType::vertex_type const* indices{nullptr};
+  typename GraphViewType::edge_type edge_offset{};
+
+  __device__ auto operator()(typename GraphViewType::edge_type i) const
+  {
+    auto minor        = indices[i];
+    auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+    auto src          = GraphViewType::is_storage_transposed ? minor : major;
+    auto dst          = GraphViewType::is_storage_transposed ? major : minor;
+    auto src_offset   = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+    auto dst_offset   = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+    return e_op(src,
+                dst,
+                edge_partition_src_value_input.get(src_offset),
+                edge_partition_dst_value_input.get(dst_offset),
+                edge_partition_e_value_input.get(edge_offset + i));
+  }
+};
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 1a7fc0130c4..24b4f0857b1 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <detail/graph_partition_utils.cuh>
+#include <prims/detail/prim_functors.cuh>
 #include <prims/fill_edge_src_dst_property.cuh>
 #include <prims/property_op_utils.cuh>
 #include <prims/reduce_op.cuh>
@@ -63,11 +64,84 @@ namespace detail {
 
 int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512;
 
+template <typename vertex_t,
+          typename edge_t,
+          bool multi_gpu,
+          typename result_t,
+          typename TransformOp,
+          typename ReduceOp,
+          typename ResultValueOutputIteratorOrWrapper>
+struct transform_and_atomic_reduce_t {
+  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition{};
+  result_t identity_element{};
+  vertex_t const* indices{nullptr};
+  TransformOp const& transform_op{};
+  ResultValueOutputIteratorOrWrapper& result_value_output{};
+
+  __device__ void operator()(edge_t i) const
+  {
+    auto e_op_result = transform_op(i);
+    if (e_op_result != identity_element) {
+      auto minor        = indices[i];
+      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+      if constexpr (multi_gpu) {
+        reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+      } else {
+        reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+      }
+    }
+  }
+};
+
+template <bool update_major,
+          typename vertex_t,
+          typename edge_t,
+          bool multi_gpu,
+          typename result_t,
+          typename TransformOp,
+          typename ReduceOp,
+          typename ResultValueOutputIteratorOrWrapper>
+__device__ void update_result_value_output(
+  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition,
+  vertex_t const* indices,
+  edge_t local_degree,
+  TransformOp const& transform_op,
+  result_t init,
+  ReduceOp const& reduce_op,
+  size_t output_idx /* relevent only when update_major === true */,
+  result_t identity_element,
+  ResultValueOutputIteratorOrWrapper& result_value_output)
+{
+  if constexpr (update_major) {
+    *(result_value_output + output_idx) =
+      thrust::transform_reduce(thrust::seq,
+                               thrust::make_counting_iterator(edge_t{0}),
+                               thrust::make_counting_iterator(local_degree),
+                               transform_op,
+                               init,
+                               reduce_op);
+  } else {
+    thrust::for_each(
+      thrust::seq,
+      thrust::make_counting_iterator(edge_t{0}),
+      thrust::make_counting_iterator(local_degree),
+      transform_and_atomic_reduce_t<vertex_t,
+                                    edge_t,
+                                    multi_gpu,
+                                    result_t,
+                                    TransformOp,
+                                    ReduceOp,
+                                    ResultValueOutputIteratorOrWrapper>{
+        edge_partition, identity_element, indices, transform_op, result_value_output});
+  }
+}
+
 template <bool update_major,
           typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -82,9 +156,11 @@ __global__ void per_v_transform_reduce_e_hypersparse(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
+  T identity_element /* relevant only if update_major == true */,
   ReduceOp reduce_op)
 {
   static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
@@ -104,6 +180,7 @@ __global__ void per_v_transform_reduce_e_hypersparse(
   while (idx < static_cast<size_t>(dcs_nzd_vertex_count)) {
     auto major =
       *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast<vertex_t>(idx)));
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
     auto major_idx =
       major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
     vertex_t const* indices{nullptr};
@@ -111,60 +188,50 @@ __global__ void per_v_transform_reduce_e_hypersparse(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_idx));
-    auto transform_op = [&edge_partition,
-                         &edge_partition_src_value_input,
-                         &edge_partition_dst_value_input,
-                         &edge_partition_e_value_input,
-                         &e_op,
-                         major,
-                         indices,
-                         edge_offset] __device__(auto i) {
-      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed ? minor : major;
-      auto dst          = GraphViewType::is_storage_transposed ? major : minor;
-      auto src_offset   = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-      auto dst_offset   = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-      return e_op(src,
-                  dst,
-                  edge_partition_src_value_input.get(src_offset),
-                  edge_partition_dst_value_input.get(dst_offset),
-                  edge_partition_e_value_input.get(edge_offset + i));
-    };
 
-    if constexpr (update_major) {
-      *(result_value_output + (major - *(edge_partition.major_hypersparse_first()))) =
-        thrust::transform_reduce(thrust::seq,
-                                 thrust::make_counting_iterator(edge_t{0}),
-                                 thrust::make_counting_iterator(local_degree),
-                                 transform_op,
-                                 init,
-                                 reduce_op);
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    if (edge_partition_e_mask) {
+      auto transform_op =
+        [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_e_op(i);
+          } else {
+            return identity_element;
+          }
+        };
+
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               transform_op,
+                                               init,
+                                               reduce_op,
+                                               major - *(edge_partition).major_hypersparse_first(),
+                                               identity_element,
+                                               result_value_output);
     } else {
-      if constexpr (GraphViewType::is_multi_gpu) {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-          });
-      } else {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-          });
-      }
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               call_e_op,
+                                               init,
+                                               reduce_op,
+                                               major - *(edge_partition).major_hypersparse_first(),
+                                               identity_element,
+                                               result_value_output);
     }
     idx += gridDim.x * blockDim.x;
   }
@@ -175,6 +242,7 @@ template <bool update_major,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -191,9 +259,11 @@ __global__ void per_v_transform_reduce_e_low_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
+  T identity_element /* relevant only if update_major == true */,
   ReduceOp reduce_op)
 {
   static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
@@ -209,71 +279,57 @@ __global__ void per_v_transform_reduce_e_low_degree(
   auto idx = static_cast<size_t>(tid);
 
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_offset));
-    auto transform_op = [&edge_partition,
-                         &edge_partition_src_value_input,
-                         &edge_partition_dst_value_input,
-                         &edge_partition_e_value_input,
-                         &e_op,
-                         major_offset,
-                         indices,
-                         edge_offset] __device__(auto i) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      return e_op(src,
-                  dst,
-                  edge_partition_src_value_input.get(src_offset),
-                  edge_partition_dst_value_input.get(dst_offset),
-                  edge_partition_e_value_input.get(edge_offset + i));
-    };
 
-    if constexpr (update_major) {
-      *(result_value_output + idx) =
-        thrust::transform_reduce(thrust::seq,
-                                 thrust::make_counting_iterator(edge_t{0}),
-                                 thrust::make_counting_iterator(local_degree),
-                                 transform_op,
-                                 init,
-                                 reduce_op);
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    if (edge_partition_e_mask) {
+      auto transform_op =
+        [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_e_op(i);
+          } else {
+            return identity_element;
+          }
+        };
+
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               transform_op,
+                                               init,
+                                               reduce_op,
+                                               idx,
+                                               identity_element,
+                                               result_value_output);
     } else {
-      if constexpr (GraphViewType::is_multi_gpu) {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-          });
-      } else {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-          });
-      }
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               call_e_op,
+                                               init,
+                                               reduce_op,
+                                               idx,
+                                               identity_element,
+                                               result_value_output);
     }
     idx += gridDim.x * blockDim.x;
   }
@@ -284,6 +340,7 @@ template <bool update_major,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -300,6 +357,7 @@ __global__ void per_v_transform_reduce_e_mid_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
@@ -327,41 +385,61 @@ __global__ void per_v_transform_reduce_e_mid_degree(
                  raft::warp_size()];  // relevant only if update_major == true
 
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
     [[maybe_unused]] auto reduced_e_op_result =
       lane_id == 0 ? init : identity_element;  // relevant only if update_major == true
-    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      auto e_op_result = e_op(src,
-                              dst,
-                              edge_partition_src_value_input.get(src_offset),
-                              edge_partition_dst_value_input.get(dst_offset),
-                              edge_partition_e_value_input.get(edge_offset + i));
-      if constexpr (update_major) {
-        reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
-      } else {
-        if constexpr (GraphViewType::is_multi_gpu) {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+    if (edge_partition_e_mask) {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          auto e_op_result = call_e_op(i);
+          if constexpr (update_major) {
+            reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
+          } else {
+            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+            if constexpr (GraphViewType::is_multi_gpu) {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+            } else {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+            }
+          }
+        }
+      }
+    } else {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        auto e_op_result = call_e_op(i);
+        if constexpr (update_major) {
+          reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
         } else {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+          if constexpr (GraphViewType::is_multi_gpu) {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+          } else {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          }
         }
       }
     }
+
     if constexpr (update_major) {
       reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
                               .Reduce(reduced_e_op_result, reduce_op);
@@ -377,6 +455,7 @@ template <bool update_major,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -393,6 +472,7 @@ __global__ void per_v_transform_reduce_e_high_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
@@ -416,41 +496,61 @@ __global__ void per_v_transform_reduce_e_high_degree(
     typename BlockReduce::TempStorage temp_storage;  // relevant only if update_major == true
 
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
     [[maybe_unused]] auto reduced_e_op_result =
       threadIdx.x == 0 ? init : identity_element;  // relevant only if update_major == true
-    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      auto e_op_result = e_op(src,
-                              dst,
-                              edge_partition_src_value_input.get(src_offset),
-                              edge_partition_dst_value_input.get(dst_offset),
-                              edge_partition_e_value_input.get(edge_offset + i));
-      if constexpr (update_major) {
-        reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
-      } else {
-        if constexpr (GraphViewType::is_multi_gpu) {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+    if (edge_partition_e_mask) {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          auto e_op_result = call_e_op(i);
+          if constexpr (update_major) {
+            reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
+          } else {
+            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+            if constexpr (GraphViewType::is_multi_gpu) {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+            } else {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+            }
+          }
+        }
+      }
+    } else {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        auto e_op_result = call_e_op(i);
+        if constexpr (update_major) {
+          reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
         } else {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+          if constexpr (GraphViewType::is_multi_gpu) {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+          } else {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          }
         }
       }
     }
+
     if constexpr (update_major) {
       reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op);
       if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; }
@@ -656,10 +756,18 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
 
   if (stream_pool_indices) { handle.sync_stream(); }
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     auto major_init = ReduceOp::identity_element;
     if constexpr (update_major) {
@@ -737,9 +845,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
               edge_partition_src_value_input,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
+              edge_partition_e_mask,
               segment_output_buffer,
               e_op,
               major_init,
+              ReduceOp::identity_element,
               reduce_op);
         }
       }
@@ -761,9 +871,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             segment_output_buffer,
             e_op,
             major_init,
+            ReduceOp::identity_element,
             reduce_op);
       }
       if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
@@ -784,6 +896,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             segment_output_buffer,
             e_op,
             major_init,
@@ -806,6 +919,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             output_buffer,
             e_op,
             major_init,
@@ -825,9 +939,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             output_buffer,
             e_op,
             major_init,
+            ReduceOp::identity_element,
             reduce_op);
       }
     }
@@ -1056,8 +1172,6 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
                                        VertexValueOutputIterator vertex_value_output_first,
                                        bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -1137,8 +1251,6 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle,
                                        VertexValueOutputIterator vertex_value_output_first,
                                        bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
index 745f1a8fd8e..18e722d62cc 100644
--- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
+++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ template <typename key_t,
           typename dst_value_t,
           typename e_value_t,
           typename EdgeOp>
-struct call_e_op_t {
+struct transform_reduce_v_frontier_call_e_op_t {
   EdgeOp e_op{};
 
   __device__ thrust::optional<
@@ -331,13 +331,13 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
 
   // 1. fill the buffer
 
-  detail::call_e_op_t<key_t,
-                      payload_t,
-                      vertex_t,
-                      typename EdgeSrcValueInputWrapper::value_type,
-                      typename EdgeDstValueInputWrapper::value_type,
-                      typename EdgeValueInputWrapper::value_type,
-                      EdgeOp>
+  detail::transform_reduce_v_frontier_call_e_op_t<key_t,
+                                                  payload_t,
+                                                  vertex_t,
+                                                  typename EdgeSrcValueInputWrapper::value_type,
+                                                  typename EdgeDstValueInputWrapper::value_type,
+                                                  typename EdgeValueInputWrapper::value_type,
+                                                  EdgeOp>
     e_op_wrapper{e_op};
 
   auto [key_buffer, payload_buffer] =
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index cc69cb5f67f..58fae83bca0 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -639,13 +639,14 @@ rmm::device_uvector<weight_t> od_shortest_distances(
         static_cast<od_idx_t>(origins.size()),
         cutoff,
         invalid_distance};
-      detail::call_e_op_t<thrust::tuple<vertex_t, od_idx_t>,
-                          weight_t,
-                          vertex_t,
-                          thrust::nullopt_t,
-                          thrust::nullopt_t,
-                          weight_t,
-                          e_op_t<vertex_t, od_idx_t, key_t, weight_t, GraphViewType::is_multi_gpu>>
+      detail::transform_reduce_v_frontier_call_e_op_t<
+        thrust::tuple<vertex_t, od_idx_t>,
+        weight_t,
+        vertex_t,
+        thrust::nullopt_t,
+        thrust::nullopt_t,
+        weight_t,
+        e_op_t<vertex_t, od_idx_t, key_t, weight_t, GraphViewType::is_multi_gpu>>
         e_op_wrapper{e_op};
 
       auto new_frontier_tagged_vertex_buffer =
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index 677d6ce5022..fc8114a4652 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -150,8 +150,9 @@ struct result_compare {
 };
 
 struct Prims_Usecase {
-  bool check_correctness{true};
   bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -200,6 +201,13 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG transform reduce
 
     const int hash_bin_count = 5;
@@ -674,7 +682,10 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGPerVTransformReduceIncomingOutgoingE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -682,7 +693,10 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGPerVTransformReduceIncomingOutgoingE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -694,7 +708,10 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGPerVTransformReduceIncomingOutgoingE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()

From 3ff2abd44f90cd143f1fe09acdd12c1ce848656f Mon Sep 17 00:00:00 2001
From: Tingyu Wang <tingyuw@nvidia.com>
Date: Mon, 29 Jan 2024 10:16:52 -0500
Subject: [PATCH 7/7] Create `cugraph-equivariant` package (#4036)

Bring up `cugraph-equivariant` package and add TensorProduct conv layers.

Authors:
  - Tingyu Wang (https://github.com/tingyu66)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - https://github.com/DejunL
  - Maximilian Stadler (https://github.com/stadlmax)
  - Mario Geiger (https://github.com/mariogeiger)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4036
---
 .github/workflows/build.yaml                  |  19 ++
 .github/workflows/pr.yaml                     |  16 ++
 .github/workflows/test.yaml                   |   9 +
 build.sh                                      |  15 +-
 ci/build_python.sh                            |   5 +
 ci/build_wheel.sh                             |   3 +-
 ci/build_wheel_cugraph-equivariant.sh         |   6 +
 ci/test_python.sh                             |  41 +++
 ci/test_wheel_cugraph-equivariant.sh          |  33 +++
 conda/recipes/cugraph-equivariant/build.sh    |   7 +
 conda/recipes/cugraph-equivariant/meta.yaml   |  37 +++
 dependencies.yaml                             |  22 ++
 python/cugraph-equivariant/LICENSE            |   1 +
 python/cugraph-equivariant/README.md          |   5 +
 .../cugraph_equivariant/VERSION               |   1 +
 .../cugraph_equivariant/__init__.py           |  14 +
 .../cugraph_equivariant/_version.py           |  27 ++
 .../cugraph_equivariant/nn/__init__.py        |  21 ++
 .../nn/tensor_product_conv.py                 | 259 ++++++++++++++++++
 .../cugraph_equivariant/tests/conftest.py     |  31 +++
 .../cugraph_equivariant/tests/test_scatter.py |  28 ++
 .../tests/test_tensor_product_conv.py         | 115 ++++++++
 .../cugraph_equivariant/utils/__init__.py     |  18 ++
 .../cugraph_equivariant/utils/scatter.py      |  43 +++
 python/cugraph-equivariant/pyproject.toml     |  64 +++++
 python/cugraph-equivariant/setup.py           |  20 ++
 26 files changed, 857 insertions(+), 3 deletions(-)
 create mode 100755 ci/build_wheel_cugraph-equivariant.sh
 create mode 100755 ci/test_wheel_cugraph-equivariant.sh
 create mode 100644 conda/recipes/cugraph-equivariant/build.sh
 create mode 100644 conda/recipes/cugraph-equivariant/meta.yaml
 create mode 120000 python/cugraph-equivariant/LICENSE
 create mode 100644 python/cugraph-equivariant/README.md
 create mode 120000 python/cugraph-equivariant/cugraph_equivariant/VERSION
 create mode 100644 python/cugraph-equivariant/cugraph_equivariant/__init__.py
 create mode 100644 python/cugraph-equivariant/cugraph_equivariant/_version.py
 create mode 100644 python/cugraph-equivariant/cugraph_equivariant/nn/__init__.py
 create mode 100644 python/cugraph-equivariant/cugraph_equivariant/nn/tensor_product_conv.py
 create mode 100644 python/cugraph-equivariant/cugraph_equivariant/tests/conftest.py
 create mode 100644 python/cugraph-equivariant/cugraph_equivariant/tests/test_scatter.py
 create mode 100644 python/cugraph-equivariant/cugraph_equivariant/tests/test_tensor_product_conv.py
 create mode 100644 python/cugraph-equivariant/cugraph_equivariant/utils/__init__.py
 create mode 100644 python/cugraph-equivariant/cugraph_equivariant/utils/scatter.py
 create mode 100644 python/cugraph-equivariant/pyproject.toml
 create mode 100644 python/cugraph-equivariant/setup.py

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 273a8902eae..243c5f23ec0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -173,3 +173,22 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: cugraph-pyg
+  wheel-build-cugraph-equivariant:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cugraph-equivariant.sh
+  wheel-publish-cugraph-equivariant:
+    needs: wheel-build-cugraph-equivariant
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cugraph-equivariant
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 84d22f8e896..1bb2e0ab0a7 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -29,6 +29,8 @@ jobs:
       - wheel-tests-cugraph-dgl
       - wheel-build-cugraph-pyg
       - wheel-tests-cugraph-pyg
+      - wheel-build-cugraph-equivariant
+      - wheel-tests-cugraph-equivariant
       - devcontainer
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
@@ -161,6 +163,20 @@ jobs:
       build_type: pull-request
       script: ci/test_wheel_cugraph-pyg.sh
       matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "11.8.0"))
+  wheel-build-cugraph-equivariant:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_cugraph-equivariant.sh
+  wheel-tests-cugraph-equivariant:
+    needs: wheel-build-cugraph-equivariant
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_cugraph-equivariant.sh
+      matrix_filter: map(select(.ARCH == "amd64"))
   devcontainer:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 773358ede8d..71051bcc529 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -75,3 +75,12 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_cugraph-pyg.sh
+  wheel-tests-cugraph-equivariant:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/test_wheel_cugraph-equivariant.sh
diff --git a/build.sh b/build.sh
index 5044b3a55b3..82de45ca9fb 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cugraph build script
 
@@ -31,6 +31,7 @@ VALIDARGS="
    cugraph-service
    cugraph-pyg
    cugraph-dgl
+   cugraph-equivariant
    nx-cugraph
    cpp-mgtests
    cpp-mtmgtests
@@ -60,6 +61,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
    cugraph-pyg                - build the cugraph-pyg Python package
    cugraph-dgl                - build the cugraph-dgl extensions for DGL
+   cugraph-equivariant        - build the cugraph-equivariant Python package
    nx-cugraph                 - build the nx-cugraph Python package
    cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
    cpp-mtmgtests              - build libcugraph MTMG tests. Adds UCX as a dependency (temporary).
@@ -222,7 +224,7 @@ if hasArg uninstall; then
     # removes the latest one and leaves the others installed. build.sh uninstall
     # can be run multiple times to remove all of them, but that is not obvious.
     pip uninstall -y pylibcugraph cugraph cugraph-service-client cugraph-service-server \
-        cugraph-dgl cugraph-pyg nx-cugraph
+        cugraph-dgl cugraph-pyg cugraph-equivariant nx-cugraph
 fi
 
 if hasArg clean; then
@@ -359,6 +361,15 @@ if hasArg cugraph-dgl || hasArg all; then
     fi
 fi
 
+# Build and install the cugraph-equivariant Python package
+if hasArg cugraph-equivariant || hasArg all; then
+    if hasArg --clean; then
+        cleanPythonDir ${REPODIR}/python/cugraph-equivariant
+    else
+        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/cugraph-equivariant
+    fi
+fi
+
 # Build and install the nx-cugraph Python package
 if hasArg nx-cugraph || hasArg all; then
     if hasArg --clean; then
diff --git a/ci/build_python.sh b/ci/build_python.sh
index a99e5ce63e8..07a4f59396b 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -89,4 +89,9 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then
     conda/recipes/cugraph-dgl
 fi
 
+rapids-conda-retry mambabuild \
+  --no-test \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
+  conda/recipes/cugraph-equivariant
+
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 828d8948143..30a1c98c106 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -57,7 +57,8 @@ python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 # pure-python packages should not have auditwheel run on them.
 if [[ ${package_name} == "nx-cugraph" ]] || \
    [[ ${package_name} == "cugraph-dgl" ]] || \
-   [[ ${package_name} == "cugraph-pyg" ]]; then
+   [[ ${package_name} == "cugraph-pyg" ]] || \
+   [[ ${package_name} == "cugraph-equivariant" ]]; then
     RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 dist
 else
     mkdir -p final_dist
diff --git a/ci/build_wheel_cugraph-equivariant.sh b/ci/build_wheel_cugraph-equivariant.sh
new file mode 100755
index 00000000000..fcc8e0f774c
--- /dev/null
+++ b/ci/build_wheel_cugraph-equivariant.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+./ci/build_wheel.sh cugraph-equivariant python/cugraph-equivariant
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 7eb5a08edc8..5892c37e35b 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -247,5 +247,46 @@ else
   rapids-logger "skipping cugraph_pyg pytest on CUDA != 11.8"
 fi
 
+# test cugraph-equivariant
+if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
+  if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
+    # Reuse cugraph-dgl's test env for cugraph-equivariant
+    set +u
+    conda activate test_cugraph_dgl
+    set -u
+    rapids-mamba-retry install \
+      --channel "${CPP_CHANNEL}" \
+      --channel "${PYTHON_CHANNEL}" \
+      --channel pytorch \
+      --channel nvidia \
+      cugraph-equivariant
+    pip install e3nn==0.5.1
+
+    rapids-print-env
+
+    rapids-logger "pytest cugraph-equivariant"
+    pushd python/cugraph-equivariant/cugraph_equivariant
+    pytest \
+      --cache-clear \
+      --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph-equivariant.xml" \
+      --cov-config=../../.coveragerc \
+      --cov=cugraph_equivariant \
+      --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-equivariant-coverage.xml" \
+      --cov-report=term \
+      .
+    popd
+
+    # Reactivate the test environment back
+    set +u
+    conda deactivate
+    conda activate test
+    set -u
+  else
+    rapids-logger "skipping cugraph-equivariant pytest on ARM64"
+  fi
+else
+  rapids-logger "skipping cugraph-equivariant pytest on CUDA!=11.8"
+fi
+
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test_wheel_cugraph-equivariant.sh b/ci/test_wheel_cugraph-equivariant.sh
new file mode 100755
index 00000000000..f054780b03a
--- /dev/null
+++ b/ci/test_wheel_cugraph-equivariant.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -eoxu pipefail
+
+package_name="cugraph-equivariant"
+package_dir="python/cugraph-equivariant"
+
+python_package_name=$(echo ${package_name}|sed 's/-/_/g')
+
+mkdir -p ./dist
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# use 'ls' to expand wildcard before adding `[extra]` requires for pip
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# pip creates wheels using python package names
+python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
+
+
+PKG_CUDA_VER="$(echo ${CUDA_VERSION} | cut -d '.' -f1,2 | tr -d '.')"
+PKG_CUDA_VER_MAJOR=${PKG_CUDA_VER:0:2}
+if [[ "${PKG_CUDA_VER_MAJOR}" == "12" ]]; then
+  PYTORCH_CUDA_VER="121"
+else
+  PYTORCH_CUDA_VER=$PKG_CUDA_VER
+fi
+PYTORCH_URL="https://download.pytorch.org/whl/cu${PYTORCH_CUDA_VER}"
+
+rapids-logger "Installing PyTorch and e3nn"
+rapids-retry python -m pip install torch --index-url ${PYTORCH_URL}
+rapids-retry python -m pip install e3nn
+
+python -m pytest python/cugraph-equivariant/cugraph_equivariant/tests
diff --git a/conda/recipes/cugraph-equivariant/build.sh b/conda/recipes/cugraph-equivariant/build.sh
new file mode 100644
index 00000000000..f0ff1688b55
--- /dev/null
+++ b/conda/recipes/cugraph-equivariant/build.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# This assumes the script is executed from the root of the repo directory
+
+./build.sh cugraph-equivariant
diff --git a/conda/recipes/cugraph-equivariant/meta.yaml b/conda/recipes/cugraph-equivariant/meta.yaml
new file mode 100644
index 00000000000..a952812f845
--- /dev/null
+++ b/conda/recipes/cugraph-equivariant/meta.yaml
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set date_string = environ['RAPIDS_DATE_STRING'] %}
+
+package:
+  name: cugraph-equivariant
+  version: {{ version }}
+
+source:
+  path: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  build:
+      number: {{ GIT_DESCRIBE_NUMBER }}
+      string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+
+requirements:
+  host:
+    - python
+  run:
+    - pylibcugraphops ={{ minor_version }}
+    - python
+
+tests:
+  imports:
+    - cugraph_equivariant
+
+about:
+  home: https://rapids.ai/
+  dev_url: https://github.com/rapidsai/cugraph
+  license: Apache-2.0
+  license_file: ../../../LICENSE
+  summary: GPU-accelerated equivariant convolutional layers.
diff --git a/dependencies.yaml b/dependencies.yaml
index 18ddb6c51dd..e9badf2be9f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -198,6 +198,28 @@ files:
       key: test
     includes:
       - test_python_common
+  py_build_cugraph_equivariant:
+    output: pyproject
+    pyproject_dir: python/cugraph-equivariant
+    extras:
+      table: build-system
+    includes:
+      - python_build_wheel
+  py_run_cugraph_equivariant:
+    output: pyproject
+    pyproject_dir: python/cugraph-equivariant
+    extras:
+      table: project
+    includes:
+      - depends_on_pylibcugraphops
+  py_test_cugraph_equivariant:
+    output: pyproject
+    pyproject_dir: python/cugraph-equivariant
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
   py_build_cugraph_service_client:
     output: pyproject
     pyproject_dir: python/cugraph-service/client
diff --git a/python/cugraph-equivariant/LICENSE b/python/cugraph-equivariant/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cugraph-equivariant/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cugraph-equivariant/README.md b/python/cugraph-equivariant/README.md
new file mode 100644
index 00000000000..d5de8852709
--- /dev/null
+++ b/python/cugraph-equivariant/README.md
@@ -0,0 +1,5 @@
+# cugraph-equivariant
+
+## Description
+
+cugraph-equivariant library provides fast symmetry-preserving (equivariant) operations and convolutional layers, to accelerate the equivariant neural networks in drug discovery and other domains.
diff --git a/python/cugraph-equivariant/cugraph_equivariant/VERSION b/python/cugraph-equivariant/cugraph_equivariant/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cugraph-equivariant/cugraph_equivariant/__init__.py b/python/cugraph-equivariant/cugraph_equivariant/__init__.py
new file mode 100644
index 00000000000..20507bd9329
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph_equivariant._version import __git_commit__, __version__
diff --git a/python/cugraph-equivariant/cugraph_equivariant/_version.py b/python/cugraph-equivariant/cugraph_equivariant/_version.py
new file mode 100644
index 00000000000..31a707bb17e
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/_version.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+# Read VERSION file from the module that is symlinked to VERSION file
+# in the root of the repo at build time or copied to the module at
+# installation. VERSION is a separate file that allows CI build-time scripts
+# to update version info (including commit hashes) without modifying
+# source files.
+__version__ = (
+    importlib.resources.files("cugraph_equivariant")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""
diff --git a/python/cugraph-equivariant/cugraph_equivariant/nn/__init__.py b/python/cugraph-equivariant/cugraph_equivariant/nn/__init__.py
new file mode 100644
index 00000000000..8f4d8de0042
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/nn/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tensor_product_conv import FullyConnectedTensorProductConv
+
+DiffDockTensorProductConv = FullyConnectedTensorProductConv
+
+__all__ = [
+    "FullyConnectedTensorProductConv",
+    "DiffDockTensorProductConv",
+]
diff --git a/python/cugraph-equivariant/cugraph_equivariant/nn/tensor_product_conv.py b/python/cugraph-equivariant/cugraph_equivariant/nn/tensor_product_conv.py
new file mode 100644
index 00000000000..5120a23180d
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/nn/tensor_product_conv.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Sequence, Union
+
+import torch
+from torch import nn
+from e3nn import o3
+from e3nn.nn import BatchNorm
+
+from cugraph_equivariant.utils import scatter_reduce
+
+from pylibcugraphops.pytorch.operators import FusedFullyConnectedTensorProduct
+
+
+class FullyConnectedTensorProductConv(nn.Module):
+    r"""Message passing layer for tensor products in DiffDock-like architectures.
+    The left operand of tensor product is the spherical harmonic representation
+    of edge vector; the right operand consists of node features in irreps.
+
+    .. math::
+        \sum_{b \in \mathcal{N}_a} Y\left(\hat{r}_{a b}\right)
+        \otimes_{\psi_{a b}} \mathbf{h}_b
+
+    where the path weights :math:`\psi_{a b}` can be constructed from edge
+    embeddings and scalar features using an MLP:
+
+    .. math::
+        \psi_{a b} = \operatorname{MLP}
+        \left(e_{a b}, \mathbf{h}_a^0, \mathbf{h}_b^0\right)
+
+    Users have the option to either directly input the weights or provide the
+    MLP parameters and scalar features from edges and nodes.
+
+    Parameters
+    ----------
+    in_irreps : e3nn.o3.Irreps
+        Irreps for the input node features.
+
+    sh_irreps : e3nn.o3.Irreps
+        Irreps for the spherical harmonic representations of edge vectors.
+
+    out_irreps : e3nn.o3.Irreps
+        Irreps for the output.
+
+    batch_norm : bool, optional (default=True)
+        If true, batch normalization is applied.
+
+    mlp_channels : sequence of ints, optional (default=None)
+        A sequence of integers defining number of neurons in each layer in MLP
+        before the output layer. If `None`, no MLP will be added. The input layer
+        contains edge embeddings and node scalar features.
+
+    mlp_activation : nn.Module or sequence of nn.Module, optional (default=nn.GELU())
+        A sequence of functions to be applied in between linear layers in MLP,
+        e.g., `nn.Sequential(nn.ReLU(), nn.Dropout(0.4))`.
+
+    e3nn_compat_mode: bool, optional (default=False)
+        cugraph-ops and e3nn use different memory layout for Irreps-tensors.
+        The last (fastest moving) dimension is num_channels for cugraph-ops and
+        ir.dim for e3nn. When enabled, the input and output of this layer will
+        follow e3nn's memory layout.
+
+    Examples
+    --------
+    >>> # Case 1: MLP with the input layer having 6 channels and 2 hidden layers
+    >>> #         having 16 channels. edge_emb.size(1) must match the size of
+    >>> #         the input layer: 6
+    >>>
+    >>> conv1 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
+    >>>     mlp_channels=[6, 16, 16], mlp_activation=nn.ReLU()).cuda()
+    >>> out = conv1(src_features, edge_sh, edge_emb, graph)
+    >>>
+    >>> # Case 2: Same as case 1 but with the scalar features from edges, sources
+    >>> #         and destinations passed in separately.
+    >>>
+    >>> conv2 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
+    >>>     mlp_channels=[6, 16, 16], mlp_activation=nn.ReLU()).cuda()
+    >>> out = conv3(src_features, edge_sh, edge_scalars, graph,
+    >>>     src_scalars=src_scalars, dst_scalars=dst_scalars)
+    >>>
+    >>> # Case 3: No MLP, edge_emb will be directly used as the tensor product weights
+    >>>
+    >>> conv3 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
+    >>>     mlp_channels=None).cuda()
+    >>> out = conv2(src_features, edge_sh, edge_emb, graph)
+
+    """
+
+    def __init__(
+        self,
+        in_irreps: o3.Irreps,
+        sh_irreps: o3.Irreps,
+        out_irreps: o3.Irreps,
+        batch_norm: bool = True,
+        mlp_channels: Optional[Sequence[int]] = None,
+        mlp_activation: Union[nn.Module, Sequence[nn.Module]] = nn.GELU(),
+        e3nn_compat_mode: bool = False,
+    ):
+        super().__init__()
+        self.in_irreps = in_irreps
+        self.out_irreps = out_irreps
+        self.sh_irreps = sh_irreps
+        self.e3nn_compat_mode = e3nn_compat_mode
+
+        self.tp = FusedFullyConnectedTensorProduct(
+            in_irreps, sh_irreps, out_irreps, e3nn_compat_mode=e3nn_compat_mode
+        )
+
+        self.batch_norm = BatchNorm(out_irreps) if batch_norm else None
+
+        if mlp_activation is None:
+            mlp_activation = []
+        elif hasattr(mlp_activation, "__len__") and hasattr(
+            mlp_activation, "__getitem__"
+        ):
+            mlp_activation = list(mlp_activation)
+        else:
+            mlp_activation = [mlp_activation]
+
+        if mlp_channels is not None:
+            dims = list(mlp_channels) + [self.tp.weight_numel]
+            mlp = []
+            for i in range(len(dims) - 1):
+                mlp.append(nn.Linear(dims[i], dims[i + 1]))
+                if i != len(dims) - 2:
+                    mlp.extend(mlp_activation)
+            self.mlp = nn.Sequential(*mlp)
+        else:
+            self.mlp = None
+
+    def forward(
+        self,
+        src_features: torch.Tensor,
+        edge_sh: torch.Tensor,
+        edge_emb: torch.Tensor,
+        graph: tuple[torch.Tensor, tuple[int, int]],
+        src_scalars: Optional[torch.Tensor] = None,
+        dst_scalars: Optional[torch.Tensor] = None,
+        reduce: str = "mean",
+        edge_envelope: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass.
+
+        Parameters
+        ----------
+        src_features : torch.Tensor
+            Source node features.
+            Shape: (num_src_nodes, in_irreps.dim)
+
+        edge_sh : torch.Tensor
+            The spherical harmonic representations of the edge vectors.
+            Shape: (num_edges, sh_irreps.dim)
+
+        edge_emb: torch.Tensor
+            Edge embeddings that are fed into MLPs to generate tensor product weights.
+            Shape: (num_edges, dim), where `dim` should be:
+            - `tp.weight_numel` when the layer does not contain MLPs.
+            - num_edge_scalars, with the sum of num_[edge/src/dst]_scalars being
+              mlp_channels[0]
+
+        graph : tuple
+            A tuple that stores the graph information, with the first element being
+            the adjacency matrix in COO, and the second element being its shape:
+            (num_src_nodes, num_dst_nodes).
+
+        src_scalars: torch.Tensor, optional
+            Scalar features of source nodes.
+            Shape: (num_src_nodes, num_src_scalars)
+
+        dst_scalars: torch.Tensor, optional
+            Scalar features of destination nodes.
+            Shape: (num_dst_nodes, num_dst_scalars)
+
+        reduce : str, optional (default="mean")
+            Reduction operator. Choose between "mean" and "sum".
+
+        edge_envelope: torch.Tensor, optional
+            Typically used as attenuation factors to fade out messages coming
+            from nodes close to the cutoff distance used to create the graph.
+            This is important to make the model smooth to the changes in node's
+            coordinates.
+            Shape: (num_edges,)
+
+        Returns
+        -------
+        torch.Tensor
+            Output node features.
+            Shape: (num_dst_nodes, out_irreps.dim)
+        """
+        edge_emb_size = edge_emb.size(-1)
+        src_scalars_size = 0 if src_scalars is None else src_scalars.size(-1)
+        dst_scalars_size = 0 if dst_scalars is None else dst_scalars.size(-1)
+
+        if self.mlp is None:
+            if self.tp.weight_numel != edge_emb_size:
+                raise RuntimeError(
+                    f"When MLP is not present, edge_emb's last dimension must "
+                    f"equal tp.weight_numel (but got {edge_emb_size} and "
+                    f"{self.tp.weight_numel})"
+                )
+        else:
+            total_size = edge_emb_size + src_scalars_size + dst_scalars_size
+            if self.mlp[0].in_features != total_size:
+                raise RuntimeError(
+                    f"The size of MLP's input layer ({self.mlp[0].in_features}) "
+                    f"does not match the total number of scalar features from "
+                    f"edge_emb, src_scalars and dst_scalars ({total_size})"
+                )
+
+        if reduce not in ["mean", "sum"]:
+            raise RuntimeError(
+                f"reduce argument must be either 'mean' or 'sum', got {reduce}."
+            )
+
+        (src, dst), (num_src_nodes, num_dst_nodes) = graph
+
+        if self.mlp is not None:
+            if src_scalars is None and dst_scalars is None:
+                tp_weights = self.mlp(edge_emb)
+            else:
+                w_edge, w_src, w_dst = torch.split(
+                    self.mlp[0].weight,
+                    (edge_emb_size, src_scalars_size, dst_scalars_size),
+                    dim=-1,
+                )
+                tp_weights = edge_emb @ w_edge.T + self.mlp[0].bias
+
+                if src_scalars is not None:
+                    tp_weights += (src_scalars @ w_src.T)[src]
+
+                if dst_scalars is not None:
+                    tp_weights += (dst_scalars @ w_dst.T)[dst]
+
+                tp_weights = self.mlp[1:](tp_weights)
+        else:
+            tp_weights = edge_emb
+
+        out = self.tp(src_features[src], edge_sh, tp_weights)
+
+        if edge_envelope is not None:
+            out = out * edge_envelope.view(-1, 1)
+
+        out = scatter_reduce(out, dst, dim=0, dim_size=num_dst_nodes, reduce=reduce)
+
+        if self.batch_norm:
+            out = self.batch_norm(out)
+
+        return out
diff --git a/python/cugraph-equivariant/cugraph_equivariant/tests/conftest.py b/python/cugraph-equivariant/cugraph_equivariant/tests/conftest.py
new file mode 100644
index 00000000000..c7c6bad07db
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/tests/conftest.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+
+@pytest.fixture
+def example_scatter_data():
+    src_feat = torch.Tensor([3, 1, 0, 1, 1, 2])
+    dst_indices = torch.Tensor([0, 1, 2, 2, 3, 1])
+
+    results = {
+        "sum": torch.Tensor([3.0, 3.0, 1.0, 1.0]),
+        "mean": torch.Tensor([3.0, 1.5, 0.5, 1.0]),
+        "prod": torch.Tensor([3.0, 2.0, 0.0, 1.0]),
+        "amax": torch.Tensor([3.0, 2.0, 1.0, 1.0]),
+        "amin": torch.Tensor([3.0, 1.0, 0.0, 1.0]),
+    }
+
+    return src_feat, dst_indices, results
diff --git a/python/cugraph-equivariant/cugraph_equivariant/tests/test_scatter.py b/python/cugraph-equivariant/cugraph_equivariant/tests/test_scatter.py
new file mode 100644
index 00000000000..ff8048468ee
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/tests/test_scatter.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from cugraph_equivariant.utils import scatter_reduce
+
+
+@pytest.mark.parametrize("reduce", ["sum", "mean", "prod", "amax", "amin"])
+def test_scatter_reduce(example_scatter_data, reduce):
+    device = torch.device("cuda:0")
+    src, index, out_true = example_scatter_data
+    src = src.to(device)
+    index = index.to(device)
+
+    out = scatter_reduce(src, index, dim=0, dim_size=None, reduce=reduce)
+
+    assert torch.allclose(out.cpu(), out_true[reduce])
diff --git a/python/cugraph-equivariant/cugraph_equivariant/tests/test_tensor_product_conv.py b/python/cugraph-equivariant/cugraph_equivariant/tests/test_tensor_product_conv.py
new file mode 100644
index 00000000000..a2a13b32cd2
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/tests/test_tensor_product_conv.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import torch
+from torch import nn
+from e3nn import o3
+from cugraph_equivariant.nn import FullyConnectedTensorProductConv
+
+device = torch.device("cuda:0")
+
+
+@pytest.mark.parametrize("e3nn_compat_mode", [True, False])
+@pytest.mark.parametrize("batch_norm", [True, False])
+@pytest.mark.parametrize(
+    "mlp_channels, mlp_activation, scalar_sizes",
+    [
+        [(30, 8, 8), nn.Sequential(nn.Dropout(0.3), nn.ReLU()), (15, 15, 0)],
+        [(7,), nn.GELU(), (2, 3, 2)],
+        [None, None, None],
+    ],
+)
+def test_tensor_product_conv_equivariance(
+    mlp_channels, mlp_activation, scalar_sizes, batch_norm, e3nn_compat_mode
+):
+    torch.manual_seed(12345)
+
+    in_irreps = o3.Irreps("10x0e + 10x1e")
+    out_irreps = o3.Irreps("20x0e + 10x1e")
+    sh_irreps = o3.Irreps.spherical_harmonics(lmax=2)
+
+    tp_conv = FullyConnectedTensorProductConv(
+        in_irreps=in_irreps,
+        sh_irreps=sh_irreps,
+        out_irreps=out_irreps,
+        mlp_channels=mlp_channels,
+        mlp_activation=mlp_activation,
+        batch_norm=batch_norm,
+        e3nn_compat_mode=e3nn_compat_mode,
+    ).to(device)
+
+    num_src_nodes, num_dst_nodes = 9, 7
+    num_edges = 40
+    src = torch.randint(num_src_nodes, (num_edges,), device=device)
+    dst = torch.randint(num_dst_nodes, (num_edges,), device=device)
+    edge_index = torch.vstack((src, dst))
+
+    src_pos = torch.randn(num_src_nodes, 3, device=device)
+    dst_pos = torch.randn(num_dst_nodes, 3, device=device)
+    edge_vec = dst_pos[dst] - src_pos[src]
+    edge_sh = o3.spherical_harmonics(
+        tp_conv.sh_irreps, edge_vec, normalize=True, normalization="component"
+    ).to(device)
+    src_features = torch.randn(num_src_nodes, in_irreps.dim, device=device)
+
+    rot = o3.rand_matrix()
+    D_in = tp_conv.in_irreps.D_from_matrix(rot).to(device)
+    D_sh = tp_conv.sh_irreps.D_from_matrix(rot).to(device)
+    D_out = tp_conv.out_irreps.D_from_matrix(rot).to(device)
+
+    if mlp_channels is None:
+        edge_emb = torch.randn(num_edges, tp_conv.tp.weight_numel, device=device)
+        src_scalars = dst_scalars = None
+    else:
+        if scalar_sizes:
+            edge_emb = torch.randn(num_edges, scalar_sizes[0], device=device)
+            src_scalars = (
+                None
+                if scalar_sizes[1] == 0
+                else torch.randn(num_src_nodes, scalar_sizes[1], device=device)
+            )
+            dst_scalars = (
+                None
+                if scalar_sizes[2] == 0
+                else torch.randn(num_dst_nodes, scalar_sizes[2], device=device)
+            )
+        else:
+            edge_emb = torch.randn(num_edges, tp_conv.mlp[0].in_features, device=device)
+            src_scalars = dst_scalars = None
+
+    # rotate before
+    out_before = tp_conv(
+        src_features=src_features @ D_in.T,
+        edge_sh=edge_sh @ D_sh.T,
+        edge_emb=edge_emb,
+        graph=(edge_index, (num_src_nodes, num_dst_nodes)),
+        src_scalars=src_scalars,
+        dst_scalars=dst_scalars,
+    )
+
+    # rotate after
+    out_after = (
+        tp_conv(
+            src_features=src_features,
+            edge_sh=edge_sh,
+            edge_emb=edge_emb,
+            graph=(edge_index, (num_src_nodes, num_dst_nodes)),
+            src_scalars=src_scalars,
+            dst_scalars=dst_scalars,
+        )
+        @ D_out.T
+    )
+
+    torch.allclose(out_before, out_after, rtol=1e-4, atol=1e-4)
diff --git a/python/cugraph-equivariant/cugraph_equivariant/utils/__init__.py b/python/cugraph-equivariant/cugraph_equivariant/utils/__init__.py
new file mode 100644
index 00000000000..b4acfe8d090
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/utils/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .scatter import scatter_reduce
+
+__all__ = [
+    "scatter_reduce",
+]
diff --git a/python/cugraph-equivariant/cugraph_equivariant/utils/scatter.py b/python/cugraph-equivariant/cugraph_equivariant/utils/scatter.py
new file mode 100644
index 00000000000..45cc541fc7b
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/utils/scatter.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+
+
+def broadcast(src: torch.Tensor, ref: torch.Tensor, dim: int) -> torch.Tensor:
+    size = ((1,) * dim) + (-1,) + ((1,) * (ref.dim() - dim - 1))
+    return src.view(size).expand_as(ref)
+
+
+def scatter_reduce(
+    src: torch.Tensor,
+    index: torch.Tensor,
+    dim: int = 0,
+    dim_size: Optional[int] = None,  # value of out.size(dim)
+    reduce: str = "sum",  # "sum", "prod", "mean", "amax", "amin"
+):
+    # scatter() expects index to be int64
+    index = broadcast(index, src, dim).to(torch.int64)
+
+    size = list(src.size())
+
+    if dim_size is not None:
+        assert dim_size >= int(index.max()) + 1
+        size[dim] = dim_size
+    else:
+        size[dim] = int(index.max()) + 1
+
+    out = torch.zeros(size, dtype=src.dtype, device=src.device)
+    return out.scatter_reduce_(dim, index, src, reduce, include_self=False)
diff --git a/python/cugraph-equivariant/pyproject.toml b/python/cugraph-equivariant/pyproject.toml
new file mode 100644
index 00000000000..f261b0e3535
--- /dev/null
+++ b/python/cugraph-equivariant/pyproject.toml
@@ -0,0 +1,64 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+requires = [
+    "setuptools>=61.0.0",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "cugraph-equivariant"
+dynamic = ["version"]
+description = "Fast GPU-based equivariant operations and convolutional layers."
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.9"
+classifiers = [
+    "Intended Audience :: Developers",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+]
+dependencies = [
+    "pylibcugraphops==24.2.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cugraph"
+Documentation = "https://docs.rapids.ai/api/cugraph/stable/api_docs/cugraph-ops/"
+
+[project.optional-dependencies]
+test = [
+    "pandas",
+    "pytest",
+    "pytest-benchmark",
+    "pytest-cov",
+    "pytest-xdist",
+    "scipy",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[tool.setuptools]
+license-files = ["LICENSE"]
+
+[tool.setuptools.dynamic]
+version = {file = "cugraph_equivariant/VERSION"}
+
+[tool.setuptools.packages.find]
+include = [
+    "cugraph_equivariant*",
+    "cugraph_equivariant.*",
+]
diff --git a/python/cugraph-equivariant/setup.py b/python/cugraph-equivariant/setup.py
new file mode 100644
index 00000000000..acd0df3f717
--- /dev/null
+++ b/python/cugraph-equivariant/setup.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from setuptools import find_packages, setup
+
+if __name__ == "__main__":
+    packages = find_packages(include=["cugraph_equivariant*"])
+    setup(
+        package_data={key: ["VERSION"] for key in packages},
+    )