diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 83078a304ed..e48301e4d14 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:23.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index d59742575b5..a57ea0d163b 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,15 +5,15 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:23.12": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {
       "version": "1.14.1"
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
index bc4a2cb6fb4..10ba2f8fd3d 100644
--- a/.devcontainer/cuda12.0-conda/devcontainer.json
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:23.12-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
index 3eacf726bf0..a112483a6db 100644
--- a/.devcontainer/cuda12.0-pip/devcontainer.json
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -5,15 +5,15 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda12.0-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda12.0-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:23.12": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {
       "version": "1.14.1"
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/labeler.yml b/.github/labeler.yml
index c589fda6099..368bf328b99 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -9,17 +9,6 @@ python:
 benchmarks:
   - 'benchmarks/**'
 
-doc:
-  - 'docs/**'
-  - '**/*.md'
-  - 'datasets/**'
-  - 'notebooks/**'
-  - '**/*.txt'
-  - '**/*.rst'
-  - '**/*.ipynb'
-  - '**/*.pdf'
-  - '**/*.png'
-
 datasets:
   - 'datasets/**'
 
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0f490283795..85ac682daf4 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -47,7 +47,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -77,13 +77,13 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel_pylibcugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.12
+      extra-repo-sha: branch-24.02
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       node_type: cpu32
   wheel-publish-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -93,7 +93,7 @@ jobs:
   wheel-build-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,12 +101,12 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel_cugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.12
+      extra-repo-sha: branch-24.02
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
   wheel-publish-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -116,7 +116,7 @@ jobs:
   wheel-build-nx-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
   wheel-publish-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9d20074381e..82c71efffdb 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,41 +27,41 @@ jobs:
       - wheel-tests-nx-cugraph
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: cpu32
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -71,7 +71,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -81,55 +81,55 @@ jobs:
   wheel-build-pylibcugraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibcugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.12
+      extra-repo-sha: branch-24.02
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       node_type: cpu32
   wheel-tests-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibcugraph.sh
   wheel-build-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.12
+      extra-repo-sha: branch-24.02
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
   wheel-tests-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph.sh
   wheel-build-nx-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/build_wheel_nx-cugraph.sh
   wheel-tests-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/test_wheel_nx-cugraph.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
     with:
       node_type: cpu32
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a0ecb67712c..0d9f4d291c3 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -41,7 +41,7 @@ jobs:
       script: ci/test_wheel_pylibcugraph.sh
   wheel-tests-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -50,7 +50,7 @@ jobs:
       script: ci/test_wheel_cugraph.sh
   wheel-tests-nx-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index a193fff41e8..3c6c5e2b706 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-23.12.00
+24.02.00
diff --git a/build.sh b/build.sh
index eef19046d85..fa7a4f6f363 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-RAPIDS_VERSION=23.12
+RAPIDS_VERSION=24.02
 
 # Valid args to this script (all possible targets and options) - only one per line
 VALIDARGS="
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 3f765704bdb..d88c7d7bcd7 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -39,7 +39,7 @@ rapids-mamba-retry install \
 rapids-logger "Install cugraph-dgl"
 rapids-mamba-retry install "${PYTHON_CHANNEL}/linux-64/cugraph-dgl-*.tar.bz2"
 
-export RAPIDS_VERSION_NUMBER="23.12"
+export RAPIDS_VERSION_NUMBER="24.02"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 for PROJECT in libcugraphops libwholegraph; do
diff --git a/ci/build_wheel_cugraph.sh b/ci/build_wheel_cugraph.sh
index 5b5061f67c2..0a722c88c3e 100755
--- a/ci/build_wheel_cugraph.sh
+++ b/ci/build_wheel_cugraph.sh
@@ -12,6 +12,6 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME=pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX} rapids-download-wheels-from-s3 ./local-pylibcugraph
 export PIP_FIND_LINKS=$(pwd)/local-pylibcugraph
 
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
 ./ci/build_wheel.sh cugraph python/cugraph
diff --git a/ci/build_wheel_pylibcugraph.sh b/ci/build_wheel_pylibcugraph.sh
index 8d365bc250b..9e236c145ce 100755
--- a/ci/build_wheel_pylibcugraph.sh
+++ b/ci/build_wheel_pylibcugraph.sh
@@ -3,6 +3,6 @@
 
 set -euo pipefail
 
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
 ./ci/build_wheel.sh pylibcugraph python/pylibcugraph
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index aa38defcd7c..dd5b6a25f6c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -16,12 +16,12 @@ dependencies:
 - cmake>=3.26.4
 - cuda-version=11.8
 - cudatoolkit
-- cudf==23.12.*
+- cudf==24.2.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==23.12.*
-- dask-cudf==23.12.*
+- dask-cuda==24.2.*
+- dask-cudf==24.2.*
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
@@ -29,11 +29,11 @@ dependencies:
 - graphviz
 - gtest>=1.13.0
 - ipython
-- libcudf==23.12.*
-- libcugraphops==23.12.*
-- libraft-headers==23.12.*
-- libraft==23.12.*
-- librmm==23.12.*
+- libcudf==24.2.*
+- libcugraphops==24.2.*
+- libraft-headers==24.2.*
+- libraft==24.2.*
+- librmm==24.2.*
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
@@ -49,20 +49,20 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibcugraphops==23.12.*
-- pylibraft==23.12.*
-- pylibwholegraph==23.12.*
+- pylibcugraphops==24.2.*
+- pylibraft==24.2.*
+- pylibwholegraph==24.2.*
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-mpl
 - pytest-xdist
 - python-louvain
-- raft-dask==23.12.*
-- rapids-dask-dependency==23.12.*
+- raft-dask==24.2.*
+- rapids-dask-dependency==24.2.*
 - recommonmark
 - requests
-- rmm==23.12.*
+- rmm==24.2.*
 - scikit-build>=0.13.1
 - scikit-learn>=0.23.1
 - scipy
@@ -72,7 +72,7 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
-- ucx-py==0.35.*
+- ucx-py==0.36.*
 - wget
 - wheel
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index a9f793b15f5..a3130310a27 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -16,12 +16,12 @@ dependencies:
 - cmake>=3.26.4
 - cuda-nvcc
 - cuda-version=12.0
-- cudf==23.12.*
+- cudf==24.2.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==23.12.*
-- dask-cudf==23.12.*
+- dask-cuda==24.2.*
+- dask-cudf==24.2.*
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
@@ -29,11 +29,11 @@ dependencies:
 - graphviz
 - gtest>=1.13.0
 - ipython
-- libcudf==23.12.*
-- libcugraphops==23.12.*
-- libraft-headers==23.12.*
-- libraft==23.12.*
-- librmm==23.12.*
+- libcudf==24.2.*
+- libcugraphops==24.2.*
+- libraft-headers==24.2.*
+- libraft==24.2.*
+- librmm==24.2.*
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
@@ -48,20 +48,20 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibcugraphops==23.12.*
-- pylibraft==23.12.*
-- pylibwholegraph==23.12.*
+- pylibcugraphops==24.2.*
+- pylibraft==24.2.*
+- pylibwholegraph==24.2.*
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-mpl
 - pytest-xdist
 - python-louvain
-- raft-dask==23.12.*
-- rapids-dask-dependency==23.12.*
+- raft-dask==24.2.*
+- rapids-dask-dependency==24.2.*
 - recommonmark
 - requests
-- rmm==23.12.*
+- rmm==24.2.*
 - scikit-build>=0.13.1
 - scikit-learn>=0.23.1
 - scipy
@@ -71,7 +71,7 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
-- ucx-py==0.35.*
+- ucx-py==0.36.*
 - wget
 - wheel
 name: all_cuda-120_arch-x86_64
diff --git a/conda/recipes/cugraph-service/conda_build_config.yaml b/conda/recipes/cugraph-service/conda_build_config.yaml
index b971a73fd39..6a0124983fd 100644
--- a/conda/recipes/cugraph-service/conda_build_config.yaml
+++ b/conda/recipes/cugraph-service/conda_build_config.yaml
@@ -1,2 +1,2 @@
 ucx_py_version:
-  - "0.35.*"
+  - "0.36.*"
diff --git a/conda/recipes/cugraph/conda_build_config.yaml b/conda/recipes/cugraph/conda_build_config.yaml
index c03d515b9f6..387f3451d8d 100644
--- a/conda/recipes/cugraph/conda_build_config.yaml
+++ b/conda/recipes/cugraph/conda_build_config.yaml
@@ -17,4 +17,4 @@ sysroot_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.35.*"
+  - "0.36.*"
diff --git a/conda/recipes/pylibcugraph/conda_build_config.yaml b/conda/recipes/pylibcugraph/conda_build_config.yaml
index c03d515b9f6..387f3451d8d 100644
--- a/conda/recipes/pylibcugraph/conda_build_config.yaml
+++ b/conda/recipes/pylibcugraph/conda_build_config.yaml
@@ -17,4 +17,4 @@ sysroot_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.35.*"
+  - "0.36.*"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2faaf5860ee..8e4dfbbf23c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUGRAPH)
 
-project(CUGRAPH VERSION 23.12.00 LANGUAGES C CXX CUDA)
+project(CUGRAPH VERSION 24.02.00 LANGUAGES C CXX CUDA)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
@@ -424,7 +424,7 @@ add_library(cugraph_c
         src/c_api/core_result.cpp
         src/c_api/extract_ego.cpp
         src/c_api/k_core.cpp
-	src/c_api/hierarchical_clustering_result.cpp
+        src/c_api/hierarchical_clustering_result.cpp
         src/c_api/induced_subgraph.cpp
         src/c_api/capi_helper.cu
         src/c_api/legacy_spectral.cpp
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 6946bd38bfe..3b74956e121 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -48,7 +48,7 @@ PROJECT_NAME           = libcugraph
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.12
+PROJECT_NUMBER         = 24.02
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 60b9f1a4054..a723fde24df 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -90,24 +90,25 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
     graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
     bool do_expensive_check = false);
 
+  edge_t number_of_edges() const { return this->number_of_edges_; }
+
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> view() const
   {
-    std::vector<edge_t const*> offsets(edge_partition_offsets_.size(), nullptr);
-    std::vector<vertex_t const*> indices(edge_partition_indices_.size(), nullptr);
-    auto dcs_nzd_vertices      = edge_partition_dcs_nzd_vertices_
-                                   ? std::make_optional<std::vector<vertex_t const*>>(
-                                  (*edge_partition_dcs_nzd_vertices_).size(), nullptr)
-                                   : std::nullopt;
-    auto dcs_nzd_vertex_counts = edge_partition_dcs_nzd_vertex_counts_
-                                   ? std::make_optional<std::vector<vertex_t>>(
-                                       (*edge_partition_dcs_nzd_vertex_counts_).size(), vertex_t{0})
-                                   : std::nullopt;
+    std::vector<raft::device_span<edge_t const>> offsets(edge_partition_offsets_.size());
+    std::vector<raft::device_span<vertex_t const>> indices(edge_partition_indices_.size());
+    auto dcs_nzd_vertices = edge_partition_dcs_nzd_vertices_
+                              ? std::make_optional<std::vector<raft::device_span<vertex_t const>>>(
+                                  (*edge_partition_dcs_nzd_vertices_).size())
+                              : std::nullopt;
     for (size_t i = 0; i < offsets.size(); ++i) {
-      offsets[i] = edge_partition_offsets_[i].data();
-      indices[i] = edge_partition_indices_[i].data();
+      offsets[i] = raft::device_span<edge_t const>(edge_partition_offsets_[i].data(),
+                                                   edge_partition_offsets_[i].size());
+      indices[i] = raft::device_span<vertex_t const>(edge_partition_indices_[i].data(),
+                                                     edge_partition_indices_[i].size());
       if (dcs_nzd_vertices) {
-        (*dcs_nzd_vertices)[i]      = (*edge_partition_dcs_nzd_vertices_)[i].data();
-        (*dcs_nzd_vertex_counts)[i] = (*edge_partition_dcs_nzd_vertex_counts_)[i];
+        (*dcs_nzd_vertices)[i] =
+          raft::device_span<vertex_t const>((*edge_partition_dcs_nzd_vertices_)[i].data(),
+                                            (*edge_partition_dcs_nzd_vertices_)[i].size());
       }
     }
 
@@ -196,15 +197,13 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
     }
 
     return graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>(
-      *(this->handle_ptr()),
       offsets,
       indices,
       dcs_nzd_vertices,
-      dcs_nzd_vertex_counts,
       graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{
         this->number_of_vertices(),
         this->number_of_edges(),
-        this->graph_properties(),
+        this->properties_,
         partition_,
         edge_partition_segment_offsets_,
         local_sorted_unique_edge_srcs,
@@ -224,7 +223,6 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
   // nzd: nonzero (local) degree
   std::optional<std::vector<rmm::device_uvector<vertex_t>>> edge_partition_dcs_nzd_vertices_{
     std::nullopt};
-  std::optional<std::vector<vertex_t>> edge_partition_dcs_nzd_vertex_counts_{std::nullopt};
   partition_t<vertex_t> partition_{};
 
   // segment offsets within the vertex partition based on vertex degree
@@ -283,16 +281,15 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!m
           graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check = false);
 
+  edge_t number_of_edges() const { return this->number_of_edges_; }
+
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> view() const
   {
     return graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>(
-      *(this->handle_ptr()),
-      offsets_.data(),
-      indices_.data(),
-      graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{this->number_of_vertices(),
-                                                                       this->number_of_edges(),
-                                                                       this->graph_properties(),
-                                                                       segment_offsets_});
+      raft::device_span<edge_t const>(offsets_.data(), offsets_.size()),
+      raft::device_span<vertex_t const>(indices_.data(), indices_.size()),
+      graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{
+        this->number_of_vertices(), this->number_of_edges(), this->properties_, segment_offsets_});
   }
 
  private:
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index f30a8b7e2af..53c66c6483e 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -258,17 +258,12 @@ class graph_base_t {
  public:
   graph_base_t() = default;
 
-  graph_base_t(raft::handle_t const& handle,
-               vertex_t number_of_vertices,
-               edge_t number_of_edges,
-               graph_properties_t properties)
-    : handle_ptr_(&handle),
-      number_of_vertices_(number_of_vertices),
+  graph_base_t(vertex_t number_of_vertices, edge_t number_of_edges, graph_properties_t properties)
+    : number_of_vertices_(number_of_vertices),
       number_of_edges_(number_of_edges),
       properties_(properties){};
 
   vertex_t number_of_vertices() const { return number_of_vertices_; }
-  edge_t number_of_edges() const { return number_of_edges_; }
 
   template <typename vertex_type = vertex_t>
   std::enable_if_t<std::is_signed<vertex_type>::value, bool> is_valid_vertex(vertex_type v) const
@@ -286,16 +281,11 @@ class graph_base_t {
   bool is_multigraph() const { return properties_.is_multigraph; }
 
  protected:
-  raft::handle_t const* handle_ptr() const { return handle_ptr_; };
-  graph_properties_t graph_properties() const { return properties_; }
+  edge_t number_of_edges_{0};
+  graph_properties_t properties_{};
 
  private:
-  raft::handle_t const* handle_ptr_{nullptr};
-
   vertex_t number_of_vertices_{0};
-  edge_t number_of_edges_{0};
-
-  graph_properties_t properties_{};
 };
 
 }  // namespace detail
@@ -385,11 +375,10 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   static constexpr bool is_storage_transposed = store_transposed;
   static constexpr bool is_multi_gpu          = multi_gpu;
 
-  graph_view_t(raft::handle_t const& handle,
-               std::vector<edge_t const*> const& edge_partition_offsets,
-               std::vector<vertex_t const*> const& edge_partition_indices,
-               std::optional<std::vector<vertex_t const*>> const& edge_partition_dcs_nzd_vertices,
-               std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
+  graph_view_t(std::vector<raft::device_span<edge_t const>> const& edge_partition_offsets,
+               std::vector<raft::device_span<vertex_t const>> const& edge_partition_indices,
+               std::optional<std::vector<raft::device_span<vertex_t const>>> const&
+                 edge_partition_dcs_nzd_vertices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta);
 
   std::vector<vertex_t> vertex_partition_range_offsets() const
@@ -604,25 +593,16 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
         this->local_edge_partition_src_value_start_offset(partition_idx);
     }
     std::optional<vertex_t> major_hypersparse_first{std::nullopt};
-    vertex_t offset_size = (major_range_last - major_range_first) + 1;
     if (this->use_dcs()) {
       major_hypersparse_first =
         major_range_first + (*(this->local_edge_partition_segment_offsets(
                               partition_idx)))[detail::num_sparse_segments_per_vertex_partition];
-      offset_size = ((*major_hypersparse_first) - major_range_first) +
-                    (*edge_partition_dcs_nzd_vertex_counts_)[partition_idx] + 1;
     }
     return edge_partition_view_t<vertex_t, edge_t, true>(
-      raft::device_span<edge_t const>(edge_partition_offsets_[partition_idx],
-                                      edge_partition_offsets_[partition_idx] + offset_size),
-      raft::device_span<vertex_t const>(
-        edge_partition_indices_[partition_idx],
-        edge_partition_indices_[partition_idx] + edge_partition_number_of_edges_[partition_idx]),
+      edge_partition_offsets_[partition_idx],
+      edge_partition_indices_[partition_idx],
       edge_partition_dcs_nzd_vertices_
-        ? std::make_optional<raft::device_span<vertex_t const>>(
-            (*edge_partition_dcs_nzd_vertices_)[partition_idx],
-            (*edge_partition_dcs_nzd_vertices_)[partition_idx] +
-              (*edge_partition_dcs_nzd_vertex_counts_)[partition_idx])
+        ? std::make_optional((*edge_partition_dcs_nzd_vertices_)[partition_idx])
         : std::nullopt,
       major_hypersparse_first,
       major_range_first,
@@ -632,6 +612,16 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
       major_value_range_start_offset);
   }
 
+  // FIXME: deprecated, replaced with copmute_number_of_edges (which works with or without edge
+  // masking)
+  edge_t number_of_edges() const
+  {
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
+    return this->number_of_edges_;
+  }
+
+  edge_t compute_number_of_edges(raft::handle_t const& handle) const;
+
   rmm::device_uvector<edge_t> compute_in_degrees(raft::handle_t const& handle) const;
   rmm::device_uvector<edge_t> compute_out_degrees(raft::handle_t const& handle) const;
 
@@ -746,14 +736,11 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   }
 
  private:
-  std::vector<edge_t const*> edge_partition_offsets_{};
-  std::vector<vertex_t const*> edge_partition_indices_{};
+  std::vector<raft::device_span<edge_t const>> edge_partition_offsets_{};
+  std::vector<raft::device_span<vertex_t const>> edge_partition_indices_{};
 
   // relevant only if we use the CSR + DCSR (or CSC + DCSC) hybrid format
-  std::optional<std::vector<vertex_t const*>> edge_partition_dcs_nzd_vertices_{};
-  std::optional<std::vector<vertex_t>> edge_partition_dcs_nzd_vertex_counts_{};
-
-  std::vector<edge_t> edge_partition_number_of_edges_{};
+  std::optional<std::vector<raft::device_span<vertex_t const>>> edge_partition_dcs_nzd_vertices_{};
 
   partition_t<vertex_t> partition_{};
 
@@ -804,9 +791,8 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   static constexpr bool is_storage_transposed = store_transposed;
   static constexpr bool is_multi_gpu          = multi_gpu;
 
-  graph_view_t(raft::handle_t const& handle,
-               edge_t const* offsets,
-               vertex_t const* indices,
+  graph_view_t(raft::device_span<edge_t const> offsets,
+               raft::device_span<vertex_t const> indices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta);
 
   std::vector<vertex_t> vertex_partition_range_offsets() const
@@ -920,11 +906,19 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   {
     assert(partition_idx == 0);  // there is only one edge partition in single-GPU
     return edge_partition_view_t<vertex_t, edge_t, false>(
-      raft::device_span<edge_t const>(offsets_, offsets_ + (this->number_of_vertices() + 1)),
-      raft::device_span<vertex_t const>(indices_, indices_ + this->number_of_edges()),
-      this->number_of_vertices());
+      offsets_, indices_, this->number_of_vertices());
   }
 
+  // FIXME: deprecated, replaced with copmute_number_of_edges (which works with or without edge
+  // masking)
+  edge_t number_of_edges() const
+  {
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
+    return this->number_of_edges_;
+  }
+
+  edge_t compute_number_of_edges(raft::handle_t const& handle) const;
+
   rmm::device_uvector<edge_t> compute_in_degrees(raft::handle_t const& handle) const;
   rmm::device_uvector<edge_t> compute_out_degrees(raft::handle_t const& handle) const;
 
@@ -1027,8 +1021,8 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   }
 
  private:
-  edge_t const* offsets_{nullptr};
-  vertex_t const* indices_{nullptr};
+  raft::device_span<edge_t const> offsets_{};
+  raft::device_span<vertex_t const> indices_{};
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> segment_offsets_{std::nullopt};
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
index c4cacb401af..5fbe7bc9f01 100644
--- a/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
@@ -57,10 +57,10 @@ class device_shared_wrapper_t {
   {
     std::lock_guard<std::mutex> lock(lock_);
 
-    auto pos = objects_.find(handle.get_local_rank());
+    auto pos = objects_.find(handle.get_rank());
     CUGRAPH_EXPECTS(pos == objects_.end(), "Cannot overwrite wrapped object");
 
-    objects_.insert(std::make_pair(handle.get_local_rank(), std::move(obj)));
+    objects_.insert(std::make_pair(handle.get_rank(), std::move(obj)));
   }
 
   /**
@@ -79,7 +79,6 @@ class device_shared_wrapper_t {
     objects_.insert(std::make_pair(local_rank, std::move(obj)));
   }
 
- public:
   /**
    * @brief Get reference to an object for a particular thread
    *
@@ -90,7 +89,7 @@ class device_shared_wrapper_t {
   {
     std::lock_guard<std::mutex> lock(lock_);
 
-    auto pos = objects_.find(handle.get_local_rank());
+    auto pos = objects_.find(handle.get_rank());
     CUGRAPH_EXPECTS(pos != objects_.end(), "Uninitialized wrapped object");
 
     return pos->second;
@@ -106,7 +105,7 @@ class device_shared_wrapper_t {
   {
     std::lock_guard<std::mutex> lock(lock_);
 
-    auto pos = objects_.find(handle.get_local_rank());
+    auto pos = objects_.find(handle.get_rank());
 
     CUGRAPH_EXPECTS(pos != objects_.end(), "Uninitialized wrapped object");
 
diff --git a/cpp/include/cugraph/mtmg/graph_view.hpp b/cpp/include/cugraph/mtmg/graph_view.hpp
index 94347e016ea..8e202ab4904 100644
--- a/cpp/include/cugraph/mtmg/graph_view.hpp
+++ b/cpp/include/cugraph/mtmg/graph_view.hpp
@@ -27,8 +27,27 @@ namespace mtmg {
  * @brief Graph view for each GPU
  */
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
-using graph_view_t = detail::device_shared_wrapper_t<
-  cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>>;
+class graph_view_t : public detail::device_shared_wrapper_t<
+                       cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>> {
+ public:
+  /**
+   * @brief Get the vertex_partition_view for this graph
+   */
+  vertex_partition_view_t<vertex_t, multi_gpu> get_vertex_partition_view(
+    cugraph::mtmg::handle_t const& handle) const
+  {
+    return this->get(handle).local_vertex_partition_view();
+  }
+
+  /**
+   * @brief Get the vertex_partition_view for this graph
+   */
+  std::vector<vertex_t> get_vertex_partition_range_lasts(
+    cugraph::mtmg::handle_t const& handle) const
+  {
+    return this->get(handle).vertex_partition_range_lasts();
+  }
+};
 
 }  // namespace mtmg
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/handle.hpp b/cpp/include/cugraph/mtmg/handle.hpp
index 6223de1781d..0b02091a3cc 100644
--- a/cpp/include/cugraph/mtmg/handle.hpp
+++ b/cpp/include/cugraph/mtmg/handle.hpp
@@ -32,18 +32,19 @@ namespace mtmg {
  *
  */
 class handle_t {
+  handle_t(handle_t const&)           = delete;
+  handle_t operator=(handle_t const&) = delete;
+
  public:
   /**
    * @brief Constructor
    *
    * @param raft_handle   Raft handle for the resources
    * @param thread_rank   Rank for this thread
+   * @param device_id     Device id for the device this handle operates on
    */
-  handle_t(raft::handle_t const& raft_handle, int thread_rank, size_t device_id)
-    : raft_handle_(raft_handle),
-      thread_rank_(thread_rank),
-      local_rank_(raft_handle.get_comms().get_rank()),  // FIXME: update for multi-node
-      device_id_(device_id)
+  handle_t(raft::handle_t const& raft_handle, int thread_rank, rmm::cuda_device_id device_id)
+    : raft_handle_(raft_handle), thread_rank_(thread_rank), device_id_raii_(device_id)
   {
   }
 
@@ -118,18 +119,10 @@ class handle_t {
    */
   int get_rank() const { return raft_handle_.get_comms().get_rank(); }
 
-  /**
-   * @brief Get local gpu rank
-   *
-   * @return local gpu rank
-   */
-  int get_local_rank() const { return local_rank_; }
-
  private:
   raft::handle_t const& raft_handle_;
   int thread_rank_;
-  int local_rank_;
-  size_t device_id_;
+  rmm::cuda_set_device_raii device_id_raii_;
 };
 
 }  // namespace mtmg
diff --git a/cpp/include/cugraph/mtmg/instance_manager.hpp b/cpp/include/cugraph/mtmg/instance_manager.hpp
index f819a5a0abe..f60063c4101 100644
--- a/cpp/include/cugraph/mtmg/instance_manager.hpp
+++ b/cpp/include/cugraph/mtmg/instance_manager.hpp
@@ -47,15 +47,10 @@ class instance_manager_t {
 
   ~instance_manager_t()
   {
-    int current_device{};
-    RAFT_CUDA_TRY(cudaGetDevice(&current_device));
-
     for (size_t i = 0; i < nccl_comms_.size(); ++i) {
-      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[i].value()));
+      rmm::cuda_set_device_raii local_set_device(device_ids_[i]);
       RAFT_NCCL_TRY(ncclCommDestroy(*nccl_comms_[i]));
     }
-
-    RAFT_CUDA_TRY(cudaSetDevice(current_device));
   }
 
   /**
@@ -75,8 +70,7 @@ class instance_manager_t {
     int gpu_id    = local_id % raft_handle_.size();
     int thread_id = local_id / raft_handle_.size();
 
-    RAFT_CUDA_TRY(cudaSetDevice(device_ids_[gpu_id].value()));
-    return handle_t(*raft_handle_[gpu_id], thread_id, static_cast<size_t>(gpu_id));
+    return handle_t(*raft_handle_[gpu_id], thread_id, device_ids_[gpu_id]);
   }
 
   /**
diff --git a/cpp/include/cugraph/mtmg/resource_manager.hpp b/cpp/include/cugraph/mtmg/resource_manager.hpp
index 127944cf7ba..bc312c9ae77 100644
--- a/cpp/include/cugraph/mtmg/resource_manager.hpp
+++ b/cpp/include/cugraph/mtmg/resource_manager.hpp
@@ -89,7 +89,7 @@ class resource_manager_t {
 
     local_rank_map_.insert(std::pair(global_rank, local_device_id));
 
-    RAFT_CUDA_TRY(cudaSetDevice(local_device_id.value()));
+    rmm::cuda_set_device_raii local_set_device(local_device_id);
 
     // FIXME: There is a bug in the cuda_memory_resource that results in a Hang.
     //   using the pool resource as a work-around.
@@ -182,14 +182,12 @@ class resource_manager_t {
       --gpu_row_comm_size;
     }
 
-    int current_device{};
-    RAFT_CUDA_TRY(cudaGetDevice(&current_device));
     RAFT_NCCL_TRY(ncclGroupStart());
 
     for (size_t i = 0; i < local_ranks_to_include.size(); ++i) {
       int rank = local_ranks_to_include[i];
       auto pos = local_rank_map_.find(rank);
-      RAFT_CUDA_TRY(cudaSetDevice(pos->second.value()));
+      rmm::cuda_set_device_raii local_set_device(pos->second);
 
       nccl_comms.push_back(std::make_unique<ncclComm_t>());
       handles.push_back(
@@ -204,7 +202,6 @@ class resource_manager_t {
         handles[i].get(), *nccl_comms[i], ranks_to_include.size(), rank);
     }
     RAFT_NCCL_TRY(ncclGroupEnd());
-    RAFT_CUDA_TRY(cudaSetDevice(current_device));
 
     std::vector<std::thread> running_threads;
 
@@ -217,9 +214,7 @@ class resource_manager_t {
                                     &device_ids,
                                     &nccl_comms,
                                     &handles]() {
-        int rank = local_ranks_to_include[idx];
-        RAFT_CUDA_TRY(cudaSetDevice(device_ids[idx].value()));
-
+        rmm::cuda_set_device_raii local_set_device(device_ids[idx]);
         cugraph::partition_manager::init_subcomm(*handles[idx], gpu_row_comm_size);
       });
     }
diff --git a/cpp/include/cugraph/mtmg/vertex_result_view.hpp b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
index a349bb95333..42b80cea62f 100644
--- a/cpp/include/cugraph/mtmg/vertex_result_view.hpp
+++ b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
@@ -39,11 +39,12 @@ class vertex_result_view_t : public detail::device_shared_device_span_t<result_t
   /**
    * @brief Gather results from specified vertices into a device vector
    */
-  template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+  template <typename vertex_t, bool multi_gpu>
   rmm::device_uvector<result_t> gather(
     handle_t const& handle,
     raft::device_span<vertex_t const> vertices,
-    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+    std::vector<vertex_t> const& vertex_partition_range_lasts,
+    cugraph::vertex_partition_view_t<vertex_t, multi_gpu> vertex_partition_view,
     std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view);
 };
 
diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh
index a62e8ce85ec..04aeac49c9d 100644
--- a/cpp/include/cugraph/utilities/misc_utils.cuh
+++ b/cpp/include/cugraph/utilities/misc_utils.cuh
@@ -19,12 +19,15 @@
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/atomic>
 #include <thrust/binary_search.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
 
+#include <cuda/functional>
+
 #include <optional>
 #include <tuple>
 #include <vector>
@@ -43,7 +46,8 @@ std::tuple<std::vector<vertex_t>, std::vector<edge_t>> compute_offset_aligned_ed
 {
   auto search_offset_first = thrust::make_transform_iterator(
     thrust::make_counting_iterator(size_t{1}),
-    [approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; });
+    cuda::proclaim_return_type<size_t>(
+      [approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; }));
   auto num_chunks = (num_edges + approx_edge_chunk_size - 1) / approx_edge_chunk_size;
 
   if (num_chunks > 1) {
diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index ab6a54cc1c0..414d9b36992 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -40,6 +40,8 @@
 #include <thrust/tabulate.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <numeric>
 #include <vector>
@@ -197,12 +199,13 @@ void multi_partition(ValueIterator value_first,
     value_last,
     thrust::make_zip_iterator(
       thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
-    [value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
-      auto group_id = value_to_group_id_op(value);
-      cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
-      return thrust::make_tuple(group_id,
-                                counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
-    });
+    cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
+      [value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
+        auto group_id = value_to_group_id_op(value);
+        cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
+        return thrust::make_tuple(group_id,
+                                  counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
+      }));
 
   rmm::device_uvector<size_t> displacements(num_groups, stream_view);
   thrust::exclusive_scan(
@@ -245,17 +248,19 @@ void multi_partition(KeyIterator key_first,
   rmm::device_uvector<int> group_ids(num_keys, stream_view);
   rmm::device_uvector<size_t> intra_partition_offsets(num_keys, stream_view);
   thrust::fill(rmm::exec_policy(stream_view), counts.begin(), counts.end(), size_t{0});
-  thrust::transform(rmm::exec_policy(stream_view),
-                    key_first,
-                    key_last,
-                    thrust::make_zip_iterator(
-                      thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
-                    [key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
-                      auto group_id = key_to_group_id_op(key);
-                      cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
-                      return thrust::make_tuple(
-                        group_id, counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream_view),
+    key_first,
+    key_last,
+    thrust::make_zip_iterator(
+      thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
+    cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
+      [key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
+        auto group_id = key_to_group_id_op(key);
+        cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
+        return thrust::make_tuple(group_id,
+                                  counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
+      }));
 
   rmm::device_uvector<size_t> displacements(num_groups, stream_view);
   thrust::exclusive_scan(
@@ -761,8 +766,9 @@ rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [I
                              stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
-    tx_value_first,
-    [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); });
+    tx_value_first, cuda::proclaim_return_type<int>([value_to_group_id_op] __device__(auto value) {
+      return value_to_group_id_op(value);
+    }));
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
   auto rank_count_pair_first = thrust::make_zip_iterator(
@@ -795,7 +801,9 @@ rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [IN
                              stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
-    tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
+    tx_key_first, cuda::proclaim_return_type<int>([key_to_group_id_op] __device__(auto key) {
+      return key_to_group_id_op(key);
+    }));
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
   auto rank_count_pair_first = thrust::make_zip_iterator(
diff --git a/cpp/libcugraph_etl/CMakeLists.txt b/cpp/libcugraph_etl/CMakeLists.txt
index ac0cb6959e8..8874c75896c 100644
--- a/cpp/libcugraph_etl/CMakeLists.txt
+++ b/cpp/libcugraph_etl/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUGRAPH_ETL)
 
-project(CUGRAPH_ETL VERSION 23.12.00 LANGUAGES C CXX CUDA)
+project(CUGRAPH_ETL VERSION 24.02.00 LANGUAGES C CXX CUDA)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
diff --git a/cpp/src/c_api/capi_helper.cu b/cpp/src/c_api/capi_helper.cu
index 0ee49f87265..f08af4137db 100644
--- a/cpp/src/c_api/capi_helper.cu
+++ b/cpp/src/c_api/capi_helper.cu
@@ -74,6 +74,104 @@ template void sort_by_key(raft::handle_t const& handle,
                           raft::device_span<int64_t> keys,
                           raft::device_span<int64_t> values);
 
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<size_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<vertex_t>&& edge_srcs,
+                          rmm::device_uvector<vertex_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<weight_t>>&& edge_weights)
+{
+  rmm::device_uvector<size_t> sort_indices(edge_srcs.size(), handle.get_stream());
+  thrust::tabulate(
+    handle.get_thrust_policy(),
+    sort_indices.begin(),
+    sort_indices.end(),
+    [offset_lasts   = raft::device_span<size_t const>(offsets.begin() + 1, offsets.end()),
+     source_indices = raft::device_span<size_t const>(source_indices.data(),
+                                                      source_indices.size())] __device__(size_t i) {
+      auto idx = static_cast<size_t>(thrust::distance(
+        offset_lasts.begin(),
+        thrust::upper_bound(thrust::seq, offset_lasts.begin(), offset_lasts.end(), i)));
+      return source_indices[idx];
+    });
+  source_indices.resize(0, handle.get_stream());
+  source_indices.shrink_to_fit(handle.get_stream());
+
+  auto triplet_first =
+    thrust::make_zip_iterator(sort_indices.begin(), edge_srcs.begin(), edge_dsts.begin());
+  if (edge_weights) {
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        triplet_first,
+                        triplet_first + sort_indices.size(),
+                        (*edge_weights).begin());
+  } else {
+    thrust::sort(handle.get_thrust_policy(), triplet_first, triplet_first + sort_indices.size());
+  }
+
+  thrust::tabulate(
+    handle.get_thrust_policy(),
+    offsets.begin() + 1,
+    offsets.end(),
+    [sort_indices = raft::device_span<size_t const>(sort_indices.data(),
+                                                    sort_indices.size())] __device__(size_t i) {
+      return static_cast<size_t>(thrust::distance(
+        sort_indices.begin(),
+        thrust::upper_bound(thrust::seq, sort_indices.begin(), sort_indices.end(), i)));
+    });
+
+  return std::make_tuple(
+    std::move(offsets), std::move(edge_srcs), std::move(edge_dsts), std::move(edge_weights));
+}
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int32_t>&& edge_srcs,
+                          rmm::device_uvector<int32_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<float>>&& edge_weights);
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int32_t>&& edge_srcs,
+                          rmm::device_uvector<int32_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<double>>&& edge_weights);
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int64_t>&& edge_srcs,
+                          rmm::device_uvector<int64_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<float>>&& edge_weights);
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int64_t>&& edge_srcs,
+                          rmm::device_uvector<int64_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<double>>&& edge_weights);
+
 }  // namespace detail
 }  // namespace c_api
 }  // namespace cugraph
diff --git a/cpp/src/c_api/capi_helper.hpp b/cpp/src/c_api/capi_helper.hpp
index ce08e8d90d3..56401606477 100644
--- a/cpp/src/c_api/capi_helper.hpp
+++ b/cpp/src/c_api/capi_helper.hpp
@@ -36,6 +36,18 @@ void sort_by_key(raft::handle_t const& handle,
                  raft::device_span<key_t> keys,
                  raft::device_span<value_t> values);
 
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<size_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<vertex_t>&& edge_srcs,
+                          rmm::device_uvector<vertex_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<weight_t>>&& edge_weights);
+
 }  // namespace detail
 }  // namespace c_api
 }  // namespace cugraph
diff --git a/cpp/src/c_api/extract_ego.cpp b/cpp/src/c_api/extract_ego.cpp
index 931d58b5185..cbe07af2e77 100644
--- a/cpp/src/c_api/extract_ego.cpp
+++ b/cpp/src/c_api/extract_ego.cpp
@@ -17,6 +17,7 @@
 #include <cugraph_c/algorithms.h>
 
 #include <c_api/abstract_functor.hpp>
+#include <c_api/capi_helper.hpp>
 #include <c_api/graph.hpp>
 #include <c_api/induced_subgraph_result.hpp>
 #include <c_api/resource_handle.hpp>
@@ -26,7 +27,10 @@
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/utilities/device_comm.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
 
+#include <numeric>
 #include <optional>
 
 namespace {
@@ -91,9 +95,22 @@ struct extract_ego_functor : public cugraph::c_api::abstract_functor {
                  source_vertices.size(),
                  handle_.get_stream());
 
+      std::optional<rmm::device_uvector<size_t>> source_indices{std::nullopt};
+
       if constexpr (multi_gpu) {
-        source_vertices = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
-          handle_, std::move(source_vertices));
+        auto displacements = cugraph::host_scalar_allgather(
+          handle_.get_comms(), source_vertices.size(), handle_.get_stream());
+        std::exclusive_scan(
+          displacements.begin(), displacements.end(), displacements.begin(), size_t{0});
+        source_indices = rmm::device_uvector<size_t>(source_vertices.size(), handle_.get_stream());
+        cugraph::detail::sequence_fill(handle_.get_stream(),
+                                       (*source_indices).data(),
+                                       (*source_indices).size(),
+                                       displacements[handle_.get_comms().get_rank()]);
+
+        std::tie(source_vertices, source_indices) =
+          cugraph::detail::shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+            handle_, std::move(source_vertices), std::move(*source_indices));
       }
 
       cugraph::renumber_ext_vertices<vertex_t, multi_gpu>(
@@ -130,6 +147,31 @@ struct extract_ego_functor : public cugraph::c_api::abstract_functor {
         graph_view.vertex_partition_range_lasts(),
         do_expensive_check_);
 
+      if constexpr (multi_gpu) {
+        auto recvcounts = cugraph::host_scalar_allgather(
+          handle_.get_comms(), (*source_indices).size(), handle_.get_stream());
+        std::vector<size_t> displacements(recvcounts.size());
+        std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0});
+        rmm::device_uvector<size_t> allgathered_indices(displacements.back() + recvcounts.back(),
+                                                        handle_.get_stream());
+        cugraph::device_allgatherv(handle_.get_comms(),
+                                   (*source_indices).begin(),
+                                   allgathered_indices.begin(),
+                                   recvcounts,
+                                   displacements,
+                                   handle_.get_stream());
+        source_indices = std::move(allgathered_indices);
+
+        std::tie(edge_offsets, src, dst, wgt) =
+          cugraph::c_api::detail::reorder_extracted_egonets<vertex_t, weight_t>(
+            handle_,
+            std::move(*source_indices),
+            std::move(edge_offsets),
+            std::move(src),
+            std::move(dst),
+            std::move(wgt));
+      }
+
       result_ = new cugraph::c_api::cugraph_induced_subgraph_result_t{
         new cugraph::c_api::cugraph_type_erased_device_array_t(src, graph_->vertex_type_),
         new cugraph::c_api::cugraph_type_erased_device_array_t(dst, graph_->vertex_type_),
diff --git a/cpp/src/community/detail/mis_impl.cuh b/cpp/src/community/detail/mis_impl.cuh
index bcd71af5a08..2659a982183 100644
--- a/cpp/src/community/detail/mis_impl.cuh
+++ b/cpp/src/community/detail/mis_impl.cuh
@@ -37,6 +37,8 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
+#include <cuda/functional>
+
 #include <cmath>
 
 namespace cugraph {
@@ -78,13 +80,13 @@ rmm::device_uvector<vertex_t> maximal_independent_set(
   thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
 
   // Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
-  thrust::transform_if(
-    handle.get_thrust_policy(),
-    out_degrees.begin(),
-    out_degrees.end(),
-    ranks.begin(),
-    [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); },
-    [] __device__(auto deg) { return deg == 0; });
+  thrust::transform_if(handle.get_thrust_policy(),
+                       out_degrees.begin(),
+                       out_degrees.end(),
+                       ranks.begin(),
+                       cuda::proclaim_return_type<vertex_t>(
+                         [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); }),
+                       [] __device__(auto deg) { return deg == 0; });
 
   out_degrees.resize(0, handle.get_stream());
   out_degrees.shrink_to_fit(handle.get_stream());
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index ebaae498d04..eb874657f01 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -46,6 +46,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 CUCO_DECLARE_BITWISE_COMPARABLE(float)
 CUCO_DECLARE_BITWISE_COMPARABLE(double)
 // FIXME: a temporary workaround for a compiler error, should be deleted once cuco gets patched.
@@ -213,16 +215,17 @@ refine_clustering(
                                 : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
                                     louvain_assignment_of_vertices.data(), vertex_t{0}),
     *edge_weight_view,
-    [] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
-      weight_t weighted_cut_contribution{0};
+    cuda::proclaim_return_type<weight_t>(
+      [] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
+        weight_t weighted_cut_contribution{0};
 
-      if (src == dst)  // self loop
-        weighted_cut_contribution = 0;
-      else if (src_cluster == dst_cluster)
-        weighted_cut_contribution = wt;
+        if (src == dst)  // self loop
+          weighted_cut_contribution = 0;
+        else if (src_cluster == dst_cluster)
+          weighted_cut_contribution = wt;
 
-      return weighted_cut_contribution;
-    },
+        return weighted_cut_contribution;
+      }),
     weight_t{0},
     cugraph::reduce_op::plus<weight_t>{},
     weighted_cut_of_vertices_to_louvain.begin());
@@ -243,13 +246,14 @@ refine_clustering(
                     wcut_deg_and_cluster_vol_triple_begin,
                     wcut_deg_and_cluster_vol_triple_end,
                     singleton_and_connected_flags.begin(),
-                    [resolution, total_edge_weight] __device__(auto wcut_wdeg_and_louvain_volume) {
+                    cuda::proclaim_return_type<uint8_t>([resolution, total_edge_weight] __device__(
+                                                          auto wcut_wdeg_and_louvain_volume) {
                       auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
                       auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
                       auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
-                      return wcut >
-                             (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight);
-                    });
+                      return static_cast<uint8_t>(
+                        wcut > (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight));
+                    }));
 
   edge_src_property_t<GraphViewType, weight_t> src_louvain_cluster_weight_cache(handle);
   edge_src_property_t<GraphViewType, weight_t> src_cut_to_louvain_cache(handle);
@@ -718,11 +722,12 @@ refine_clustering(
       vertices_in_mis.begin(),
       vertices_in_mis.end(),
       dst_vertices.begin(),
-      [dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
-       v_first   = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
-        auto dst = *(dst_first + v - v_first);
-        return dst;
-      });
+      cuda::proclaim_return_type<vertex_t>(
+        [dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
+         v_first   = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
+          auto dst = *(dst_first + v - v_first);
+          return dst;
+        }));
 
     cugraph::resize_dataframe_buffer(gain_and_dst_output_pairs, 0, handle.get_stream());
     cugraph::shrink_to_fit_dataframe_buffer(gain_and_dst_output_pairs, handle.get_stream());
diff --git a/cpp/src/community/leiden_impl.cuh b/cpp/src/community/leiden_impl.cuh
index b6e20272de9..1e2b8f2ad44 100644
--- a/cpp/src/community/leiden_impl.cuh
+++ b/cpp/src/community/leiden_impl.cuh
@@ -568,17 +568,17 @@ void flatten_leiden_dendrogram(raft::handle_t const& handle,
   leiden_partition_at_level<vertex_t, multi_gpu>(
     handle, dendrogram, clustering, dendrogram.num_levels());
 
-  rmm::device_uvector<vertex_t> unique_cluster_ids(graph_view.number_of_vertices(),
+  rmm::device_uvector<vertex_t> unique_cluster_ids(graph_view.local_vertex_partition_range_size(),
                                                    handle.get_stream());
   thrust::copy(handle.get_thrust_policy(),
                clustering,
-               clustering + graph_view.number_of_vertices(),
+               clustering + graph_view.local_vertex_partition_range_size(),
                unique_cluster_ids.begin());
 
   remove_duplicates<vertex_t, multi_gpu>(handle, unique_cluster_ids);
 
   relabel_cluster_ids<vertex_t, multi_gpu>(
-    handle, unique_cluster_ids, clustering, graph_view.number_of_vertices());
+    handle, unique_cluster_ids, clustering, graph_view.local_vertex_partition_range_size());
 }
 
 }  // namespace detail
diff --git a/cpp/src/detail/collect_local_vertex_values.cu b/cpp/src/detail/collect_local_vertex_values.cu
index 9d5d2cb553b..795902dfd87 100644
--- a/cpp/src/detail/collect_local_vertex_values.cu
+++ b/cpp/src/detail/collect_local_vertex_values.cu
@@ -19,6 +19,8 @@
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
+#include <cuda/functional>
+
 namespace cugraph {
 namespace detail {
 
@@ -64,7 +66,8 @@ rmm::device_uvector<value_t> collect_local_vertex_values_from_ext_vertex_value_p
 
   auto vertex_iterator = thrust::make_transform_iterator(
     d_vertices.begin(),
-    [local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; });
+    cuda::proclaim_return_type<vertex_t>(
+      [local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; }));
 
   d_local_values.resize(local_vertex_last - local_vertex_first, handle.get_stream());
   thrust::fill(
diff --git a/cpp/src/detail/shuffle_vertices.cu b/cpp/src/detail/shuffle_vertices.cu
index bc450ce3bbf..94729a770f7 100644
--- a/cpp/src/detail/shuffle_vertices.cu
+++ b/cpp/src/detail/shuffle_vertices.cu
@@ -200,6 +200,12 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int32_t>&& vertices,
   rmm::device_uvector<int32_t>&& values);
 
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<size_t>>
+shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& vertices,
+  rmm::device_uvector<size_t>&& values);
+
 template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
 shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle,
@@ -224,6 +230,12 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int64_t>&& vertices,
   rmm::device_uvector<int64_t>&& values);
 
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<size_t>>
+shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& vertices,
+  rmm::device_uvector<size_t>&& values);
+
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
 shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle,
diff --git a/cpp/src/generators/erdos_renyi_generator.cu b/cpp/src/generators/erdos_renyi_generator.cu
index 6d847ae0bde..8448eeaf960 100644
--- a/cpp/src/generators/erdos_renyi_generator.cu
+++ b/cpp/src/generators/erdos_renyi_generator.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,8 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 namespace cugraph {
 
 template <typename vertex_t>
@@ -42,12 +44,13 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                   "Implementation cannot support specified value");
 
   auto random_iterator = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_t>(0), [seed] __device__(size_t index) {
+    thrust::make_counting_iterator<size_t>(0),
+    cuda::proclaim_return_type<float>([seed] __device__(size_t index) {
       thrust::default_random_engine rng(seed);
       thrust::uniform_real_distribution<float> dist(0.0, 1.0);
       rng.discard(index);
       return dist(rng);
-    });
+    }));
 
   size_t count = thrust::count_if(handle.get_thrust_policy(),
                                   random_iterator,
@@ -69,13 +72,14 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                     indices_v.begin(),
                     indices_v.end(),
                     thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), src_v.end())),
-                    [num_vertices] __device__(size_t index) {
-                      size_t src = index / num_vertices;
-                      size_t dst = index % num_vertices;
-
-                      return thrust::make_tuple(static_cast<vertex_t>(src),
-                                                static_cast<vertex_t>(dst));
-                    });
+                    cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
+                      [num_vertices] __device__(size_t index) {
+                        size_t src = index / num_vertices;
+                        size_t dst = index % num_vertices;
+
+                        return thrust::make_tuple(static_cast<vertex_t>(src),
+                                                  static_cast<vertex_t>(dst));
+                      }));
 
   handle.sync_stream();
 
diff --git a/cpp/src/generators/simple_generators.cu b/cpp/src/generators/simple_generators.cu
index 6dba63909c3..65647be5de0 100644
--- a/cpp/src/generators/simple_generators.cu
+++ b/cpp/src/generators/simple_generators.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,8 @@
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 
 namespace cugraph {
@@ -264,23 +266,24 @@ generate_complete_graph_edgelist(
 
     auto transform_iter = thrust::make_transform_iterator(
       thrust::make_counting_iterator<size_t>(0),
-      [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
-        size_t graph_index = index / (num_vertices * num_vertices);
-        size_t local_index = index % (num_vertices * num_vertices);
-
-        vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
-        vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
-
-        if (src == dst) {
-          src = invalid_vertex;
-          dst = invalid_vertex;
-        } else {
-          src += (graph_index * num_vertices);
-          dst += (graph_index * num_vertices);
-        }
-
-        return thrust::make_tuple(src, dst);
-      });
+      cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
+        [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
+          size_t graph_index = index / (num_vertices * num_vertices);
+          size_t local_index = index % (num_vertices * num_vertices);
+
+          vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
+          vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
+
+          if (src == dst) {
+            src = invalid_vertex;
+            dst = invalid_vertex;
+          } else {
+            src += (graph_index * num_vertices);
+            dst += (graph_index * num_vertices);
+          }
+
+          return thrust::make_tuple(src, dst);
+        }));
 
     output_iterator = thrust::copy_if(handle.get_thrust_policy(),
                                       transform_iter,
diff --git a/cpp/src/link_analysis/hits_impl.cuh b/cpp/src/link_analysis/hits_impl.cuh
index 674046745b1..5cdf1b9dc6a 100644
--- a/cpp/src/link_analysis/hits_impl.cuh
+++ b/cpp/src/link_analysis/hits_impl.cuh
@@ -80,6 +80,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
   if (num_vertices == 0) { return std::make_tuple(diff_sum, final_iteration_count); }
 
   CUGRAPH_EXPECTS(epsilon >= 0.0, "Invalid input argument: epsilon should be non-negative.");
+  auto tolerance = static_cast<result_t>(graph_view.number_of_vertices()) * epsilon;
 
   // Check validity of initial guess if supplied
   if (has_initial_hubs_guess && do_expensive_check) {
@@ -171,7 +172,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
     std::swap(prev_hubs, curr_hubs);
     iter++;
 
-    if (diff_sum < epsilon) {
+    if (diff_sum < tolerance) {
       break;
     } else if (iter >= max_iterations) {
       CUGRAPH_FAIL("HITS failed to converge.");
diff --git a/cpp/src/mtmg/vertex_result.cu b/cpp/src/mtmg/vertex_result.cu
index 97fcd291c87..414f1bdfa88 100644
--- a/cpp/src/mtmg/vertex_result.cu
+++ b/cpp/src/mtmg/vertex_result.cu
@@ -21,21 +21,21 @@
 
 #include <detail/graph_partition_utils.cuh>
 
+#include <thrust/functional.h>
 #include <thrust/gather.h>
 
 namespace cugraph {
 namespace mtmg {
 
 template <typename result_t>
-template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+template <typename vertex_t, bool multi_gpu>
 rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
   handle_t const& handle,
   raft::device_span<vertex_t const> vertices,
-  cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::vector<vertex_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<vertex_t, multi_gpu> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view)
 {
-  auto this_gpu_graph_view = graph_view.get(handle);
-
   rmm::device_uvector<vertex_t> local_vertices(vertices.size(), handle.get_stream());
   rmm::device_uvector<int> vertex_gpu_ids(vertices.size(), handle.get_stream());
   rmm::device_uvector<size_t> vertex_pos(vertices.size(), handle.get_stream());
@@ -47,11 +47,11 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
   cugraph::detail::sequence_fill(
     handle.get_stream(), vertex_pos.data(), vertex_pos.size(), size_t{0});
 
-  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
-    this_gpu_graph_view.vertex_partition_range_lasts().size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(),
+                                                               handle.get_stream());
   raft::update_device(d_vertex_partition_range_lasts.data(),
-                      this_gpu_graph_view.vertex_partition_range_lasts().data(),
-                      this_gpu_graph_view.vertex_partition_range_lasts().size(),
+                      vertex_partition_range_lasts.data(),
+                      vertex_partition_range_lasts.size(),
                       handle.get_stream());
 
   if (renumber_map_view) {
@@ -60,8 +60,8 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
       local_vertices.data(),
       local_vertices.size(),
       renumber_map_view->get(handle).data(),
-      this_gpu_graph_view.local_vertex_partition_range_first(),
-      this_gpu_graph_view.local_vertex_partition_range_last());
+      vertex_partition_view.local_vertex_partition_range_first(),
+      vertex_partition_view.local_vertex_partition_range_last());
   }
 
   auto const major_comm_size =
@@ -89,13 +89,14 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
 
   auto& wrapped = this->get(handle);
 
-  auto vertex_partition = vertex_partition_device_view_t<vertex_t, multi_gpu>(
-    this_gpu_graph_view.local_vertex_partition_view());
+  auto vertex_partition =
+    vertex_partition_device_view_t<vertex_t, multi_gpu>(vertex_partition_view);
 
-  auto iter =
-    thrust::make_transform_iterator(local_vertices.begin(), [vertex_partition] __device__(auto v) {
+  auto iter = thrust::make_transform_iterator(
+    local_vertices.begin(),
+    cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
       return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-    });
+    }));
 
   thrust::gather(handle.get_thrust_policy(),
                  iter,
@@ -112,7 +113,7 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
       vertex_gpu_ids.begin(),
       vertex_gpu_ids.end(),
       thrust::make_zip_iterator(local_vertices.begin(), vertex_pos.begin(), tmp_result.begin()),
-      [] __device__(int gpu) { return gpu; },
+      thrust::identity{},
       handle.get_stream());
 
   //
@@ -130,37 +131,85 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
 template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, false> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
 template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
 template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
   handle_t const& handle,
   raft::device_span<int64_t const> vertices,
-  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
 
-template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, false> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
-template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
-template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, true> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<int32_t> vertex_result_view_t<int32_t>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<int32_t> vertex_result_view_t<int32_t>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, true> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<int64_t> vertex_result_view_t<int64_t>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<int64_t> vertex_result_view_t<int64_t>::gather(
   handle_t const& handle,
   raft::device_span<int64_t const> vertices,
-  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
 
 }  // namespace mtmg
diff --git a/cpp/src/prims/count_if_e.cuh b/cpp/src/prims/count_if_e.cuh
index f6e4bc9bead..9cff4f5eceb 100644
--- a/cpp/src/prims/count_if_e.cuh
+++ b/cpp/src/prims/count_if_e.cuh
@@ -74,8 +74,6 @@ typename GraphViewType::edge_type count_if_e(raft::handle_t const& handle,
   using vertex_t = typename GraphViewType::vertex_type;
   using edge_t   = typename GraphViewType::edge_type;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/fill_edge_property.cuh b/cpp/src/prims/fill_edge_property.cuh
index d446944b65b..e6875576044 100644
--- a/cpp/src/prims/fill_edge_property.cuh
+++ b/cpp/src/prims/fill_edge_property.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cugraph/edge_partition_edge_property_device_view.cuh>
 #include <cugraph/edge_property.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/error.hpp>
@@ -23,6 +24,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
 
 #include <cstddef>
 
@@ -38,21 +40,78 @@ void fill_edge_property(raft::handle_t const& handle,
 {
   static_assert(std::is_same_v<T, typename EdgePropertyOutputWrapper::value_type>);
 
+  using edge_t = typename GraphViewType::edge_type;
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   auto value_firsts = edge_property_output.value_firsts();
   auto edge_counts  = edge_property_output.edge_counts();
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+
     if constexpr (cugraph::has_packed_bool_element<
                     std::remove_reference_t<decltype(value_firsts[i])>,
                     T>()) {
       static_assert(std::is_arithmetic_v<T>, "unimplemented for thrust::tuple types.");
       auto packed_input = input ? packed_bool_full_mask() : packed_bool_empty_mask();
-      thrust::fill_n(handle.get_thrust_policy(),
-                     value_firsts[i],
-                     packed_bool_size(static_cast<size_t>(edge_counts[i])),
-                     packed_input);
+      auto rem          = edge_counts[i] % packed_bools_per_word();
+      if (edge_partition_e_mask) {
+        auto input_first =
+          thrust::make_zip_iterator(value_firsts[i], (*edge_partition_e_mask).value_first());
+        thrust::transform(handle.get_thrust_policy(),
+                          input_first,
+                          input_first + packed_bool_size(static_cast<size_t>(edge_counts[i] - rem)),
+                          value_firsts[i],
+                          [packed_input] __device__(thrust::tuple<T, uint32_t> pair) {
+                            auto old_value = thrust::get<0>(pair);
+                            auto mask      = thrust::get<1>(pair);
+                            return (old_value & ~mask) | (packed_input & mask);
+                          });
+        if (rem > 0) {
+          thrust::transform(
+            handle.get_thrust_policy(),
+            input_first + packed_bool_size(static_cast<size_t>(edge_counts[i] - rem)),
+            input_first + packed_bool_size(static_cast<size_t>(edge_counts[i])),
+            value_firsts[i] + packed_bool_size(static_cast<size_t>(edge_counts[i] - rem)),
+            [packed_input, rem] __device__(thrust::tuple<T, uint32_t> pair) {
+              auto old_value = thrust::get<0>(pair);
+              auto mask      = thrust::get<1>(pair);
+              return ((old_value & ~mask) | (packed_input & mask)) & packed_bool_partial_mask(rem);
+            });
+        }
+      } else {
+        thrust::fill_n(handle.get_thrust_policy(),
+                       value_firsts[i],
+                       packed_bool_size(static_cast<size_t>(edge_counts[i] - rem)),
+                       packed_input);
+        if (rem > 0) {
+          thrust::fill_n(
+            handle.get_thrust_policy(),
+            value_firsts[i] + packed_bool_size(static_cast<size_t>(edge_counts[i] - rem)),
+            1,
+            packed_input & packed_bool_partial_mask(rem));
+        }
+      }
     } else {
-      thrust::fill_n(
-        handle.get_thrust_policy(), value_firsts[i], static_cast<size_t>(edge_counts[i]), input);
+      if (edge_partition_e_mask) {
+        thrust::transform_if(handle.get_thrust_policy(),
+                             thrust::make_constant_iterator(input),
+                             thrust::make_constant_iterator(input) + edge_counts[i],
+                             thrust::make_counting_iterator(edge_t{0}),
+                             value_firsts[i],
+                             thrust::identity<T>{},
+                             [edge_partition_e_mask = *edge_partition_e_mask] __device__(edge_t i) {
+                               return edge_partition_e_mask.get(i);
+                             });
+      } else {
+        thrust::fill_n(
+          handle.get_thrust_policy(), value_firsts[i], static_cast<size_t>(edge_counts[i]), input);
+      }
     }
   }
 }
@@ -79,8 +138,6 @@ void fill_edge_property(raft::handle_t const& handle,
                         edge_property_t<GraphViewType, T>& edge_property_output,
                         bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 5fee97790f1..4c5c43c7d1e 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -44,6 +44,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <optional>
 #include <tuple>
 
@@ -596,8 +598,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
                                         multiplier_t<size_t>{high_partition_over_sampling_K}),
         thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_t{0}),
-          [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-            size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
         handle.get_stream());
       if (tmp_storage_bytes > d_tmp_storage.size()) {
         d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
@@ -615,8 +618,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
                                         multiplier_t<size_t>{high_partition_over_sampling_K}),
         thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_t{0}),
-          [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-            size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
         handle.get_stream());
 
       // copy the neighbor indices back to sample_nbr_indices
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 0b6c6a554bb..1a7fc0130c4 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -51,6 +51,8 @@
 #include <thrust/tuple.h>
 #include <thrust/type_traits/integer_sequence.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 #include <type_traits>
 #include <utility>
@@ -591,10 +593,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
           value_size = sizeof(T);
         }
 
-        auto avg_vertex_degree = graph_view.number_of_vertices() > 0
-                                   ? (static_cast<double>(graph_view.number_of_edges()) /
-                                      static_cast<double>(graph_view.number_of_vertices()))
-                                   : double{0.0};
+        auto avg_vertex_degree =
+          graph_view.number_of_vertices() > 0
+            ? (static_cast<double>(graph_view.compute_number_of_edges(handle)) /
+               static_cast<double>(graph_view.number_of_vertices()))
+            : double{0.0};
 
         num_streams =
           std::min(static_cast<size_t>(avg_vertex_degree * (static_cast<double>(sizeof(vertex_t)) /
@@ -940,16 +943,19 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
                      minor_init);
         auto value_first = thrust::make_transform_iterator(
           view.value_first(),
-          [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); });
-        thrust::scatter(
-          handle.get_thrust_policy(),
-          value_first + (*minor_key_offsets)[i],
-          value_first + (*minor_key_offsets)[i + 1],
-          thrust::make_transform_iterator(
-            (*(view.keys())).begin() + (*minor_key_offsets)[i],
-            [key_first = graph_view.vertex_partition_range_first(
-               this_segment_vertex_partition_id)] __device__(auto key) { return key - key_first; }),
-          tx_buffer_first);
+          cuda::proclaim_return_type<T>(
+            [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); }));
+        thrust::scatter(handle.get_thrust_policy(),
+                        value_first + (*minor_key_offsets)[i],
+                        value_first + (*minor_key_offsets)[i + 1],
+                        thrust::make_transform_iterator(
+                          (*(view.keys())).begin() + (*minor_key_offsets)[i],
+                          cuda::proclaim_return_type<vertex_t>(
+                            [key_first = graph_view.vertex_partition_range_first(
+                               this_segment_vertex_partition_id)] __device__(auto key) {
+                              return key - key_first;
+                            })),
+                        tx_buffer_first);
         device_reduce(major_comm,
                       tx_buffer_first,
                       vertex_value_output_first,
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index edacdc8a970..c6623621d24 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -16,10 +16,12 @@
 #pragma once
 
 #include <cugraph/edge_partition_device_view.cuh>
+#include <cugraph/edge_partition_edge_property_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/mask_utils.cuh>
 #include <cugraph/utilities/packed_bool_utils.hpp>
 
 #include <raft/core/handle.hpp>
@@ -44,6 +46,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename EdgePartitionEdgeValueOutputWrapper,
           typename EdgeOp>
 __global__ void transform_e_packed_bool(
@@ -53,6 +56,7 @@ __global__ void transform_e_packed_bool(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   EdgePartitionEdgeValueOutputWrapper edge_partition_e_value_output,
   EdgeOp e_op)
 {
@@ -68,11 +72,14 @@ __global__ void transform_e_packed_bool(
 
   auto num_edges = edge_partition.number_of_edges();
   while (idx < static_cast<edge_t>(packed_bool_size(num_edges))) {
+    auto edge_mask = packed_bool_full_mask();
+    if (edge_partition_e_mask) { edge_mask = *((*edge_partition_e_mask).value_first() + idx); }
+
     auto local_edge_idx =
       idx * static_cast<edge_t>(packed_bools_per_word()) + static_cast<edge_t>(lane_id);
-    uint32_t mask{0};
     int predicate{0};
-    if (local_edge_idx < num_edges) {
+
+    if ((local_edge_idx < num_edges) && (edge_mask & packed_bool_mask(lane_id))) {
       auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(local_edge_idx);
       auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
       auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
@@ -91,8 +98,15 @@ __global__ void transform_e_packed_bool(
                           ? int{1}
                           : int{0};
     }
-    mask = __ballot_sync(uint32_t{0xffffffff}, predicate);
-    if (lane_id == 0) { *(edge_partition_e_value_output.value_first() + idx) = mask; }
+    uint32_t new_val = __ballot_sync(uint32_t{0xffffffff}, predicate);
+    if (lane_id == 0) {
+      if (edge_mask == packed_bool_full_mask()) {
+        *(edge_partition_e_value_output.value_first() + idx) = new_val;
+      } else {
+        auto old_val = *(edge_partition_e_value_output.value_first() + idx);
+        *(edge_partition_e_value_output.value_first() + idx) = (old_val & ~edge_mask) | new_val;
+      }
+    }
 
     idx += static_cast<edge_t>(gridDim.x * (blockDim.x / raft::warp_size()));
   }
@@ -178,12 +192,18 @@ void transform_e(raft::handle_t const& handle,
     typename EdgeValueOutputWrapper::value_iterator,
     typename EdgeValueOutputWrapper::value_type>;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+  auto edge_mask_view = graph_view.edge_mask_view();
 
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     edge_partition_src_input_device_view_t edge_partition_src_value_input{};
     edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
@@ -214,35 +234,40 @@ void transform_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             edge_partition_e_value_output,
             e_op);
       }
     } else {
-      thrust::transform(
+      thrust::for_each(
         handle.get_thrust_policy(),
         thrust::make_counting_iterator(edge_t{0}),
         thrust::make_counting_iterator(num_edges),
-        edge_partition_e_value_output.value_first(),
         [e_op,
          edge_partition,
          edge_partition_src_value_input,
          edge_partition_dst_value_input,
-         edge_partition_e_value_input] __device__(edge_t i) {
-          auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(i);
-          auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
-          auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-          auto minor        = *(edge_partition.indices() + i);
-          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-
-          auto src        = GraphViewType::is_storage_transposed ? minor : major;
-          auto dst        = GraphViewType::is_storage_transposed ? major : minor;
-          auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-          auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-          return e_op(src,
-                      dst,
-                      edge_partition_src_value_input.get(src_offset),
-                      edge_partition_dst_value_input.get(dst_offset),
-                      edge_partition_e_value_input.get(i));
+         edge_partition_e_value_input,
+         edge_partition_e_mask,
+         edge_partition_e_value_output] __device__(edge_t i) {
+          if (!edge_partition_e_mask || (*edge_partition_e_mask).get(i)) {
+            auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(i);
+            auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
+            auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+            auto minor        = *(edge_partition.indices() + i);
+            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+
+            auto src         = GraphViewType::is_storage_transposed ? minor : major;
+            auto dst         = GraphViewType::is_storage_transposed ? major : minor;
+            auto src_offset  = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+            auto dst_offset  = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+            auto e_op_result = e_op(src,
+                                    dst,
+                                    edge_partition_src_value_input.get(src_offset),
+                                    edge_partition_dst_value_input.get(dst_offset),
+                                    edge_partition_e_value_input.get(i));
+            edge_partition_e_value_output.set(i, e_op_result);
+          }
         });
     }
   }
@@ -336,14 +361,12 @@ void transform_e(raft::handle_t const& handle,
     typename EdgeValueOutputWrapper::value_iterator,
     typename EdgeValueOutputWrapper::value_type>;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   auto major_first =
     GraphViewType::is_storage_transposed ? edge_list.dst_begin() : edge_list.src_begin();
   auto minor_first =
     GraphViewType::is_storage_transposed ? edge_list.src_begin() : edge_list.dst_begin();
 
-  auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(major_first, minor_first));
+  auto edge_first = thrust::make_zip_iterator(major_first, minor_first);
 
   if (do_expensive_check) {
     CUGRAPH_EXPECTS(
@@ -382,10 +405,18 @@ void transform_e(raft::handle_t const& handle,
     edge_partition_offsets.back() = edge_list.size();
   }
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     if (do_expensive_check) {
       CUGRAPH_EXPECTS(
@@ -393,7 +424,8 @@ void transform_e(raft::handle_t const& handle,
           handle.get_thrust_policy(),
           edge_first + edge_partition_offsets[i],
           edge_first + edge_partition_offsets[i + 1],
-          [edge_partition] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
+          [edge_partition,
+           edge_partition_e_mask] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
             auto major = thrust::get<0>(edge);
             auto minor = thrust::get<1>(edge);
             vertex_t major_idx{};
@@ -416,8 +448,19 @@ void transform_e(raft::handle_t const& handle,
             edge_t edge_offset{};
             edge_t local_degree{};
             thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
-            auto it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
-            return *it != minor;
+            auto lower_it =
+              thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+            if (*lower_it != minor) { return true; }
+            if (edge_partition_e_mask) {
+              auto upper_it =
+                thrust::upper_bound(thrust::seq, lower_it, indices + local_degree, minor);
+              if (detail::count_set_bits((*edge_partition_e_mask).value_first(),
+                                         edge_offset + thrust::distance(indices, lower_it),
+                                         thrust::distance(lower_it, upper_it)) == 0) {
+                return true;
+              }
+            }
+            return false;
           }) == 0,
         "Invalid input arguments: edge_list contains edges that do not exist in the input graph.");
     }
@@ -446,6 +489,7 @@ void transform_e(raft::handle_t const& handle,
        edge_partition_src_value_input,
        edge_partition_dst_value_input,
        edge_partition_e_value_input,
+       edge_partition_e_mask,
        edge_partition_e_value_output] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
         auto major = thrust::get<0>(edge);
         auto minor = thrust::get<1>(edge);
@@ -469,7 +513,7 @@ void transform_e(raft::handle_t const& handle,
         edge_t local_degree{};
         thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
         auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
-        auto upper_it = thrust::upper_bound(thrust::seq, indices, indices + local_degree, minor);
+        auto upper_it = thrust::upper_bound(thrust::seq, lower_it, indices + local_degree, minor);
 
         auto src        = GraphViewType::is_storage_transposed ? minor : major;
         auto dst        = GraphViewType::is_storage_transposed ? major : minor;
@@ -478,14 +522,17 @@ void transform_e(raft::handle_t const& handle,
 
         for (auto it = lower_it; it != upper_it; ++it) {
           assert(*it == minor);
-          auto e_op_result =
-            e_op(src,
-                 dst,
-                 edge_partition_src_value_input.get(src_offset),
-                 edge_partition_dst_value_input.get(dst_offset),
-                 edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
-          edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it),
-                                            e_op_result);
+          if (!edge_partition_e_mask ||
+              ((*edge_partition_e_mask).get(edge_offset + thrust::distance(indices, it)))) {
+            auto e_op_result =
+              e_op(src,
+                   dst,
+                   edge_partition_src_value_input.get(src_offset),
+                   edge_partition_dst_value_input.get(dst_offset),
+                   edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
+            edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it),
+                                              e_op_result);
+          }
         }
       });
   }
diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh
index 9c23f3fca18..483ab64dcd9 100644
--- a/cpp/src/prims/transform_reduce_e.cuh
+++ b/cpp/src/prims/transform_reduce_e.cuh
@@ -56,6 +56,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void transform_reduce_e_hypersparse(
@@ -65,6 +66,7 @@ __global__ void transform_reduce_e_hypersparse(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -101,24 +103,31 @@ __global__ void transform_reduce_e_hypersparse(
        &edge_partition_src_value_input,
        &edge_partition_dst_value_input,
        &edge_partition_e_value_input,
+       &edge_partition_e_mask,
        &e_op,
        major,
        indices,
        edge_offset] __device__(auto i) {
-        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-        auto minor        = indices[i];
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        auto src          = GraphViewType::is_storage_transposed ? minor : major;
-        auto dst          = GraphViewType::is_storage_transposed ? major : minor;
-        auto src_offset =
-          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-        auto dst_offset =
-          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-        return e_op(src,
-                    dst,
-                    edge_partition_src_value_input.get(src_offset),
-                    edge_partition_dst_value_input.get(dst_offset),
-                    edge_partition_e_value_input.get(edge_offset + i));
+        if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
+          auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+          auto minor        = indices[i];
+          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+          auto src          = GraphViewType::is_storage_transposed ? minor : major;
+          auto dst          = GraphViewType::is_storage_transposed ? major : minor;
+          auto src_offset   = GraphViewType::is_storage_transposed
+                                                                       ? minor_offset
+                                                                       : static_cast<vertex_t>(major_offset);
+          auto dst_offset   = GraphViewType::is_storage_transposed
+                                                                       ? static_cast<vertex_t>(major_offset)
+                                                                       : minor_offset;
+          return e_op(src,
+                      dst,
+                      edge_partition_src_value_input.get(src_offset),
+                      edge_partition_dst_value_input.get(dst_offset),
+                      edge_partition_e_value_input.get(edge_offset + i));
+        } else {
+          return e_op_result_t{};
+        }
       },
       e_op_result_t{},
       edge_property_add);
@@ -135,6 +144,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void transform_reduce_e_low_degree(
@@ -146,6 +156,7 @@ __global__ void transform_reduce_e_low_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -177,27 +188,34 @@ __global__ void transform_reduce_e_low_degree(
        &edge_partition_src_value_input,
        &edge_partition_dst_value_input,
        &edge_partition_e_value_input,
+       &edge_partition_e_mask,
        &e_op,
        major_offset,
        indices,
        edge_offset] __device__(auto i) {
-        auto minor        = indices[i];
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        auto src          = GraphViewType::is_storage_transposed
-                                                                     ? minor
-                                                                     : edge_partition.major_from_major_offset_nocheck(major_offset);
-        auto dst          = GraphViewType::is_storage_transposed
-                                                                     ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                                                                     : minor;
-        auto src_offset =
-          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-        auto dst_offset =
-          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-        return e_op(src,
-                    dst,
-                    edge_partition_src_value_input.get(src_offset),
-                    edge_partition_dst_value_input.get(dst_offset),
-                    edge_partition_e_value_input.get(edge_offset + i));
+        if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
+          auto minor        = indices[i];
+          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+          auto src          = GraphViewType::is_storage_transposed
+                                                                       ? minor
+                                                                       : edge_partition.major_from_major_offset_nocheck(major_offset);
+          auto dst          = GraphViewType::is_storage_transposed
+                                                                       ? edge_partition.major_from_major_offset_nocheck(major_offset)
+                                                                       : minor;
+          auto src_offset   = GraphViewType::is_storage_transposed
+                                                                       ? minor_offset
+                                                                       : static_cast<vertex_t>(major_offset);
+          auto dst_offset   = GraphViewType::is_storage_transposed
+                                                                       ? static_cast<vertex_t>(major_offset)
+                                                                       : minor_offset;
+          return e_op(src,
+                      dst,
+                      edge_partition_src_value_input.get(src_offset),
+                      edge_partition_dst_value_input.get(dst_offset),
+                      edge_partition_e_value_input.get(edge_offset + i));
+        } else {
+          return e_op_result_t{};
+        }
       },
       e_op_result_t{},
       edge_property_add);
@@ -214,6 +232,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void transform_reduce_e_mid_degree(
@@ -225,6 +244,7 @@ __global__ void transform_reduce_e_mid_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -250,24 +270,26 @@ __global__ void transform_reduce_e_mid_degree(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
     for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      auto e_op_result = e_op(src,
-                              dst,
-                              edge_partition_src_value_input.get(src_offset),
-                              edge_partition_dst_value_input.get(dst_offset),
-                              edge_partition_e_value_input.get(edge_offset + i));
-      e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+      if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
+        auto minor        = indices[i];
+        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+        auto src          = GraphViewType::is_storage_transposed
+                              ? minor
+                              : edge_partition.major_from_major_offset_nocheck(major_offset);
+        auto dst          = GraphViewType::is_storage_transposed
+                              ? edge_partition.major_from_major_offset_nocheck(major_offset)
+                              : minor;
+        auto src_offset =
+          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
+        auto dst_offset =
+          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
+        auto e_op_result = e_op(src,
+                                dst,
+                                edge_partition_src_value_input.get(src_offset),
+                                edge_partition_dst_value_input.get(dst_offset),
+                                edge_partition_e_value_input.get(edge_offset + i));
+        e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+      }
     }
     idx += gridDim.x * (blockDim.x / raft::warp_size());
   }
@@ -280,6 +302,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void transform_reduce_e_high_degree(
@@ -291,6 +314,7 @@ __global__ void transform_reduce_e_high_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -313,24 +337,26 @@ __global__ void transform_reduce_e_high_degree(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
     for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      auto e_op_result = e_op(src,
-                              dst,
-                              edge_partition_src_value_input.get(src_offset),
-                              edge_partition_dst_value_input.get(dst_offset),
-                              edge_partition_e_value_input.get(edge_offset + i));
-      e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+      if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
+        auto minor        = indices[i];
+        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+        auto src          = GraphViewType::is_storage_transposed
+                              ? minor
+                              : edge_partition.major_from_major_offset_nocheck(major_offset);
+        auto dst          = GraphViewType::is_storage_transposed
+                              ? edge_partition.major_from_major_offset_nocheck(major_offset)
+                              : minor;
+        auto src_offset =
+          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
+        auto dst_offset =
+          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
+        auto e_op_result = e_op(src,
+                                dst,
+                                edge_partition_src_value_input.get(src_offset),
+                                edge_partition_dst_value_input.get(dst_offset),
+                                edge_partition_e_value_input.get(edge_offset + i));
+        e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+      }
     }
     idx += gridDim.x;
   }
@@ -417,8 +443,6 @@ T transform_reduce_e(raft::handle_t const& handle,
       typename EdgeValueInputWrapper::value_iterator,
       typename EdgeValueInputWrapper::value_type>>;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -431,10 +455,18 @@ T transform_reduce_e(raft::handle_t const& handle,
                get_dataframe_buffer_begin(result_buffer) + 1,
                T{});
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     edge_partition_src_input_device_view_t edge_partition_src_value_input{};
     edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
@@ -467,6 +499,7 @@ T transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_dataframe_buffer_begin(result_buffer),
             e_op);
       }
@@ -482,6 +515,7 @@ T transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_dataframe_buffer_begin(result_buffer),
             e_op);
       }
@@ -497,6 +531,7 @@ T transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_dataframe_buffer_begin(result_buffer),
             e_op);
       }
@@ -510,6 +545,7 @@ T transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_dataframe_buffer_begin(result_buffer),
             e_op);
       }
@@ -527,6 +563,7 @@ T transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_dataframe_buffer_begin(result_buffer),
             e_op);
       }
@@ -601,8 +638,6 @@ auto transform_reduce_e(raft::handle_t const& handle,
     edge_op_result_type<vertex_t, vertex_t, src_value_t, dst_value_t, e_value_t, EdgeOp>::type;
   static_assert(!std::is_same_v<T, void>);
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh
index 2d72a075ca5..0c7058cccb4 100644
--- a/cpp/src/prims/update_edge_src_dst_property.cuh
+++ b/cpp/src/prims/update_edge_src_dst_property.cuh
@@ -44,6 +44,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scatter.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <numeric>
 #include <type_traits>
@@ -181,13 +183,14 @@ void update_edge_major_property(raft::handle_t const& handle,
                        handle.get_stream());
           auto bool_first = thrust::make_transform_iterator(
             (*edge_partition_keys)[i].begin(),
-            [rx_value_first,
-             v_first = graph_view.vertex_partition_range_first(
-               major_range_vertex_partition_id)] __device__(auto v) {
-              auto v_offset = v - v_first;
-              return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
-                                       packed_bool_mask(v_offset));
-            });
+            cuda::proclaim_return_type<bool>(
+              [rx_value_first,
+               v_first = graph_view.vertex_partition_range_first(
+                 major_range_vertex_partition_id)] __device__(auto v) {
+                auto v_offset = v - v_first;
+                return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                         packed_bool_mask(v_offset));
+              }));
           pack_bools(handle,
                      bool_first,
                      bool_first + (*edge_partition_keys)[i].size(),
@@ -202,8 +205,9 @@ void update_edge_major_property(raft::handle_t const& handle,
 
           auto v_offset_first = thrust::make_transform_iterator(
             (*edge_partition_keys)[i].begin(),
-            [v_first = graph_view.vertex_partition_range_first(
-               major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+            cuda::proclaim_return_type<vertex_t>(
+              [v_first = graph_view.vertex_partition_range_first(
+                 major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; }));
           thrust::gather(handle.get_thrust_policy(),
                          v_offset_first,
                          v_offset_first + (*edge_partition_keys)[i].size(),
@@ -312,21 +316,24 @@ void update_edge_major_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (packed_bool) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+            vertex_first,
+            cuda::proclaim_return_type<bool>([vertex_property_input_first,
+                                              vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
               return static_cast<bool>(
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
-            });
+            }));
           pack_bools(handle,
                      bool_first,
                      bool_first + thrust::distance(vertex_first, vertex_last),
                      rx_value_first);
         } else {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+          auto map_first = thrust::make_transform_iterator(
+            vertex_first,
+            cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-            });
+            }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
           thrust::gather(handle.get_thrust_policy(),
@@ -391,9 +398,10 @@ void update_edge_major_property(raft::handle_t const& handle,
             });
         } else {
           auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
               return edge_partition.major_offset_from_major_nocheck(v);
-            });
+            }));
           // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
@@ -471,7 +479,8 @@ void update_edge_minor_property(raft::handle_t const& handle,
       bcast_size *= sizeof(typename EdgeMinorPropertyOutputWrapper::value_type);
     }
     auto num_concurrent_bcasts =
-      (static_cast<size_t>(graph_view.number_of_edges() / comm_size) * sizeof(vertex_t)) /
+      (static_cast<size_t>(graph_view.compute_number_of_edges(handle) / comm_size) *
+       sizeof(vertex_t)) /
       std::max(bcast_size, size_t{1});
     num_concurrent_bcasts = std::max(num_concurrent_bcasts, size_t{1});
     num_concurrent_bcasts = std::min(num_concurrent_bcasts, static_cast<size_t>(major_comm_size));
@@ -593,13 +602,14 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
                 auto bool_first = thrust::make_transform_iterator(
                   (*edge_partition_keys).begin() + key_offsets[j],
-                  [rx_value_first,
-                   v_first = graph_view.vertex_partition_range_first(
-                     minor_range_vertex_partition_id)] __device__(auto v) {
-                    auto v_offset = v - v_first;
-                    return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
-                                             packed_bool_mask(v_offset));
-                  });
+                  cuda::proclaim_return_type<bool>(
+                    [rx_value_first,
+                     v_first = graph_view.vertex_partition_range_first(
+                       minor_range_vertex_partition_id)] __device__(auto v) {
+                      auto v_offset = v - v_first;
+                      return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                               packed_bool_mask(v_offset));
+                    }));
                 pack_unaligned_bools(
                   handle,
                   bool_first,
@@ -611,10 +621,10 @@ void update_edge_minor_property(raft::handle_t const& handle,
                   std::get<std::vector<size_t>>(key_offsets_or_rx_displacements);
                 auto bool_first = thrust::make_transform_iterator(
                   thrust::make_counting_iterator(vertex_t{0}),
-                  [rx_value_first] __device__(vertex_t v_offset) {
+                  cuda::proclaim_return_type<bool>([rx_value_first] __device__(vertex_t v_offset) {
                     return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
                                              packed_bool_mask(v_offset));
-                  });
+                  }));
                 pack_unaligned_bools(
                   handle,
                   bool_first,
@@ -630,8 +640,9 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
               auto v_offset_first = thrust::make_transform_iterator(
                 (*edge_partition_keys).begin() + key_offsets[j],
-                [v_first = graph_view.vertex_partition_range_first(
-                   minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+                cuda::proclaim_return_type<vertex_t>(
+                  [v_first = graph_view.vertex_partition_range_first(
+                     minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; }));
               thrust::gather(handle.get_thrust_policy(),
                              v_offset_first,
                              v_offset_first + (key_offsets[j + 1] - key_offsets[j]),
@@ -718,21 +729,24 @@ void update_edge_minor_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (packed_bool) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+            vertex_first,
+            cuda::proclaim_return_type<bool>([vertex_property_input_first,
+                                              vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
               return static_cast<bool>(
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
-            });
+            }));
           pack_bools(handle,
                      bool_first,
                      bool_first + thrust::distance(vertex_first, vertex_last),
                      rx_value_first);
         } else {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+          auto map_first = thrust::make_transform_iterator(
+            vertex_first,
+            cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-            });
+            }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
           thrust::gather(handle.get_thrust_policy(),
@@ -799,9 +813,10 @@ void update_edge_minor_property(raft::handle_t const& handle,
             });
         } else {
           auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
               return edge_partition.minor_offset_from_minor_nocheck(v);
-            });
+            }));
           // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
@@ -866,8 +881,6 @@ void update_edge_src_property(
     edge_src_property_output,
   bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -917,8 +930,6 @@ void update_edge_src_property(
     edge_src_property_output,
   bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
       handle.get_thrust_policy(),
@@ -985,8 +996,6 @@ void update_edge_dst_property(
     edge_dst_property_output,
   bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -1036,8 +1045,6 @@ void update_edge_dst_property(
     edge_dst_property_output,
   bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
       handle.get_thrust_policy(),
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 6a7334e9f1a..5a9ded02009 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -52,6 +52,8 @@
 #include <thrust/transform_scan.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <cassert>
 #include <cstdlib>  // FIXME: requirement for temporary std::getenv()
 #include <limits>
@@ -378,7 +380,8 @@ struct random_walker_t {
 
     // scatter d_src_init_v to coalesced vertex vector:
     //
-    auto dlambda = [stride = max_depth_] __device__(auto indx) { return indx * stride; };
+    auto dlambda = cuda::proclaim_return_type<index_t>(
+      [stride = max_depth_] __device__(auto indx) { return indx * stride; });
 
     // use the transform iterator as map:
     //
@@ -539,10 +542,11 @@ struct random_walker_t {
 
     // delta = ptr_d_sizes[indx] - 1
     //
-    auto dlambda = [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
-      auto delta = ptr_d_sizes[indx] - 1;
-      return ptr_d_coalesced[indx * stride + delta];
-    };
+    auto dlambda = cuda::proclaim_return_type<vertex_t>(
+      [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
+        auto delta = ptr_d_sizes[indx] - 1;
+        return ptr_d_coalesced[indx * stride + delta];
+      });
 
     // use the transform iterator as map:
     //
@@ -587,10 +591,11 @@ struct random_walker_t {
   {
     index_t const* ptr_d_sizes = original::raw_const_ptr(d_sizes);
 
-    auto dlambda = [stride, adjust, ptr_d_sizes] __device__(auto indx) {
-      auto delta = ptr_d_sizes[indx] - adjust - 1;
-      return indx * stride + delta;
-    };
+    auto dlambda =
+      cuda::proclaim_return_type<index_t>([stride, adjust, ptr_d_sizes] __device__(auto indx) {
+        auto delta = ptr_d_sizes[indx] - adjust - 1;
+        return indx * stride + delta;
+      });
 
     // use the transform iterator as map:
     //
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 77d4f2d865f..852d82e78ab 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -40,6 +40,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <optional>
 
 namespace cugraph {
@@ -1229,10 +1231,12 @@ renumber_and_compress_sampled_edgelist(
           auto pair_first       = thrust::make_zip_iterator((*compressed_label_indices).begin(),
                                                       (*compressed_hops).begin());
           auto value_pair_first = thrust::make_transform_iterator(
-            thrust::make_counting_iterator(size_t{0}), [num_hops] __device__(size_t i) {
-              return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
-                                        static_cast<int32_t>(i % num_hops));
-            });
+            thrust::make_counting_iterator(size_t{0}),
+            cuda::proclaim_return_type<thrust::tuple<label_index_t, int32_t>>(
+              [num_hops] __device__(size_t i) {
+                return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
+                                          static_cast<int32_t>(i % num_hops));
+              }));
           thrust::upper_bound(handle.get_thrust_policy(),
                               pair_first,
                               pair_first + (*compressed_label_indices).size(),
diff --git a/cpp/src/structure/detail/structure_utils.cuh b/cpp/src/structure/detail/structure_utils.cuh
index c49b62e4543..f0f729bce18 100644
--- a/cpp/src/structure/detail/structure_utils.cuh
+++ b/cpp/src/structure/detail/structure_utils.cuh
@@ -20,6 +20,7 @@
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/device_functors.cuh>
 #include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/mask_utils.cuh>
 #include <cugraph/utilities/misc_utils.cuh>
 #include <cugraph/utilities/packed_bool_utils.hpp>
 
@@ -524,34 +525,21 @@ std::tuple<size_t, rmm::device_uvector<uint32_t>> mark_entries(raft::handle_t co
                      return word;
                    });
 
-  size_t bit_count = thrust::transform_reduce(
-    handle.get_thrust_policy(),
-    marked_entries.begin(),
-    marked_entries.end(),
-    [] __device__(auto word) { return __popc(word); },
-    size_t{0},
-    thrust::plus<size_t>());
+  size_t bit_count = detail::count_set_bits(handle, marked_entries.begin(), num_entries);
 
   return std::make_tuple(bit_count, std::move(marked_entries));
 }
 
 template <typename T>
-rmm::device_uvector<T> remove_flagged_elements(raft::handle_t const& handle,
-                                               rmm::device_uvector<T>&& vector,
-                                               raft::device_span<uint32_t const> remove_flags,
-                                               size_t remove_count)
+rmm::device_uvector<T> keep_flagged_elements(raft::handle_t const& handle,
+                                             rmm::device_uvector<T>&& vector,
+                                             raft::device_span<uint32_t const> keep_flags,
+                                             size_t keep_count)
 {
-  rmm::device_uvector<T> result(vector.size() - remove_count, handle.get_stream());
-
-  thrust::copy_if(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator(size_t{0}),
-    thrust::make_counting_iterator(vector.size()),
-    thrust::make_transform_output_iterator(result.begin(),
-                                           indirection_t<size_t, T*>{vector.data()}),
-    [remove_flags] __device__(size_t i) {
-      return !(remove_flags[cugraph::packed_bool_offset(i)] & cugraph::packed_bool_mask(i));
-    });
+  rmm::device_uvector<T> result(keep_count, handle.get_stream());
+
+  detail::copy_if_mask_set(
+    handle, vector.begin(), vector.end(), keep_flags.begin(), result.begin());
 
   return result;
 }
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 75862266789..6568b5e3b9e 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -133,8 +133,7 @@ update_local_sorted_unique_edge_majors_minors(
   graph_meta_t<vertex_t, edge_t, multi_gpu> const& meta,
   std::vector<rmm::device_uvector<edge_t>> const& edge_partition_offsets,
   std::vector<rmm::device_uvector<vertex_t>> const& edge_partition_indices,
-  std::optional<std::vector<rmm::device_uvector<vertex_t>>> const& edge_partition_dcs_nzd_vertices,
-  std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts)
+  std::optional<std::vector<rmm::device_uvector<vertex_t>>> const& edge_partition_dcs_nzd_vertices)
 {
   auto& comm                 = handle.get_comms();
   auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
@@ -341,8 +340,7 @@ update_local_sorted_unique_edge_majors_minors(
       if (use_dcs) {
         thrust::copy(handle.get_thrust_policy(),
                      (*edge_partition_dcs_nzd_vertices)[i].begin(),
-                     (*edge_partition_dcs_nzd_vertices)[i].begin() +
-                       (*edge_partition_dcs_nzd_vertex_counts)[i],
+                     (*edge_partition_dcs_nzd_vertices)[i].end(),
                      unique_edge_majors.begin() + cur_size);
       }
 
@@ -390,7 +388,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
   graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
   bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+      meta.number_of_vertices, meta.number_of_edges, meta.properties),
     partition_(meta.partition)
 {
   CUGRAPH_EXPECTS(
@@ -408,14 +406,6 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
   edge_partition_offsets_          = std::move(edge_partition_offsets);
   edge_partition_indices_          = std::move(edge_partition_indices);
   edge_partition_dcs_nzd_vertices_ = std::move(edge_partition_dcs_nzd_vertices);
-  if (edge_partition_dcs_nzd_vertices_) {
-    edge_partition_dcs_nzd_vertex_counts_ =
-      std::vector<vertex_t>((*edge_partition_dcs_nzd_vertices_).size());
-    for (size_t i = 0; i < (*edge_partition_dcs_nzd_vertex_counts_).size(); ++i) {
-      (*edge_partition_dcs_nzd_vertex_counts_)[i] =
-        static_cast<vertex_t>((*edge_partition_dcs_nzd_vertices_)[i].size());
-    }
-  }
 
   // update local sorted unique edge sources/destinations (only if key, value pair will be used)
 
@@ -432,8 +422,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
         meta,
         edge_partition_offsets_,
         edge_partition_indices_,
-        edge_partition_dcs_nzd_vertices_,
-        edge_partition_dcs_nzd_vertex_counts_);
+        edge_partition_dcs_nzd_vertices_);
   } else {
     std::tie(local_sorted_unique_edge_srcs_,
              local_sorted_unique_edge_src_chunk_start_offsets_,
@@ -447,8 +436,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
         meta,
         edge_partition_offsets_,
         edge_partition_indices_,
-        edge_partition_dcs_nzd_vertices_,
-        edge_partition_dcs_nzd_vertex_counts_);
+        edge_partition_dcs_nzd_vertices_);
   }
 }
 
@@ -460,7 +448,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_g
   graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
   bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, static_cast<edge_t>(indices.size()), meta.properties),
+      meta.number_of_vertices, static_cast<edge_t>(indices.size()), meta.properties),
     offsets_(std::move(offsets)),
     indices_(std::move(indices)),
     segment_offsets_(meta.segment_offsets)
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 64a8a3212b3..da0ecc991df 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -51,6 +51,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <cstdint>
 #include <type_traits>
@@ -70,44 +72,15 @@ struct out_of_range_t {
   __device__ bool operator()(vertex_t v) const { return (v < min) || (v >= max); }
 };
 
-template <typename vertex_t, typename edge_t>
-std::vector<edge_t> update_edge_partition_edge_counts(
-  std::vector<edge_t const*> const& edge_partition_offsets,
-  std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
-  partition_t<vertex_t> const& partition,
-  std::vector<vertex_t> const& edge_partition_segment_offsets,
-  cudaStream_t stream)
-{
-  std::vector<edge_t> edge_partition_edge_counts(partition.number_of_local_edge_partitions(), 0);
-  auto use_dcs = edge_partition_dcs_nzd_vertex_counts.has_value();
-  for (size_t i = 0; i < edge_partition_offsets.size(); ++i) {
-    auto [major_range_first, major_range_last] = partition.local_edge_partition_major_range(i);
-    auto segment_offset_size_per_partition =
-      edge_partition_segment_offsets.size() / edge_partition_offsets.size();
-    raft::update_host(
-      &(edge_partition_edge_counts[i]),
-      edge_partition_offsets[i] +
-        (use_dcs
-           ? (edge_partition_segment_offsets[segment_offset_size_per_partition * i +
-                                             detail::num_sparse_segments_per_vertex_partition] +
-              (*edge_partition_dcs_nzd_vertex_counts)[i])
-           : (major_range_last - major_range_first)),
-      1,
-      stream);
-  }
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  return edge_partition_edge_counts;
-}
-
 // compute out-degrees (if we are internally storing edges in the sparse 2D matrix using sources as
 // major indices) or in-degrees (otherwise)
 template <typename vertex_t, typename edge_t>
 rmm::device_uvector<edge_t> compute_major_degrees(
   raft::handle_t const& handle,
-  std::vector<edge_t const*> const& edge_partition_offsets,
-  std::optional<std::vector<vertex_t const*>> const& edge_partition_dcs_nzd_vertices,
-  std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
-  std::optional<std::vector<uint32_t const*>> const& edge_partition_masks,
+  std::vector<raft::device_span<edge_t const>> const& edge_partition_offsets,
+  std::optional<std::vector<raft::device_span<vertex_t const>>> const&
+    edge_partition_dcs_nzd_vertices,
+  std::optional<std::vector<raft::device_span<uint32_t const>>> const& edge_partition_masks,
   partition_t<vertex_t> const& partition,
   std::vector<vertex_t> const& edge_partition_segment_offsets)
 {
@@ -159,39 +132,39 @@ rmm::device_uvector<edge_t> compute_major_degrees(
                       thrust::make_counting_iterator(vertex_t{0}),
                       thrust::make_counting_iterator(major_hypersparse_first - major_range_first),
                       local_degrees.begin(),
-                      [offsets, masks] __device__(auto i) {
+                      cuda::proclaim_return_type<edge_t>([offsets, masks] __device__(auto i) {
                         auto local_degree = offsets[i + 1] - offsets[i];
                         if (masks) {
                           local_degree = static_cast<edge_t>(
-                            detail::count_set_bits(*masks, offsets[i], local_degree));
+                            detail::count_set_bits((*masks).begin(), offsets[i], local_degree));
                         }
                         return local_degree;
-                      });
+                      }));
     if (use_dcs) {
-      auto dcs_nzd_vertices     = (*edge_partition_dcs_nzd_vertices)[i];
-      auto dcs_nzd_vertex_count = (*edge_partition_dcs_nzd_vertex_counts)[i];
+      auto dcs_nzd_vertices = (*edge_partition_dcs_nzd_vertices)[i];
       thrust::fill(execution_policy,
                    local_degrees.begin() + (major_hypersparse_first - major_range_first),
                    local_degrees.begin() + (major_range_last - major_range_first),
                    edge_t{0});
-      thrust::for_each(execution_policy,
-                       thrust::make_counting_iterator(vertex_t{0}),
-                       thrust::make_counting_iterator(dcs_nzd_vertex_count),
-                       [offsets,
-                        dcs_nzd_vertices,
-                        masks,
-                        major_range_first,
-                        major_hypersparse_first,
-                        local_degrees = local_degrees.data()] __device__(auto i) {
-                         auto major_idx    = (major_hypersparse_first - major_range_first) + i;
-                         auto local_degree = offsets[major_idx + 1] - offsets[major_idx];
-                         if (masks) {
-                           local_degree = static_cast<edge_t>(
-                             detail::count_set_bits(*masks, offsets[major_idx], local_degree));
-                         }
-                         auto v                               = dcs_nzd_vertices[i];
-                         local_degrees[v - major_range_first] = local_degree;
-                       });
+      thrust::for_each(
+        execution_policy,
+        thrust::make_counting_iterator(vertex_t{0}),
+        thrust::make_counting_iterator(static_cast<vertex_t>(dcs_nzd_vertices.size())),
+        [offsets,
+         dcs_nzd_vertices,
+         masks,
+         major_range_first,
+         major_hypersparse_first,
+         local_degrees = local_degrees.data()] __device__(auto i) {
+          auto major_idx    = (major_hypersparse_first - major_range_first) + i;
+          auto local_degree = offsets[major_idx + 1] - offsets[major_idx];
+          if (masks) {
+            local_degree = static_cast<edge_t>(
+              detail::count_set_bits((*masks).begin(), offsets[major_idx], local_degree));
+          }
+          auto v                               = dcs_nzd_vertices[i];
+          local_degrees[v - major_range_first] = local_degree;
+        });
     }
     minor_comm.reduce(local_degrees.data(),
                       i == minor_comm_rank ? degrees.data() : static_cast<edge_t*>(nullptr),
@@ -207,10 +180,11 @@ rmm::device_uvector<edge_t> compute_major_degrees(
 // compute out-degrees (if we are internally storing edges in the sparse 2D matrix using sources as
 // major indices) or in-degrees (otherwise)
 template <typename vertex_t, typename edge_t>
-rmm::device_uvector<edge_t> compute_major_degrees(raft::handle_t const& handle,
-                                                  edge_t const* offsets,
-                                                  std::optional<uint32_t const*> masks,
-                                                  vertex_t number_of_vertices)
+rmm::device_uvector<edge_t> compute_major_degrees(
+  raft::handle_t const& handle,
+  raft::device_span<edge_t const> offsets,
+  std::optional<raft::device_span<uint32_t const>> masks,
+  vertex_t number_of_vertices)
 {
   rmm::device_uvector<edge_t> degrees(number_of_vertices, handle.get_stream());
   thrust::tabulate(
@@ -221,7 +195,7 @@ rmm::device_uvector<edge_t> compute_major_degrees(raft::handle_t const& handle,
       auto local_degree = offsets[i + 1] - offsets[i];
       if (masks) {
         local_degree =
-          static_cast<edge_t>(detail::count_set_bits(*masks, offsets[i], local_degree));
+          static_cast<edge_t>(detail::count_set_bits((*masks).begin(), offsets[i], local_degree));
       }
       return local_degree;
     });
@@ -444,24 +418,16 @@ edge_t count_edge_partition_multi_edges(
 
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
-  graph_view_t(raft::handle_t const& handle,
-               std::vector<edge_t const*> const& edge_partition_offsets,
-               std::vector<vertex_t const*> const& edge_partition_indices,
-               std::optional<std::vector<vertex_t const*>> const& edge_partition_dcs_nzd_vertices,
-               std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
+  graph_view_t(std::vector<raft::device_span<edge_t const>> const& edge_partition_offsets,
+               std::vector<raft::device_span<vertex_t const>> const& edge_partition_indices,
+               std::optional<std::vector<raft::device_span<vertex_t const>>> const&
+                 edge_partition_dcs_nzd_vertices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+      meta.number_of_vertices, meta.number_of_edges, meta.properties),
     edge_partition_offsets_(edge_partition_offsets),
     edge_partition_indices_(edge_partition_indices),
     edge_partition_dcs_nzd_vertices_(edge_partition_dcs_nzd_vertices),
-    edge_partition_dcs_nzd_vertex_counts_(edge_partition_dcs_nzd_vertex_counts),
-    edge_partition_number_of_edges_(
-      update_edge_partition_edge_counts(edge_partition_offsets,
-                                        edge_partition_dcs_nzd_vertex_counts,
-                                        meta.partition,
-                                        meta.edge_partition_segment_offsets,
-                                        handle.get_stream())),
     partition_(meta.partition),
     edge_partition_segment_offsets_(meta.edge_partition_segment_offsets),
     local_sorted_unique_edge_srcs_(meta.local_sorted_unique_edge_srcs),
@@ -479,51 +445,42 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
 {
   // cheap error checks
 
-  auto const minor_comm_size =
-    this->handle_ptr()->get_subcomm(cugraph::partition_manager::minor_comm_name()).get_size();
-
   auto use_dcs = edge_partition_dcs_nzd_vertices.has_value();
 
   CUGRAPH_EXPECTS(edge_partition_offsets.size() == edge_partition_indices.size(),
                   "Internal Error: edge_partition_offsets.size() and "
                   "edge_partition_indices.size() should coincide.");
-  CUGRAPH_EXPECTS(edge_partition_dcs_nzd_vertex_counts.has_value() == use_dcs,
-                  "edge_partition_dcs_nzd_vertices.has_value() and "
-                  "edge_partition_dcs_nzd_vertex_counts.has_value() should coincide");
-  CUGRAPH_EXPECTS(!use_dcs || ((*edge_partition_dcs_nzd_vertices).size() ==
-                               (*edge_partition_dcs_nzd_vertex_counts).size()),
-                  "Internal Error: edge_partition_dcs_nzd_vertices.size() and "
-                  "edge_partition_dcs_nzd_vertex_counts.size() should coincide (if used).");
   CUGRAPH_EXPECTS(
     !use_dcs || ((*edge_partition_dcs_nzd_vertices).size() == edge_partition_offsets.size()),
     "Internal Error: edge_partition_dcs_nzd_vertices.size() should coincide "
     "with edge_partition_offsets.size() (if used).");
 
-  CUGRAPH_EXPECTS(edge_partition_offsets.size() == static_cast<size_t>(minor_comm_size),
-                  "Internal Error: erroneous edge_partition_offsets.size().");
-
-  CUGRAPH_EXPECTS(
-    meta.edge_partition_segment_offsets.size() ==
-      minor_comm_size * (detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 3 : 2)),
-    "Internal Error: invalid edge_partition_segment_offsets.size().");
+  CUGRAPH_EXPECTS(meta.edge_partition_segment_offsets.size() ==
+                    edge_partition_offsets.size() *
+                      (detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 3 : 2)),
+                  "Internal Error: invalid edge_partition_segment_offsets.size().");
 
   // skip expensive error checks as this function is only called by graph_t
 }
 
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
-  graph_view_t(raft::handle_t const& handle,
-               edge_t const* offsets,
-               vertex_t const* indices,
+  graph_view_t(raft::device_span<edge_t const> offsets,
+               raft::device_span<vertex_t const> indices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+      meta.number_of_vertices, meta.number_of_edges, meta.properties),
     offsets_(offsets),
     indices_(indices),
     segment_offsets_(meta.segment_offsets)
 {
   // cheap error checks
 
+  CUGRAPH_EXPECTS(offsets.size() == static_cast<size_t>(meta.number_of_vertices + 1),
+                  "Internal Error: offsets.size() returns an invalid value.");
+  CUGRAPH_EXPECTS(indices.size() == static_cast<size_t>(meta.number_of_edges),
+                  "Internal Error: indices.size() returns an invalid value.");
+
   CUGRAPH_EXPECTS(
     !(meta.segment_offsets).has_value() ||
       ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
@@ -532,23 +489,66 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
   // skip expensive error checks as this function is only called by graph_t
 }
 
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_number_of_edges(raft::handle_t const& handle) const
+{
+  if (this->has_edge_mask()) {
+    edge_t ret{};
+    auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+    auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+    for (size_t i = 0; i < value_firsts.size(); ++i) {
+      ret += static_cast<edge_t>(detail::count_set_bits(handle, value_firsts[i], edge_counts[i]));
+    }
+    ret =
+      host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream());
+    return ret;
+  } else {
+    return this->number_of_edges_;
+  }
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
+  compute_number_of_edges(raft::handle_t const& handle) const
+{
+  if (this->has_edge_mask()) {
+    auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+    auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+    assert(value_firsts.size() == 0);
+    assert(edge_counts.size() == 0);
+    return static_cast<edge_t>(detail::count_set_bits(handle, value_firsts[0], edge_counts[0]));
+  } else {
+    return this->number_of_edges_;
+  }
+}
+
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 rmm::device_uvector<edge_t>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_in_degrees(raft::handle_t const& handle) const
 {
   if (store_transposed) {
+    std::optional<std::vector<raft::device_span<uint32_t const>>> edge_partition_masks{
+      std::nullopt};
+    if (this->has_edge_mask()) {
+      edge_partition_masks =
+        std::vector<raft::device_span<uint32_t const>>(this->edge_partition_offsets_.size());
+      auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+      auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+      for (size_t i = 0; i < (*edge_partition_masks).size(); ++i) {
+        (*edge_partition_masks)[i] =
+          raft::device_span<uint32_t const>(value_firsts[i], edge_counts[i]);
+      }
+    }
     return compute_major_degrees(handle,
                                  this->edge_partition_offsets_,
                                  this->edge_partition_dcs_nzd_vertices_,
-                                 this->edge_partition_dcs_nzd_vertex_counts_,
-                                 this->has_edge_mask()
-                                   ? std::make_optional((*(this->edge_mask_view())).value_firsts())
-                                   : std::nullopt,
+                                 edge_partition_masks,
                                  this->partition_,
                                  this->edge_partition_segment_offsets_);
   } else {
-    CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
   }
 }
@@ -559,14 +559,16 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
   compute_in_degrees(raft::handle_t const& handle) const
 {
   if (store_transposed) {
-    return compute_major_degrees(
-      handle,
-      this->offsets_,
-      this->has_edge_mask() ? std::make_optional((*(this->edge_mask_view())).value_firsts()[0])
-                            : std::nullopt,
-      this->local_vertex_partition_range_size());
+    return compute_major_degrees(handle,
+                                 this->offsets_,
+                                 this->has_edge_mask()
+                                   ? std::make_optional(raft::device_span<uint32_t const>(
+                                       (*(this->edge_mask_view())).value_firsts()[0],
+                                       (*(this->edge_mask_view())).edge_counts()[0]))
+                                   : std::nullopt,
+                                 this->local_vertex_partition_range_size());
   } else {
-    CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
   }
 }
@@ -577,16 +579,25 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
   compute_out_degrees(raft::handle_t const& handle) const
 {
   if (store_transposed) {
-    CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
   } else {
+    std::optional<std::vector<raft::device_span<uint32_t const>>> edge_partition_masks{
+      std::nullopt};
+    if (this->has_edge_mask()) {
+      edge_partition_masks =
+        std::vector<raft::device_span<uint32_t const>>(this->edge_partition_offsets_.size());
+      auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+      auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+      for (size_t i = 0; i < (*edge_partition_masks).size(); ++i) {
+        (*edge_partition_masks)[i] =
+          raft::device_span<uint32_t const>(value_firsts[i], edge_counts[i]);
+      }
+    }
     return compute_major_degrees(handle,
                                  this->edge_partition_offsets_,
                                  this->edge_partition_dcs_nzd_vertices_,
-                                 this->edge_partition_dcs_nzd_vertex_counts_,
-                                 this->has_edge_mask()
-                                   ? std::make_optional((*(this->edge_mask_view())).value_firsts())
-                                   : std::nullopt,
+                                 edge_partition_masks,
                                  this->partition_,
                                  this->edge_partition_segment_offsets_);
   }
@@ -598,15 +609,17 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
   compute_out_degrees(raft::handle_t const& handle) const
 {
   if (store_transposed) {
-    CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
   } else {
-    return compute_major_degrees(
-      handle,
-      this->offsets_,
-      this->has_edge_mask() ? std::make_optional((*(this->edge_mask_view())).value_firsts()[0])
-                            : std::nullopt,
-      this->local_vertex_partition_range_size());
+    return compute_major_degrees(handle,
+                                 this->offsets_,
+                                 this->has_edge_mask()
+                                   ? std::make_optional(raft::device_span<uint32_t const>(
+                                       (*(this->edge_mask_view())).value_firsts()[0],
+                                       (*(this->edge_mask_view())).edge_counts()[0]))
+                                   : std::nullopt,
+                                 this->local_vertex_partition_range_size());
   }
 }
 
@@ -614,7 +627,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_max_in_degree(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   auto in_degrees = compute_in_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), in_degrees.begin(), in_degrees.end());
@@ -632,7 +645,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   compute_max_in_degree(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   auto in_degrees = compute_in_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), in_degrees.begin(), in_degrees.end());
@@ -646,7 +659,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_max_out_degree(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   auto out_degrees = compute_out_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), out_degrees.begin(), out_degrees.end());
@@ -664,7 +677,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   compute_max_out_degree(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   auto out_degrees = compute_out_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), out_degrees.begin(), out_degrees.end());
@@ -678,7 +691,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   count_self_loops(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   return count_if_e(
     handle,
@@ -693,7 +706,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   count_self_loops(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   return count_if_e(
     handle,
@@ -708,7 +721,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   count_multi_edges(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   if (!this->is_multigraph()) { return edge_t{0}; }
 
@@ -728,7 +741,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   count_multi_edges(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   if (!this->is_multigraph()) { return edge_t{0}; }
 
diff --git a/cpp/src/structure/remove_multi_edges_impl.cuh b/cpp/src/structure/remove_multi_edges_impl.cuh
index ab6b1fba8eb..fdd3059f874 100644
--- a/cpp/src/structure/remove_multi_edges_impl.cuh
+++ b/cpp/src/structure/remove_multi_edges_impl.cuh
@@ -254,50 +254,47 @@ remove_multi_edges(raft::handle_t const& handle,
     }
   }
 
-  auto [multi_edge_count, multi_edges_to_delete] =
-    detail::mark_entries(handle,
-                         edgelist_srcs.size(),
-                         [d_edgelist_srcs = edgelist_srcs.data(),
-                          d_edgelist_dsts = edgelist_dsts.data()] __device__(auto idx) {
-                           return (idx > 0) && (d_edgelist_srcs[idx - 1] == d_edgelist_srcs[idx]) &&
-                                  (d_edgelist_dsts[idx - 1] == d_edgelist_dsts[idx]);
-                         });
-
-  if (multi_edge_count > 0) {
-    edgelist_srcs = detail::remove_flagged_elements(
+  auto [keep_count, keep_flags] = detail::mark_entries(
+    handle,
+    edgelist_srcs.size(),
+    [d_edgelist_srcs = edgelist_srcs.data(),
+     d_edgelist_dsts = edgelist_dsts.data()] __device__(auto idx) {
+      return !((idx > 0) && (d_edgelist_srcs[idx - 1] == d_edgelist_srcs[idx]) &&
+               (d_edgelist_dsts[idx - 1] == d_edgelist_dsts[idx]));
+    });
+
+  if (keep_count < edgelist_srcs.size()) {
+    edgelist_srcs = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_srcs),
-      raft::device_span<uint32_t const>{multi_edges_to_delete.data(), multi_edges_to_delete.size()},
-      multi_edge_count);
-    edgelist_dsts = detail::remove_flagged_elements(
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
+    edgelist_dsts = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_dsts),
-      raft::device_span<uint32_t const>{multi_edges_to_delete.data(), multi_edges_to_delete.size()},
-      multi_edge_count);
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
 
     if (edgelist_weights)
-      edgelist_weights = detail::remove_flagged_elements(
+      edgelist_weights = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_weights),
-        raft::device_span<uint32_t const>{multi_edges_to_delete.data(),
-                                          multi_edges_to_delete.size()},
-        multi_edge_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_ids)
-      edgelist_edge_ids = detail::remove_flagged_elements(
+      edgelist_edge_ids = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_ids),
-        raft::device_span<uint32_t const>{multi_edges_to_delete.data(),
-                                          multi_edges_to_delete.size()},
-        multi_edge_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_types)
-      edgelist_edge_types = detail::remove_flagged_elements(
+      edgelist_edge_types = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_types),
-        raft::device_span<uint32_t const>{multi_edges_to_delete.data(),
-                                          multi_edges_to_delete.size()},
-        multi_edge_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
   }
 
   return std::make_tuple(std::move(edgelist_srcs),
diff --git a/cpp/src/structure/remove_self_loops_impl.cuh b/cpp/src/structure/remove_self_loops_impl.cuh
index 161ffeae28e..dafe26cd1c5 100644
--- a/cpp/src/structure/remove_self_loops_impl.cuh
+++ b/cpp/src/structure/remove_self_loops_impl.cuh
@@ -44,44 +44,44 @@ remove_self_loops(raft::handle_t const& handle,
                   std::optional<rmm::device_uvector<edge_t>>&& edgelist_edge_ids,
                   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types)
 {
-  auto [self_loop_count, self_loops_to_delete] =
+  auto [keep_count, keep_flags] =
     detail::mark_entries(handle,
                          edgelist_srcs.size(),
                          [d_srcs = edgelist_srcs.data(), d_dsts = edgelist_dsts.data()] __device__(
-                           size_t i) { return d_srcs[i] == d_dsts[i]; });
+                           size_t i) { return d_srcs[i] != d_dsts[i]; });
 
-  if (self_loop_count > 0) {
-    edgelist_srcs = detail::remove_flagged_elements(
+  if (keep_count < edgelist_srcs.size()) {
+    edgelist_srcs = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_srcs),
-      raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-      self_loop_count);
-    edgelist_dsts = detail::remove_flagged_elements(
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
+    edgelist_dsts = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_dsts),
-      raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-      self_loop_count);
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
 
     if (edgelist_weights)
-      edgelist_weights = detail::remove_flagged_elements(
+      edgelist_weights = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_weights),
-        raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-        self_loop_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_ids)
-      edgelist_edge_ids = detail::remove_flagged_elements(
+      edgelist_edge_ids = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_ids),
-        raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-        self_loop_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_types)
-      edgelist_edge_types = detail::remove_flagged_elements(
+      edgelist_edge_types = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_types),
-        raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-        self_loop_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
   }
 
   return std::make_tuple(std::move(edgelist_srcs),
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index 6a0c5a4a675..cc69cb5f67f 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -432,7 +432,7 @@ rmm::device_uvector<weight_t> od_shortest_distances(
   // 1. check input arguments
 
   auto const num_vertices = graph_view.number_of_vertices();
-  auto const num_edges    = graph_view.number_of_edges();
+  auto const num_edges    = graph_view.compute_number_of_edges(handle);
 
   CUGRAPH_EXPECTS(num_vertices != 0 || (origins.size() == 0 && destinations.size() == 0),
                   "Invalid input argument: the input graph is empty but origins.size() > 0 or "
@@ -1049,7 +1049,7 @@ rmm::device_uvector<weight_t> od_shortest_distances(
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
   auto const num_vertices = graph_view.number_of_vertices();
-  auto const num_edges    = graph_view.number_of_edges();
+  auto const num_edges    = graph_view.compute_number_of_edges(handle);
 
   weight_t average_vertex_degree =
     static_cast<weight_t>(num_edges) / static_cast<weight_t>(num_vertices);
diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh
index c78fa3839e2..5a6d536c6f5 100644
--- a/cpp/src/traversal/sssp_impl.cuh
+++ b/cpp/src/traversal/sssp_impl.cuh
@@ -93,7 +93,7 @@ void sssp(raft::handle_t const& handle,
                 "GraphViewType should support the push model.");
 
   auto const num_vertices = push_graph_view.number_of_vertices();
-  auto const num_edges    = push_graph_view.number_of_edges();
+  auto const num_edges    = push_graph_view.compute_number_of_edges(handle);
   if (num_vertices == 0) { return; }
 
   // implements the Near-Far Pile method in
diff --git a/cpp/src/utilities/cugraph_ops_utils.hpp b/cpp/src/utilities/cugraph_ops_utils.hpp
index 9aea4183866..880a2c8d104 100644
--- a/cpp/src/utilities/cugraph_ops_utils.hpp
+++ b/cpp/src/utilities/cugraph_ops_utils.hpp
@@ -30,7 +30,7 @@ ops::graph::csc<EdgeTypeT, NodeTypeT> get_graph(
   ops::graph::csc<EdgeTypeT, NodeTypeT> graph;
   graph.n_src_nodes = gview.number_of_vertices();
   graph.n_dst_nodes = gview.number_of_vertices();
-  graph.n_indices   = gview.number_of_edges();
+  graph.n_indices   = gview.local_edge_partition_view().number_of_edges();
   // FIXME this is sufficient for now, but if there is a fast (cached) way
   // of getting max degree, use that instead
   graph.dst_max_in_degree = std::numeric_limits<EdgeTypeT>::max();
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3d2f51f7df8..d3224df2860 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -257,7 +257,7 @@ ConfigureTest(BALANCED_TEST community/balanced_edge_test.cpp)
 
 ###################################################################################################
 # - EGO tests -------------------------------------------------------------------------------------
-ConfigureTest(EGO_TEST community/egonet_test.cpp GPUS 1 PERCENT 75)
+ConfigureTest(EGONET_TEST community/egonet_test.cpp GPUS 1 PERCENT 75)
 
 ###################################################################################################
 # - FORCE ATLAS 2  tests --------------------------------------------------------------------------
@@ -531,7 +531,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
 
     ###############################################################################################
     # - MG LOUVAIN tests --------------------------------------------------------------------------
-    ConfigureTestMG(MG_EGO_TEST community/mg_egonet_test.cu)
+    ConfigureTestMG(MG_EGONET_TEST community/mg_egonet_test.cu)
 
     ###############################################################################################
     # - MG WEAKLY CONNECTED COMPONENTS tests ------------------------------------------------------
@@ -742,9 +742,16 @@ if (BUILD_CUGRAPH_MTMG_TESTS)
     # - MTMG tests -------------------------------------------------------------------------
     ConfigureTest(MTMG_TEST mtmg/threaded_test.cu)
     target_link_libraries(MTMG_TEST
-                        PRIVATE
-                        UCP::UCP
-                        )
+                          PRIVATE
+                          UCP::UCP
+                         )
+
+    ConfigureTest(MTMG_LOUVAIN_TEST mtmg/threaded_test_louvain.cu)
+    target_link_libraries(MTMG_LOUVAIN_TEST
+                          PRIVATE
+                          cugraphmgtestutil
+                          UCP::UCP
+                         )
 
     if(BUILD_CUGRAPH_MG_TESTS)
         ###############################################################################################
diff --git a/cpp/tests/c_api/hits_test.c b/cpp/tests/c_api/hits_test.c
index c275d883d11..1ebd4f82a51 100644
--- a/cpp/tests/c_api/hits_test.c
+++ b/cpp/tests/c_api/hits_test.c
@@ -163,7 +163,7 @@ int test_hits()
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // hits wants store_transposed = TRUE
@@ -195,7 +195,7 @@ int test_hits_with_transpose()
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // Hits wants store_transposed = TRUE
@@ -232,7 +232,7 @@ int test_hits_with_initial()
   vertex_t h_initial_vertices[] = {0, 1, 2, 3, 4};
   weight_t h_initial_hubs[]     = {0.347296, 0.532089, 1, 0.00003608, 0.00003608};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   return generic_hits_test(h_src,
diff --git a/cpp/tests/c_api/mg_create_graph_test.c b/cpp/tests/c_api/mg_create_graph_test.c
index fec319d1881..7156647b025 100644
--- a/cpp/tests/c_api/mg_create_graph_test.c
+++ b/cpp/tests/c_api/mg_create_graph_test.c
@@ -175,18 +175,18 @@ int test_create_mg_graph_multiple_edge_lists(const cugraph_resource_handle_t* ha
   int my_rank = cugraph_resource_handle_get_rank(handle);
   int comm_size = cugraph_resource_handle_get_comm_size(handle);
 
-  size_t local_num_vertices = (num_vertices + comm_size - 1) / comm_size;
+  size_t local_num_vertices = num_vertices / comm_size;
   size_t local_start_vertex = my_rank * local_num_vertices;
-  size_t local_num_edges = (num_edges + comm_size - 1) / comm_size;
+  size_t local_num_edges = num_edges / comm_size;
   size_t local_start_edge = my_rank * local_num_edges;
 
-  local_num_edges = (local_num_edges < (num_edges - local_start_edge)) ? local_num_edges : (num_edges - local_start_edge);
-  local_num_vertices = (local_num_vertices < (num_vertices - local_start_vertex)) ? local_num_vertices : (num_vertices - local_start_vertex);
+  local_num_edges = (my_rank != (comm_size - 1)) ? local_num_edges : (num_edges - local_start_edge);
+  local_num_vertices = (my_rank != (comm_size - 1)) ? local_num_vertices : (num_vertices - local_start_vertex);
 
   for (size_t i = 0 ; i < num_local_arrays ; ++i) {
-    size_t vertex_count = (local_num_vertices + num_local_arrays - 1) / num_local_arrays;
+    size_t vertex_count = local_num_vertices / num_local_arrays;
     size_t vertex_start = i * vertex_count;
-    vertex_count = (vertex_count < (local_num_vertices - vertex_start)) ? vertex_count : (local_num_vertices - vertex_start);
+    vertex_count = (i != (num_local_arrays - 1)) ? vertex_count : (local_num_vertices - vertex_start);
     
     ret_code =
       cugraph_type_erased_device_array_create(handle, vertex_count, vertex_tid, vertices + i, &ret_error);
@@ -363,18 +363,18 @@ int test_create_mg_graph_multiple_edge_lists_multi_edge(const cugraph_resource_h
   int my_rank = cugraph_resource_handle_get_rank(handle);
   int comm_size = cugraph_resource_handle_get_comm_size(handle);
 
-  size_t local_num_vertices = (num_vertices + comm_size - 1) / comm_size;
+  size_t local_num_vertices = num_vertices / comm_size;
   size_t local_start_vertex = my_rank * local_num_vertices;
-  size_t local_num_edges = (num_edges + comm_size - 1) / comm_size;
+  size_t local_num_edges = num_edges / comm_size;
   size_t local_start_edge = my_rank * local_num_edges;
 
-  local_num_edges = (local_num_edges < (num_edges - local_start_edge)) ? local_num_edges : (num_edges - local_start_edge);
-  local_num_vertices = (local_num_vertices < (num_vertices - local_start_vertex)) ? local_num_vertices : (num_vertices - local_start_vertex);
+  local_num_edges = (my_rank != (comm_size - 1)) ? local_num_edges : (num_edges - local_start_edge);
+  local_num_vertices = (my_rank != (comm_size - 1)) ? local_num_vertices : (num_vertices - local_start_vertex);
 
   for (size_t i = 0 ; i < num_local_arrays ; ++i) {
     size_t vertex_count = (local_num_vertices + num_local_arrays - 1) / num_local_arrays;
     size_t vertex_start = i * vertex_count;
-    vertex_count = (vertex_count < (local_num_vertices - vertex_start)) ? vertex_count : (local_num_vertices - vertex_start);
+    vertex_count = (i != (num_local_arrays - 1)) ? vertex_count : (local_num_vertices - vertex_start);
     
     ret_code =
       cugraph_type_erased_device_array_create(handle, vertex_count, vertex_tid, vertices + i, &ret_error);
diff --git a/cpp/tests/c_api/mg_hits_test.c b/cpp/tests/c_api/mg_hits_test.c
index 87371093613..3e10bfc05d6 100644
--- a/cpp/tests/c_api/mg_hits_test.c
+++ b/cpp/tests/c_api/mg_hits_test.c
@@ -171,7 +171,7 @@ int test_hits(const cugraph_resource_handle_t* handle)
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // hits wants store_transposed = TRUE
@@ -203,7 +203,7 @@ int test_hits_with_transpose(const cugraph_resource_handle_t* handle)
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // Hits wants store_transposed = TRUE
diff --git a/cpp/tests/community/triangle_count_test.cpp b/cpp/tests/community/triangle_count_test.cpp
index 836bab59457..592924c3c47 100644
--- a/cpp/tests/community/triangle_count_test.cpp
+++ b/cpp/tests/community/triangle_count_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -232,7 +232,7 @@ class Tests_TriangleCount
       for (size_t i = 0; i < h_cugraph_vertices.size(); ++i) {
         auto v     = h_cugraph_vertices[i];
         auto count = h_cugraph_triangle_counts[i];
-        ASSERT_TRUE(count == h_reference_triangle_counts[v])
+        ASSERT_EQ(count, h_reference_triangle_counts[v])
           << "Triangle count values do not match with the reference values.";
       }
     }
diff --git a/cpp/tests/link_analysis/hits_test.cpp b/cpp/tests/link_analysis/hits_test.cpp
index d0e77769034..cf35356bb76 100644
--- a/cpp/tests/link_analysis/hits_test.cpp
+++ b/cpp/tests/link_analysis/hits_test.cpp
@@ -52,9 +52,11 @@ std::tuple<std::vector<result_t>, std::vector<result_t>, double, size_t> hits_re
   size_t max_iterations,
   std::optional<result_t const*> starting_hub_values,
   bool normalized,
-  double tolerance)
+  double epsilon)
 {
   CUGRAPH_EXPECTS(num_vertices > 1, "number of vertices expected to be non-zero");
+  auto tolerance = static_cast<result_t>(num_vertices) * epsilon;
+
   std::vector<result_t> prev_hubs(num_vertices, result_t{1.0} / num_vertices);
   std::vector<result_t> prev_authorities(num_vertices, result_t{1.0} / num_vertices);
   std::vector<result_t> curr_hubs(num_vertices);
@@ -127,8 +129,8 @@ std::tuple<std::vector<result_t>, std::vector<result_t>, double, size_t> hits_re
 }
 
 struct Hits_Usecase {
-  bool check_correctness{true};
   bool check_initial_input{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -175,8 +177,8 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
     // 3. run hits
 
     auto graph_view         = graph.view();
-    auto maximum_iterations = 500;
-    weight_t tolerance      = 1e-5;
+    auto maximum_iterations = 200;
+    weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_hubs(graph_view.local_vertex_partition_range_size(),
                                          handle.get_stream());
 
@@ -201,7 +203,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
                                 graph_view,
                                 d_hubs.data(),
                                 d_authorities.data(),
-                                tolerance,
+                                epsilon,
                                 maximum_iterations,
                                 hits_usecase.check_initial_input,
                                 true,
@@ -227,12 +229,12 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
         offsets.data(),
         indices.data(),
         unrenumbered_graph_view.number_of_vertices(),
-        unrenumbered_graph_view.number_of_edges(),
+        unrenumbered_graph_view.compute_number_of_edges(handle),
         maximum_iterations,
         (hits_usecase.check_initial_input) ? std::make_optional(initial_random_hubs.data())
                                            : std::nullopt,
         true,
-        tolerance);
+        epsilon);
 
       std::vector<weight_t> h_cugraph_hits{};
       if (renumber) {
@@ -246,8 +248,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
       handle.sync_stream();
       auto threshold_ratio = 1e-3;
       auto threshold_magnitude =
-        (1.0 / static_cast<weight_t>(graph_view.number_of_vertices())) *
-        threshold_ratio;  // skip comparison for low hits vertices (lowly ranked vertices)
+        1e-6;  // skip comparison for low hits vertices (lowly ranked vertices)
       auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
         return std::abs(lhs - rhs) <=
                std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
@@ -294,14 +295,17 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"),
                       cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_Hits_Rmat,
                          // enable correctness checks
-                         ::testing::Combine(::testing::Values(Hits_Usecase{true, false},
+                         ::testing::Combine(::testing::Values(Hits_Usecase{false, true},
                                                               Hits_Usecase{true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
@@ -315,7 +319,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_File,
   ::testing::Combine(
     // disable correctness checks
-    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{false, true}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -327,7 +331,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_Rmat,
   // disable correctness checks for large graphs
   ::testing::Combine(
-    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{false, true}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/link_analysis/mg_hits_test.cpp b/cpp/tests/link_analysis/mg_hits_test.cpp
index cf95d03681d..5c89bafd08e 100644
--- a/cpp/tests/link_analysis/mg_hits_test.cpp
+++ b/cpp/tests/link_analysis/mg_hits_test.cpp
@@ -33,8 +33,8 @@
 #include <gtest/gtest.h>
 
 struct Hits_Usecase {
-  bool check_correctness{true};
   bool check_initial_input{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -81,7 +81,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
     auto mg_graph_view = mg_graph.view();
 
     auto maximum_iterations = 200;
-    weight_t tolerance      = 1e-8;
+    weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_mg_hubs(mg_graph_view.local_vertex_partition_range_size(),
                                             handle_->get_stream());
 
@@ -110,7 +110,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
                                 mg_graph_view,
                                 d_mg_hubs.data(),
                                 d_mg_authorities.data(),
-                                tolerance,
+                                epsilon,
                                 maximum_iterations,
                                 hits_usecase.check_initial_input,
                                 true,
@@ -205,7 +205,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
                                     sg_graph_view,
                                     d_sg_hubs.data(),
                                     d_sg_authorities.data(),
-                                    tolerance,
+                                    epsilon,
                                     maximum_iterations,
                                     hits_usecase.check_initial_input,
                                     true,
@@ -218,9 +218,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
 
         auto threshold_ratio = 1e-3;
         auto threshold_magnitude =
-          (1.0 / static_cast<result_t>(mg_graph_view.number_of_vertices())) *
-          threshold_ratio;  // skip comparison for low Hits verties (lowly ranked
-                            // vertices)
+          1e-6;  // skip comparison for low Hits verties (lowly ranked vertices)
         auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
           return std::abs(lhs - rhs) <
                  std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
@@ -274,7 +272,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -285,7 +283,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_Rmat,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -297,7 +295,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_Rmat,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(Hits_Usecase{false, false}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/mtmg/multi_node_threaded_test.cu b/cpp/tests/mtmg/multi_node_threaded_test.cu
index e5a7de07781..17aed4fdecf 100644
--- a/cpp/tests/mtmg/multi_node_threaded_test.cu
+++ b/cpp/tests/mtmg/multi_node_threaded_test.cu
@@ -311,7 +311,8 @@ class Tests_Multithreaded
         auto d_my_pageranks = pageranks_view.gather(
           thread_handle,
           raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
-          graph_view,
+          graph_view.get_vertex_partition_range_lasts(thread_handle),
+          graph_view.get_vertex_partition_view(thread_handle),
           renumber_map_view);
 
         std::vector<result_t> my_pageranks(d_my_pageranks.size());
diff --git a/cpp/tests/mtmg/threaded_test.cu b/cpp/tests/mtmg/threaded_test.cu
index bc4d8cfef6a..a5df0199cac 100644
--- a/cpp/tests/mtmg/threaded_test.cu
+++ b/cpp/tests/mtmg/threaded_test.cu
@@ -155,10 +155,25 @@ class Tests_Multithreaded
       input_usecase.template construct_edgelist<vertex_t, weight_t>(
         handle, multithreaded_usecase.test_weighted, false, false);
 
+    rmm::device_uvector<vertex_t> d_unique_vertices(2 * d_src_v.size(), handle.get_stream());
+    thrust::copy(
+      handle.get_thrust_policy(), d_src_v.begin(), d_src_v.end(), d_unique_vertices.begin());
+    thrust::copy(handle.get_thrust_policy(),
+                 d_dst_v.begin(),
+                 d_dst_v.end(),
+                 d_unique_vertices.begin() + d_src_v.size());
+    thrust::sort(handle.get_thrust_policy(), d_unique_vertices.begin(), d_unique_vertices.end());
+
+    d_unique_vertices.resize(thrust::distance(d_unique_vertices.begin(),
+                                              thrust::unique(handle.get_thrust_policy(),
+                                                             d_unique_vertices.begin(),
+                                                             d_unique_vertices.end())),
+                             handle.get_stream());
+
     auto h_src_v         = cugraph::test::to_host(handle, d_src_v);
     auto h_dst_v         = cugraph::test::to_host(handle, d_dst_v);
     auto h_weights_v     = cugraph::test::to_host(handle, d_weights_v);
-    auto unique_vertices = cugraph::test::to_host(handle, d_vertices_v);
+    auto unique_vertices = cugraph::test::to_host(handle, d_unique_vertices);
 
     // Load edgelist from different threads.  We'll use more threads than GPUs here
     for (int i = 0; i < num_threads; ++i) {
@@ -293,13 +308,13 @@ class Tests_Multithreaded
                                     num_threads]() {
         auto thread_handle = instance_manager->get_handle();
 
-        auto number_of_vertices = unique_vertices->size();
+        auto number_of_vertices = unique_vertices.size();
 
         std::vector<vertex_t> my_vertex_list;
         my_vertex_list.reserve((number_of_vertices + num_threads - 1) / num_threads);
 
         for (size_t j = i; j < number_of_vertices; j += num_threads) {
-          my_vertex_list.push_back((*unique_vertices)[j]);
+          my_vertex_list.push_back(unique_vertices[j]);
         }
 
         rmm::device_uvector<vertex_t> d_my_vertex_list(my_vertex_list.size(),
@@ -312,7 +327,8 @@ class Tests_Multithreaded
         auto d_my_pageranks = pageranks_view.gather(
           thread_handle,
           raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
-          graph_view,
+          graph_view.get_vertex_partition_range_lasts(thread_handle),
+          graph_view.get_vertex_partition_view(thread_handle),
           renumber_map_view);
 
         std::vector<result_t> my_pageranks(d_my_pageranks.size());
diff --git a/cpp/tests/mtmg/threaded_test_louvain.cu b/cpp/tests/mtmg/threaded_test_louvain.cu
new file mode 100644
index 00000000000..c1395037646
--- /dev/null
+++ b/cpp/tests/mtmg/threaded_test_louvain.cu
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/mtmg/edgelist.hpp>
+#include <cugraph/mtmg/graph.hpp>
+#include <cugraph/mtmg/per_thread_edgelist.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+#include <cugraph/mtmg/resource_manager.hpp>
+#include <cugraph/mtmg/vertex_result.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <nccl.h>
+
+#include <vector>
+
+#include <thrust/count.h>
+#include <thrust/unique.h>
+
+struct Multithreaded_Usecase {
+  bool test_weighted{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Multithreaded
+  : public ::testing::TestWithParam<std::tuple<Multithreaded_Usecase, input_usecase_t>> {
+ public:
+  Tests_Multithreaded() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  std::vector<int> get_gpu_list()
+  {
+    int num_gpus_per_node{1};
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
+
+    std::vector<int> gpu_list(num_gpus_per_node);
+    std::iota(gpu_list.begin(), gpu_list.end(), 0);
+
+    return gpu_list;
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename result_t,
+            bool multi_gpu>
+  void run_current_test(
+    std::tuple<Multithreaded_Usecase const&, input_usecase_t const&> const& param,
+    std::vector<int> gpu_list)
+  {
+    using edge_type_t = int32_t;
+
+    constexpr bool renumber           = true;
+    constexpr bool do_expensive_check = false;
+
+    auto [multithreaded_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+
+    size_t max_level{1};  // Louvain is non-deterministic in MG if max_leve > 1
+    weight_t threshold{1e-6};
+    weight_t resolution{1};
+
+    size_t device_buffer_size{64 * 1024 * 1024};
+    size_t thread_buffer_size{4 * 1024 * 1024};
+
+    int num_gpus    = gpu_list.size();
+    int num_threads = num_gpus * 4;
+
+    cugraph::mtmg::resource_manager_t resource_manager;
+
+    std::for_each(gpu_list.begin(), gpu_list.end(), [&resource_manager](int gpu_id) {
+      resource_manager.register_local_gpu(gpu_id, rmm::cuda_device_id{gpu_id});
+    });
+
+    ncclUniqueId instance_manager_id;
+    ncclGetUniqueId(&instance_manager_id);
+
+    auto instance_manager = resource_manager.create_instance_manager(
+      resource_manager.registered_ranks(), instance_manager_id);
+
+    cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> edgelist;
+    cugraph::mtmg::graph_t<vertex_t, edge_t, false, multi_gpu> graph;
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu> graph_view;
+    cugraph::mtmg::vertex_result_t<vertex_t> louvain_clusters;
+    std::optional<cugraph::mtmg::renumber_map_t<vertex_t>> renumber_map =
+      std::make_optional<cugraph::mtmg::renumber_map_t<vertex_t>>();
+
+    auto edge_weights = multithreaded_usecase.test_weighted
+                          ? std::make_optional<cugraph::mtmg::edge_property_t<
+                              cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+                              weight_t>>()
+                          : std::nullopt;
+
+    //
+    // Simulate graph creation by spawning threads to walk through the
+    // local COO and add edges
+    //
+    std::vector<std::thread> running_threads;
+
+    //  Initialize shared edgelist object, one per GPU
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &edgelist,
+                                    device_buffer_size,
+                                    use_weight    = true,
+                                    use_edge_id   = false,
+                                    use_edge_type = false]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        edgelist.set(thread_handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    // Load SG edge list
+    auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] =
+      input_usecase.template construct_edgelist<vertex_t, weight_t>(
+        handle, multithreaded_usecase.test_weighted, false, false);
+
+    rmm::device_uvector<vertex_t> d_unique_vertices(2 * d_src_v.size(), handle.get_stream());
+    thrust::copy(
+      handle.get_thrust_policy(), d_src_v.begin(), d_src_v.end(), d_unique_vertices.begin());
+    thrust::copy(handle.get_thrust_policy(),
+                 d_dst_v.begin(),
+                 d_dst_v.end(),
+                 d_unique_vertices.begin() + d_src_v.size());
+    thrust::sort(handle.get_thrust_policy(), d_unique_vertices.begin(), d_unique_vertices.end());
+
+    d_unique_vertices.resize(thrust::distance(d_unique_vertices.begin(),
+                                              thrust::unique(handle.get_thrust_policy(),
+                                                             d_unique_vertices.begin(),
+                                                             d_unique_vertices.end())),
+                             handle.get_stream());
+
+    auto h_src_v         = cugraph::test::to_host(handle, d_src_v);
+    auto h_dst_v         = cugraph::test::to_host(handle, d_dst_v);
+    auto h_weights_v     = cugraph::test::to_host(handle, d_weights_v);
+    auto unique_vertices = cugraph::test::to_host(handle, d_unique_vertices);
+
+    // Load edgelist from different threads.  We'll use more threads than GPUs here
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    thread_buffer_size,
+                                    &edgelist,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+        cugraph::mtmg::per_thread_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>
+          per_thread_edgelist(edgelist.get(thread_handle), thread_buffer_size);
+
+        for (size_t j = i; j < h_src_v.size(); j += num_threads) {
+          per_thread_edgelist.append(
+            thread_handle,
+            h_src_v[j],
+            h_dst_v[j],
+            h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
+            std::nullopt,
+            std::nullopt);
+        }
+
+        per_thread_edgelist.flush(thread_handle);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph,
+                                    &edge_weights,
+                                    &edgelist,
+                                    &renumber_map,
+                                    is_symmetric = is_symmetric,
+                                    renumber,
+                                    do_expensive_check]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        if (thread_handle.get_thread_rank() > 0) return;
+
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+          edge_t>>
+          edge_ids{std::nullopt};
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+          int32_t>>
+          edge_types{std::nullopt};
+
+        edgelist.finalize_buffer(thread_handle);
+        edgelist.consolidate_and_shuffle(thread_handle, false);
+
+        cugraph::mtmg::
+          create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
+            thread_handle,
+            edgelist,
+            cugraph::graph_properties_t{is_symmetric, true},
+            renumber,
+            graph,
+            edge_weights,
+            edge_ids,
+            edge_types,
+            renumber_map,
+            do_expensive_check);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    graph_view             = graph.view();
+    auto renumber_map_view = renumber_map ? std::make_optional(renumber_map->view()) : std::nullopt;
+
+    weight_t modularity{0};
+
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph_view,
+                                    &edge_weights,
+                                    &louvain_clusters,
+                                    &modularity,
+                                    &renumber_map,
+                                    max_level,
+                                    threshold,
+                                    resolution]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        if (thread_handle.get_thread_rank() > 0) return;
+
+        rmm::device_uvector<vertex_t> local_louvain_clusters(
+          graph_view.get(thread_handle).local_vertex_partition_range_size(),
+          thread_handle.get_stream());
+
+        std::tie(std::ignore, modularity) = cugraph::louvain<vertex_t, edge_t, weight_t, true>(
+          thread_handle.raft_handle(),
+          graph_view.get(thread_handle),
+          edge_weights ? std::make_optional(edge_weights->get(thread_handle).view()) : std::nullopt,
+          local_louvain_clusters.data(),
+          max_level,
+          threshold,
+          resolution);
+
+        louvain_clusters.set(thread_handle, std::move(local_louvain_clusters));
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    std::vector<std::tuple<std::vector<vertex_t>, std::vector<vertex_t>>> computed_clusters_v;
+    std::mutex computed_clusters_lock{};
+
+    auto louvain_clusters_view = louvain_clusters.view();
+    std::vector<vertex_t> h_renumber_map;
+
+    // Load computed_clusters_v from different threads.
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph_view,
+                                    &renumber_map_view,
+                                    &louvain_clusters_view,
+                                    &computed_clusters_lock,
+                                    &computed_clusters_v,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    &h_renumber_map,
+                                    &unique_vertices,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        auto number_of_vertices = unique_vertices.size();
+
+        std::vector<vertex_t> my_vertex_list;
+        my_vertex_list.reserve((number_of_vertices + num_threads - 1) / num_threads);
+
+        for (size_t j = i; j < number_of_vertices; j += num_threads) {
+          my_vertex_list.push_back(unique_vertices[j]);
+        }
+
+        rmm::device_uvector<vertex_t> d_my_vertex_list(my_vertex_list.size(),
+                                                       thread_handle.raft_handle().get_stream());
+        raft::update_device(d_my_vertex_list.data(),
+                            my_vertex_list.data(),
+                            my_vertex_list.size(),
+                            thread_handle.raft_handle().get_stream());
+
+        auto d_my_clusters = louvain_clusters_view.gather(
+          thread_handle,
+          raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
+          graph_view.get_vertex_partition_range_lasts(thread_handle),
+          graph_view.get_vertex_partition_view(thread_handle),
+          renumber_map_view);
+
+        std::vector<vertex_t> my_clusters(d_my_clusters.size());
+        raft::update_host(my_clusters.data(),
+                          d_my_clusters.data(),
+                          d_my_clusters.size(),
+                          thread_handle.raft_handle().get_stream());
+
+        {
+          std::lock_guard<std::mutex> lock(computed_clusters_lock);
+          computed_clusters_v.push_back(
+            std::make_tuple(std::move(my_vertex_list), std::move(my_clusters)));
+        }
+
+        h_renumber_map = cugraph::test::to_host(
+          thread_handle.raft_handle(),
+          cugraph::test::device_allgatherv(thread_handle.raft_handle(),
+                                           renumber_map_view->get(thread_handle)));
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    if (multithreaded_usecase.check_correctness) {
+      // Want to compare the results in computed_clusters_v with SG results
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(handle);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+
+      for (int i = 0; i < num_gpus; ++i) {
+        running_threads.emplace_back(
+          [&instance_manager, &graph_view, &edge_weights, &sg_graph, &sg_edge_weights]() {
+            auto thread_handle = instance_manager->get_handle();
+
+            if (thread_handle.get_rank() == 0) {
+              std::tie(sg_graph, sg_edge_weights, std::ignore) =
+                cugraph::test::mg_graph_to_sg_graph(
+                  thread_handle.raft_handle(),
+                  graph_view.get(thread_handle),
+                  edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
+                               : std::nullopt,
+                  std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+                  false);  // create an SG graph with MG graph vertex IDs
+            } else {
+              cugraph::test::mg_graph_to_sg_graph(
+                thread_handle.raft_handle(),
+                graph_view.get(thread_handle),
+                edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
+                             : std::nullopt,
+                std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+                false);  // create an SG graph with MG graph vertex IDs
+            }
+          });
+      }
+
+      // Wait for CPU threads to complete
+      std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+      running_threads.resize(0);
+      instance_manager->reset_threads();
+
+      rmm::device_uvector<vertex_t> sg_clusters(sg_graph.number_of_vertices(), handle.get_stream());
+      weight_t modularity;
+
+      std::tie(std::ignore, modularity) = cugraph::louvain<vertex_t, edge_t, weight_t, false>(
+        handle,
+        sg_graph.view(),
+        sg_edge_weights ? std::make_optional(sg_edge_weights->view()) : std::nullopt,
+        sg_clusters.data(),
+        max_level,
+        threshold,
+        resolution);
+
+      auto h_sg_clusters = cugraph::test::to_host(handle, sg_clusters);
+      std::map<vertex_t, vertex_t> h_cluster_map;
+      std::map<vertex_t, vertex_t> h_cluster_reverse_map;
+
+      std::for_each(
+        computed_clusters_v.begin(),
+        computed_clusters_v.end(),
+        [&h_sg_clusters, &h_cluster_map, &h_renumber_map, &h_cluster_reverse_map](auto t1) {
+          std::for_each(
+            thrust::make_zip_iterator(std::get<0>(t1).begin(), std::get<1>(t1).begin()),
+            thrust::make_zip_iterator(std::get<0>(t1).end(), std::get<1>(t1).end()),
+            [&h_sg_clusters, &h_cluster_map, &h_renumber_map, &h_cluster_reverse_map](auto t2) {
+              vertex_t v = thrust::get<0>(t2);
+              vertex_t c = thrust::get<1>(t2);
+
+              auto pos    = std::find(h_renumber_map.begin(), h_renumber_map.end(), v);
+              auto offset = std::distance(h_renumber_map.begin(), pos);
+
+              auto cluster_pos = h_cluster_map.find(c);
+              if (cluster_pos == h_cluster_map.end()) {
+                auto reverse_pos = h_cluster_reverse_map.find(h_sg_clusters[offset]);
+
+                ASSERT_TRUE(reverse_pos != h_cluster_map.end()) << "two different cluster mappings";
+
+                h_cluster_map.insert(std::make_pair(c, h_sg_clusters[offset]));
+                h_cluster_reverse_map.insert(std::make_pair(h_sg_clusters[offset], c));
+              } else {
+                ASSERT_EQ(cluster_pos->second, h_sg_clusters[offset])
+                  << "vertex " << v << ", offset = " << offset
+                  << ", SG cluster = " << h_sg_clusters[offset] << ", mtmg cluster = " << c
+                  << ", mapped value = " << cluster_pos->second;
+              }
+            });
+        });
+    }
+  }
+};
+
+using Tests_Multithreaded_File = Tests_Multithreaded<cugraph::test::File_Usecase>;
+using Tests_Multithreaded_Rmat = Tests_Multithreaded<cugraph::test::Rmat_Usecase>;
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_Multithreaded_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+TEST_P(Tests_Multithreaded_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+INSTANTIATE_TEST_SUITE_P(file_test,
+                         Tests_Multithreaded_File,
+                         ::testing::Combine(
+                           // enable correctness checks
+                           ::testing::Values(Multithreaded_Usecase{true, true}),
+                           ::testing::Values(cugraph::test::File_Usecase("karate.csv"),
+                                             cugraph::test::File_Usecase("dolphins.csv"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(Multithreaded_Usecase{true, true}),
+    //::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+    ::testing::Values(cugraph::test::Rmat_Usecase(5, 8, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_File,
+  ::testing::Combine(
+    // disable correctness checks
+    ::testing::Values(Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_count_if_e.cu b/cpp/tests/prims/mg_count_if_e.cu
index 449aa728d87..03bf8ae0ae5 100644
--- a/cpp/tests/prims/mg_count_if_e.cu
+++ b/cpp/tests/prims/mg_count_if_e.cu
@@ -53,8 +53,9 @@
 #include <random>
 
 struct Prims_Usecase {
-  bool check_correctness{true};
   bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -102,6 +103,13 @@ class Tests_MGCountIfE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG count_if_e
 
     const int hash_bin_count = 5;
@@ -148,19 +156,19 @@ class Tests_MGCountIfE
                                                               (*mg_renumber_map).size()),
         false);
 
-      auto sg_graph_view = sg_graph.view();
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
 
-      auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
-        *handle_,
-        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
-        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
-        hash_bin_count);
-      auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
-        *handle_, sg_graph_view, sg_vertex_prop);
-      auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
-        *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+          *handle_,
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+          hash_bin_count);
+        auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
 
-      if (handle_->get_comms().get_rank() == 0) {
         auto expected_result = count_if_e(
           *handle_,
           sg_graph_view,
@@ -312,7 +320,10 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGCountIfE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -320,7 +331,10 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGCountIfE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -332,7 +346,10 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGCountIfE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
index a3edb1f6372..ac73c446d89 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/mg_utilities.hpp>
@@ -116,29 +118,8 @@ class Tests_MGPerVPairTransformDstNbrIntersection
 
     std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
     if (prims_usecase.edge_masking) {
-      cugraph::edge_src_property_t<decltype(mg_graph_view), vertex_t> edge_src_renumber_map(
-        *handle_, mg_graph_view);
-      cugraph::edge_dst_property_t<decltype(mg_graph_view), vertex_t> edge_dst_renumber_map(
-        *handle_, mg_graph_view);
-      cugraph::update_edge_src_property(
-        *handle_, mg_graph_view, (*mg_renumber_map).begin(), edge_src_renumber_map);
-      cugraph::update_edge_dst_property(
-        *handle_, mg_graph_view, (*mg_renumber_map).begin(), edge_dst_renumber_map);
-
-      edge_mask = cugraph::edge_property_t<decltype(mg_graph_view), bool>(*handle_, mg_graph_view);
-
-      cugraph::transform_e(
-        *handle_,
-        mg_graph_view,
-        edge_src_renumber_map.view(),
-        edge_dst_renumber_map.view(),
-        cugraph::edge_dummy_property_t{}.view(),
-        [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
-          return ((src_property % 2 == 0) && (dst_property % 2 == 0))
-                   ? false
-                   : true;  // mask out the edges with even unrenumbered src & dst vertex IDs
-        },
-        (*edge_mask).mutable_view());
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
       mg_graph_view.attach_edge_mask((*edge_mask).view());
     }
 
@@ -257,42 +238,6 @@ class Tests_MGPerVPairTransformDstNbrIntersection
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
 
-        if (prims_usecase.edge_masking) {
-          rmm::device_uvector<vertex_t> srcs(0, handle_->get_stream());
-          rmm::device_uvector<vertex_t> dsts(0, handle_->get_stream());
-          std::tie(srcs, dsts, std::ignore, std::ignore) =
-            cugraph::decompress_to_edgelist<vertex_t, edge_t, weight_t, false, false>(
-              *handle_, sg_graph_view, std::nullopt, std::nullopt, std::nullopt);
-          auto edge_first = thrust::make_zip_iterator(srcs.begin(), dsts.begin());
-          srcs.resize(thrust::distance(edge_first,
-                                       thrust::remove_if(handle_->get_thrust_policy(),
-                                                         edge_first,
-                                                         edge_first + srcs.size(),
-                                                         [] __device__(auto pair) {
-                                                           return (thrust::get<0>(pair) % 2 == 0) &&
-                                                                  (thrust::get<1>(pair) % 2 == 0);
-                                                         })),
-                      handle_->get_stream());
-          dsts.resize(srcs.size(), handle_->get_stream());
-          rmm::device_uvector<vertex_t> vertices(sg_graph_view.number_of_vertices(),
-                                                 handle_->get_stream());
-          thrust::sequence(
-            handle_->get_thrust_policy(), vertices.begin(), vertices.end(), vertex_t{0});
-          std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::
-            create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, false>(
-              *handle_,
-              std::move(vertices),
-              std::move(srcs),
-              std::move(dsts),
-              std::nullopt,
-              std::nullopt,
-              std::nullopt,
-              cugraph::graph_properties_t{sg_graph_view.is_symmetric(),
-                                          sg_graph_view.is_multigraph()},
-              false);
-          sg_graph_view = sg_graph.view();
-        }
-
         auto sg_result_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<edge_t, edge_t>>(
           cugraph::size_dataframe_buffer(mg_aggregate_vertex_pair_buffer), handle_->get_stream());
         auto sg_out_degrees = sg_graph_view.compute_out_degrees(*handle_);
diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
index eb6a8fd5cb6..80aa34b68ae 100644
--- a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
@@ -301,8 +301,8 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
                      sg_graph_view.local_edge_partition_view().offsets().begin(),
                      sg_graph_view.local_edge_partition_view().offsets().end(),
                      sg_offsets.begin());
-        rmm::device_uvector<vertex_t> sg_indices(sg_graph_view.number_of_edges(),
-                                                 handle_->get_stream());
+        rmm::device_uvector<vertex_t> sg_indices(
+          sg_graph_view.local_edge_partition_view().indices().size(), handle_->get_stream());
         thrust::copy(handle_->get_thrust_policy(),
                      sg_graph_view.local_edge_partition_view().indices().begin(),
                      sg_graph_view.local_edge_partition_view().indices().end(),
@@ -324,8 +324,9 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
            with_replacement = prims_usecase.with_replacement,
            invalid_value =
              invalid_value ? thrust::make_optional<result_t>(*invalid_value) : thrust::nullopt,
-           property_transform = cugraph::test::detail::property_transform<vertex_t, property_t>{
-             hash_bin_count}] __device__(size_t i) {
+           property_transform =
+             cugraph::test::detail::vertex_property_transform<vertex_t, property_t>{
+               hash_bin_count}] __device__(size_t i) {
             auto v = *(frontier_vertex_first + i);
 
             // check sample_offsets
diff --git a/cpp/tests/prims/mg_transform_e.cu b/cpp/tests/prims/mg_transform_e.cu
index 24deaad810a..e9be80f1f7d 100644
--- a/cpp/tests/prims/mg_transform_e.cu
+++ b/cpp/tests/prims/mg_transform_e.cu
@@ -52,6 +52,7 @@
 
 struct Prims_Usecase {
   bool use_edgelist{false};
+  bool edge_masking{false};
   bool check_correctness{true};
 };
 
@@ -100,6 +101,13 @@ class Tests_MGTransformE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG transform_e
 
     const int hash_bin_count = 5;
@@ -439,7 +447,10 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGTransformE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}),
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -447,8 +458,10 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGTransformE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{false, true},
-                                                              Prims_Usecase{true, true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -460,7 +473,10 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGTransformE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}),
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu
index 79aa3da54df..c4ae11ab7c9 100644
--- a/cpp/tests/prims/mg_transform_reduce_e.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e.cu
@@ -91,8 +91,9 @@ struct result_compare<thrust::tuple<Args...>> {
 };
 
 struct Prims_Usecase {
-  bool check_correctness{true};
   bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -141,6 +142,13 @@ class Tests_MGTransformReduceE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG transform reduce
 
     const int hash_bin_count = 5;
@@ -365,7 +373,10 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGTransformReduceE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -373,7 +384,10 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGTransformReduceE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -385,7 +399,10 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGTransformReduceE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/property_generator.cuh b/cpp/tests/prims/property_generator.cuh
index e7264cd276f..680455eda79 100644
--- a/cpp/tests/prims/property_generator.cuh
+++ b/cpp/tests/prims/property_generator.cuh
@@ -15,6 +15,7 @@
  */
 
 #pragma once
+#include <prims/transform_e.cuh>
 #include <prims/update_edge_src_dst_property.cuh>
 
 #include <cugraph/edge_src_dst_property.hpp>
@@ -61,7 +62,7 @@ __host__ __device__ auto make_property_value(T val)
 }
 
 template <typename vertex_t, typename property_t>
-struct property_transform {
+struct vertex_property_transform {
   int32_t mod{};
 
   constexpr __device__ property_t operator()(vertex_t v) const
@@ -73,6 +74,20 @@ struct property_transform {
   }
 };
 
+template <typename vertex_t, typename property_t>
+struct edge_property_transform {
+  int32_t mod{};
+
+  constexpr __device__ property_t operator()(
+    vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+  {
+    static_assert(cugraph::is_thrust_tuple_of_arithmetic<property_t>::value ||
+                  std::is_arithmetic_v<property_t>);
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    return make_property_value<property_t>(hash_func(src + dst) % mod);
+  }
+};
+
 }  // namespace detail
 
 template <typename vertex_t, typename property_t>
@@ -96,7 +111,7 @@ struct generate {
                       labels.begin(),
                       labels.end(),
                       cugraph::get_dataframe_buffer_begin(data),
-                      detail::property_transform<vertex_t, property_t>{hash_bin_count});
+                      detail::vertex_property_transform<vertex_t, property_t>{hash_bin_count});
     return data;
   }
 
@@ -111,7 +126,7 @@ struct generate {
                       begin,
                       end,
                       cugraph::get_dataframe_buffer_begin(data),
-                      detail::property_transform<vertex_t, property_t>{hash_bin_count});
+                      detail::vertex_property_transform<vertex_t, property_t>{hash_bin_count});
     return data;
   }
 
@@ -138,6 +153,22 @@ struct generate {
       handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
     return output_property;
   }
+
+  template <typename graph_view_type>
+  static auto edge_property(raft::handle_t const& handle,
+                            graph_view_type const& graph_view,
+                            int32_t hash_bin_count)
+  {
+    auto output_property = cugraph::edge_property_t<graph_view_type, bool>(handle, graph_view);
+    cugraph::transform_e(handle,
+                         graph_view,
+                         cugraph::edge_src_dummy_property_t{}.view(),
+                         cugraph::edge_dst_dummy_property_t{}.view(),
+                         cugraph::edge_dummy_property_t{}.view(),
+                         detail::edge_property_transform<vertex_t, property_t>{hash_bin_count},
+                         output_property.mutable_view());
+    return output_property;
+  }
 };
 
 }  // namespace test
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index e5267d75ac2..6be735c3482 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -38,6 +38,8 @@
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 struct SamplingPostProcessing_Usecase {
   size_t num_labels{};
   size_t num_seeds_per_label{};
@@ -318,15 +320,16 @@ bool check_renumber_map_invariants(
 
     auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
       merged_vertices.begin(),
-      [sorted_org_vertices =
-         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
-       matching_renumbered_vertices = raft::device_span<vertex_t const>(
-         matching_renumbered_vertices.data(),
-         matching_renumbered_vertices.size())] __device__(vertex_t major) {
-        auto it = thrust::lower_bound(
-          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
-        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-      });
+      cuda::proclaim_return_type<vertex_t>(
+        [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
+                                                                 sorted_org_vertices.size()),
+         matching_renumbered_vertices = raft::device_span<vertex_t const>(
+           matching_renumbered_vertices.data(),
+           matching_renumbered_vertices.size())] __device__(vertex_t major) {
+          auto it = thrust::lower_bound(
+            thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+          return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+        }));
 
     thrust::reduce_by_key(handle.get_thrust_policy(),
                           sort_key_first,
@@ -1020,23 +1023,24 @@ class Tests_SamplingPostProcessing
                  ? this_label_output_edgelist_srcs.begin()
                  : this_label_output_edgelist_dsts.begin()) +
                 old_size,
-              [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
-               nzd_vertices =
-                 renumbered_and_compressed_nzd_vertices
-                   ? thrust::make_optional<raft::device_span<vertex_t const>>(
-                       (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
-                       (offset_end_offset - offset_start_offset) - 1)
-                   : thrust::nullopt,
-               base_v] __device__(size_t i) {
-                auto idx = static_cast<size_t>(thrust::distance(
-                  offsets.begin() + 1,
-                  thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
-                if (nzd_vertices) {
-                  return (*nzd_vertices)[idx];
-                } else {
-                  return base_v + static_cast<vertex_t>(idx);
-                }
-              });
+              cuda::proclaim_return_type<vertex_t>(
+                [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+                 nzd_vertices =
+                   renumbered_and_compressed_nzd_vertices
+                     ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                         (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
+                         (offset_end_offset - offset_start_offset) - 1)
+                     : thrust::nullopt,
+                 base_v] __device__(size_t i) {
+                  auto idx = static_cast<size_t>(thrust::distance(
+                    offsets.begin() + 1,
+                    thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
+                  if (nzd_vertices) {
+                    return (*nzd_vertices)[idx];
+                  } else {
+                    return base_v + static_cast<vertex_t>(idx);
+                  }
+                }));
             thrust::copy(handle.get_thrust_policy(),
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets.back(),
diff --git a/cpp/tests/structure/mg_select_random_vertices_test.cpp b/cpp/tests/structure/mg_select_random_vertices_test.cpp
index 79c50301922..8392a6831ca 100644
--- a/cpp/tests/structure/mg_select_random_vertices_test.cpp
+++ b/cpp/tests/structure/mg_select_random_vertices_test.cpp
@@ -79,6 +79,8 @@ class Tests_MGSelectRandomVertices
     //
 
     std::vector<bool> with_replacement_flags = {true, false};
+    std::vector<bool> sort_vertices_flags    = {true, false};
+
     {
       // Generate distributed vertex set to sample from
       std::srand((unsigned)std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -90,7 +92,7 @@ class Tests_MGSelectRandomVertices
       std::iota(
         h_given_set.begin(), h_given_set.end(), mg_graph_view.local_vertex_partition_range_first());
       std::shuffle(h_given_set.begin(), h_given_set.end(), std::mt19937{std::random_device{}()});
-      h_given_set.resize(std::rand() % mg_graph_view.local_vertex_partition_range_size() + 1);
+      h_given_set.resize(std::rand() % (mg_graph_view.local_vertex_partition_range_size() + 1));
 
       // Compute size of the distributed vertex set
       int num_of_elements_in_given_set = static_cast<int>(h_given_set.size());
@@ -105,82 +107,97 @@ class Tests_MGSelectRandomVertices
       size_t select_count =
         num_of_elements_in_given_set > select_random_vertices_usecase.select_count
           ? select_random_vertices_usecase.select_count
-          : std::rand() % num_of_elements_in_given_set + 1;
-
-      for (int idx = 0; idx < with_replacement_flags.size(); idx++) {
-        bool with_replacement = with_replacement_flags[idx];
-        auto d_sampled_vertices =
-          cugraph::select_random_vertices(*handle_,
-                                          mg_graph_view,
-                                          std::make_optional(raft::device_span<vertex_t const>{
-                                            d_given_set.data(), d_given_set.size()}),
-                                          rng_state,
-                                          select_count,
-                                          with_replacement,
-                                          true);
-
-        RAFT_CUDA_TRY(cudaDeviceSynchronize());
-
-        auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
-
-        if (select_random_vertices_usecase.check_correctness) {
-          if (!with_replacement) {
-            std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
-
-            auto nr_duplicates =
-              std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
-                            h_sampled_vertices.end());
-
-            ASSERT_EQ(nr_duplicates, 0);
+          : std::rand() % (num_of_elements_in_given_set + 1);
+
+      for (int i = 0; i < with_replacement_flags.size(); i++) {
+        for (int j = 0; j < sort_vertices_flags.size(); j++) {
+          bool with_replacement = with_replacement_flags[i];
+          bool sort_vertices    = sort_vertices_flags[j];
+
+          auto d_sampled_vertices =
+            cugraph::select_random_vertices(*handle_,
+                                            mg_graph_view,
+                                            std::make_optional(raft::device_span<vertex_t const>{
+                                              d_given_set.data(), d_given_set.size()}),
+                                            rng_state,
+                                            select_count,
+                                            with_replacement,
+                                            sort_vertices);
+
+          RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+          auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
+
+          if (select_random_vertices_usecase.check_correctness) {
+            if (!with_replacement) {
+              std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+
+              auto nr_duplicates =
+                std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
+                              h_sampled_vertices.end());
+
+              ASSERT_EQ(nr_duplicates, 0);
+            }
+
+            std::sort(h_given_set.begin(), h_given_set.end());
+            if (sort_vertices) {
+              assert(std::is_sorted(h_sampled_vertices.begin(), h_sampled_vertices.end()));
+            } else {
+              std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+            }
+            std::for_each(
+              h_sampled_vertices.begin(), h_sampled_vertices.end(), [&h_given_set](vertex_t v) {
+                ASSERT_TRUE(std::binary_search(h_given_set.begin(), h_given_set.end(), v));
+              });
           }
-
-          std::sort(h_given_set.begin(), h_given_set.end());
-          std::for_each(
-            h_sampled_vertices.begin(), h_sampled_vertices.end(), [&h_given_set](vertex_t v) {
-              ASSERT_TRUE(std::binary_search(h_given_set.begin(), h_given_set.end(), v));
-            });
         }
       }
-    }
 
-    //
-    // Test sampling from [0, V)
-    //
-
-    for (int idx = 0; idx < with_replacement_flags.size(); idx++) {
-      bool with_replacement   = false;
-      auto d_sampled_vertices = cugraph::select_random_vertices(
-        *handle_,
-        mg_graph_view,
-        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
-        rng_state,
-        select_random_vertices_usecase.select_count,
-        with_replacement,
-        true);
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-
-      auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
-
-      if (select_random_vertices_usecase.check_correctness) {
-        if (!with_replacement) {
-          std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
-
-          auto nr_duplicates =
-            std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
-                          h_sampled_vertices.end());
-
-          ASSERT_EQ(nr_duplicates, 0);
+      //
+      // Test sampling from [0, V)
+      //
+
+      for (int i = 0; i < with_replacement_flags.size(); i++) {
+        for (int j = 0; j < sort_vertices_flags.size(); j++) {
+          bool with_replacement = with_replacement_flags[i];
+          bool sort_vertices    = sort_vertices_flags[j];
+
+          auto d_sampled_vertices = cugraph::select_random_vertices(
+            *handle_,
+            mg_graph_view,
+            std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+            rng_state,
+            select_random_vertices_usecase.select_count,
+            with_replacement,
+            sort_vertices);
+
+          RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+          auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
+
+          if (select_random_vertices_usecase.check_correctness) {
+            if (!with_replacement) {
+              std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+
+              auto nr_duplicates =
+                std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
+                              h_sampled_vertices.end());
+
+              ASSERT_EQ(nr_duplicates, 0);
+            }
+            if (sort_vertices) {
+              assert(std::is_sorted(h_sampled_vertices.begin(), h_sampled_vertices.end()));
+            }
+
+            auto vertex_first = mg_graph_view.local_vertex_partition_range_first();
+            auto vertex_last  = mg_graph_view.local_vertex_partition_range_last();
+            std::for_each(h_sampled_vertices.begin(),
+                          h_sampled_vertices.end(),
+                          [vertex_first, vertex_last](vertex_t v) {
+                            ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
+                          });
+          }
         }
-
-        auto vertex_first = mg_graph_view.local_vertex_partition_range_first();
-        auto vertex_last  = mg_graph_view.local_vertex_partition_range_last();
-
-        std::for_each(h_sampled_vertices.begin(),
-                      h_sampled_vertices.end(),
-                      [vertex_first, vertex_last](vertex_t v) {
-                        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
-                      });
       }
     }
   }
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index 16c9d3ed145..8cc87b26f1d 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -621,9 +621,25 @@ construct_graph(raft::handle_t const& handle,
 
   CUGRAPH_EXPECTS(d_src_v.size() <= static_cast<size_t>(std::numeric_limits<edge_t>::max()),
                   "Invalid template parameter: edge_t overflow.");
-  if (drop_self_loops) { remove_self_loops(handle, d_src_v, d_dst_v, d_weights_v); }
+  if (drop_self_loops) {
+    std::tie(d_src_v, d_dst_v, d_weights_v, std::ignore, std::ignore) =
+      cugraph::remove_self_loops<vertex_t, edge_t, weight_t, int32_t>(handle,
+                                                                      std::move(d_src_v),
+                                                                      std::move(d_dst_v),
+                                                                      std::move(d_weights_v),
+                                                                      std::nullopt,
+                                                                      std::nullopt);
+  }
 
-  if (drop_multi_edges) { sort_and_remove_multi_edges(handle, d_src_v, d_dst_v, d_weights_v); }
+  if (drop_multi_edges) {
+    std::tie(d_src_v, d_dst_v, d_weights_v, std::ignore, std::ignore) =
+      cugraph::remove_multi_edges<vertex_t, edge_t, weight_t, int32_t>(handle,
+                                                                       std::move(d_src_v),
+                                                                       std::move(d_dst_v),
+                                                                       std::move(d_weights_v),
+                                                                       std::nullopt,
+                                                                       std::nullopt);
+  }
 
   graph_t<vertex_t, edge_t, store_transposed, multi_gpu> graph(handle);
   std::optional<
diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu
index cb7e6f1bd66..2daf250b4a2 100644
--- a/cpp/tests/utilities/thrust_wrapper.cu
+++ b/cpp/tests/utilities/thrust_wrapper.cu
@@ -206,131 +206,5 @@ template void populate_vertex_ids(raft::handle_t const& handle,
                                   rmm::device_uvector<int64_t>& d_vertices_v,
                                   int64_t vertex_id_offset);
 
-template <typename vertex_t, typename weight_t>
-void remove_self_loops(raft::handle_t const& handle,
-                       rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-                       rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-                       std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */)
-{
-  if (d_weight_v) {
-    auto edge_first = thrust::make_zip_iterator(
-      thrust::make_tuple(d_src_v.begin(), d_dst_v.begin(), (*d_weight_v).begin()));
-    d_src_v.resize(
-      thrust::distance(edge_first,
-                       thrust::remove_if(
-                         handle.get_thrust_policy(),
-                         edge_first,
-                         edge_first + d_src_v.size(),
-                         [] __device__(auto e) { return thrust::get<0>(e) == thrust::get<1>(e); })),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-    (*d_weight_v).resize(d_src_v.size(), handle.get_stream());
-  } else {
-    auto edge_first =
-      thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
-    d_src_v.resize(
-      thrust::distance(edge_first,
-                       thrust::remove_if(
-                         handle.get_thrust_policy(),
-                         edge_first,
-                         edge_first + d_src_v.size(),
-                         [] __device__(auto e) { return thrust::get<0>(e) == thrust::get<1>(e); })),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-  }
-
-  d_src_v.shrink_to_fit(handle.get_stream());
-  d_dst_v.shrink_to_fit(handle.get_stream());
-  if (d_weight_v) { (*d_weight_v).shrink_to_fit(handle.get_stream()); }
-}
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
-template <typename vertex_t, typename weight_t>
-void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */)
-{
-  if (d_weight_v) {
-    auto edge_first = thrust::make_zip_iterator(
-      thrust::make_tuple(d_src_v.begin(), d_dst_v.begin(), (*d_weight_v).begin()));
-    thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + d_src_v.size());
-    d_src_v.resize(
-      thrust::distance(edge_first,
-                       thrust::unique(handle.get_thrust_policy(),
-                                      edge_first,
-                                      edge_first + d_src_v.size(),
-                                      [] __device__(auto lhs, auto rhs) {
-                                        return (thrust::get<0>(lhs) == thrust::get<0>(rhs)) &&
-                                               (thrust::get<1>(lhs) == thrust::get<1>(rhs));
-                                      })),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-    (*d_weight_v).resize(d_src_v.size(), handle.get_stream());
-  } else {
-    auto edge_first =
-      thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
-    thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + d_src_v.size());
-    d_src_v.resize(
-      thrust::distance(
-        edge_first,
-        thrust::unique(handle.get_thrust_policy(), edge_first, edge_first + d_src_v.size())),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-  }
-
-  d_src_v.shrink_to_fit(handle.get_stream());
-  d_dst_v.shrink_to_fit(handle.get_stream());
-  if (d_weight_v) { (*d_weight_v).shrink_to_fit(handle.get_stream()); }
-}
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp
index eead4dc268f..fb82d781198 100644
--- a/cpp/tests/utilities/thrust_wrapper.hpp
+++ b/cpp/tests/utilities/thrust_wrapper.hpp
@@ -46,18 +46,5 @@ void populate_vertex_ids(raft::handle_t const& handle,
                          rmm::device_uvector<vertex_t>& d_vertices_v /* [INOUT] */,
                          vertex_t vertex_id_offset);
 
-template <typename vertex_t, typename weight_t>
-void remove_self_loops(raft::handle_t const& handle,
-                       rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-                       rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-                       std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */);
-
-template <typename vertex_t, typename weight_t>
-void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */);
-
 }  // namespace test
 }  // namespace cugraph
diff --git a/dependencies.yaml b/dependencies.yaml
index 2c0918ad117..b5c1fb2fa2d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -297,10 +297,10 @@ dependencies:
           - cxx-compiler
           - gmock>=1.13.0
           - gtest>=1.13.0
-          - libcugraphops==23.12.*
-          - libraft-headers==23.12.*
-          - libraft==23.12.*
-          - librmm==23.12.*
+          - libcugraphops==24.2.*
+          - libraft-headers==24.2.*
+          - libraft==24.2.*
+          - librmm==24.2.*
           - openmpi # Required for building cpp-mgtests (multi-GPU tests)
     specific:
       - output_types: [conda]
@@ -377,16 +377,16 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &dask rapids-dask-dependency==23.12.*
-          - &dask_cuda dask-cuda==23.12.*
+          - &dask rapids-dask-dependency==24.2.*
+          - &dask_cuda dask-cuda==24.2.*
           - &numba numba>=0.57
-          - &numpy numpy>=1.21 
-          - &ucx_py ucx-py==0.35.*
+          - &numpy numpy>=1.21
+          - &ucx_py ucx-py==0.36.*
       - output_types: conda
         packages:
           - aiohttp
           - fsspec>=0.6.0
-          - libcudf==23.12.*
+          - libcudf==24.2.*
           - requests
           - nccl>=2.9.9
           - ucx-proc=*=gpu
@@ -409,7 +409,7 @@ dependencies:
           - *numpy
       - output_types: [pyproject]
         packages:
-          - &cugraph cugraph==23.12.*
+          - &cugraph cugraph==24.2.*
   python_run_cugraph_pyg:
     common:
       - output_types: [conda, pyproject]
@@ -437,7 +437,7 @@ dependencies:
       - output_types: pyproject
         packages:
           - *cugraph
-          - cugraph-service-client==23.12.*
+          - cugraph-service-client==24.2.*
   test_cpp:
     common:
       - output_types: conda
@@ -472,7 +472,7 @@ dependencies:
           - scikit-learn>=0.23.1
       - output_types: [conda]
         packages:
-          - pylibwholegraph==23.12.*
+          - pylibwholegraph==24.2.*
   test_python_pylibcugraph:
     common:
       - output_types: [conda, pyproject]
@@ -489,7 +489,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==23.12.*
+          - cugraph==24.2.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - dgl>=1.1.0.cu*
@@ -497,7 +497,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==23.12.*
+          - cugraph==24.2.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - pyg>=2.4.0
@@ -506,22 +506,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==23.12.*
+          - &rmm_conda rmm==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &rmm_packages_pip_cu12
-              - rmm-cu12==23.12.*
+              - rmm-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *rmm_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *rmm_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &rmm_packages_pip_cu11
-              - rmm-cu11==23.12.*
+              - rmm-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *rmm_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *rmm_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *rmm_packages_pip_cu11}
@@ -531,22 +532,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==23.12.*
+          - &cudf_conda cudf==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &cudf_packages_pip_cu12
-              - cudf-cu12==23.12.*
+              - cudf-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *cudf_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *cudf_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &cudf_packages_pip_cu11
-              - cudf-cu11==23.12.*
+              - cudf-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *cudf_packages_pip_cu11}
@@ -556,22 +558,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &dask_cudf_conda dask-cudf==23.12.*
+          - &dask_cudf_conda dask-cudf==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &dask_cudf_packages_pip_cu12
-              - dask-cudf-cu12==23.12.*
+              - dask-cudf-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *dask_cudf_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *dask_cudf_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &dask_cudf_packages_pip_cu11
-              - dask-cudf-cu11==23.12.*
+              - dask-cudf-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *dask_cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *dask_cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *dask_cudf_packages_pip_cu11}
@@ -581,22 +584,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_conda pylibraft==23.12.*
+          - &pylibraft_conda pylibraft==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &pylibraft_packages_pip_cu12
-              - pylibraft-cu12==23.12.*
+              - pylibraft-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *pylibraft_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *pylibraft_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &pylibraft_packages_pip_cu11
-              - pylibraft-cu11==23.12.*
+              - pylibraft-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *pylibraft_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *pylibraft_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *pylibraft_packages_pip_cu11}
@@ -606,22 +610,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &raft_dask_conda raft-dask==23.12.*
+          - &raft_dask_conda raft-dask==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &raft_dask_packages_pip_cu12
-              - raft-dask-cu12==23.12.*
+              - raft-dask-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *raft_dask_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *raft_dask_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &raft_dask_packages_pip_cu11
-              - raft-dask-cu11==23.12.*
+              - raft-dask-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *raft_dask_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *raft_dask_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *raft_dask_packages_pip_cu11}
@@ -631,22 +636,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraph_conda pylibcugraph==23.12.*
+          - &pylibcugraph_conda pylibcugraph==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &pylibcugraph_packages_pip_cu12
-              - pylibcugraph-cu12==23.12.*
+              - pylibcugraph-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *pylibcugraph_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *pylibcugraph_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &pylibcugraph_packages_pip_cu11
-              - pylibcugraph-cu11==23.12.*
+              - pylibcugraph-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *pylibcugraph_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *pylibcugraph_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *pylibcugraph_packages_pip_cu11}
@@ -656,22 +662,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraphops_conda pylibcugraphops==23.12.*
+          - &pylibcugraphops_conda pylibcugraphops==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &pylibcugraphops_packages_pip_cu12
-              - pylibcugraphops-cu12==23.12.*
+              - pylibcugraphops-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *pylibcugraphops_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *pylibcugraphops_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &pylibcugraphops_packages_pip_cu11
-              - pylibcugraphops-cu11==23.12.*
+              - pylibcugraphops-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *pylibcugraphops_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *pylibcugraphops_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *pylibcugraphops_packages_pip_cu11}
diff --git a/docs/cugraph/source/conf.py b/docs/cugraph/source/conf.py
index 3f7ef7deb03..cef06a584fc 100644
--- a/docs/cugraph/source/conf.py
+++ b/docs/cugraph/source/conf.py
@@ -77,9 +77,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.12'
+version = '24.02'
 # The full version, including alpha/beta/rc tags.
-release = '23.12.00'
+release = '24.02.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/cugraph/source/installation/source_build.md b/docs/cugraph/source/installation/source_build.md
index f5ee0741da6..1a129d45295 100644
--- a/docs/cugraph/source/installation/source_build.md
+++ b/docs/cugraph/source/installation/source_build.md
@@ -1,53 +1,46 @@
 # Building from Source
 
-The following instructions are for users wishing to build cuGraph from source code.  These instructions are tested on supported distributions of Linux, CUDA, and Python - See [RAPIDS Getting Started](https://rapids.ai/start.html) for list of supported environments.  Other operating systems _might be_ compatible, but are not currently tested.
-
-The cuGraph package include both a C/C++ CUDA portion and a python portion.  Both libraries need to be installed in order for cuGraph to operate correctly.
+These instructions are tested on supported versions/distributions of Linux,
+CUDA, and Python - See [RAPIDS Getting Started](https://rapids.ai/start.html)
+for the list of supported environments.  Other environments _might be_
+compatible, but are not currently tested.
 
 ## Prerequisites
 
-__Compiler:__
+__Compilers:__
 * `gcc`           version 9.3+
-* `nvcc`          version 11.0+
-* `cmake`         version 3.20.1+
+* `nvcc`          version 11.5+
 
 __CUDA:__
-* CUDA 11.0+
+* CUDA 11.2+
 * NVIDIA driver 450.80.02+
 * Pascal architecture or better
 
-You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
-
-__Packages:__
-* `cmake`         version 3.20.1+
-* `libcugraphops` (version matching source branch version, eg. `23.10`)
-
-You can obtain `libcugraphops` using `conda`/`mamba` from the `nvidia` channel, or using `pip` with the `--extra-index-url=https://pypi.nvidia.com` option.  See the [RAPIDS docs](https://docs.rapids.ai/install#environment) for more details.
-
-## Building cuGraph
-To install cuGraph from source, ensure the dependencies are met.
+Further details and download links for these prerequisites are available on the
+[RAPIDS System Requirements page](https://docs.rapids.ai/install#system-req).
 
+## Setting up the development environment
 
-### Clone Repo and Configure Conda Environment
-__GIT clone a version of the repository__
-
-  ```bash
-  # Set the localtion to cuGraph in an environment variable CUGRAPH_HOME
-  export CUGRAPH_HOME=$(pwd)/cugraph
-
-  # Download the cuGraph repo - if you have a folked version, use that path here instead
-  git clone https://github.com/rapidsai/cugraph.git $CUGRAPH_HOME
+### Clone the repository:
+```bash
+CUGRAPH_HOME=$(pwd)/cugraph
+git clone https://github.com/rapidsai/cugraph.git $CUGRAPH_HOME
+cd $CUGRAPH_HOME
+```
 
-  cd $CUGRAPH_HOME
-  ```
+### Create the conda environment
 
-__Create the conda development environment__
+Using conda is the easiest way to install both the build and runtime
+dependencies for cugraph. While it is possible to build and run cugraph without
+conda, the required packages occasionally change, making it difficult to
+document here. The best way to see the current dependencies needed for a build
+and run environment is to examine the list of packages in the [conda
+environment YAML
+files](https://github.com/rapidsai/cugraph/blob/main/conda/environments).
 
 ```bash
-# create the conda environment (assuming in base `cugraph` directory)
-
 # for CUDA 11.x
-conda env create --name cugraph_dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
+conda env create --name cugraph_dev --file $CUGRAPH_HOME/conda/environments/all_cuda-118_arch-x86_64.yaml
 
 # activate the environment
 conda activate cugraph_dev
@@ -56,101 +49,53 @@ conda activate cugraph_dev
 conda deactivate
 ```
 
-  - The environment can be updated as development includes/changes the dependencies. To do so, run:
-
+The environment can be updated as cugraph adds/removes/updates its dependencies. To do so, run:
 
 ```bash
-
-# Where XXX is the CUDA 11 version
-conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.XXX.yml
-
+# for CUDA 11.x
+conda env update --name cugraph_dev --file $CUGRAPH_HOME/conda/environments/all_cuda-118_arch-x86_64.yaml
 conda activate cugraph_dev
 ```
 
+### Build and Install
 
-### Build and Install Using the `build.sh` Script
-Using the `build.sh` script make compiling and installing cuGraph a breeze.  To build and install, simply do:
+#### Build and install using `build.sh`
+Using the `build.sh` script, located in the `$CUGRAPH_HOME` directory, is the
+recommended way to build and install the cugraph libraries. By default,
+`build.sh` will build and install a predefined set of targets
+(packages/libraries), but can also accept a list of targets to build.
 
-```bash
-$ cd $CUGRAPH_HOME
-$ ./build.sh clean
-$ ./build.sh libcugraph
-$ ./build.sh cugraph
-```
+For example, to build only the cugraph C++ library (`libcugraph`) and the
+high-level python library (`cugraph`) without building the C++ test binaries,
+run this command:
 
-There are several other options available on the build script for advanced users.
-`build.sh` options:
 ```bash
-build.sh [<target> ...] [<flag> ...]
- where <target> is:
-   clean                      - remove all existing build artifacts and configuration (start over)
-   uninstall                  - uninstall libcugraph and cugraph from a prior build/install (see also -n)
-   libcugraph                 - build libcugraph.so and SG test binaries
-   libcugraph_etl             - build libcugraph_etl.so and SG test binaries
-   pylibcugraph               - build the pylibcugraph Python package
-   cugraph                    - build the cugraph Python package
-   nx-cugraph                 - build the nx-cugraph Python package
-   cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
-   cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
-   cugraph-dgl                - build the cugraph-dgl extensions for DGL
-   cugraph-pyg                - build the cugraph-dgl extensions for PyG
-   docs                       - build the docs
- and <flag> is:
-   -v                         - verbose build mode
-   -g                         - build for debug
-   -n                         - do not install after a successful build
-   --pydevelop                - use setup.py develop instead of install
-   --allgpuarch               - build for all supported GPU architectures
-   --skip_cpp_tests           - do not build the SG test binaries as part of the libcugraph and libcugraph_etl targets
-   --without_cugraphops       - do not build algos that require cugraph-ops
-   --cmake_default_generator  - use the default cmake generator instead of ninja
-   --clean                    - clean an individual target (note: to do a complete rebuild, use the clean target described above)
-   -h                         - print this text
-
- default action (no args) is to build and install 'libcugraph' then 'libcugraph_etl' then 'pylibcugraph' then 'cugraph' then 'cugraph-service' targets
-
-examples:
-$ ./build.sh clean                        # remove prior build artifacts (start over)
-$ ./build.sh libcugraph -v                # compile and install libcugraph with verbose output
-$ ./build.sh libcugraph -g                # compile and install libcugraph for debug
-$ ./build.sh libcugraph -n                # compile libcugraph but do not install
-
-# make parallelism options can also be defined: Example build jobs using 4 threads (make -j4)
-$ PARALLEL_LEVEL=4 ./build.sh libcugraph
-
-Note that the libraries will be installed to the location set in `$PREFIX` if set (i.e. `export PREFIX=/install/path`), otherwise to `$CONDA_PREFIX`.
+$ cd $CUGRAPH_HOME
+$ ./build.sh libcugraph pylibcugraph cugraph --skip_cpp_tests
 ```
 
+There are several other options available on the build script for advanced
+users. Refer to the output of `--help` for details.
 
-## Building each section independently
-#### Build and Install the C++/CUDA `libcugraph` Library
-CMake depends on the `nvcc` executable being on your path or defined in `$CUDACXX`.
-
-This project uses cmake for building the C/C++ library. To configure cmake, run:
-
-  ```bash
-  # Set the localtion to cuGraph in an environment variable CUGRAPH_HOME
-  export CUGRAPH_HOME=$(pwd)/cugraph
-
-  cd $CUGRAPH_HOME
-  cd cpp                                        # enter cpp directory
-  mkdir build                                   # create build directory
-  cd build                                      # enter the build directory
-  cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
-
-  # now build the code
-  make -j                                       # "-j" starts multiple threads
-  make install                                  # install the libraries
-  ```
-The default installation locations are `$CMAKE_INSTALL_PREFIX/lib` and `$CMAKE_INSTALL_PREFIX/include/cugraph` respectively.
+Note that libraries will be installed to the location set in `$PREFIX` if set
+(i.e. `export PREFIX=/install/path`), otherwise to `$CONDA_PREFIX`.
 
 #### Updating the RAFT branch
 
-`libcugraph` uses the [RAFT](https://github.com/rapidsai/raft) library and there are times when it might be desirable to build against a different RAFT branch, such as when working on new features that might span both RAFT and cuGraph.
+`libcugraph` uses the [RAFT](https://github.com/rapidsai/raft) library and
+there are times when it might be desirable to build against a different RAFT
+branch, such as when working on new features that might span both RAFT and
+cuGraph.
 
-For local development, the `CPM_raft_SOURCE=<path/to/raft/source>` option can be passed to the `cmake` command to enable `libcugraph` to use the local RAFT branch.
+For local development, the `CPM_raft_SOURCE=<path/to/raft/source>` option can
+be passed to the `cmake` command to enable `libcugraph` to use the local RAFT
+branch. The `build.sh` script calls `cmake` to build the C/C++ targets, but
+developers can call `cmake` directly in order to pass it options like those
+described here. Refer to the `build.sh` script to see how to call `cmake` and
+other commands directly.
 
-To have CI test a `cugraph` pull request against a different RAFT branch, modify the bottom of the `cpp/cmake/thirdparty/get_raft.cmake` file as follows:
+To have CI test a `cugraph` pull request against a different RAFT branch,
+modify the bottom of the `cpp/cmake/thirdparty/get_raft.cmake` file as follows:
 
 ```cmake
 # Change pinned tag and fork here to test a commit in CI
@@ -167,24 +112,10 @@ find_and_configure_raft(VERSION    ${CUGRAPH_MIN_VERSION_raft}
                         )
 ```
 
-When the above change is pushed to a pull request, the continuous integration servers will use the specified RAFT branch to run the cuGraph tests. After the changes in the RAFT branch are merged to the release branch, remember to revert the `get_raft.cmake` file back to the original cuGraph branch.
-
-### Building and installing the Python package
-
-2) Install the Python packages to your Python path:
-
-```bash
-cd $CUGRAPH_HOME
-cd python
-cd pylibcugraph
-python setup.py build_ext --inplace
-python setup.py install    # install pylibcugraph
-cd ../cugraph
-python setup.py build_ext --inplace
-python setup.py install    # install cugraph python bindings
-
-```
-
+When the above change is pushed to a pull request, the continuous integration
+servers will use the specified RAFT branch to run the cuGraph tests. After the
+changes in the RAFT branch are merged to the release branch, remember to revert
+the `get_raft.cmake` file back to the original cuGraph branch.
 
 
 ## Run tests
@@ -240,7 +171,10 @@ Note: This conda installation only applies to Linux and Python versions 3.8/3.10
 
 ### (OPTIONAL) Set environment variable on activation
 
-It is possible to configure the conda environment to set environmental variables on activation. Providing instructions to set PATH to include the CUDA toolkit bin directory and LD_LIBRARY_PATH to include the CUDA lib64 directory will be helpful.
+It is possible to configure the conda environment to set environment variables
+on activation. Providing instructions to set PATH to include the CUDA toolkit
+bin directory and LD_LIBRARY_PATH to include the CUDA lib64 directory will be
+helpful.
 
 ```bash
 cd  ~/anaconda3/envs/cugraph_dev
@@ -271,7 +205,8 @@ unset LD_LIBRARY_PATH
 
 ## Creating documentation
 
-Python API documentation can be generated from _./docs/cugraph directory_. Or through using "./build.sh docs"
+Python API documentation can be generated from _./docs/cugraph directory_. Or
+through using "./build.sh docs"
 
 ## Attribution
 Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 2c1dd855cb5..1f099e8f85f 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake
   )
 endif()
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
index b73ccb0cf9a..c6df338ab9a 100644
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -10,11 +10,11 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==23.12.*
+- cugraph==24.2.*
 - dgl>=1.1.0.cu*
 - pandas
 - pre-commit
-- pylibcugraphops==23.12.*
+- pylibcugraphops==24.2.*
 - pytest
 - pytest-benchmark
 - pytest-cov
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index eff7a20f0aa..62fa8ab6368 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cugraph==23.12.*",
+    "cugraph==24.2.*",
     "numba>=0.57",
     "numpy>=1.21",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
index 71d1c7e389c..0c63dc9d80e 100644
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -10,11 +10,11 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==23.12.*
+- cugraph==24.2.*
 - pandas
 - pre-commit
 - pyg>=2.4.0
-- pylibcugraphops==23.12.*
+- pylibcugraphops==24.2.*
 - pytest
 - pytest-benchmark
 - pytest-cov
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
index 9c9dcdb43bb..bef3a023b93 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
@@ -13,6 +13,7 @@
 
 from .gat_conv import GATConv
 from .gatv2_conv import GATv2Conv
+from .hetero_gat_conv import HeteroGATConv
 from .rgcn_conv import RGCNConv
 from .sage_conv import SAGEConv
 from .transformer_conv import TransformerConv
@@ -20,6 +21,7 @@
 __all__ = [
     "GATConv",
     "GATv2Conv",
+    "HeteroGATConv",
     "RGCNConv",
     "SAGEConv",
     "TransformerConv",
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
new file mode 100644
index 00000000000..3b717552a96
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+from collections import defaultdict
+
+from cugraph.utilities.utils import import_optional
+from pylibcugraphops.pytorch.operators import mha_gat_n2n
+
+from .base import BaseConv
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+
+class HeteroGATConv(BaseConv):
+    r"""The graph attentional operator on heterogeneous graphs, where a separate
+    `GATConv` is applied on the homogeneous graph for each edge type. Compared
+    with directly wrapping `GATConv`s with `HeteroConv`, `HeteroGATConv` fuses
+    all the linear transformation associated with each node type together into 1
+    GEMM call, to improve the performance on GPUs.
+
+    Parameters
+    ----------
+    in_channels : int or Dict[str, int])
+        Size of each input sample of every node type.
+
+    out_channels : int
+        Size of each output sample.
+
+    node_types : List[str]
+        List of Node types.
+
+    edge_types : List[Tuple[str, str, str]]
+        List of Edge types.
+
+    heads : int, optional (default=1)
+        Number of multi-head-attentions.
+
+    concat : bool, optional (default=True):
+        If set to :obj:`False`, the multi-head attentions are averaged instead
+        of concatenated.
+
+    negative_slope : float, optional (default=0.2)
+        LeakyReLU angle of the negative slope.
+
+    bias : bool, optional (default=True)
+        If set to :obj:`False`, the layer will not learn an additive bias.
+
+    aggr : str, optional (default="sum")
+        The aggregation scheme to use for grouping node embeddings generated by
+        different relations. Choose from "sum", "mean", "min", "max".
+    """
+
+    def __init__(
+        self,
+        in_channels: Union[int, dict[str, int]],
+        out_channels: int,
+        node_types: list[str],
+        edge_types: list[tuple[str, str, str]],
+        heads: int = 1,
+        concat: bool = True,
+        negative_slope: float = 0.2,
+        bias: bool = True,
+        aggr: str = "sum",
+    ):
+        major, minor, patch = torch_geometric.__version__.split(".")[:3]
+        pyg_version = tuple(map(int, [major, minor, patch]))
+        if pyg_version < (2, 4, 0):
+            raise RuntimeError(f"{self.__class__.__name__} requires pyg >= 2.4.0.")
+
+        super().__init__()
+
+        if isinstance(in_channels, int):
+            in_channels = dict.fromkeys(node_types, in_channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.node_types = node_types
+        self.edge_types = edge_types
+        self.num_heads = heads
+        self.concat_heads = concat
+
+        self.negative_slope = negative_slope
+        self.aggr = aggr
+
+        self.relations_per_ntype = defaultdict(lambda: ([], []))
+
+        lin_weights = dict.fromkeys(self.node_types)
+        attn_weights = dict.fromkeys(self.edge_types)
+        biases = dict.fromkeys(self.edge_types)
+
+        ParameterDict = torch_geometric.nn.parameter_dict.ParameterDict
+
+        for edge_type in self.edge_types:
+            src_type, _, dst_type = edge_type
+            self.relations_per_ntype[src_type][0].append(edge_type)
+            if src_type != dst_type:
+                self.relations_per_ntype[dst_type][1].append(edge_type)
+
+            attn_weights[edge_type] = torch.empty(
+                2 * self.num_heads * self.out_channels
+            )
+
+            if bias and concat:
+                biases[edge_type] = torch.empty(self.num_heads * out_channels)
+            elif bias:
+                biases[edge_type] = torch.empty(out_channels)
+            else:
+                biases[edge_type] = None
+
+        for ntype in self.node_types:
+            n_src_rel = len(self.relations_per_ntype[ntype][0])
+            n_dst_rel = len(self.relations_per_ntype[ntype][1])
+            n_rel = n_src_rel + n_dst_rel
+
+            lin_weights[ntype] = torch.empty(
+                (n_rel * self.num_heads * self.out_channels, self.in_channels[ntype])
+            )
+
+        self.lin_weights = ParameterDict(lin_weights)
+        self.attn_weights = ParameterDict(attn_weights)
+
+        if bias:
+            self.bias = ParameterDict(biases)
+        else:
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def split_tensors(
+        self, x_fused_dict: dict[str, torch.Tensor], dim: int
+    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+        """Split fused tensors into chunks based on edge types.
+
+        Parameters
+        ----------
+        x_fused_dict : dict[str, torch.Tensor]
+            A dictionary to hold node feature for each node type. The key is
+            node type; the value is a fused tensor that account for all
+            relations for that node type.
+
+        dim : int
+            Dimension along which to split the fused tensor.
+
+        Returns
+        -------
+        x_src_dict : dict[str, torch.Tensor]
+            A dictionary to hold source node feature for each relation graph.
+
+        x_dst_dict : dict[str, torch.Tensor]
+            A dictionary to hold destination node feature for each relation graph.
+        """
+        x_src_dict = dict.fromkeys(self.edge_types)
+        x_dst_dict = dict.fromkeys(self.edge_types)
+
+        for ntype, t in x_fused_dict.items():
+            n_src_rel = len(self.relations_per_ntype[ntype][0])
+            n_dst_rel = len(self.relations_per_ntype[ntype][1])
+            n_rel = n_src_rel + n_dst_rel
+            t_list = torch.chunk(t, chunks=n_rel, dim=dim)
+
+            for i, src_rel in enumerate(self.relations_per_ntype[ntype][0]):
+                x_src_dict[src_rel] = t_list[i]
+
+            for i, dst_rel in enumerate(self.relations_per_ntype[ntype][1]):
+                x_dst_dict[dst_rel] = t_list[i + n_src_rel]
+
+        return x_src_dict, x_dst_dict
+
+    def reset_parameters(self, seed: Optional[int] = None):
+        if seed is not None:
+            torch.manual_seed(seed)
+
+        w_src, w_dst = self.split_tensors(self.lin_weights, dim=0)
+
+        for edge_type in self.edge_types:
+            src_type, _, dst_type = edge_type
+
+            # lin_src
+            torch_geometric.nn.inits.glorot(w_src[edge_type])
+
+            # lin_dst
+            if src_type != dst_type:
+                torch_geometric.nn.inits.glorot(w_dst[edge_type])
+
+            # attn_weights
+            torch_geometric.nn.inits.glorot(
+                self.attn_weights[edge_type].view(-1, self.num_heads, self.out_channels)
+            )
+
+            # bias
+            if self.bias is not None:
+                torch_geometric.nn.inits.zeros(self.bias[edge_type])
+
+    def forward(
+        self,
+        x_dict: dict[str, torch.Tensor],
+        edge_index_dict: dict[tuple[str, str, str], torch.Tensor],
+    ) -> dict[str, torch.Tensor]:
+        feat_dict = dict.fromkeys(x_dict.keys())
+
+        for ntype, x in x_dict.items():
+            feat_dict[ntype] = x @ self.lin_weights[ntype].T
+
+        x_src_dict, x_dst_dict = self.split_tensors(feat_dict, dim=1)
+
+        out_dict = defaultdict(list)
+
+        for edge_type, edge_index in edge_index_dict.items():
+            src_type, _, dst_type = edge_type
+
+            csc = BaseConv.to_csc(
+                edge_index, (x_dict[src_type].size(0), x_dict[dst_type].size(0))
+            )
+
+            if src_type == dst_type:
+                graph = self.get_cugraph(
+                    csc,
+                    bipartite=False,
+                )
+                out = mha_gat_n2n(
+                    x_src_dict[edge_type],
+                    self.attn_weights[edge_type],
+                    graph,
+                    num_heads=self.num_heads,
+                    activation="LeakyReLU",
+                    negative_slope=self.negative_slope,
+                    concat_heads=self.concat_heads,
+                )
+
+            else:
+                graph = self.get_cugraph(
+                    csc,
+                    bipartite=True,
+                )
+                out = mha_gat_n2n(
+                    (x_src_dict[edge_type], x_dst_dict[edge_type]),
+                    self.attn_weights[edge_type],
+                    graph,
+                    num_heads=self.num_heads,
+                    activation="LeakyReLU",
+                    negative_slope=self.negative_slope,
+                    concat_heads=self.concat_heads,
+                )
+
+            if self.bias is not None:
+                out = out + self.bias[edge_type]
+
+            out_dict[dst_type].append(out)
+
+        for key, value in out_dict.items():
+            out_dict[key] = torch_geometric.nn.conv.hetero_conv.group(value, self.aggr)
+
+        return out_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
index 1512901822a..30994289f9c 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
@@ -284,3 +284,32 @@ def basic_pyg_graph_2():
     )
     size = (10, 10)
     return edge_index, size
+
+
+@pytest.fixture
+def sample_pyg_hetero_data():
+    torch.manual_seed(12345)
+    raw_data_dict = {
+        "v0": torch.randn(6, 3),
+        "v1": torch.randn(7, 2),
+        "v2": torch.randn(5, 4),
+        ("v2", "e0", "v1"): torch.tensor([[0, 2, 2, 4, 4], [4, 3, 6, 0, 1]]),
+        ("v1", "e1", "v1"): torch.tensor(
+            [[0, 2, 2, 2, 3, 5, 5], [4, 0, 4, 5, 3, 0, 1]]
+        ),
+        ("v0", "e2", "v0"): torch.tensor([[0, 2, 2, 3, 5, 5], [1, 1, 5, 1, 1, 2]]),
+        ("v1", "e3", "v2"): torch.tensor(
+            [[0, 1, 1, 2, 4, 5, 6], [1, 2, 3, 1, 2, 2, 2]]
+        ),
+        ("v0", "e4", "v2"): torch.tensor([[1, 1, 3, 3, 4, 4], [1, 4, 1, 4, 0, 3]]),
+    }
+
+    # create a nested dictionary to facilitate PyG's HeteroData construction
+    hetero_data_dict = {}
+    for key, value in raw_data_dict.items():
+        if isinstance(key, tuple):
+            hetero_data_dict[key] = {"edge_index": value}
+        else:
+            hetero_data_dict[key] = {"x": value}
+
+    return hetero_data_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
new file mode 100644
index 00000000000..1c841a17df7
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph_pyg.nn import HeteroGATConv as CuGraphHeteroGATConv
+from cugraph.utilities.utils import import_optional, MissingModule
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+ATOL = 1e-6
+
+
+@pytest.mark.cugraph_ops
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(torch_geometric, MissingModule), reason="torch_geometric not available"
+)
+@pytest.mark.parametrize("heads", [1, 3, 10])
+@pytest.mark.parametrize("aggr", ["sum", "mean"])
+def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
+    major, minor, patch = torch_geometric.__version__.split(".")[:3]
+    pyg_version = tuple(map(int, [major, minor, patch]))
+    if pyg_version < (2, 4, 0):
+        pytest.skip("Skipping HeteroGATConv test")
+
+    from torch_geometric.data import HeteroData
+    from torch_geometric.nn import HeteroConv, GATConv
+
+    device = torch.device("cuda:0")
+    data = HeteroData(sample_pyg_hetero_data).to(device)
+
+    in_channels_dict = {k: v.size(1) for k, v in data.x_dict.items()}
+    out_channels = 2
+
+    convs_dict = {}
+    kwargs1 = dict(heads=heads, add_self_loops=False, bias=False)
+    for edge_type in data.edge_types:
+        src_t, _, dst_t = edge_type
+        in_channels_src, in_channels_dst = data.x_dict[src_t].size(-1), data.x_dict[
+            dst_t
+        ].size(-1)
+        if src_t == dst_t:
+            convs_dict[edge_type] = GATConv(in_channels_src, out_channels, **kwargs1)
+        else:
+            convs_dict[edge_type] = GATConv(
+                (in_channels_src, in_channels_dst), out_channels, **kwargs1
+            )
+
+    conv1 = HeteroConv(convs_dict, aggr=aggr).to(device)
+    kwargs2 = dict(
+        heads=heads,
+        aggr=aggr,
+        node_types=data.node_types,
+        edge_types=data.edge_types,
+        bias=False,
+    )
+    conv2 = CuGraphHeteroGATConv(in_channels_dict, out_channels, **kwargs2).to(device)
+
+    # copy over linear and attention weights
+    w_src, w_dst = conv2.split_tensors(conv2.lin_weights, dim=0)
+    with torch.no_grad():
+        for edge_type in conv2.edge_types:
+            src_t, _, dst_t = edge_type
+            w_src[edge_type][:, :] = conv1.convs[edge_type].lin_src.weight[:, :]
+            if w_dst[edge_type] is not None:
+                w_dst[edge_type][:, :] = conv1.convs[edge_type].lin_dst.weight[:, :]
+
+            conv2.attn_weights[edge_type][: heads * out_channels] = conv1.convs[
+                edge_type
+            ].att_src.data.flatten()
+            conv2.attn_weights[edge_type][heads * out_channels :] = conv1.convs[
+                edge_type
+            ].att_dst.data.flatten()
+
+    out1 = conv1(data.x_dict, data.edge_index_dict)
+    out2 = conv2(data.x_dict, data.edge_index_dict)
+
+    for node_type in data.node_types:
+        assert torch.allclose(out1[node_type], out2[node_type], atol=ATOL)
+
+    loss1 = 0
+    loss2 = 0
+    for node_type in data.node_types:
+        loss1 += out1[node_type].mean()
+        loss2 += out2[node_type].mean()
+
+    loss1.backward()
+    loss2.backward()
+
+    # check gradient w.r.t attention weights
+    out_dim = heads * out_channels
+    for edge_type in conv2.edge_types:
+        assert torch.allclose(
+            conv1.convs[edge_type].att_src.grad.flatten(),
+            conv2.attn_weights[edge_type].grad[:out_dim],
+            atol=ATOL,
+        )
+        assert torch.allclose(
+            conv1.convs[edge_type].att_dst.grad.flatten(),
+            conv2.attn_weights[edge_type].grad[out_dim:],
+            atol=ATOL,
+        )
+
+    # check gradient w.r.t linear weights
+    grad_lin_weights_ref = dict.fromkeys(out1.keys())
+    for node_t, (rels_as_src, rels_as_dst) in conv2.relations_per_ntype.items():
+        grad_list = []
+        for rel_t in rels_as_src:
+            grad_list.append(conv1.convs[rel_t].lin_src.weight.grad.clone())
+        for rel_t in rels_as_dst:
+            grad_list.append(conv1.convs[rel_t].lin_dst.weight.grad.clone())
+        assert len(grad_list) > 0
+        grad_lin_weights_ref[node_t] = torch.vstack(grad_list)
+
+    for node_type in conv2.lin_weights:
+        assert torch.allclose(
+            grad_lin_weights_ref[node_type],
+            conv2.lin_weights[node_type].grad,
+            atol=ATOL,
+        )
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index 95b1fa27402..b0671644982 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -26,7 +26,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
 ]
 dependencies = [
-    "cugraph==23.12.*",
+    "cugraph==24.2.*",
     "numba>=0.57",
     "numpy>=1.21",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index d68f8055ded..4c83ad3905b 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -19,18 +19,18 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.12.*",
-    "cugraph-service-client==23.12.*",
-    "cugraph==23.12.*",
+    "cudf==24.2.*",
+    "cugraph-service-client==24.2.*",
+    "cugraph==24.2.*",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==23.12.*",
-    "dask-cudf==23.12.*",
+    "dask-cuda==24.2.*",
+    "dask-cudf==24.2.*",
     "numba>=0.57",
     "numpy>=1.21",
-    "rapids-dask-dependency==23.12.*",
-    "rmm==23.12.*",
+    "rapids-dask-dependency==24.2.*",
+    "rmm==24.2.*",
     "thriftpy2",
-    "ucx-py==0.35.*",
+    "ucx-py==0.36.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index 8693c0e9e1f..99936b23a8c 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cugraph_version 23.12.00)
+set(cugraph_version 24.02.00)
 
 include(../../fetch_rapids.cmake)
 
@@ -38,7 +38,6 @@ project(
 option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before defaulting to local files"
        OFF
 )
-option(CUGRAPH_BUILD_WHEELS "Whether this build is generating a Python wheel." OFF)
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
 
 if(NOT USE_CUGRAPH_OPS)
@@ -59,22 +58,14 @@ if(NOT cugraph_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
-
-  set(_exclude_from_all "")
-  if(CUGRAPH_BUILD_WHEELS)
-    # Statically link dependencies if building wheels
-    set(CUDA_STATIC_RUNTIME ON)
-    set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_LIB ON)
-    set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
-    set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
-    set(ALLOW_CLONE_CUGRAPH_OPS ON)
-
-    # Don't install the cuML C++ targets into wheels
-    set(_exclude_from_all EXCLUDE_FROM_ALL)
-  endif()
-
-  add_subdirectory(../../cpp cugraph-cpp ${_exclude_from_all})
+  set(CUDA_STATIC_RUNTIME ON)
+  set(USE_RAFT_STATIC ON)
+  set(CUGRAPH_COMPILE_RAFT_LIB ON)
+  set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
+  set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
+  set(ALLOW_CLONE_CUGRAPH_OPS ON)
+
+  add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
   set(cython_lib_dir cugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
diff --git a/python/cugraph/cugraph/dask/common/mg_utils.py b/python/cugraph/cugraph/dask/common/mg_utils.py
index 6acda48c9da..b04f293dc0e 100644
--- a/python/cugraph/cugraph/dask/common/mg_utils.py
+++ b/python/cugraph/cugraph/dask/common/mg_utils.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 import os
-
+import gc
 import numba.cuda
 
 
@@ -68,3 +68,8 @@ def get_visible_devices():
     else:
         visible_devices = _visible_devices.strip().split(",")
     return visible_devices
+
+
+def run_gc_on_dask_cluster(client):
+    gc.collect()
+    client.run(gc.collect)
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index f666900b226..319435575cc 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -37,8 +37,8 @@
 from cugraph.dask.common.part_utils import (
     get_persisted_df_worker_map,
     persist_dask_df_equal_parts_per_worker,
-    _chunk_lst,
 )
+from cugraph.dask.common.mg_utils import run_gc_on_dask_cluster
 from cugraph.dask import get_n_workers
 import cugraph.dask.comms.comms as Comms
 
@@ -171,7 +171,6 @@ def __from_edgelist(
         store_transposed=False,
         legacy_renum_only=False,
     ):
-
         if not isinstance(input_ddf, dask_cudf.DataFrame):
             raise TypeError("input should be a dask_cudf dataFrame")
 
@@ -275,7 +274,6 @@ def __from_edgelist(
             )
             value_col = None
         else:
-
             source_col, dest_col, value_col = symmetrize(
                 input_ddf,
                 source,
@@ -350,9 +348,11 @@ def __from_edgelist(
             is_symmetric=not self.properties.directed,
         )
         ddf = ddf.repartition(npartitions=len(workers) * 2)
-        ddf_keys = ddf.to_delayed()
         workers = _client.scheduler_info()["workers"].keys()
-        ddf_keys_ls = _chunk_lst(ddf_keys, len(workers))
+        persisted_keys_d = persist_dask_df_equal_parts_per_worker(
+            ddf, _client, return_type="dict"
+        )
+        del ddf
 
         delayed_tasks_d = {
             w: delayed(simpleDistributedGraphImpl._make_plc_graph)(
@@ -367,19 +367,19 @@ def __from_edgelist(
                 self.edge_id_type,
                 self.edge_type_id_type,
             )
-            for w, edata in zip(workers, ddf_keys_ls)
+            for w, edata in persisted_keys_d.items()
         }
+        del persisted_keys_d
         self._plc_graph = {
             w: _client.compute(
                 delayed_task, workers=w, allow_other_workers=False, pure=False
             )
             for w, delayed_task in delayed_tasks_d.items()
         }
-        wait(list(self._plc_graph.values()))
-        del ddf_keys
         del delayed_tasks_d
-        gc.collect()
-        _client.run(gc.collect)
+        run_gc_on_dask_cluster(_client)
+        wait(list(self._plc_graph.values()))
+        run_gc_on_dask_cluster(_client)
 
     @property
     def renumbered(self):
@@ -945,7 +945,6 @@ def convert_to_cudf(cp_arrays: cp.ndarray) -> cudf.Series:
         def _call_plc_select_random_vertices(
             mg_graph_x, sID: bytes, random_state: int, num_vertices: int
         ) -> cudf.Series:
-
             cp_arrays = pylibcugraph_select_random_vertices(
                 graph=mg_graph_x,
                 resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
@@ -961,7 +960,6 @@ def _mg_call_plc_select_random_vertices(
             random_state: int,
             num_vertices: int,
         ) -> dask_cudf.Series:
-
             result = [
                 client.submit(
                     _call_plc_select_random_vertices,
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index bd426291c8d..620971bcde1 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -6,9 +6,9 @@ requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
     "ninja",
-    "pylibcugraph==23.12.*",
-    "pylibraft==23.12.*",
-    "rmm==23.12.*",
+    "pylibcugraph==24.2.*",
+    "pylibraft==24.2.*",
+    "rmm==24.2.*",
     "scikit-build>=0.13.1",
     "setuptools>=61.0.0",
     "wheel",
@@ -29,18 +29,18 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.12.*",
+    "cudf==24.2.*",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==23.12.*",
-    "dask-cudf==23.12.*",
+    "dask-cuda==24.2.*",
+    "dask-cudf==24.2.*",
     "fsspec[http]>=0.6.0",
     "numba>=0.57",
     "numpy>=1.21",
-    "pylibcugraph==23.12.*",
-    "raft-dask==23.12.*",
-    "rapids-dask-dependency==23.12.*",
-    "rmm==23.12.*",
-    "ucx-py==0.35.*",
+    "pylibcugraph==24.2.*",
+    "raft-dask==24.2.*",
+    "rapids-dask-dependency==24.2.*",
+    "rmm==24.2.*",
+    "ucx-py==0.36.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index ef5f8f3fc23..1fd436bb845 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -24,7 +24,7 @@
     "backend_name": "cugraph",
     "project": "nx-cugraph",
     "package": "nx_cugraph",
-    "url": "https://github.com/rapidsai/cugraph/tree/branch-23.12/python/nx-cugraph",
+    "url": "https://github.com/rapidsai/cugraph/tree/branch-24.02/python/nx-cugraph",
     "short_summary": "GPU-accelerated backend.",
     # "description": "TODO",
     "functions": {
@@ -160,7 +160,7 @@ def get_info():
 
 
 # FIXME: can this use the standard VERSION file and update mechanism?
-__version__ = "23.12.00"
+__version__ = "24.02.00"
 
 if __name__ == "__main__":
     from pathlib import Path
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index a94aa9f0448..de6f20bc439 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -45,12 +45,12 @@ repos:
       - id: pyupgrade
         args: [--py39-plus]
   - repo: https://github.com/psf/black
-    rev: 23.10.1
+    rev: 23.11.0
     hooks:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.3
+    rev: v0.1.7
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]  # --unsafe-fixes]
@@ -62,7 +62,7 @@ repos:
         additional_dependencies: &flake8_dependencies
           # These versions need updated manually
           - flake8==6.1.0
-          - flake8-bugbear==23.9.16
+          - flake8-bugbear==23.12.2
           - flake8-simplify==0.21.0
   - repo: https://github.com/asottile/yesqa
     rev: v1.5.0
@@ -77,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.3
+    rev: v0.1.7
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
index 63841b15bd5..e4947491555 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
@@ -15,13 +15,13 @@
     centrality,
     community,
     components,
-    shortest_paths,
     link_analysis,
+    shortest_paths,
 )
 from .bipartite import complete_bipartite_graph
 from .centrality import *
 from .components import *
 from .core import *
 from .isolate import *
-from .shortest_paths import *
 from .link_analysis import *
+from .shortest_paths import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
index 1d3e762b4fd..25b9b39554b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
@@ -17,15 +17,14 @@
 import numpy as np
 
 from nx_cugraph.generators._utils import _create_using_class, _number_and_nodes
-from nx_cugraph.utils import index_dtype, networkx_algorithm, nodes_or_number
+from nx_cugraph.utils import index_dtype, networkx_algorithm
 
 __all__ = [
     "complete_bipartite_graph",
 ]
 
 
-@nodes_or_number([0, 1])
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=[0, 1])
 def complete_bipartite_graph(n1, n2, create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/core.py b/python/nx-cugraph/nx_cugraph/algorithms/core.py
index 2219388bc58..390598d070e 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/core.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/core.py
@@ -31,7 +31,11 @@ def k_truss(G, k):
     if is_nx := isinstance(G, nx.Graph):
         G = nxcg.from_networkx(G, preserve_all_attrs=True)
     if nxcg.number_of_selfloops(G) > 0:
-        raise nx.NetworkXError(
+        if nx.__version__[:3] <= "3.2":
+            exc_class = nx.NetworkXError
+        else:
+            exc_class = nx.NetworkXNotImplemented
+        raise exc_class(
             "Input graph has self loops which is not permitted; "
             "Consider using G.remove_edges_from(nx.selfloop_edges(G))."
         )
diff --git a/python/nx-cugraph/nx_cugraph/generators/classic.py b/python/nx-cugraph/nx_cugraph/generators/classic.py
index b196c232320..4213e6dd2a0 100644
--- a/python/nx-cugraph/nx_cugraph/generators/classic.py
+++ b/python/nx-cugraph/nx_cugraph/generators/classic.py
@@ -19,7 +19,7 @@
 
 import nx_cugraph as nxcg
 
-from ..utils import _get_int_dtype, index_dtype, networkx_algorithm, nodes_or_number
+from ..utils import _get_int_dtype, index_dtype, networkx_algorithm
 from ._utils import (
     _IS_NX32_OR_LESS,
     _common_small_graph,
@@ -86,8 +86,7 @@ def circular_ladder_graph(n, create_using=None):
     return _ladder_graph(n, create_using, is_circular=True)
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def complete_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     if n < 3:
@@ -143,8 +142,7 @@ def complete_multipartite_graph(*subset_sizes):
     )
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def cycle_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using)
@@ -174,8 +172,7 @@ def cycle_graph(n, create_using=None):
     return G
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def empty_graph(n=0, create_using=None, default=nx.Graph):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using, default=default)
@@ -242,8 +239,7 @@ def ladder_graph(n, create_using=None):
     return _ladder_graph(n, create_using)
 
 
-@nodes_or_number([0, 1])
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=[0, 1])
 def lollipop_graph(m, n, create_using=None):
     # Like complete_graph then path_graph
     orig_m, unused_nodes_m = m
@@ -283,8 +279,7 @@ def null_graph(create_using=None):
     return _common_small_graph(0, None, create_using)
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def path_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using)
@@ -304,8 +299,7 @@ def path_graph(n, create_using=None):
     return G
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def star_graph(n, create_using=None):
     orig_n, orig_nodes = n
     n, nodes = _number_and_nodes(n)
@@ -329,8 +323,7 @@ def star_graph(n, create_using=None):
     return G
 
 
-@nodes_or_number([0, 1])
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=[0, 1])
 def tadpole_graph(m, n, create_using=None):
     orig_m, unused_nodes_m = m
     orig_n, unused_nodes_n = n
@@ -382,8 +375,7 @@ def turan_graph(n, r):
     return complete_multipartite_graph(*partitions)
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def wheel_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using)
diff --git a/python/nx-cugraph/nx_cugraph/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py
index 0048aee51bb..a0dbfcec890 100644
--- a/python/nx-cugraph/nx_cugraph/utils/decorators.py
+++ b/python/nx-cugraph/nx_cugraph/utils/decorators.py
@@ -15,6 +15,7 @@
 from functools import partial, update_wrapper
 from textwrap import dedent
 
+import networkx as nx
 from networkx.utils.decorators import nodes_or_number, not_implemented_for
 
 from nx_cugraph.interface import BackendInterface
@@ -47,10 +48,18 @@ def __new__(
         *,
         name: str | None = None,
         extra_params: dict[str, str] | str | None = None,
+        nodes_or_number: list[int] | int | None = None,
     ):
         if func is None:
-            return partial(networkx_algorithm, name=name, extra_params=extra_params)
+            return partial(
+                networkx_algorithm,
+                name=name,
+                extra_params=extra_params,
+                nodes_or_number=nodes_or_number,
+            )
         instance = object.__new__(cls)
+        if nodes_or_number is not None and nx.__version__[:3] > "3.2":
+            func = nx.utils.decorators.nodes_or_number(nodes_or_number)(func)
         # update_wrapper sets __wrapped__, which will be used for the signature
         update_wrapper(instance, func)
         instance.__defaults__ = func.__defaults__
@@ -76,6 +85,8 @@ def __new__(
         setattr(BackendInterface, instance.name, instance)
         # Set methods so they are in __dict__
         instance._can_run = instance._can_run
+        if nodes_or_number is not None and nx.__version__[:3] <= "3.2":
+            instance = nx.utils.decorators.nodes_or_number(nodes_or_number)(instance)
         return instance
 
     def _can_run(self, func):
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index f309f4797a7..b29578b036f 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -33,7 +33,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "networkx>=3.0",
     "numpy>=1.21",
-    "pylibcugraph==23.12.*",
+    "pylibcugraph==24.2.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index 057f30ef3ad..e1250cb2edb 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(pylibcugraph_version 23.12.00)
+set(pylibcugraph_version 24.02.00)
 
 include(../../fetch_rapids.cmake)
 
@@ -38,7 +38,6 @@ project(
 option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before defaulting to local files"
        OFF
 )
-option(CUGRAPH_BUILD_WHEELS "Whether we're building a wheel for pypi" OFF)
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
 
 if(NOT USE_CUGRAPH_OPS)
@@ -59,22 +58,14 @@ if (NOT cugraph_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
-
-  set(_exclude_from_all "")
-  if(CUGRAPH_BUILD_WHEELS)
-    # Statically link dependencies if building wheels
-    set(CUDA_STATIC_RUNTIME ON)
-    set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_LIB ON)
-    set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
-    set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
-    set(ALLOW_CLONE_CUGRAPH_OPS ON)
-
-    # Don't install the cuML C++ targets into wheels
-    set(_exclude_from_all EXCLUDE_FROM_ALL)
-  endif()
-
-  add_subdirectory(../../cpp cugraph-cpp ${_exclude_from_all})
+  set(CUDA_STATIC_RUNTIME ON)
+  set(USE_RAFT_STATIC ON)
+  set(CUGRAPH_COMPILE_RAFT_LIB ON)
+  set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
+  set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
+  set(ALLOW_CLONE_CUGRAPH_OPS ON)
+
+  add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
   set(cython_lib_dir pylibcugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 96f5ec84efb..0f2d742e7c5 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -6,8 +6,8 @@ requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
     "ninja",
-    "pylibraft==23.12.*",
-    "rmm==23.12.*",
+    "pylibraft==24.2.*",
+    "rmm==24.2.*",
     "scikit-build>=0.13.1",
     "setuptools>=61.0.0",
     "wheel",
@@ -28,8 +28,8 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "pylibraft==23.12.*",
-    "rmm==23.12.*",
+    "pylibraft==24.2.*",
+    "rmm==24.2.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -40,7 +40,7 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "cudf==23.12.*",
+    "cudf==24.2.*",
     "numpy>=1.21",
     "pandas",
     "pytest",