diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 9a0b4155035..2ed5231aef4 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -6,3 +6,4 @@ branch_checker: true
 label_checker: true
 release_drafter: true
 recently_updated: true
+forward_merger: true
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 92f70bb2aa7..5ecff1f8a75 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -47,7 +47,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -83,7 +83,7 @@ jobs:
   wheel-publish-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -93,7 +93,7 @@ jobs:
   wheel-build-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
   wheel-publish-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -116,7 +116,7 @@ jobs:
   wheel-build-nx-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
   wheel-publish-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -136,7 +136,7 @@ jobs:
   wheel-build-cugraph-dgl:
     needs: wheel-publish-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-cugraph-dgl:
     needs: wheel-build-cugraph-dgl
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -156,7 +156,7 @@ jobs:
   wheel-build-cugraph-pyg:
     needs: wheel-publish-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -166,7 +166,7 @@ jobs:
   wheel-publish-cugraph-pyg:
     needs: wheel-build-cugraph-pyg
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -175,7 +175,7 @@ jobs:
       package-name: cugraph-pyg
   wheel-build-cugraph-equivariant:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -185,7 +185,7 @@ jobs:
   wheel-publish-cugraph-equivariant:
     needs: wheel-build-cugraph-equivariant
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 36a53df14e1..349d682a59e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -33,41 +33,41 @@ jobs:
       - wheel-tests-cugraph-equivariant
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: cpu32
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -77,7 +77,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -87,7 +87,7 @@ jobs:
   wheel-build-pylibcugraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibcugraph.sh
@@ -98,14 +98,14 @@ jobs:
   wheel-tests-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibcugraph.sh
   wheel-build-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph.sh
@@ -115,35 +115,35 @@ jobs:
   wheel-tests-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph.sh
   wheel-build-nx-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_nx-cugraph.sh
   wheel-tests-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_nx-cugraph.sh
   wheel-build-cugraph-dgl:
     needs: wheel-tests-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-dgl.sh
   wheel-tests-cugraph-dgl:
     needs: wheel-build-cugraph-dgl
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-dgl.sh
@@ -151,35 +151,35 @@ jobs:
   wheel-build-cugraph-pyg:
     needs: wheel-tests-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-pyg.sh
   wheel-tests-cugraph-pyg:
     needs: wheel-build-cugraph-pyg
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-pyg.sh
       matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "11.8.0"))
   wheel-build-cugraph-equivariant:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph-equivariant.sh
   wheel-tests-cugraph-equivariant:
     needs: wheel-build-cugraph-equivariant
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph-equivariant.sh
       matrix_filter: map(select(.ARCH == "amd64"))
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04
     with:
       node_type: cpu32
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 7e81354d1b8..f6a04fcc776 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -41,7 +41,7 @@ jobs:
       script: ci/test_wheel_pylibcugraph.sh
   wheel-tests-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -50,7 +50,7 @@ jobs:
       script: ci/test_wheel_cugraph.sh
   wheel-tests-nx-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       script: ci/test_wheel_nx-cugraph.sh
   wheel-tests-cugraph-dgl:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       matrix_filter: map(select(.ARCH == "amd64"))
   wheel-tests-cugraph-pyg:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "11.8.0"))
   wheel-tests-cugraph-equivariant:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@test-cuda-12.2
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/ci/run_ctests.sh b/ci/run_ctests.sh
new file mode 100755
index 00000000000..318c3b183a8
--- /dev/null
+++ b/ci/run_ctests.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support customizing the ctests' install location
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcugraph/"
+ctest --output-on-failure --no-tests=error "$@"
+
+if [ -d "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcugraph_c/" ]; then
+  # Support customizing the ctests' install location
+  cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcugraph_c/"
+  ctest --output-on-failure --no-tests=error "$@"
+fi
diff --git a/ci/run_cugraph_benchmark_pytests.sh b/ci/run_cugraph_benchmark_pytests.sh
new file mode 100755
index 00000000000..ae795ba3468
--- /dev/null
+++ b/ci/run_cugraph_benchmark_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_cugraph_benchmark_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../benchmarks
+
+pytest --capture=no --benchmark-disable -m tiny "$@" cugraph/pytest-based/bench_algos.py
diff --git a/ci/run_cugraph_dgl_pytests.sh b/ci/run_cugraph_dgl_pytests.sh
new file mode 100755
index 00000000000..83c26a57dc0
--- /dev/null
+++ b/ci/run_cugraph_dgl_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_cugraph_dgl_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-dgl/tests
+
+pytest --cache-clear --ignore=mg "$@" .
diff --git a/ci/run_cugraph_equivariant_pytests.sh b/ci/run_cugraph_equivariant_pytests.sh
new file mode 100755
index 00000000000..5d5a5fb05c2
--- /dev/null
+++ b/ci/run_cugraph_equivariant_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_cugraph_equivariant_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-equivariant/cugraph_equivariant
+
+pytest --cache-clear "$@" .
diff --git a/ci/run_cugraph_pyg_pytests.sh b/ci/run_cugraph_pyg_pytests.sh
new file mode 100755
index 00000000000..47ed6ba0008
--- /dev/null
+++ b/ci/run_cugraph_pyg_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_cugraph_pyg_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-pyg/cugraph_pyg
+
+pytest --cache-clear --ignore=tests/mg "$@" .
diff --git a/ci/run_cugraph_pytests.sh b/ci/run_cugraph_pytests.sh
new file mode 100755
index 00000000000..c8776c9acd4
--- /dev/null
+++ b/ci/run_cugraph_pytests.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_cugraph_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph/cugraph
+
+DASK_WORKER_DEVICES="${DASK_WORKER_DEVICES:-0}" \
+DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL="${DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL:-1000s}" \
+DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="${DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT:-1000s}" \
+DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT="${DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT:-1000s}" \
+pytest --cache-clear --import-mode=append --benchmark-disable \
+  -k "not test_property_graph_mg and not test_bulk_sampler_io" \
+  "$@" \
+  tests
diff --git a/ci/run_cugraph_service_pytests.sh b/ci/run_cugraph_service_pytests.sh
new file mode 100755
index 00000000000..772168dcc63
--- /dev/null
+++ b/ci/run_cugraph_service_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_cugraph_service_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-service
+
+pytest --capture=no --cache-clear --benchmark-disable -k "not mg" "$@" tests
diff --git a/ci/run_nx_cugraph_pytests.sh b/ci/run_nx_cugraph_pytests.sh
new file mode 100755
index 00000000000..b0caffd0a0f
--- /dev/null
+++ b/ci/run_nx_cugraph_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_nx_cugraph_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/nx-cugraph/nx_cugraph
+
+pytest --capture=no --cache-clear --benchmark-disable "$@" tests
diff --git a/ci/run_pylibcugraph_pytests.sh b/ci/run_pylibcugraph_pytests.sh
new file mode 100755
index 00000000000..ee403bd4a66
--- /dev/null
+++ b/ci/run_pylibcugraph_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_pylibcugraph_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/pylibcugraph/pylibcugraph
+
+pytest --cache-clear "$@" tests
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 39fff52bd94..2cd7d02670f 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -3,6 +3,9 @@
 
 set -euo pipefail
 
+# Support invoking test_cpp.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
+
 . /opt/conda/etc/profile.d/conda.sh
 
 rapids-logger "Generate C++ testing dependencies"
@@ -38,21 +41,11 @@ pushd "${RAPIDS_DATASET_ROOT_DIR}"
 ./get_test_data.sh --subset
 popd
 
-EXITCODE=0
-trap "EXITCODE=1" ERR
-set +e
-
 export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/
 
 # Run libcugraph gtests from libcugraph-tests package
 rapids-logger "Run gtests"
-cd "$CONDA_PREFIX"/bin/gtests/libcugraph/
-ctest -j10 --output-on-failure --no-tests=error
-
-if [ -d "$CONDA_PREFIX"/bin/gtests/libcugraph_c/ ]; then
-  cd "$CONDA_PREFIX"/bin/gtests/libcugraph_c/
-  ctest -j10 --output-on-failure --no-tests=error
-fi
+./ci/run_ctests.sh -j10 && EXITCODE=$? || EXITCODE=$?;
 
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test_python.sh b/ci/test_python.sh
index b070143f076..8fa9a90ae69 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -3,6 +3,9 @@
 
 set -euo pipefail
 
+# Support invoking test_python.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
+
 . /opt/conda/etc/profile.d/conda.sh
 
 rapids-logger "Generate Python testing dependencies"
@@ -52,16 +55,14 @@ trap "EXITCODE=1" ERR
 set +e
 
 rapids-logger "pytest pylibcugraph"
-pushd python/pylibcugraph/pylibcugraph
-pytest \
-  --cache-clear \
+./ci/run_pylibcugraph_pytests.sh \
+  --verbose \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-pylibcugraph.xml" \
   --cov-config=../../.coveragerc \
   --cov=pylibcugraph \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/pylibcugraph-coverage.xml" \
-  --cov-report=term \
-  tests
-popd
+  --cov-report=term
+
 
 # Test runs that include tests that use dask require
 # --import-mode=append. Those tests start a LocalCUDACluster that inherits
@@ -74,49 +75,26 @@ popd
 # FIXME: TEMPORARILY disable MG PropertyGraph tests (experimental) tests and
 # bulk sampler IO tests (hangs in CI)
 rapids-logger "pytest cugraph"
-pushd python/cugraph/cugraph
-DASK_WORKER_DEVICES="0" \
-DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL="1000s" \
-DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="1000s" \
-DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT="1000s" \
-pytest \
-  -v \
-  --import-mode=append \
-  --benchmark-disable \
-  --cache-clear \
+./ci/run_cugraph_pytests.sh \
+  --verbose \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph.xml" \
   --cov-config=../../.coveragerc \
   --cov=cugraph \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-coverage.xml" \
-  --cov-report=term \
-  -k "not test_property_graph_mg and not test_bulk_sampler_io" \
-  tests
-popd
+  --cov-report=term
+
 
 rapids-logger "pytest cugraph benchmarks (run as tests)"
-pushd benchmarks
-pytest \
-  --capture=no \
-  --verbose \
-  -m tiny \
-  --benchmark-disable \
-  cugraph/pytest-based/bench_algos.py
-popd
+./ci/run_cugraph_benchmark_pytests.sh --verbose
 
 rapids-logger "pytest nx-cugraph"
-pushd python/nx-cugraph/nx_cugraph
-pytest \
-  --capture=no \
+./ci/run_nx_cugraph_pytests.sh \
   --verbose \
-  --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-nx-cugraph.xml" \
   --cov-config=../../.coveragerc \
   --cov=nx_cugraph \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/nx-cugraph-coverage.xml" \
-  --cov-report=term \
-  --benchmark-disable \
-  tests
-popd
+  --cov-report=term
 
 rapids-logger "pytest networkx using nx-cugraph backend"
 pushd python/nx-cugraph
@@ -150,21 +128,14 @@ python -m nx_cugraph.scripts.print_table
 popd
 
 rapids-logger "pytest cugraph-service (single GPU)"
-pushd python/cugraph-service
-pytest \
-  --capture=no \
+./ci/run_cugraph_service_pytests.sh \
   --verbose \
-  --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph-service.xml" \
   --cov-config=../.coveragerc \
   --cov=cugraph_service_client \
   --cov=cugraph_service_server \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-service-coverage.xml" \
-  --cov-report=term \
-  --benchmark-disable \
-  -k "not mg" \
-  tests
-popd
+  --cov-report=term
 
 if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
   if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
@@ -195,17 +166,12 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
     rapids-print-env
 
     rapids-logger "pytest cugraph_dgl (single GPU)"
-    pushd python/cugraph-dgl/tests
-    pytest \
-      --cache-clear \
-      --ignore=mg \
+    ./ci/run_cugraph_dgl_pytests.sh \
       --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph-dgl.xml" \
       --cov-config=../../.coveragerc \
       --cov=cugraph_dgl \
       --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-dgl-coverage.xml" \
-      --cov-report=term \
-      .
-    popd
+      --cov-report=term
 
     # Reactivate the test environment back
     set +u
@@ -252,18 +218,13 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
     rapids-print-env
 
     rapids-logger "pytest cugraph_pyg (single GPU)"
-    pushd python/cugraph-pyg/cugraph_pyg
     # rmat is not tested because of multi-GPU testing
-    pytest \
-      --cache-clear \
-      --ignore=tests/mg \
+    ./ci/run_cugraph_pyg_pytests.sh \
       --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph-pyg.xml" \
       --cov-config=../../.coveragerc \
       --cov=cugraph_pyg \
       --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-pyg-coverage.xml" \
-      --cov-report=term \
-      .
-    popd
+      --cov-report=term
 
     # Reactivate the test environment back
     set +u
@@ -296,16 +257,12 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
     rapids-print-env
 
     rapids-logger "pytest cugraph-equivariant"
-    pushd python/cugraph-equivariant/cugraph_equivariant
-    pytest \
-      --cache-clear \
+    ./ci/run_cugraph_equivariant_pytests.sh \
       --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph-equivariant.xml" \
       --cov-config=../../.coveragerc \
       --cov=cugraph_equivariant \
       --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-equivariant-coverage.xml" \
-      --cov-report=term \
-      .
-    popd
+      --cov-report=term
 
     # Reactivate the test environment back
     set +u
diff --git a/cpp/src/prims/key_store.cuh b/cpp/src/prims/key_store.cuh
index 6d135b4e94e..907ca36ef4a 100644
--- a/cpp/src/prims/key_store.cuh
+++ b/cpp/src/prims/key_store.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,8 @@ namespace cugraph {
 
 namespace detail {
 
+using cuco_storage_type = cuco::storage<1>;  ///< cuco window storage type
+
 template <typename KeyIterator>
 struct key_binary_search_contains_op_t {
   using key_type = typename thrust::iterator_traits<KeyIterator>::value_type;
@@ -70,9 +72,8 @@ struct key_binary_search_store_device_view_t {
 
 template <typename ViewType>
 struct key_cuco_store_contains_device_view_t {
-  using key_type = typename ViewType::key_type;
-  using cuco_store_device_ref_type =
-    typename ViewType::cuco_store_type::ref_type<cuco::experimental::contains_tag>;
+  using key_type                   = typename ViewType::key_type;
+  using cuco_store_device_ref_type = typename ViewType::cuco_set_type::ref_type<cuco::contains_tag>;
 
   static_assert(!ViewType::binary_search);
 
@@ -88,9 +89,8 @@ struct key_cuco_store_contains_device_view_t {
 
 template <typename ViewType>
 struct key_cuco_store_insert_device_view_t {
-  using key_type = typename ViewType::key_type;
-  using cuco_store_device_ref_type =
-    typename ViewType::cuco_store_type::ref_type<cuco::experimental::insert_tag>;
+  using key_type                   = typename ViewType::key_type;
+  using cuco_store_device_ref_type = typename ViewType::cuco_set_type::ref_type<cuco::insert_tag>;
 
   static_assert(!ViewType::binary_search);
 
@@ -147,16 +147,17 @@ class key_cuco_store_view_t {
 
   static constexpr bool binary_search = false;
 
-  using cuco_store_type = cuco::experimental::static_set<
-    key_t,
-    cuco::experimental::extent<std::size_t>,
-    cuda::thread_scope_device,
-    thrust::equal_to<key_t>,
-    cuco::experimental::linear_probing<1,  // CG size
-                                       cuco::murmurhash3_32<key_t>>,
-    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
+  using cuco_set_type =
+    cuco::static_set<key_t,
+                     cuco::extent<std::size_t>,
+                     cuda::thread_scope_device,
+                     thrust::equal_to<key_t>,
+                     cuco::linear_probing<1,  // CG size
+                                          cuco::murmurhash3_32<key_t>>,
+                     rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>,
+                     cuco_storage_type>;
 
-  key_cuco_store_view_t(cuco_store_type const* store) : cuco_store_(store) {}
+  key_cuco_store_view_t(cuco_set_type const* store) : cuco_store_(store) {}
 
   template <typename QueryKeyIterator, typename ResultValueIterator>
   void contains(QueryKeyIterator key_first,
@@ -167,17 +168,14 @@ class key_cuco_store_view_t {
     cuco_store_->contains(key_first, key_last, value_first, stream);
   }
 
-  auto cuco_store_contains_device_ref() const
-  {
-    return cuco_store_->ref(cuco::experimental::contains);
-  }
+  auto cuco_store_contains_device_ref() const { return cuco_store_->ref(cuco::contains); }
 
-  auto cuco_store_insert_device_ref() const { return cuco_store_->ref(cuco::experimental::insert); }
+  auto cuco_store_insert_device_ref() const { return cuco_store_->ref(cuco::insert); }
 
   key_t invalid_key() const { return cuco_store_->get_empty_key_sentinel(); }
 
  private:
-  cuco_store_type const* cuco_store_{};
+  cuco_set_type const* cuco_store_{};
 };
 
 template <typename key_t>
@@ -240,14 +238,15 @@ class key_cuco_store_t {
  public:
   using key_type = key_t;
 
-  using cuco_store_type = cuco::experimental::static_set<
-    key_t,
-    cuco::experimental::extent<std::size_t>,
-    cuda::thread_scope_device,
-    thrust::equal_to<key_t>,
-    cuco::experimental::linear_probing<1,  // CG size
-                                       cuco::murmurhash3_32<key_t>>,
-    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
+  using cuco_set_type =
+    cuco::static_set<key_t,
+                     cuco::extent<std::size_t>,
+                     cuda::thread_scope_device,
+                     thrust::equal_to<key_t>,
+                     cuco::linear_probing<1,  // CG size
+                                          cuco::murmurhash3_32<key_t>>,
+                     rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>,
+                     cuco_storage_type>;
 
   key_cuco_store_t(rmm::cuda_stream_view stream) {}
 
@@ -306,7 +305,7 @@ class key_cuco_store_t {
     return keys;
   }
 
-  cuco_store_type const* cuco_store_ptr() const { return cuco_store_.get(); }
+  cuco_set_type const* cuco_store_ptr() const { return cuco_store_.get(); }
 
   key_t invalid_key() const { return cuco_store_->empty_key_sentinel(); }
 
@@ -324,17 +323,19 @@ class key_cuco_store_t {
 
     auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(
       rmm::mr::polymorphic_allocator<std::byte>(rmm::mr::get_current_device_resource()), stream);
-    cuco_store_ = std::make_unique<cuco_store_type>(
-      cuco_size,
-      cuco::sentinel::empty_key<key_t>{invalid_key},
-      thrust::equal_to<key_t>{},
-      cuco::experimental::linear_probing<1,  // CG size
-                                         cuco::murmurhash3_32<key_t>>{},
-      stream_adapter,
-      stream.value());
+    cuco_store_ =
+      std::make_unique<cuco_set_type>(cuco_size,
+                                      cuco::sentinel::empty_key<key_t>{invalid_key},
+                                      thrust::equal_to<key_t>{},
+                                      cuco::linear_probing<1,  // CG size
+                                                           cuco::murmurhash3_32<key_t>>{},
+                                      cuco::thread_scope_device,
+                                      cuco_storage_type{},
+                                      stream_adapter,
+                                      stream.value());
   }
 
-  std::unique_ptr<cuco_store_type> cuco_store_{nullptr};
+  std::unique_ptr<cuco_set_type> cuco_store_{nullptr};
 
   size_t capacity_{0};
   size_t size_{0};  // caching as cuco_store_->size() is expensive (this scans the entire slots to
diff --git a/cpp/src/prims/kv_store.cuh b/cpp/src/prims/kv_store.cuh
index f17441ad6ab..be4fde2fbff 100644
--- a/cpp/src/prims/kv_store.cuh
+++ b/cpp/src/prims/kv_store.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,6 +49,8 @@ namespace cugraph {
 
 namespace detail {
 
+using cuco_storage_type = cuco::storage<1>;  ///< cuco window storage type
+
 template <typename KeyIterator, typename ValueIterator>
 struct kv_binary_search_find_op_t {
   using key_type   = typename thrust::iterator_traits<KeyIterator>::value_type;
@@ -86,18 +88,19 @@ struct kv_binary_search_contains_op_t {
 
 template <typename KeyIterator>
 struct kv_cuco_insert_and_increment_t {
-  using key_type        = typename thrust::iterator_traits<KeyIterator>::value_type;
-  using cuco_store_type = cuco::experimental::static_map<
-    key_type,
-    size_t,
-    cuco::experimental::extent<std::size_t>,
-    cuda::thread_scope_device,
-    thrust::equal_to<key_type>,
-    cuco::experimental::linear_probing<1,  // CG size
-                                       cuco::murmurhash3_32<key_t>>,
-    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
-
-  typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag> device_ref{};
+  using key_type = typename thrust::iterator_traits<KeyIterator>::value_type;
+  using cuco_set_type =
+    cuco::static_map<key_type,
+                     size_t,
+                     cuco::extent<std::size_t>,
+                     cuda::thread_scope_device,
+                     thrust::equal_to<key_type>,
+                     cuco::linear_probing<1,  // CG size
+                                          cuco::murmurhash3_32<key_t>>,
+                     rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>,
+                     cuco_storage_type>;
+
+  typename cuco_set_type::ref_type<cuco::insert_and_find_tag> device_ref{};
   KeyIterator key_first{};
   size_t* counter{nullptr};
   size_t invalid_idx{};
@@ -109,7 +112,7 @@ struct kv_cuco_insert_and_increment_t {
     if (inserted) {
       cuda::atomic_ref<size_t, cuda::thread_scope_device> atomic_counter(*counter);
       auto idx       = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
-      using ref_type = typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag>;
+      using ref_type = typename cuco_set_type::ref_type<cuco::insert_and_find_tag>;
       cuda::atomic_ref<typename ref_type::mapped_type, cuda::thread_scope_device> ref(
         (*iter).second);
       ref.store(idx, cuda::std::memory_order_relaxed);
@@ -122,18 +125,19 @@ struct kv_cuco_insert_and_increment_t {
 
 template <typename KeyIterator, typename StencilIterator, typename PredOp>
 struct kv_cuco_insert_if_and_increment_t {
-  using key_type        = typename thrust::iterator_traits<KeyIterator>::value_type;
-  using cuco_store_type = cuco::experimental::static_map<
-    key_type,
-    size_t,
-    cuco::experimental::extent<std::size_t>,
-    cuda::thread_scope_device,
-    thrust::equal_to<key_type>,
-    cuco::experimental::linear_probing<1,  // CG size
-                                       cuco::murmurhash3_32<key_t>>,
-    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
-
-  typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag> device_ref{};
+  using key_type = typename thrust::iterator_traits<KeyIterator>::value_type;
+  using cuco_set_type =
+    cuco::static_map<key_type,
+                     size_t,
+                     cuco::extent<std::size_t>,
+                     cuda::thread_scope_device,
+                     thrust::equal_to<key_type>,
+                     cuco::linear_probing<1,  // CG size
+                                          cuco::murmurhash3_32<key_t>>,
+                     rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>,
+                     cuco_storage_type>;
+
+  typename cuco_set_type::ref_type<cuco::insert_and_find_tag> device_ref{};
   KeyIterator key_first{};
   StencilIterator stencil_first{};
   PredOp pred_op{};
@@ -149,7 +153,7 @@ struct kv_cuco_insert_if_and_increment_t {
     if (inserted) {
       cuda::atomic_ref<size_t, cuda::thread_scope_device> atomic_counter(*counter);
       auto idx       = atomic_counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed);
-      using ref_type = typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag>;
+      using ref_type = typename cuco_set_type::ref_type<cuco::insert_and_find_tag>;
       cuda::atomic_ref<typename ref_type::mapped_type, cuda::thread_scope_device> ref(
         (*iter).second);
       ref.store(idx, cuda::std::memory_order_relaxed);
@@ -162,23 +166,24 @@ struct kv_cuco_insert_if_and_increment_t {
 
 template <typename key_t, typename value_t>
 struct kv_cuco_insert_and_assign_t {
-  using cuco_store_type = cuco::experimental::static_map<
-    key_t,
-    std::conditional_t<std::is_arithmetic_v<value_t>, value_t, size_t>,
-    cuco::experimental::extent<std::size_t>,
-    cuda::thread_scope_device,
-    thrust::equal_to<key_t>,
-    cuco::experimental::linear_probing<1,  // CG size
-                                       cuco::murmurhash3_32<key_t>>,
-    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
-
-  typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag> device_ref{};
+  using cuco_set_type =
+    cuco::static_map<key_t,
+                     std::conditional_t<std::is_arithmetic_v<value_t>, value_t, size_t>,
+                     cuco::extent<std::size_t>,
+                     cuda::thread_scope_device,
+                     thrust::equal_to<key_t>,
+                     cuco::linear_probing<1,  // CG size
+                                          cuco::murmurhash3_32<key_t>>,
+                     rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>,
+                     cuco_storage_type>;
+
+  typename cuco_set_type::ref_type<cuco::insert_and_find_tag> device_ref{};
 
   __device__ void operator()(thrust::tuple<key_t, value_t> pair)
   {
     auto [iter, inserted] = device_ref.insert_and_find(pair);
     if (!inserted) {
-      using ref_type = typename cuco_store_type::ref_type<cuco::experimental::insert_and_find_tag>;
+      using ref_type = typename cuco_set_type::ref_type<cuco::insert_and_find_tag>;
       cuda::atomic_ref<typename ref_type::mapped_type, cuda::thread_scope_device> ref(
         (*iter).second);
       ref.store(thrust::get<1>(pair), cuda::std::memory_order_relaxed);
@@ -220,10 +225,9 @@ struct kv_binary_search_store_device_view_t {
 
 template <typename ViewType>
 struct kv_cuco_store_find_device_view_t {
-  using key_type   = typename ViewType::key_type;
-  using value_type = typename ViewType::value_type;
-  using cuco_store_device_ref_type =
-    typename ViewType::cuco_store_type::ref_type<cuco::experimental::find_tag>;
+  using key_type                   = typename ViewType::key_type;
+  using value_type                 = typename ViewType::value_type;
+  using cuco_store_device_ref_type = typename ViewType::cuco_set_type::ref_type<cuco::find_tag>;
 
   static_assert(!ViewType::binary_search);
 
@@ -336,25 +340,26 @@ class kv_cuco_store_view_t {
 
   static constexpr bool binary_search = false;
 
-  using cuco_store_type = cuco::experimental::static_map<
-    key_t,
-    std::conditional_t<std::is_arithmetic_v<value_type>, value_type, size_t>,
-    cuco::experimental::extent<std::size_t>,
-    cuda::thread_scope_device,
-    thrust::equal_to<key_t>,
-    cuco::experimental::linear_probing<1,  // CG size
-                                       cuco::murmurhash3_32<key_t>>,
-    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
+  using cuco_set_type =
+    cuco::static_map<key_t,
+                     std::conditional_t<std::is_arithmetic_v<value_type>, value_type, size_t>,
+                     cuco::extent<std::size_t>,
+                     cuda::thread_scope_device,
+                     thrust::equal_to<key_t>,
+                     cuco::linear_probing<1,  // CG size
+                                          cuco::murmurhash3_32<key_t>>,
+                     rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>,
+                     cuco_storage_type>;
 
   template <typename type = value_type>
-  kv_cuco_store_view_t(cuco_store_type const* store,
+  kv_cuco_store_view_t(cuco_set_type const* store,
                        std::enable_if_t<std::is_arithmetic_v<type>, int32_t> = 0)
     : cuco_store_(store)
   {
   }
 
   template <typename type = value_type>
-  kv_cuco_store_view_t(cuco_store_type const* store,
+  kv_cuco_store_view_t(cuco_set_type const* store,
                        ValueIterator value_first,
                        type invalid_value,
                        std::enable_if_t<!std::is_arithmetic_v<type>, int32_t> = 0)
@@ -392,7 +397,7 @@ class kv_cuco_store_view_t {
     cuco_store_->contains(key_first, key_last, value_first, stream.value());
   }
 
-  auto cuco_store_find_device_ref() const { return cuco_store_->ref(cuco::experimental::find); }
+  auto cuco_store_find_device_ref() const { return cuco_store_->ref(cuco::find); }
 
   template <typename type = value_type>
   std::enable_if_t<!std::is_arithmetic_v<type>, ValueIterator> store_value_first() const
@@ -412,7 +417,7 @@ class kv_cuco_store_view_t {
   }
 
  private:
-  cuco_store_type const* cuco_store_{};
+  cuco_set_type const* cuco_store_{};
   std::conditional_t<std::is_arithmetic_v<value_type>, ValueIterator, std::byte /* dummy */>
     store_value_first_{};
 
@@ -531,15 +536,16 @@ class kv_cuco_store_t {
     std::invoke_result_t<decltype(get_dataframe_buffer_cbegin<value_buffer_type>),
                          value_buffer_type&>;
 
-  using cuco_store_type = cuco::experimental::static_map<
-    key_t,
-    std::conditional_t<std::is_arithmetic_v<value_t>, value_t, size_t>,
-    cuco::experimental::extent<std::size_t>,
-    cuda::thread_scope_device,
-    thrust::equal_to<key_t>,
-    cuco::experimental::linear_probing<1,  // CG size
-                                       cuco::murmurhash3_32<key_t>>,
-    rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>>;
+  using cuco_set_type =
+    cuco::static_map<key_t,
+                     std::conditional_t<std::is_arithmetic_v<value_t>, value_t, size_t>,
+                     cuco::extent<std::size_t>,
+                     cuda::thread_scope_device,
+                     thrust::equal_to<key_t>,
+                     cuco::linear_probing<1,  // CG size
+                                          cuco::murmurhash3_32<key_t>>,
+                     rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<std::byte>>,
+                     cuco_storage_type>;
 
   kv_cuco_store_t(rmm::cuda_stream_view stream) {}
 
@@ -588,7 +594,7 @@ class kv_cuco_store_t {
       // requires placing the atomic variable on managed memory and this adds additional
       // complication.
       rmm::device_scalar<size_t> counter(old_store_value_size, stream);
-      auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find);
+      auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find);
       rmm::device_uvector<size_t> store_value_offsets(num_keys, stream);
       thrust::tabulate(
         rmm::exec_policy(stream),
@@ -629,7 +635,7 @@ class kv_cuco_store_t {
       // requires placing the atomic variable on managed memory and this adds additional
       // complication.
       rmm::device_scalar<size_t> counter(old_store_value_size, stream);
-      auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find);
+      auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find);
       rmm::device_uvector<size_t> store_value_offsets(num_keys, stream);
       thrust::tabulate(rmm::exec_policy(stream),
                        store_value_offsets.begin(),
@@ -665,8 +671,8 @@ class kv_cuco_store_t {
     if constexpr (std::is_arithmetic_v<value_t>) {
       auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(key_first, value_first));
       // FIXME: a temporary solution till insert_and_assign is added to
-      // cuco::experimental::static_map
-      auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find);
+      // cuco::static_map
+      auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find);
       thrust::for_each(rmm::exec_policy(stream),
                        pair_first,
                        pair_first + num_keys,
@@ -679,7 +685,7 @@ class kv_cuco_store_t {
       // requires placing the atomic variable on managed memory and this adds additional
       // complication.
       rmm::device_scalar<size_t> counter(old_store_value_size, stream);
-      auto mutable_device_ref = cuco_store_->ref(cuco::experimental::insert_and_find);
+      auto mutable_device_ref = cuco_store_->ref(cuco::insert_and_find);
       rmm::device_uvector<size_t> store_value_offsets(num_keys, stream);
       thrust::tabulate(
         rmm::exec_policy(stream),
@@ -731,20 +737,19 @@ class kv_cuco_store_t {
                                                         })),
                         stream);
 
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        kv_indices.begin(),
-        kv_indices.end(),
-        [key_first,
-         value_first,
-         store_value_first = get_dataframe_buffer_begin(store_values_),
-         device_ref        = cuco_store_->ref(cuco::experimental::find)] __device__(auto kv_idx) {
-          size_t store_value_offset{};
-          auto found = device_ref.find(*(key_first + kv_idx));
-          assert(found != device_ref.end());
-          store_value_offset                        = (*found).second;
-          *(store_value_first + store_value_offset) = *(value_first + kv_idx);
-        });
+      thrust::for_each(rmm::exec_policy(stream),
+                       kv_indices.begin(),
+                       kv_indices.end(),
+                       [key_first,
+                        value_first,
+                        store_value_first = get_dataframe_buffer_begin(store_values_),
+                        device_ref        = cuco_store_->ref(cuco::find)] __device__(auto kv_idx) {
+                         size_t store_value_offset{};
+                         auto found = device_ref.find(*(key_first + kv_idx));
+                         assert(found != device_ref.end());
+                         store_value_offset                        = (*found).second;
+                         *(store_value_first + store_value_offset) = *(value_first + kv_idx);
+                       });
     }
   }
 
@@ -783,7 +788,7 @@ class kv_cuco_store_t {
     return std::make_tuple(std::move(retrieved_keys), std::move(retrieved_values));
   }
 
-  cuco_store_type const* cuco_store_ptr() const { return cuco_store_.get(); }
+  cuco_set_type const* cuco_store_ptr() const { return cuco_store_.get(); }
 
   template <typename type = value_t>
   std::enable_if_t<!std::is_arithmetic_v<type>, const_value_iterator> store_value_first() const
@@ -821,23 +826,25 @@ class kv_cuco_store_t {
     auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(
       rmm::mr::polymorphic_allocator<std::byte>(rmm::mr::get_current_device_resource()), stream);
     if constexpr (std::is_arithmetic_v<value_t>) {
-      cuco_store_ = std::make_unique<cuco_store_type>(
-        cuco_size,
-        cuco::sentinel::empty_key<key_t>{invalid_key},
-        cuco::sentinel::empty_value<value_t>{invalid_value},
-        thrust::equal_to<key_t>{},
-        cuco::experimental::linear_probing<1,  // CG size
-                                           cuco::murmurhash3_32<key_t>>{},
-        stream_adapter,
-        stream.value());
+      cuco_store_ =
+        std::make_unique<cuco_set_type>(cuco_size,
+                                        cuco::sentinel::empty_key<key_t>{invalid_key},
+                                        cuco::sentinel::empty_value<value_t>{invalid_value},
+                                        thrust::equal_to<key_t>{},
+                                        cuco::linear_probing<1,  // CG size
+                                                             cuco::murmurhash3_32<key_t>>{},
+                                        cuco::thread_scope_device,
+                                        cuco_storage_type{},
+                                        stream_adapter,
+                                        stream.value());
     } else {
-      cuco_store_ = std::make_unique<cuco_store_type>(
+      cuco_store_ = std::make_unique<cuco_set_type>(
         cuco_size,
         cuco::sentinel::empty_key<key_t>{invalid_key},
         cuco::sentinel::empty_value<size_t>{std::numeric_limits<size_t>::max()},
         thrust::equal_to<key_t>{},
-        cuco::experimental::linear_probing<1,  // CG size
-                                           cuco::murmurhash3_32<key_t>>{},
+        cuco::linear_probing<1,  // CG size
+                             cuco::murmurhash3_32<key_t>>{},
         stream_adapter,
         stream);
       store_values_ = allocate_dataframe_buffer<value_t>(0, stream);
@@ -845,7 +852,7 @@ class kv_cuco_store_t {
     }
   }
 
-  std::unique_ptr<cuco_store_type> cuco_store_{nullptr};
+  std::unique_ptr<cuco_set_type> cuco_store_{nullptr};
   std::conditional_t<!std::is_arithmetic_v<value_t>,
                      decltype(allocate_dataframe_buffer<value_t>(0, rmm::cuda_stream_view{})),
                      std::byte /* dummy */>
@@ -857,7 +864,7 @@ class kv_cuco_store_t {
   size_t size_{
     0};  // caching as cuco_store_->size() is expensive (this scans the entire slots to handle
          // user inserts through a device reference (and currently this is an upper bound (this
-         // will become exact once we fully switch to cuco::experimental::static_map and use the
+         // will become exact once we fully switch to cuco::static_map and use the
          // static_map class's insert_and_assign function; this function will be added soon)
 };