From 5c6c4d3e716c49932ba46642a4d0ad9cb82f0036 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 7 Feb 2024 05:52:06 -0800
Subject: [PATCH 1/7] Exclude tests from builds (#4147)

---
 python/cugraph-dgl/pyproject.toml            |  2 ++
 python/cugraph-equivariant/pyproject.toml    |  1 +
 python/cugraph-equivariant/setup.py          | 20 --------------------
 python/cugraph-pyg/pyproject.toml            |  2 ++
 python/cugraph-service/client/pyproject.toml |  1 +
 python/cugraph-service/server/pyproject.toml |  1 +
 python/cugraph/pyproject.toml                |  1 +
 python/nx-cugraph/pyproject.toml             |  1 +
 python/pylibcugraph/pyproject.toml           |  1 +
 9 files changed, 10 insertions(+), 20 deletions(-)
 delete mode 100644 python/cugraph-equivariant/setup.py

diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index 65ee414da44..b575559bee5 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -53,3 +53,5 @@ version = {file = "cugraph_dgl/VERSION"}
 include = [
     "cugraph_dgl*",
 ]
+exclude = ["*tests*"]
+
diff --git a/python/cugraph-equivariant/pyproject.toml b/python/cugraph-equivariant/pyproject.toml
index f261b0e3535..3cb0183e3a2 100644
--- a/python/cugraph-equivariant/pyproject.toml
+++ b/python/cugraph-equivariant/pyproject.toml
@@ -62,3 +62,4 @@ include = [
     "cugraph_equivariant*",
     "cugraph_equivariant.*",
 ]
+exclude = ["*tests*"]
diff --git a/python/cugraph-equivariant/setup.py b/python/cugraph-equivariant/setup.py
deleted file mode 100644
index acd0df3f717..00000000000
--- a/python/cugraph-equivariant/setup.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from setuptools import find_packages, setup
-
-if __name__ == "__main__":
-    packages = find_packages(include=["cugraph_equivariant*"])
-    setup(
-        package_data={key: ["VERSION"] for key in packages},
-    )
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index b8666c0d806..2284b454888 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -57,3 +57,5 @@ include = [
     "cugraph_pyg*",
     "cugraph_pyg.*",
 ]
+exclude = ["*tests*"]
+
diff --git a/python/cugraph-service/client/pyproject.toml b/python/cugraph-service/client/pyproject.toml
index 59539693877..ad4a98da507 100644
--- a/python/cugraph-service/client/pyproject.toml
+++ b/python/cugraph-service/client/pyproject.toml
@@ -42,3 +42,4 @@ version = {file = "cugraph_service_client/VERSION"}
 include = [
     "cugraph_service_client",
 ]
+exclude = ["*tests*"]
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index 84c0358668b..2d1c00d027e 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -71,3 +71,4 @@ include = [
     "cugraph_service_server",
     "cugraph_service_server.*"
 ]
+exclude = ["*tests*"]
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index 6a9d88bf5c8..ff7fc1a8d89 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -72,6 +72,7 @@ build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
 cmake.minimum-version = "3.26.4"
 ninja.make-fallback = true
+sdist.exclude = ["*tests*"]
 sdist.reproducible = true
 wheel.packages = ["cugraph"]
 
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index 63ac115918f..d8adeec0b38 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -79,6 +79,7 @@ include = [
     "_nx_cugraph*",
     "_nx_cugraph.*",
 ]
+exclude = ["*tests*"]
 
 [tool.black]
 line-length = 88
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 1d27d952af1..97c94c1b9db 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -59,6 +59,7 @@ build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
 cmake.minimum-version = "3.26.4"
 ninja.make-fallback = true
+sdist.exclude = ["*tests*"]
 sdist.reproducible = true
 wheel.packages = ["pylibcugraph"]
 

From f9e618b3274ee58f28a0a15e5d286c6f1cdd6568 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 7 Feb 2024 13:32:20 -0600
Subject: [PATCH 2/7] Ensure that `ctest` is called with `--no-tests=error`.
 (#4148)

This PR ensures that all calls to `ctest` include the flag `--no-tests=error`. See https://github.com/rapidsai/build-planning/issues/18.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4148
---
 ci/test_cpp.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index b204c1dc59b..39fff52bd94 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -47,11 +47,11 @@ export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/
 # Run libcugraph gtests from libcugraph-tests package
 rapids-logger "Run gtests"
 cd "$CONDA_PREFIX"/bin/gtests/libcugraph/
-ctest -j10 --output-on-failure
+ctest -j10 --output-on-failure --no-tests=error
 
 if [ -d "$CONDA_PREFIX"/bin/gtests/libcugraph_c/ ]; then
   cd "$CONDA_PREFIX"/bin/gtests/libcugraph_c/
-  ctest -j10 --output-on-failure
+  ctest -j10 --output-on-failure --no-tests=error
 fi
 
 rapids-logger "Test script exiting with value: $EXITCODE"

From 9ad7389398a7abdbbee070a220b2d638c29bc876 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 7 Feb 2024 18:46:45 -0600
Subject: [PATCH 3/7] handle more RAPIDS version formats in update-version.sh,
 refactor dependencies.yaml (#4109)

Contributes to https://github.com/rapidsai/build-planning/issues/13.

Updates `update-version.sh` to correctly handle RAPIDS dependencies like `cudf-cu12==24.2.*`.

This also pulls in some dependency refactoring originally added in #4088, which allows greater use of dependencies.yaml globs (and therefore less maintenance effort to support new CUDA versions).

Authors:
  - James Lamb (https://github.com/jameslamb)
  - Brad Rees (https://github.com/BradReesWork)
  - Ray Douglass (https://github.com/raydouglass)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Bradley Dice (https://github.com/bdice)
  - Brad Rees (https://github.com/BradReesWork)
  - Rick Ratzel (https://github.com/rlratzel)

URL: https://github.com/rapidsai/cugraph/pull/4109
---
 ci/release/update-version.sh |  11 ++--
 dependencies.yaml            | 124 ++++++++++-------------------------
 2 files changed, 38 insertions(+), 97 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index c091bd1ed33..9e284f49b5b 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -96,13 +96,12 @@ DEPENDENCIES=(
 )
 for DEP in "${DEPENDENCIES[@]}"; do
   for FILE in dependencies.yaml conda/environments/*.yaml python/cugraph-{pyg,dgl}/conda/*.yaml; do
-    sed_runner "/-.* ${DEP}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" ${FILE}
-    sed_runner "/-.* ${DEP}-cu[0-9][0-9]==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" ${FILE}
-    sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_VERSION}.*/g" ${FILE}
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}"
+    sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_VERSION}.*/g" "${FILE}"
   done
   for FILE in python/**/pyproject.toml python/**/**/pyproject.toml; do
-    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
-    sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_VERSION}.*\"/g" ${FILE}
+    sed_runner "/\"${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" "${FILE}"
+    sed_runner "/\"ucx-py==/ s/==.*\"/==${NEXT_UCX_PY_VERSION}.*\"/g" "${FILE}"
   done
 done
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 9d8ba3e5340..cfefe3b9ff9 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -567,17 +567,12 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.2"}
-            packages: &rmm_packages_pip_cu12
+          - matrix: {cuda: "12.*"}
+            packages:
               - rmm-cu12==24.4.*
-          - {matrix: {cuda: "12.1"}, packages: *rmm_packages_pip_cu12}
-          - {matrix: {cuda: "12.0"}, packages: *rmm_packages_pip_cu12}
-          - matrix: {cuda: "11.8"}
-            packages: &rmm_packages_pip_cu11
+          - matrix: {cuda: "11.*"}
+            packages:
               - rmm-cu11==24.4.*
-          - {matrix: {cuda: "11.5"}, packages: *rmm_packages_pip_cu11}
-          - {matrix: {cuda: "11.4"}, packages: *rmm_packages_pip_cu11}
-          - {matrix: {cuda: "11.2"}, packages: *rmm_packages_pip_cu11}
           - {matrix: null, packages: [*rmm_conda]}
 
   depends_on_cudf:
@@ -593,17 +588,12 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.2"}
-            packages: &cudf_packages_pip_cu12
+          - matrix: {cuda: "12.*"}
+            packages:
               - cudf-cu12==24.4.*
-          - {matrix: {cuda: "12.1"}, packages: *cudf_packages_pip_cu12}
-          - {matrix: {cuda: "12.0"}, packages: *cudf_packages_pip_cu12}
-          - matrix: {cuda: "11.8"}
-            packages: &cudf_packages_pip_cu11
+          - matrix: {cuda: "11.*"}
+            packages:
               - cudf-cu11==24.4.*
-          - {matrix: {cuda: "11.5"}, packages: *cudf_packages_pip_cu11}
-          - {matrix: {cuda: "11.4"}, packages: *cudf_packages_pip_cu11}
-          - {matrix: {cuda: "11.2"}, packages: *cudf_packages_pip_cu11}
           - {matrix: null, packages: [*cudf_conda]}
 
   depends_on_dask_cudf:
@@ -619,17 +609,12 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.2"}
-            packages: &dask_cudf_packages_pip_cu12
+          - matrix: {cuda: "12.*"}
+            packages:
               - dask-cudf-cu12==24.4.*
-          - {matrix: {cuda: "12.1"}, packages: *dask_cudf_packages_pip_cu12}
-          - {matrix: {cuda: "12.0"}, packages: *dask_cudf_packages_pip_cu12}
-          - matrix: {cuda: "11.8"}
-            packages: &dask_cudf_packages_pip_cu11
+          - matrix: {cuda: "11.*"}
+            packages:
               - dask-cudf-cu11==24.4.*
-          - {matrix: {cuda: "11.5"}, packages: *dask_cudf_packages_pip_cu11}
-          - {matrix: {cuda: "11.4"}, packages: *dask_cudf_packages_pip_cu11}
-          - {matrix: {cuda: "11.2"}, packages: *dask_cudf_packages_pip_cu11}
           - {matrix: null, packages: [*dask_cudf_conda]}
 
   depends_on_pylibraft:
@@ -645,17 +630,12 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.2"}
-            packages: &pylibraft_packages_pip_cu12
+          - matrix: {cuda: "12.*"}
+            packages:
               - pylibraft-cu12==24.4.*
-          - {matrix: {cuda: "12.1"}, packages: *pylibraft_packages_pip_cu12}
-          - {matrix: {cuda: "12.0"}, packages: *pylibraft_packages_pip_cu12}
-          - matrix: {cuda: "11.8"}
-            packages: &pylibraft_packages_pip_cu11
+          - matrix: {cuda: "11.*"}
+            packages:
               - pylibraft-cu11==24.4.*
-          - {matrix: {cuda: "11.5"}, packages: *pylibraft_packages_pip_cu11}
-          - {matrix: {cuda: "11.4"}, packages: *pylibraft_packages_pip_cu11}
-          - {matrix: {cuda: "11.2"}, packages: *pylibraft_packages_pip_cu11}
           - {matrix: null, packages: [*pylibraft_conda]}
 
   depends_on_raft_dask:
@@ -671,17 +651,12 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.2"}
-            packages: &raft_dask_packages_pip_cu12
+          - matrix: {cuda: "12.*"}
+            packages:
               - raft-dask-cu12==24.4.*
-          - {matrix: {cuda: "12.1"}, packages: *raft_dask_packages_pip_cu12}
-          - {matrix: {cuda: "12.0"}, packages: *raft_dask_packages_pip_cu12}
-          - matrix: {cuda: "11.8"}
-            packages: &raft_dask_packages_pip_cu11
+          - matrix: {cuda: "11.*"}
+            packages:
               - raft-dask-cu11==24.4.*
-          - {matrix: {cuda: "11.5"}, packages: *raft_dask_packages_pip_cu11}
-          - {matrix: {cuda: "11.4"}, packages: *raft_dask_packages_pip_cu11}
-          - {matrix: {cuda: "11.2"}, packages: *raft_dask_packages_pip_cu11}
           - {matrix: null, packages: [*raft_dask_conda]}
 
   depends_on_pylibcugraph:
@@ -697,17 +672,12 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.2"}
-            packages: &pylibcugraph_packages_pip_cu12
+          - matrix: {cuda: "12.*"}
+            packages:
               - pylibcugraph-cu12==24.4.*
-          - {matrix: {cuda: "12.1"}, packages: *pylibcugraph_packages_pip_cu12}
-          - {matrix: {cuda: "12.0"}, packages: *pylibcugraph_packages_pip_cu12}
-          - matrix: {cuda: "11.8"}
-            packages: &pylibcugraph_packages_pip_cu11
+          - matrix: {cuda: "11.*"}
+            packages:
               - pylibcugraph-cu11==24.4.*
-          - {matrix: {cuda: "11.5"}, packages: *pylibcugraph_packages_pip_cu11}
-          - {matrix: {cuda: "11.4"}, packages: *pylibcugraph_packages_pip_cu11}
-          - {matrix: {cuda: "11.2"}, packages: *pylibcugraph_packages_pip_cu11}
           - {matrix: null, packages: [*pylibcugraph_conda]}
 
   depends_on_pylibcugraphops:
@@ -723,17 +693,12 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.2"}
-            packages: &pylibcugraphops_packages_pip_cu12
+          - matrix: {cuda: "12.*"}
+            packages:
               - pylibcugraphops-cu12==24.4.*
-          - {matrix: {cuda: "12.1"}, packages: *pylibcugraphops_packages_pip_cu12}
-          - {matrix: {cuda: "12.0"}, packages: *pylibcugraphops_packages_pip_cu12}
-          - matrix: {cuda: "11.8"}
-            packages: &pylibcugraphops_packages_pip_cu11
+          - matrix: {cuda: "11.*"}
+            packages:
               - pylibcugraphops-cu11==24.4.*
-          - {matrix: {cuda: "11.5"}, packages: *pylibcugraphops_packages_pip_cu11}
-          - {matrix: {cuda: "11.4"}, packages: *pylibcugraphops_packages_pip_cu11}
-          - {matrix: {cuda: "11.2"}, packages: *pylibcugraphops_packages_pip_cu11}
           - {matrix: null, packages: [*pylibcugraphops_conda]}
 
   depends_on_cupy:
@@ -744,33 +709,10 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          # All CUDA 12 + x86_64 versions
-          - matrix: {cuda: "12.2", arch: x86_64}
-            packages: &cupy_packages_cu12_x86_64
+          - matrix: {cuda: "12.*"}
+            packages:
               - cupy-cuda12x>=12.0.0
-          - {matrix: {cuda: "12.1", arch: x86_64}, packages: *cupy_packages_cu12_x86_64}
-          - {matrix: {cuda: "12.0", arch: x86_64}, packages: *cupy_packages_cu12_x86_64}
-
-          # All CUDA 12 + aarch64 versions
-          - matrix: {cuda: "12.2", arch: aarch64}
-            packages: &cupy_packages_cu12_aarch64
-              - cupy-cuda12x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
-          - {matrix: {cuda: "12.1", arch: aarch64}, packages: *cupy_packages_cu12_aarch64}
-          - {matrix: {cuda: "12.0", arch: aarch64}, packages: *cupy_packages_cu12_aarch64}
-
-          # All CUDA 11 + x86_64 versions
-          - matrix: {cuda: "11.8", arch: x86_64}
-            packages: &cupy_packages_cu11_x86_64
+          - matrix: {cuda: "11.*"}
+            packages: &cupy_packages_cu11
               - cupy-cuda11x>=12.0.0
-          - {matrix: {cuda: "11.5", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
-          - {matrix: {cuda: "11.4", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
-          - {matrix: {cuda: "11.2", arch: x86_64}, packages: *cupy_packages_cu11_x86_64}
-
-          # All CUDA 11 + aarch64 versions
-          - matrix: {cuda: "11.8", arch: aarch64}
-            packages: &cupy_packages_cu11_aarch64
-              - cupy-cuda11x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
-          - {matrix: {cuda: "11.5", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
-          - {matrix: {cuda: "11.4", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
-          - {matrix: {cuda: "11.2", arch: aarch64}, packages: *cupy_packages_cu11_aarch64}
-          - {matrix: null, packages: [cupy-cuda11x>=12.0.0]}
+          - {matrix: null, packages: *cupy_packages_cu11}

From bb5bee24a2e86a7b979a481d65b802e7ed28e1c8 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Thu, 8 Feb 2024 17:15:59 -0500
Subject: [PATCH 4/7] Revert "Exclude tests from builds (#4147)" (#4157)

This reverts commit 5c6c4d3e716c49932ba46642a4d0ad9cb82f0036.
---
 python/cugraph-dgl/pyproject.toml            |  2 --
 python/cugraph-equivariant/pyproject.toml    |  1 -
 python/cugraph-equivariant/setup.py          | 20 ++++++++++++++++++++
 python/cugraph-pyg/pyproject.toml            |  2 --
 python/cugraph-service/client/pyproject.toml |  1 -
 python/cugraph-service/server/pyproject.toml |  1 -
 python/cugraph/pyproject.toml                |  1 -
 python/nx-cugraph/pyproject.toml             |  1 -
 python/pylibcugraph/pyproject.toml           |  1 -
 9 files changed, 20 insertions(+), 10 deletions(-)
 create mode 100644 python/cugraph-equivariant/setup.py

diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index b575559bee5..65ee414da44 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -53,5 +53,3 @@ version = {file = "cugraph_dgl/VERSION"}
 include = [
     "cugraph_dgl*",
 ]
-exclude = ["*tests*"]
-
diff --git a/python/cugraph-equivariant/pyproject.toml b/python/cugraph-equivariant/pyproject.toml
index 3cb0183e3a2..f261b0e3535 100644
--- a/python/cugraph-equivariant/pyproject.toml
+++ b/python/cugraph-equivariant/pyproject.toml
@@ -62,4 +62,3 @@ include = [
     "cugraph_equivariant*",
     "cugraph_equivariant.*",
 ]
-exclude = ["*tests*"]
diff --git a/python/cugraph-equivariant/setup.py b/python/cugraph-equivariant/setup.py
new file mode 100644
index 00000000000..acd0df3f717
--- /dev/null
+++ b/python/cugraph-equivariant/setup.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from setuptools import find_packages, setup
+
+if __name__ == "__main__":
+    packages = find_packages(include=["cugraph_equivariant*"])
+    setup(
+        package_data={key: ["VERSION"] for key in packages},
+    )
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index 2284b454888..b8666c0d806 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -57,5 +57,3 @@ include = [
     "cugraph_pyg*",
     "cugraph_pyg.*",
 ]
-exclude = ["*tests*"]
-
diff --git a/python/cugraph-service/client/pyproject.toml b/python/cugraph-service/client/pyproject.toml
index ad4a98da507..59539693877 100644
--- a/python/cugraph-service/client/pyproject.toml
+++ b/python/cugraph-service/client/pyproject.toml
@@ -42,4 +42,3 @@ version = {file = "cugraph_service_client/VERSION"}
 include = [
     "cugraph_service_client",
 ]
-exclude = ["*tests*"]
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index 2d1c00d027e..84c0358668b 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -71,4 +71,3 @@ include = [
     "cugraph_service_server",
     "cugraph_service_server.*"
 ]
-exclude = ["*tests*"]
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index ff7fc1a8d89..6a9d88bf5c8 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -72,7 +72,6 @@ build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
 cmake.minimum-version = "3.26.4"
 ninja.make-fallback = true
-sdist.exclude = ["*tests*"]
 sdist.reproducible = true
 wheel.packages = ["cugraph"]
 
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index d8adeec0b38..63ac115918f 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -79,7 +79,6 @@ include = [
     "_nx_cugraph*",
     "_nx_cugraph.*",
 ]
-exclude = ["*tests*"]
 
 [tool.black]
 line-length = 88
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 97c94c1b9db..1d27d952af1 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -59,7 +59,6 @@ build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
 cmake.minimum-version = "3.26.4"
 ninja.make-fallback = true
-sdist.exclude = ["*tests*"]
 sdist.reproducible = true
 wheel.packages = ["pylibcugraph"]
 

From 38a8cdbb369a398a9688acb84c41015b92b7fd0e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Fri, 9 Feb 2024 13:34:23 -0800
Subject: [PATCH 5/7] Address primitives performance regression with the added
 edge masking support (when edge masking is not in use) (#4119)

This addresses performance regression due to the added edge masking support when edge masking is disabled.

We addressed the issue for `per_v_transform_reduce_incoming|outgoing_e` in https://github.com/rapidsai/cugraph/pull/4085

Using a similar approach, we address the issue for `transform_e`, `transform_reduce_e`, and `detail::nbr_intersection` in this PR.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Naim (https://github.com/naimnv)

URL: https://github.com/rapidsai/cugraph/pull/4119
---
 cpp/src/prims/detail/nbr_intersection.cuh |  73 +++--
 cpp/src/prims/transform_e.cuh             | 334 +++++++++++++++-------
 cpp/src/prims/transform_reduce_e.cuh      | 252 ++++++++--------
 3 files changed, 408 insertions(+), 251 deletions(-)

diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index 8261ec747f9..26b87f21dbb 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -319,7 +319,8 @@ struct pick_min_degree_t {
   }
 };
 
-template <typename InputKeyIterator0,
+template <bool check_edge_mask,
+          typename InputKeyIterator0,
           typename InputKeyIterator1,
           typename InputValueIterator0,  // should be void* if invalid
           typename InputValueIterator1,  // should be void* if invalid
@@ -356,10 +357,14 @@ __device__ edge_t set_intersection_by_key_with_mask(InputKeyIterator0 input_key_
   auto output_idx = output_start_offset;
   while ((idx0 < (input_start_offset0 + input_size0)) &&
          (idx1 < (input_start_offset1 + input_size1))) {
-    bool valid0 = apply_mask0 ? check_bit_set(idx0) : true;
-    bool valid1 = apply_mask1 ? check_bit_set(idx1) : true;
-    if (!valid0) { ++idx0; }
-    if (!valid1) { ++idx1; }
+    bool valid0 = true;
+    bool valid1 = true;
+    if constexpr (check_edge_mask) {
+      valid0 = apply_mask0 ? check_bit_set(idx0) : true;
+      valid1 = apply_mask1 ? check_bit_set(idx1) : true;
+      if (!valid0) { ++idx0; }
+      if (!valid1) { ++idx1; }
+    }
 
     if (valid0 && valid1) {
       auto key0 = *(input_key_first0 + idx0);
@@ -522,24 +527,42 @@ struct copy_intersecting_nbrs_and_update_intersection_size_t {
     // vertices in a single warp (better optimize if this becomes a performance
     // bottleneck)
 
-    auto mask_first        = edge_partition_e_mask ? (*edge_partition_e_mask).value_first()
-                                                   : static_cast<uint32_t const*>(nullptr);
-    auto intersection_size = set_intersection_by_key_with_mask(
-      indices0,
-      indices1,
-      edge_property_values0,
-      edge_property_values1,
-      mask_first,
-      nbr_intersection_indices.begin(),
-      nbr_intersection_e_property_values0,
-      nbr_intersection_e_property_values1,
-      local_edge_offset0,
-      local_degree0,
-      (std::is_same_v<FirstElementToIdxMap, void*> && edge_partition_e_mask),
-      local_edge_offset1,
-      local_degree1,
-      (std::is_same_v<SecondElementToIdxMap, void*> && edge_partition_e_mask),
-      nbr_intersection_offsets[i]);
+    edge_t intersection_size{};
+    if (edge_partition_e_mask) {
+      intersection_size =
+        set_intersection_by_key_with_mask<true>(indices0,
+                                                indices1,
+                                                edge_property_values0,
+                                                edge_property_values1,
+                                                (*edge_partition_e_mask).value_first(),
+                                                nbr_intersection_indices.begin(),
+                                                nbr_intersection_e_property_values0,
+                                                nbr_intersection_e_property_values1,
+                                                local_edge_offset0,
+                                                local_degree0,
+                                                std::is_same_v<FirstElementToIdxMap, void*>,
+                                                local_edge_offset1,
+                                                local_degree1,
+                                                std::is_same_v<SecondElementToIdxMap, void*>,
+                                                nbr_intersection_offsets[i]);
+    } else {
+      intersection_size =
+        set_intersection_by_key_with_mask<false>(indices0,
+                                                 indices1,
+                                                 edge_property_values0,
+                                                 edge_property_values1,
+                                                 static_cast<uint32_t const*>(nullptr),
+                                                 nbr_intersection_indices.begin(),
+                                                 nbr_intersection_e_property_values0,
+                                                 nbr_intersection_e_property_values1,
+                                                 local_edge_offset0,
+                                                 local_degree0,
+                                                 false,
+                                                 local_edge_offset1,
+                                                 local_degree1,
+                                                 false,
+                                                 nbr_intersection_offsets[i]);
+    }
 
     thrust::fill(
       thrust::seq,
@@ -714,7 +737,7 @@ nbr_intersection(raft::handle_t const& handle,
   auto edge_mask_view = graph_view.edge_mask_view();
 
   std::optional<std::unique_ptr<kv_store_t<vertex_t, vertex_t, false>>> major_to_idx_map_ptr{
-    std::nullopt};
+    std::nullopt};  // idx to major_nbr_offsets
   std::optional<rmm::device_uvector<edge_t>> major_nbr_offsets{std::nullopt};
   std::optional<rmm::device_uvector<vertex_t>> major_nbr_indices{std::nullopt};
 
@@ -1041,7 +1064,7 @@ nbr_intersection(raft::handle_t const& handle,
   // 3. Collect neighbor list for minors (for the neighbors within the minor range for this GPU)
 
   std::optional<std::unique_ptr<kv_store_t<vertex_t, vertex_t, false>>> minor_to_idx_map_ptr{
-    std::nullopt};
+    std::nullopt};  // idx to minor_nbr_offsets
   std::optional<rmm::device_uvector<size_t>> minor_nbr_offsets{std::nullopt};
   std::optional<rmm::device_uvector<vertex_t>> minor_nbr_indices{std::nullopt};
 
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index 93a2d040b60..5c83e0f1b71 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -42,7 +42,8 @@ namespace detail {
 
 int32_t constexpr transform_e_kernel_block_size = 512;
 
-template <typename GraphViewType,
+template <bool check_edge_mask,
+          typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
@@ -56,7 +57,7 @@ __global__ void transform_e_packed_bool(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
-  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask,
   EdgePartitionEdgeValueOutputWrapper edge_partition_e_value_output,
   EdgeOp e_op)
 {
@@ -72,35 +73,44 @@ __global__ void transform_e_packed_bool(
 
   auto num_edges = edge_partition.number_of_edges();
   while (idx < static_cast<edge_t>(packed_bool_size(num_edges))) {
-    auto edge_mask = packed_bool_full_mask();
-    if (edge_partition_e_mask) { edge_mask = *((*edge_partition_e_mask).value_first() + idx); }
+    [[maybe_unused]] auto edge_mask =
+      packed_bool_full_mask();  // relevant only when check_edge_mask is true
+    if constexpr (check_edge_mask) { edge_mask = *(edge_partition_e_mask.value_first() + idx); }
 
     auto local_edge_idx =
       idx * static_cast<edge_t>(packed_bools_per_word()) + static_cast<edge_t>(lane_id);
     int predicate{0};
 
-    if ((local_edge_idx < num_edges) && (edge_mask & packed_bool_mask(lane_id))) {
-      auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(local_edge_idx);
-      auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
-      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-      auto minor        = *(edge_partition.indices() + local_edge_idx);
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-
-      auto src        = GraphViewType::is_storage_transposed ? minor : major;
-      auto dst        = GraphViewType::is_storage_transposed ? major : minor;
-      auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-      auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-      predicate       = e_op(src,
-                       dst,
-                       edge_partition_src_value_input.get(src_offset),
-                       edge_partition_dst_value_input.get(dst_offset),
-                       edge_partition_e_value_input.get(local_edge_idx))
-                          ? int{1}
-                          : int{0};
+    if (local_edge_idx < num_edges) {
+      bool compute_predicate = true;
+      if constexpr (check_edge_mask) {
+        compute_predicate = (edge_mask & packed_bool_mask(lane_id) != packed_bool_empty_mask());
+      }
+
+      if (compute_predicate) {
+        auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(local_edge_idx);
+        auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
+        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+        auto minor        = *(edge_partition.indices() + local_edge_idx);
+        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+
+        auto src        = GraphViewType::is_storage_transposed ? minor : major;
+        auto dst        = GraphViewType::is_storage_transposed ? major : minor;
+        auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+        auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+        predicate       = e_op(src,
+                         dst,
+                         edge_partition_src_value_input.get(src_offset),
+                         edge_partition_dst_value_input.get(dst_offset),
+                         edge_partition_e_value_input.get(local_edge_idx))
+                            ? int{1}
+                            : int{0};
+      }
     }
+
     uint32_t new_val = __ballot_sync(uint32_t{0xffffffff}, predicate);
     if (lane_id == 0) {
-      if (edge_mask == packed_bool_full_mask()) {
+      if constexpr (check_edge_mask) {
         *(edge_partition_e_value_output.value_first() + idx) = new_val;
       } else {
         auto old_val = *(edge_partition_e_value_output.value_first() + idx);
@@ -112,6 +122,99 @@ __global__ void transform_e_packed_bool(
   }
 }
 
+template <bool check_edge_mask,
+          typename GraphViewType,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
+          typename EdgeOp,
+          typename EdgeValueOutputWrapper>
+struct update_e_value_t {
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu>
+    edge_partition{};
+  EdgePartitionSrcValueInputWrapper edge_partition_src_value_input{};
+  EdgePartitionDstValueInputWrapper edge_partition_dst_value_input{};
+  EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input{};
+  EdgePartitionEdgeMaskWrapper edge_partition_e_mask{};
+  EdgeOp e_op{};
+  EdgeValueOutputWrapper edge_partition_e_value_output{};
+
+  __device__ void operator()(thrust::tuple<typename GraphViewType::vertex_type,
+                                           typename GraphViewType::vertex_type> edge) const
+  {
+    using vertex_t = typename GraphViewType::vertex_type;
+    using edge_t   = typename GraphViewType::edge_type;
+
+    auto major = thrust::get<0>(edge);
+    auto minor = thrust::get<1>(edge);
+
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    auto major_idx    = edge_partition.major_idx_from_major_nocheck(major);
+    assert(major_idx);
+
+    auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+
+    vertex_t const* indices{nullptr};
+    edge_t edge_offset{};
+    edge_t local_degree{};
+    thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(*major_idx);
+    auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+    auto upper_it = thrust::upper_bound(thrust::seq, lower_it, indices + local_degree, minor);
+
+    auto src        = GraphViewType::is_storage_transposed ? minor : major;
+    auto dst        = GraphViewType::is_storage_transposed ? major : minor;
+    auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+    auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+
+    for (auto it = lower_it; it != upper_it; ++it) {
+      assert(*it == minor);
+      if constexpr (check_edge_mask) {
+        if (edge_partition_e_mask.get(edge_offset + thrust::distance(indices, it))) {
+          auto e_op_result =
+            e_op(src,
+                 dst,
+                 edge_partition_src_value_input.get(src_offset),
+                 edge_partition_dst_value_input.get(dst_offset),
+                 edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
+          edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it),
+                                            e_op_result);
+        }
+      } else {
+        auto e_op_result =
+          e_op(src,
+               dst,
+               edge_partition_src_value_input.get(src_offset),
+               edge_partition_dst_value_input.get(dst_offset),
+               edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
+        edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it), e_op_result);
+      }
+    }
+  }
+
+  __device__ void operator()(typename GraphViewType::edge_type i) const
+  {
+    auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(i);
+    auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+    auto minor        = *(edge_partition.indices() + i);
+    auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+
+    auto src         = GraphViewType::is_storage_transposed ? minor : major;
+    auto dst         = GraphViewType::is_storage_transposed ? major : minor;
+    auto src_offset  = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+    auto dst_offset  = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+    auto e_op_result = e_op(src,
+                            dst,
+                            edge_partition_src_value_input.get(src_offset),
+                            edge_partition_dst_value_input.get(dst_offset),
+                            edge_partition_e_value_input.get(i));
+    edge_partition_e_value_output.set(i, e_op_result);
+  }
+};
+
 }  // namespace detail
 
 /**
@@ -228,47 +331,68 @@ void transform_e(raft::handle_t const& handle,
         raft::grid_1d_thread_t update_grid(num_edges,
                                            detail::transform_e_kernel_block_size,
                                            handle.get_device_properties().maxGridSize[0]);
-        detail::transform_e_packed_bool<GraphViewType>
-          <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
-            edge_partition,
-            edge_partition_src_value_input,
-            edge_partition_dst_value_input,
-            edge_partition_e_value_input,
-            edge_partition_e_mask,
-            edge_partition_e_value_output,
-            e_op);
+        if (edge_partition_e_mask) {
+          detail::transform_e_packed_bool<true, GraphViewType>
+            <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+              edge_partition,
+              edge_partition_src_value_input,
+              edge_partition_dst_value_input,
+              edge_partition_e_value_input,
+              *edge_partition_e_mask,
+              edge_partition_e_value_output,
+              e_op);
+        } else {
+          detail::transform_e_packed_bool<false, GraphViewType>
+            <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
+              edge_partition,
+              edge_partition_src_value_input,
+              edge_partition_dst_value_input,
+              edge_partition_e_value_input,
+              std::byte{},  // dummy
+              edge_partition_e_value_output,
+              e_op);
+        }
       }
     } else {
-      thrust::for_each(
-        handle.get_thrust_policy(),
-        thrust::make_counting_iterator(edge_t{0}),
-        thrust::make_counting_iterator(num_edges),
-        [e_op,
-         edge_partition,
-         edge_partition_src_value_input,
-         edge_partition_dst_value_input,
-         edge_partition_e_value_input,
-         edge_partition_e_mask,
-         edge_partition_e_value_output] __device__(edge_t i) {
-          if (!edge_partition_e_mask || (*edge_partition_e_mask).get(i)) {
-            auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(i);
-            auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
-            auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-            auto minor        = *(edge_partition.indices() + i);
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-
-            auto src         = GraphViewType::is_storage_transposed ? minor : major;
-            auto dst         = GraphViewType::is_storage_transposed ? major : minor;
-            auto src_offset  = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-            auto dst_offset  = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-            auto e_op_result = e_op(src,
-                                    dst,
-                                    edge_partition_src_value_input.get(src_offset),
-                                    edge_partition_dst_value_input.get(dst_offset),
-                                    edge_partition_e_value_input.get(i));
-            edge_partition_e_value_output.set(i, e_op_result);
-          }
-        });
+      if (edge_partition_e_mask) {
+        thrust::for_each(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator(edge_t{0}),
+                         thrust::make_counting_iterator(num_edges),
+                         detail::update_e_value_t<true,
+                                                  GraphViewType,
+                                                  edge_partition_src_input_device_view_t,
+                                                  edge_partition_dst_input_device_view_t,
+                                                  edge_partition_e_input_device_view_t,
+                                                  decltype(*edge_partition_e_mask),
+                                                  EdgeOp,
+                                                  edge_partition_e_output_device_view_t>{
+                           edge_partition,
+                           edge_partition_src_value_input,
+                           edge_partition_dst_value_input,
+                           edge_partition_e_value_input,
+                           *edge_partition_e_mask,
+                           e_op,
+                           edge_partition_e_value_output});
+      } else {
+        thrust::for_each(handle.get_thrust_policy(),
+                         thrust::make_counting_iterator(edge_t{0}),
+                         thrust::make_counting_iterator(num_edges),
+                         detail::update_e_value_t<false,
+                                                  GraphViewType,
+                                                  edge_partition_src_input_device_view_t,
+                                                  edge_partition_dst_input_device_view_t,
+                                                  edge_partition_e_input_device_view_t,
+                                                  std::byte,  // dummy
+                                                  EdgeOp,
+                                                  edge_partition_e_output_device_view_t>{
+                           edge_partition,
+                           edge_partition_src_value_input,
+                           edge_partition_dst_value_input,
+                           edge_partition_e_value_input,
+                           std::byte{},  // dummy
+                           e_op,
+                           edge_partition_e_value_output});
+      }
     }
   }
 }
@@ -467,53 +591,45 @@ void transform_e(raft::handle_t const& handle,
     auto edge_partition_e_value_output =
       edge_partition_e_output_device_view_t(edge_value_output, i);
 
-    thrust::for_each(
-      handle.get_thrust_policy(),
-      edge_first + edge_partition_offsets[i],
-      edge_first + edge_partition_offsets[i + 1],
-      [e_op,
-       edge_partition,
-       edge_partition_src_value_input,
-       edge_partition_dst_value_input,
-       edge_partition_e_value_input,
-       edge_partition_e_mask,
-       edge_partition_e_value_output] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
-        auto major = thrust::get<0>(edge);
-        auto minor = thrust::get<1>(edge);
-
-        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-        auto major_idx    = edge_partition.major_idx_from_major_nocheck(major);
-        assert(major_idx);
-
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-
-        vertex_t const* indices{nullptr};
-        edge_t edge_offset{};
-        edge_t local_degree{};
-        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(*major_idx);
-        auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
-        auto upper_it = thrust::upper_bound(thrust::seq, lower_it, indices + local_degree, minor);
-
-        auto src        = GraphViewType::is_storage_transposed ? minor : major;
-        auto dst        = GraphViewType::is_storage_transposed ? major : minor;
-        auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-        auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-
-        for (auto it = lower_it; it != upper_it; ++it) {
-          assert(*it == minor);
-          if (!edge_partition_e_mask ||
-              ((*edge_partition_e_mask).get(edge_offset + thrust::distance(indices, it)))) {
-            auto e_op_result =
-              e_op(src,
-                   dst,
-                   edge_partition_src_value_input.get(src_offset),
-                   edge_partition_dst_value_input.get(dst_offset),
-                   edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
-            edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it),
-                                              e_op_result);
-          }
-        }
-      });
+    if (edge_partition_e_mask) {
+      thrust::for_each(handle.get_thrust_policy(),
+                       edge_first + edge_partition_offsets[i],
+                       edge_first + edge_partition_offsets[i + 1],
+                       detail::update_e_value_t<true,
+                                                GraphViewType,
+                                                edge_partition_src_input_device_view_t,
+                                                edge_partition_dst_input_device_view_t,
+                                                edge_partition_e_input_device_view_t,
+                                                decltype(*edge_partition_e_mask),
+                                                EdgeOp,
+                                                edge_partition_e_output_device_view_t>{
+                         edge_partition,
+                         edge_partition_src_value_input,
+                         edge_partition_dst_value_input,
+                         edge_partition_e_value_input,
+                         *edge_partition_e_mask,
+                         e_op,
+                         edge_partition_e_value_output});
+    } else {
+      thrust::for_each(handle.get_thrust_policy(),
+                       edge_first + edge_partition_offsets[i],
+                       edge_first + edge_partition_offsets[i + 1],
+                       detail::update_e_value_t<false,
+                                                GraphViewType,
+                                                edge_partition_src_input_device_view_t,
+                                                edge_partition_dst_input_device_view_t,
+                                                edge_partition_e_input_device_view_t,
+                                                std::byte,  // dummy
+                                                EdgeOp,
+                                                edge_partition_e_output_device_view_t>{
+                         edge_partition,
+                         edge_partition_src_value_input,
+                         edge_partition_dst_value_input,
+                         edge_partition_e_value_input,
+                         std::byte{},  // dummy
+                         e_op,
+                         edge_partition_e_value_output});
+    }
   }
 }
 
diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh
index 483ab64dcd9..7acc7461268 100644
--- a/cpp/src/prims/transform_reduce_e.cuh
+++ b/cpp/src/prims/transform_reduce_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <prims/detail/prim_functors.cuh>
 #include <prims/property_op_utils.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
@@ -89,48 +90,51 @@ __global__ void transform_reduce_e_hypersparse(
   while (idx < static_cast<size_t>(dcs_nzd_vertex_count)) {
     auto major =
       *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast<vertex_t>(idx)));
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
     auto major_idx =
       major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
-    auto sum                                        = thrust::transform_reduce(
-      thrust::seq,
-      thrust::make_counting_iterator(edge_t{0}),
-      thrust::make_counting_iterator(local_degree),
-      [&edge_partition,
-       &edge_partition_src_value_input,
-       &edge_partition_dst_value_input,
-       &edge_partition_e_value_input,
-       &edge_partition_e_mask,
-       &e_op,
-       major,
-       indices,
-       edge_offset] __device__(auto i) {
-        if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
-          auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-          auto minor        = indices[i];
-          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-          auto src          = GraphViewType::is_storage_transposed ? minor : major;
-          auto dst          = GraphViewType::is_storage_transposed ? major : minor;
-          auto src_offset   = GraphViewType::is_storage_transposed
-                                                                       ? minor_offset
-                                                                       : static_cast<vertex_t>(major_offset);
-          auto dst_offset   = GraphViewType::is_storage_transposed
-                                                                       ? static_cast<vertex_t>(major_offset)
-                                                                       : minor_offset;
-          return e_op(src,
-                      dst,
-                      edge_partition_src_value_input.get(src_offset),
-                      edge_partition_dst_value_input.get(dst_offset),
-                      edge_partition_e_value_input.get(edge_offset + i));
-        } else {
-          return e_op_result_t{};
-        }
-      },
-      e_op_result_t{},
-      edge_property_add);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    e_op_result_t sum{};
+    if (edge_partition_e_mask) {
+      sum = thrust::transform_reduce(
+        thrust::seq,
+        thrust::make_counting_iterator(edge_t{0}),
+        thrust::make_counting_iterator(local_degree),
+        [&edge_partition_e_mask, &call_e_op, edge_offset] __device__(auto i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_e_op(i);
+          } else {
+            return e_op_result_t{};
+          }
+        },
+        e_op_result_t{},
+        edge_property_add);
+    } else {
+      sum = thrust::transform_reduce(thrust::seq,
+                                     thrust::make_counting_iterator(edge_t{0}),
+                                     thrust::make_counting_iterator(local_degree),
+                                     call_e_op,
+                                     e_op_result_t{},
+                                     edge_property_add);
+    }
 
     e_op_result_sum = edge_property_add(e_op_result_sum, sum);
     idx += gridDim.x * blockDim.x;
@@ -175,50 +179,50 @@ __global__ void transform_reduce_e_low_degree(
   property_op<e_op_result_t, thrust::plus> edge_property_add{};
   e_op_result_t e_op_result_sum{};
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
-    auto sum                                        = thrust::transform_reduce(
-      thrust::seq,
-      thrust::make_counting_iterator(edge_t{0}),
-      thrust::make_counting_iterator(local_degree),
-      [&edge_partition,
-       &edge_partition_src_value_input,
-       &edge_partition_dst_value_input,
-       &edge_partition_e_value_input,
-       &edge_partition_e_mask,
-       &e_op,
-       major_offset,
-       indices,
-       edge_offset] __device__(auto i) {
-        if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
-          auto minor        = indices[i];
-          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-          auto src          = GraphViewType::is_storage_transposed
-                                                                       ? minor
-                                                                       : edge_partition.major_from_major_offset_nocheck(major_offset);
-          auto dst          = GraphViewType::is_storage_transposed
-                                                                       ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                                                                       : minor;
-          auto src_offset   = GraphViewType::is_storage_transposed
-                                                                       ? minor_offset
-                                                                       : static_cast<vertex_t>(major_offset);
-          auto dst_offset   = GraphViewType::is_storage_transposed
-                                                                       ? static_cast<vertex_t>(major_offset)
-                                                                       : minor_offset;
-          return e_op(src,
-                      dst,
-                      edge_partition_src_value_input.get(src_offset),
-                      edge_partition_dst_value_input.get(dst_offset),
-                      edge_partition_e_value_input.get(edge_offset + i));
-        } else {
-          return e_op_result_t{};
-        }
-      },
-      e_op_result_t{},
-      edge_property_add);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    e_op_result_t sum{};
+    if (edge_partition_e_mask) {
+      sum = thrust::transform_reduce(
+        thrust::seq,
+        thrust::make_counting_iterator(edge_t{0}),
+        thrust::make_counting_iterator(local_degree),
+        [&edge_partition_e_mask, &call_e_op, edge_offset] __device__(auto i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_e_op(i);
+          } else {
+            return e_op_result_t{};
+          }
+        },
+        e_op_result_t{},
+        edge_property_add);
+    } else {
+      sum = thrust::transform_reduce(thrust::seq,
+                                     thrust::make_counting_iterator(edge_t{0}),
+                                     thrust::make_counting_iterator(local_degree),
+                                     call_e_op,
+                                     e_op_result_t{},
+                                     edge_property_add);
+    }
 
     e_op_result_sum = edge_property_add(e_op_result_sum, sum);
     idx += gridDim.x * blockDim.x;
@@ -264,30 +268,37 @@ __global__ void transform_reduce_e_mid_degree(
   property_op<e_op_result_t, thrust::plus> edge_property_add{};
   e_op_result_t e_op_result_sum{};
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
-    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-      if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
-        auto minor        = indices[i];
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        auto src          = GraphViewType::is_storage_transposed
-                              ? minor
-                              : edge_partition.major_from_major_offset_nocheck(major_offset);
-        auto dst          = GraphViewType::is_storage_transposed
-                              ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                              : minor;
-        auto src_offset =
-          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-        auto dst_offset =
-          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-        auto e_op_result = e_op(src,
-                                dst,
-                                edge_partition_src_value_input.get(src_offset),
-                                edge_partition_dst_value_input.get(dst_offset),
-                                edge_partition_e_value_input.get(edge_offset + i));
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    if (edge_partition_e_mask) {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          auto e_op_result = call_e_op(i);
+          e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+        }
+      }
+    } else {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        auto e_op_result = call_e_op(i);
         e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
       }
     }
@@ -331,30 +342,37 @@ __global__ void transform_reduce_e_high_degree(
   property_op<e_op_result_t, thrust::plus> edge_property_add{};
   e_op_result_t e_op_result_sum{};
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
-    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-      if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
-        auto minor        = indices[i];
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        auto src          = GraphViewType::is_storage_transposed
-                              ? minor
-                              : edge_partition.major_from_major_offset_nocheck(major_offset);
-        auto dst          = GraphViewType::is_storage_transposed
-                              ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                              : minor;
-        auto src_offset =
-          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-        auto dst_offset =
-          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-        auto e_op_result = e_op(src,
-                                dst,
-                                edge_partition_src_value_input.get(src_offset),
-                                edge_partition_dst_value_input.get(dst_offset),
-                                edge_partition_e_value_input.get(edge_offset + i));
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    if (edge_partition_e_mask) {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          auto e_op_result = call_e_op(i);
+          e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+        }
+      }
+    } else {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        auto e_op_result = call_e_op(i);
         e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
       }
     }

From f63310c2f4d538c2872de1b355c1735f6de87406 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 12 Feb 2024 11:46:38 -0600
Subject: [PATCH 6/7] Add cuda-nvtx-dev, add missing CUDA library dependencies.
 (#4162)

This PR fixes issues with devcontainer builds where `cuda-nvtx-dev` was missing when building `libcugraph_etl`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cugraph/pull/4162
---
 conda/environments/all_cuda-118_arch-x86_64.yaml |  1 +
 conda/environments/all_cuda-120_arch-x86_64.yaml |  7 +++++++
 dependencies.yaml                                | 10 +++++++++-
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 43bc60d91fb..de8db94df1c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -14,6 +14,7 @@ dependencies:
 - breathe
 - c-compiler
 - cmake>=3.26.4
+- cuda-nvtx
 - cuda-version=11.8
 - cudatoolkit
 - cudf==24.4.*
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 1829f3dd860..03dade0ed1f 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -14,7 +14,10 @@ dependencies:
 - breathe
 - c-compiler
 - cmake>=3.26.4
+- cuda-cudart-dev
 - cuda-nvcc
+- cuda-nvtx-dev
+- cuda-profiler-api
 - cuda-version=12.0
 - cudf==24.4.*
 - cupy>=12.0.0
@@ -29,8 +32,12 @@ dependencies:
 - graphviz
 - gtest>=1.13.0
 - ipython
+- libcublas-dev
 - libcudf==24.4.*
 - libcugraphops==24.4.*
+- libcurand-dev
+- libcusolver-dev
+- libcusparse-dev
 - libraft-headers==24.4.*
 - libraft==24.4.*
 - librmm==24.4.*
diff --git a/dependencies.yaml b/dependencies.yaml
index cfefe3b9ff9..58354407bbc 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -327,10 +327,18 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
+              - cuda-cudart-dev
+              - cuda-nvtx-dev
+              - cuda-profiler-api
+              - libcublas-dev
+              - libcurand-dev
+              - libcusolver-dev
+              - libcusparse-dev
           - matrix:
               cuda: "11.*"
             packages:
               - cudatoolkit
+              - cuda-nvtx
   common_build:
     common:
       - output_types: [conda, pyproject]
@@ -345,6 +353,7 @@ dependencies:
           - cxx-compiler
           - gmock>=1.13.0
           - gtest>=1.13.0
+          - libcudf==24.4.*
           - libcugraphops==24.4.*
           - libraft-headers==24.4.*
           - libraft==24.4.*
@@ -438,7 +447,6 @@ dependencies:
         packages:
           - aiohttp
           - fsspec>=0.6.0
-          - libcudf==24.4.*
           - requests
           - nccl>=2.9.9
           - ucx-proc=*=gpu

From eb096ac5ba7831112d6dd39a75be7c4ce9328b21 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Mon, 12 Feb 2024 15:44:12 -0500
Subject: [PATCH 7/7] Update Changelog [skip ci]

---
 CHANGELOG.md | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d165cd7efc4..fe08c8aeb03 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,84 @@
+# cuGraph 24.02.00 (12 Feb 2024)
+
+## 🚨 Breaking Changes
+
+- Remove Experimental Wrappers from GNN Code ([#4070](https://github.com/rapidsai/cugraph/pull/4070)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Switch to scikit-build-core ([#4053](https://github.com/rapidsai/cugraph/pull/4053)) [@vyasr](https://github.com/vyasr)
+- Update to CCCL 2.2.0. ([#4052](https://github.com/rapidsai/cugraph/pull/4052)) [@bdice](https://github.com/bdice)
+
+## 🐛 Bug Fixes
+
+- Revert &quot;Exclude tests from builds ([#4147)&quot; (#4157](https://github.com/rapidsai/cugraph/pull/4147)&quot; (#4157)) [@raydouglass](https://github.com/raydouglass)
+- Exclude tests from builds ([#4147](https://github.com/rapidsai/cugraph/pull/4147)) [@vyasr](https://github.com/vyasr)
+- Constraint pytorch-dependent wheel test to only run on amd64 ([#4133](https://github.com/rapidsai/cugraph/pull/4133)) [@tingyu66](https://github.com/tingyu66)
+- Removes the `networkx_algorithm` decorator to all SCC functions to disable dispatching to them ([#4120](https://github.com/rapidsai/cugraph/pull/4120)) [@rlratzel](https://github.com/rlratzel)
+- Correct `cugraph-pyg` package name used in wheels and fix test script ([#4083](https://github.com/rapidsai/cugraph/pull/4083)) [@tingyu66](https://github.com/tingyu66)
+- Fix Jaccard hang ([#4080](https://github.com/rapidsai/cugraph/pull/4080)) [@jnke2016](https://github.com/jnke2016)
+- Fix OOB error, BFS C API should validate that the source vertex is a valid vertex ([#4077](https://github.com/rapidsai/cugraph/pull/4077)) [@ChuckHastings](https://github.com/ChuckHastings)
+- [BUG]Fix non-type template parameter to cugraph::relabel ([#4064](https://github.com/rapidsai/cugraph/pull/4064)) [@naimnv](https://github.com/naimnv)
+- Fix MG weighted similarity test failure ([#4054](https://github.com/rapidsai/cugraph/pull/4054)) [@seunghwak](https://github.com/seunghwak)
+- MG C-API test failure fixes ([#4047](https://github.com/rapidsai/cugraph/pull/4047)) [@seunghwak](https://github.com/seunghwak)
+- Add a barrier before cugraph Graph creation ([#4046](https://github.com/rapidsai/cugraph/pull/4046)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Fix % 0 bug in MG_SELECT_RANDOM_VERTICES test ([#4034](https://github.com/rapidsai/cugraph/pull/4034)) [@seunghwak](https://github.com/seunghwak)
+- Branch 24.02 merge branch 23.12 ([#4012](https://github.com/rapidsai/cugraph/pull/4012)) [@vyasr](https://github.com/vyasr)
+
+## 📖 Documentation
+
+- Updates nx-cugraph README.md with latest algos ([#4135](https://github.com/rapidsai/cugraph/pull/4135)) [@rlratzel](https://github.com/rlratzel)
+- corrected links in C API and added groups for support functions ([#4131](https://github.com/rapidsai/cugraph/pull/4131)) [@acostadon](https://github.com/acostadon)
+- Forward-merge branch-23.12 to branch-24.02 ([#4049](https://github.com/rapidsai/cugraph/pull/4049)) [@GPUtester](https://github.com/GPUtester)
+
+## 🚀 New Features
+
+- Implement has_edge() &amp; compute_multiplicity() ([#4096](https://github.com/rapidsai/cugraph/pull/4096)) [@seunghwak](https://github.com/seunghwak)
+- Update per_v_transform_reduce_incoming|outgoing_e to support edge masking ([#4085](https://github.com/rapidsai/cugraph/pull/4085)) [@seunghwak](https://github.com/seunghwak)
+- Remove Experimental Wrappers from GNN Code ([#4070](https://github.com/rapidsai/cugraph/pull/4070)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- MNMG ECG ([#4030](https://github.com/rapidsai/cugraph/pull/4030)) [@naimnv](https://github.com/naimnv)
+- Replace graph_view.hpp::number_of_edges with compute_number_of_edges ([#4026](https://github.com/rapidsai/cugraph/pull/4026)) [@seunghwak](https://github.com/seunghwak)
+- Update count_if_e, transform_reduce_e, and transform_e to support edge masking ([#4001](https://github.com/rapidsai/cugraph/pull/4001)) [@seunghwak](https://github.com/seunghwak)
+- Sampling Performance Testing ([#3584](https://github.com/rapidsai/cugraph/pull/3584)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+
+## 🛠️ Improvements
+
+- Adds option to rapids_cpm_find for raft to disable hnswlib feature, adds updates for pytest 8 compat, temporarily skips IO intensive test in CI ([#4121](https://github.com/rapidsai/cugraph/pull/4121)) [@rlratzel](https://github.com/rlratzel)
+- Adds benchmarks for additional nx-cugraph 24.02 algos ([#4112](https://github.com/rapidsai/cugraph/pull/4112)) [@rlratzel](https://github.com/rlratzel)
+- nx-cugraph: use coverage to ensure all algorithms were run ([#4108](https://github.com/rapidsai/cugraph/pull/4108)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: rename `plc=` to `_plc=` ([#4106](https://github.com/rapidsai/cugraph/pull/4106)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `complement` and `reverse` ([#4103](https://github.com/rapidsai/cugraph/pull/4103)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `core_number` (undirected graphs only) ([#4100](https://github.com/rapidsai/cugraph/pull/4100)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `is_tree`, etc. ([#4097](https://github.com/rapidsai/cugraph/pull/4097)) [@eriknw](https://github.com/eriknw)
+- Optimize the drop-duplicate functionality ([#4095](https://github.com/rapidsai/cugraph/pull/4095)) [@jnke2016](https://github.com/jnke2016)
+- nx-cugraph: add triangles and clustering algorithms ([#4093](https://github.com/rapidsai/cugraph/pull/4093)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: PLC now handles isolated nodes; clean up our workarounds ([#4092](https://github.com/rapidsai/cugraph/pull/4092)) [@eriknw](https://github.com/eriknw)
+- Remove usages of rapids-env-update ([#4090](https://github.com/rapidsai/cugraph/pull/4090)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Provide explicit pool sizes and avoid RMM detail APIs ([#4086](https://github.com/rapidsai/cugraph/pull/4086)) [@harrism](https://github.com/harrism)
+- refactor CUDA versions in dependencies.yaml ([#4084](https://github.com/rapidsai/cugraph/pull/4084)) [@jameslamb](https://github.com/jameslamb)
+- build wheels for `cugraph-dgl` and `cugraph-pyg` ([#4075](https://github.com/rapidsai/cugraph/pull/4075)) [@tingyu66](https://github.com/tingyu66)
+- Match weight-sharing option of GATConv in DGL ([#4074](https://github.com/rapidsai/cugraph/pull/4074)) [@tingyu66](https://github.com/tingyu66)
+- nx-cugraph: add weakly connected components ([#4071](https://github.com/rapidsai/cugraph/pull/4071)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: indicate which plc algorithms are used and version_added ([#4069](https://github.com/rapidsai/cugraph/pull/4069)) [@eriknw](https://github.com/eriknw)
+- Adds `nx-cugraph` benchmarks for 23.12 algos (SSSP, pagerank, hits, katz_centrality, degree_centrality, eigenvector_centrality) ([#4065](https://github.com/rapidsai/cugraph/pull/4065)) [@rlratzel](https://github.com/rlratzel)
+- `nx-cugraph`: add `to_undirected` method; add reciprocity algorithms ([#4063](https://github.com/rapidsai/cugraph/pull/4063)) [@eriknw](https://github.com/eriknw)
+- Switch to scikit-build-core ([#4053](https://github.com/rapidsai/cugraph/pull/4053)) [@vyasr](https://github.com/vyasr)
+- Update to CCCL 2.2.0. ([#4052](https://github.com/rapidsai/cugraph/pull/4052)) [@bdice](https://github.com/bdice)
+- Prevent `actions/labeler` from adding `Label Checker` labels ([#4048](https://github.com/rapidsai/cugraph/pull/4048)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Update dependencies.yaml to new pip index ([#4045](https://github.com/rapidsai/cugraph/pull/4045)) [@vyasr](https://github.com/vyasr)
+- Remove checks for Pascal, no longer supported ([#4044](https://github.com/rapidsai/cugraph/pull/4044)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Fix HITS convergence error. ([#4043](https://github.com/rapidsai/cugraph/pull/4043)) [@seunghwak](https://github.com/seunghwak)
+- Test select_random_vertices for all possible values of flags ([#4042](https://github.com/rapidsai/cugraph/pull/4042)) [@naimnv](https://github.com/naimnv)
+- Remove CUGRAPH_BUILD_WHEELS and standardize Python builds ([#4041](https://github.com/rapidsai/cugraph/pull/4041)) [@vyasr](https://github.com/vyasr)
+- Create `cugraph-equivariant` package ([#4036](https://github.com/rapidsai/cugraph/pull/4036)) [@tingyu66](https://github.com/tingyu66)
+- [FEA]: Add DASK edgelist and graph support to the Dataset API ([#4035](https://github.com/rapidsai/cugraph/pull/4035)) [@huiyuxie](https://github.com/huiyuxie)
+- Add support for Louvain to MTMG ([#4033](https://github.com/rapidsai/cugraph/pull/4033)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Clean up self-loop and multi-edge removal logic ([#4032](https://github.com/rapidsai/cugraph/pull/4032)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Mtmg updates for rmm ([#4031](https://github.com/rapidsai/cugraph/pull/4031)) [@ChuckHastings](https://github.com/ChuckHastings)
+- nx-cugraph: adds `ancestors`, `descendants`, and BFS algos ([#4029](https://github.com/rapidsai/cugraph/pull/4029)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: update usage of `nodes_or_number` for nx compat ([#4028](https://github.com/rapidsai/cugraph/pull/4028)) [@eriknw](https://github.com/eriknw)
+- Removes unsupported `setup.py` calls, cleans up text ([#4024](https://github.com/rapidsai/cugraph/pull/4024)) [@rlratzel](https://github.com/rlratzel)
+- Resolves conflicts from forward-merging branch-23.12 into branch-24.02 ([#4020](https://github.com/rapidsai/cugraph/pull/4020)) [@rlratzel](https://github.com/rlratzel)
+- Add `HeteroGATConv` to `cugraph-pyg` ([#3914](https://github.com/rapidsai/cugraph/pull/3914)) [@tingyu66](https://github.com/tingyu66)
+- Update for CCCL 2.x ([#3862](https://github.com/rapidsai/cugraph/pull/3862)) [@seunghwak](https://github.com/seunghwak)
+
 # cuGraph 23.12.00 (6 Dec 2023)
 
 ## 🚨 Breaking Changes