From 65df1a271011b65c039eac354259250b1e96b5d1 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Wed, 6 Dec 2023 12:35:17 -0800
Subject: [PATCH 1/4] Fix HITS convergence error. (#4043)

We set epsilon in hits call to 1e-6 or 1e-8 in our tests. HITS internally uses max. norm to normalize HITS values after each iteration before computing HITS value changes in two consecutive iterations. Sum of HITS values tends to grow with the number of vertices. Using a fixed epsilon leads to convergence failure in large graphs.

This PR updates HITS to compare sum of HITS value changes in two consecutive iterations with `epsilon` * graph_view.number_of_vertices() following networkx documentation (https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_analysis.hits_alg.hits.html).

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Joseph Nke (https://github.com/jnke2016)
  - Naim (https://github.com/naimnv)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/4043
---
 cpp/src/link_analysis/hits_impl.cuh      |  3 ++-
 cpp/tests/c_api/hits_test.c              |  6 ++---
 cpp/tests/c_api/mg_hits_test.c           |  4 ++--
 cpp/tests/link_analysis/hits_test.cpp    | 28 ++++++++++++++----------
 cpp/tests/link_analysis/mg_hits_test.cpp | 18 +++++++--------
 5 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/cpp/src/link_analysis/hits_impl.cuh b/cpp/src/link_analysis/hits_impl.cuh
index 674046745b1..5cdf1b9dc6a 100644
--- a/cpp/src/link_analysis/hits_impl.cuh
+++ b/cpp/src/link_analysis/hits_impl.cuh
@@ -80,6 +80,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
   if (num_vertices == 0) { return std::make_tuple(diff_sum, final_iteration_count); }
 
   CUGRAPH_EXPECTS(epsilon >= 0.0, "Invalid input argument: epsilon should be non-negative.");
+  auto tolerance = static_cast<result_t>(graph_view.number_of_vertices()) * epsilon;
 
   // Check validity of initial guess if supplied
   if (has_initial_hubs_guess && do_expensive_check) {
@@ -171,7 +172,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
     std::swap(prev_hubs, curr_hubs);
     iter++;
 
-    if (diff_sum < epsilon) {
+    if (diff_sum < tolerance) {
       break;
     } else if (iter >= max_iterations) {
       CUGRAPH_FAIL("HITS failed to converge.");
diff --git a/cpp/tests/c_api/hits_test.c b/cpp/tests/c_api/hits_test.c
index c275d883d11..1ebd4f82a51 100644
--- a/cpp/tests/c_api/hits_test.c
+++ b/cpp/tests/c_api/hits_test.c
@@ -163,7 +163,7 @@ int test_hits()
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // hits wants store_transposed = TRUE
@@ -195,7 +195,7 @@ int test_hits_with_transpose()
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // Hits wants store_transposed = TRUE
@@ -232,7 +232,7 @@ int test_hits_with_initial()
   vertex_t h_initial_vertices[] = {0, 1, 2, 3, 4};
   weight_t h_initial_hubs[]     = {0.347296, 0.532089, 1, 0.00003608, 0.00003608};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   return generic_hits_test(h_src,
diff --git a/cpp/tests/c_api/mg_hits_test.c b/cpp/tests/c_api/mg_hits_test.c
index 87371093613..3e10bfc05d6 100644
--- a/cpp/tests/c_api/mg_hits_test.c
+++ b/cpp/tests/c_api/mg_hits_test.c
@@ -171,7 +171,7 @@ int test_hits(const cugraph_resource_handle_t* handle)
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // hits wants store_transposed = TRUE
@@ -203,7 +203,7 @@ int test_hits_with_transpose(const cugraph_resource_handle_t* handle)
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // Hits wants store_transposed = TRUE
diff --git a/cpp/tests/link_analysis/hits_test.cpp b/cpp/tests/link_analysis/hits_test.cpp
index d0e77769034..6796761e212 100644
--- a/cpp/tests/link_analysis/hits_test.cpp
+++ b/cpp/tests/link_analysis/hits_test.cpp
@@ -52,9 +52,11 @@ std::tuple<std::vector<result_t>, std::vector<result_t>, double, size_t> hits_re
   size_t max_iterations,
   std::optional<result_t const*> starting_hub_values,
   bool normalized,
-  double tolerance)
+  double epsilon)
 {
   CUGRAPH_EXPECTS(num_vertices > 1, "number of vertices expected to be non-zero");
+  auto tolerance = static_cast<result_t>(num_vertices) * epsilon;
+
   std::vector<result_t> prev_hubs(num_vertices, result_t{1.0} / num_vertices);
   std::vector<result_t> prev_authorities(num_vertices, result_t{1.0} / num_vertices);
   std::vector<result_t> curr_hubs(num_vertices);
@@ -127,8 +129,8 @@ std::tuple<std::vector<result_t>, std::vector<result_t>, double, size_t> hits_re
 }
 
 struct Hits_Usecase {
-  bool check_correctness{true};
   bool check_initial_input{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -175,8 +177,8 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
     // 3. run hits
 
     auto graph_view         = graph.view();
-    auto maximum_iterations = 500;
-    weight_t tolerance      = 1e-5;
+    auto maximum_iterations = 200;
+    weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_hubs(graph_view.local_vertex_partition_range_size(),
                                          handle.get_stream());
 
@@ -201,7 +203,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
                                 graph_view,
                                 d_hubs.data(),
                                 d_authorities.data(),
-                                tolerance,
+                                epsilon,
                                 maximum_iterations,
                                 hits_usecase.check_initial_input,
                                 true,
@@ -232,7 +234,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
         (hits_usecase.check_initial_input) ? std::make_optional(initial_random_hubs.data())
                                            : std::nullopt,
         true,
-        tolerance);
+        epsilon);
 
       std::vector<weight_t> h_cugraph_hits{};
       if (renumber) {
@@ -246,8 +248,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
       handle.sync_stream();
       auto threshold_ratio = 1e-3;
       auto threshold_magnitude =
-        (1.0 / static_cast<weight_t>(graph_view.number_of_vertices())) *
-        threshold_ratio;  // skip comparison for low hits vertices (lowly ranked vertices)
+        1e-6;  // skip comparison for low hits vertices (lowly ranked vertices)
       auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
         return std::abs(lhs - rhs) <=
                std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
@@ -294,14 +295,17 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"),
                       cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_Hits_Rmat,
                          // enable correctness checks
-                         ::testing::Combine(::testing::Values(Hits_Usecase{true, false},
+                         ::testing::Combine(::testing::Values(Hits_Usecase{false, true},
                                                               Hits_Usecase{true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
@@ -315,7 +319,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_File,
   ::testing::Combine(
     // disable correctness checks
-    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{false, true}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -327,7 +331,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_Rmat,
   // disable correctness checks for large graphs
   ::testing::Combine(
-    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{false, true}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/link_analysis/mg_hits_test.cpp b/cpp/tests/link_analysis/mg_hits_test.cpp
index cf95d03681d..5c89bafd08e 100644
--- a/cpp/tests/link_analysis/mg_hits_test.cpp
+++ b/cpp/tests/link_analysis/mg_hits_test.cpp
@@ -33,8 +33,8 @@
 #include <gtest/gtest.h>
 
 struct Hits_Usecase {
-  bool check_correctness{true};
   bool check_initial_input{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -81,7 +81,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
     auto mg_graph_view = mg_graph.view();
 
     auto maximum_iterations = 200;
-    weight_t tolerance      = 1e-8;
+    weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_mg_hubs(mg_graph_view.local_vertex_partition_range_size(),
                                             handle_->get_stream());
 
@@ -110,7 +110,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
                                 mg_graph_view,
                                 d_mg_hubs.data(),
                                 d_mg_authorities.data(),
-                                tolerance,
+                                epsilon,
                                 maximum_iterations,
                                 hits_usecase.check_initial_input,
                                 true,
@@ -205,7 +205,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
                                     sg_graph_view,
                                     d_sg_hubs.data(),
                                     d_sg_authorities.data(),
-                                    tolerance,
+                                    epsilon,
                                     maximum_iterations,
                                     hits_usecase.check_initial_input,
                                     true,
@@ -218,9 +218,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
 
         auto threshold_ratio = 1e-3;
         auto threshold_magnitude =
-          (1.0 / static_cast<result_t>(mg_graph_view.number_of_vertices())) *
-          threshold_ratio;  // skip comparison for low Hits verties (lowly ranked
-                            // vertices)
+          1e-6;  // skip comparison for low Hits verties (lowly ranked vertices)
         auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
           return std::abs(lhs - rhs) <
                  std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
@@ -274,7 +272,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -285,7 +283,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_Rmat,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -297,7 +295,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_Rmat,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(Hits_Usecase{false, false}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()

From c709fc9c4e8d1a434a567b3b2c407f3d51ea9030 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 6 Dec 2023 12:52:59 -0800
Subject: [PATCH 2/4] Remove CUGRAPH_BUILD_WHEELS and standardize Python builds
 (#4041)

Some minor simplification in advance of the scikit-build-core migration to better align wheel and non-wheel Python builds.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/4041
---
 ci/build_wheel_cugraph.sh          |  2 +-
 ci/build_wheel_pylibcugraph.sh     |  2 +-
 python/cugraph/CMakeLists.txt      | 25 ++++++++-----------------
 python/pylibcugraph/CMakeLists.txt | 25 ++++++++-----------------
 4 files changed, 18 insertions(+), 36 deletions(-)

diff --git a/ci/build_wheel_cugraph.sh b/ci/build_wheel_cugraph.sh
index 5b5061f67c2..0a722c88c3e 100755
--- a/ci/build_wheel_cugraph.sh
+++ b/ci/build_wheel_cugraph.sh
@@ -12,6 +12,6 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME=pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX} rapids-download-wheels-from-s3 ./local-pylibcugraph
 export PIP_FIND_LINKS=$(pwd)/local-pylibcugraph
 
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
 ./ci/build_wheel.sh cugraph python/cugraph
diff --git a/ci/build_wheel_pylibcugraph.sh b/ci/build_wheel_pylibcugraph.sh
index 8d365bc250b..9e236c145ce 100755
--- a/ci/build_wheel_pylibcugraph.sh
+++ b/ci/build_wheel_pylibcugraph.sh
@@ -3,6 +3,6 @@
 
 set -euo pipefail
 
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
 ./ci/build_wheel.sh pylibcugraph python/pylibcugraph
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index a1ec12c6e07..99936b23a8c 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -38,7 +38,6 @@ project(
 option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before defaulting to local files"
        OFF
 )
-option(CUGRAPH_BUILD_WHEELS "Whether this build is generating a Python wheel." OFF)
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
 
 if(NOT USE_CUGRAPH_OPS)
@@ -59,22 +58,14 @@ if(NOT cugraph_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
-
-  set(_exclude_from_all "")
-  if(CUGRAPH_BUILD_WHEELS)
-    # Statically link dependencies if building wheels
-    set(CUDA_STATIC_RUNTIME ON)
-    set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_LIB ON)
-    set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
-    set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
-    set(ALLOW_CLONE_CUGRAPH_OPS ON)
-
-    # Don't install the cuML C++ targets into wheels
-    set(_exclude_from_all EXCLUDE_FROM_ALL)
-  endif()
-
-  add_subdirectory(../../cpp cugraph-cpp ${_exclude_from_all})
+  set(CUDA_STATIC_RUNTIME ON)
+  set(USE_RAFT_STATIC ON)
+  set(CUGRAPH_COMPILE_RAFT_LIB ON)
+  set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
+  set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
+  set(ALLOW_CLONE_CUGRAPH_OPS ON)
+
+  add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
   set(cython_lib_dir cugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index 7d5dc790ad0..e1250cb2edb 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -38,7 +38,6 @@ project(
 option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before defaulting to local files"
        OFF
 )
-option(CUGRAPH_BUILD_WHEELS "Whether we're building a wheel for pypi" OFF)
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
 
 if(NOT USE_CUGRAPH_OPS)
@@ -59,22 +58,14 @@ if (NOT cugraph_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
-
-  set(_exclude_from_all "")
-  if(CUGRAPH_BUILD_WHEELS)
-    # Statically link dependencies if building wheels
-    set(CUDA_STATIC_RUNTIME ON)
-    set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_LIB ON)
-    set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
-    set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
-    set(ALLOW_CLONE_CUGRAPH_OPS ON)
-
-    # Don't install the cuML C++ targets into wheels
-    set(_exclude_from_all EXCLUDE_FROM_ALL)
-  endif()
-
-  add_subdirectory(../../cpp cugraph-cpp ${_exclude_from_all})
+  set(CUDA_STATIC_RUNTIME ON)
+  set(USE_RAFT_STATIC ON)
+  set(CUGRAPH_COMPILE_RAFT_LIB ON)
+  set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
+  set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
+  set(ALLOW_CLONE_CUGRAPH_OPS ON)
+
+  add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
   set(cython_lib_dir pylibcugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})

From 1df62176c3d5a8addaad2a910c489c7426e8f6a4 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 6 Dec 2023 16:12:44 -0800
Subject: [PATCH 3/4] Update dependencies.yaml to new pip index (#4045)

This PR changes all references to pypi.nvidia.com to pypi.anaconda.org/rapidsai-wheels-nightly.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cugraph/pull/4045
---
 dependencies.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dependencies.yaml b/dependencies.yaml
index baa08c37413..b5c1fb2fa2d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -511,6 +511,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -536,6 +537,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -561,6 +563,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -586,6 +589,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -611,6 +615,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -636,6 +641,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -661,6 +667,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:

From 18ab76b73680e9269be89728613cb55081f9833e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Fri, 8 Dec 2023 13:41:24 -0800
Subject: [PATCH 4/4] Update for CCCL 2.x (#3862)

This PR adds `cuda::proclaim_return_type` to device lambdas used in `thrust::transform` and `thrust::make_transform_iterator`.

This PR requires libcudacxx 2.1.0, which was provided by https://github.com/rapidsai/rapids-cmake/pull/464.

Closes #3863.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/3862
---
 cpp/include/cugraph/utilities/misc_utils.cuh  |  5 +-
 .../cugraph/utilities/shuffle_comm.cuh        | 48 ++++++-----
 cpp/src/community/detail/mis_impl.cuh         | 16 ++--
 cpp/src/community/detail/refine_impl.cuh      | 39 +++++----
 cpp/src/detail/collect_local_vertex_values.cu |  5 +-
 cpp/src/generators/erdos_renyi_generator.cu   | 24 +++---
 cpp/src/generators/simple_generators.cu       | 39 +++++----
 cpp/src/mtmg/vertex_result.cu                 | 10 ++-
 ...r_v_random_select_transform_outgoing_e.cuh | 12 ++-
 ...v_transform_reduce_incoming_outgoing_e.cuh | 25 +++---
 .../prims/update_edge_src_dst_property.cuh    | 82 +++++++++++--------
 cpp/src/sampling/random_walks.cuh             | 23 ++++--
 .../sampling_post_processing_impl.cuh         | 12 ++-
 cpp/src/structure/graph_view_impl.cuh         |  6 +-
 .../sampling/sampling_post_processing_test.cu | 56 +++++++------
 15 files changed, 235 insertions(+), 167 deletions(-)

diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh
index 28e2853727f..04aeac49c9d 100644
--- a/cpp/include/cugraph/utilities/misc_utils.cuh
+++ b/cpp/include/cugraph/utilities/misc_utils.cuh
@@ -26,6 +26,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
 
+#include <cuda/functional>
+
 #include <optional>
 #include <tuple>
 #include <vector>
@@ -44,7 +46,8 @@ std::tuple<std::vector<vertex_t>, std::vector<edge_t>> compute_offset_aligned_ed
 {
   auto search_offset_first = thrust::make_transform_iterator(
     thrust::make_counting_iterator(size_t{1}),
-    [approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; });
+    cuda::proclaim_return_type<size_t>(
+      [approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; }));
   auto num_chunks = (num_edges + approx_edge_chunk_size - 1) / approx_edge_chunk_size;
 
   if (num_chunks > 1) {
diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index ab6a54cc1c0..414d9b36992 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -40,6 +40,8 @@
 #include <thrust/tabulate.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <numeric>
 #include <vector>
@@ -197,12 +199,13 @@ void multi_partition(ValueIterator value_first,
     value_last,
     thrust::make_zip_iterator(
       thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
-    [value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
-      auto group_id = value_to_group_id_op(value);
-      cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
-      return thrust::make_tuple(group_id,
-                                counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
-    });
+    cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
+      [value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
+        auto group_id = value_to_group_id_op(value);
+        cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
+        return thrust::make_tuple(group_id,
+                                  counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
+      }));
 
   rmm::device_uvector<size_t> displacements(num_groups, stream_view);
   thrust::exclusive_scan(
@@ -245,17 +248,19 @@ void multi_partition(KeyIterator key_first,
   rmm::device_uvector<int> group_ids(num_keys, stream_view);
   rmm::device_uvector<size_t> intra_partition_offsets(num_keys, stream_view);
   thrust::fill(rmm::exec_policy(stream_view), counts.begin(), counts.end(), size_t{0});
-  thrust::transform(rmm::exec_policy(stream_view),
-                    key_first,
-                    key_last,
-                    thrust::make_zip_iterator(
-                      thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
-                    [key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
-                      auto group_id = key_to_group_id_op(key);
-                      cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
-                      return thrust::make_tuple(
-                        group_id, counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream_view),
+    key_first,
+    key_last,
+    thrust::make_zip_iterator(
+      thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
+    cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
+      [key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
+        auto group_id = key_to_group_id_op(key);
+        cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
+        return thrust::make_tuple(group_id,
+                                  counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
+      }));
 
   rmm::device_uvector<size_t> displacements(num_groups, stream_view);
   thrust::exclusive_scan(
@@ -761,8 +766,9 @@ rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [I
                              stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
-    tx_value_first,
-    [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); });
+    tx_value_first, cuda::proclaim_return_type<int>([value_to_group_id_op] __device__(auto value) {
+      return value_to_group_id_op(value);
+    }));
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
   auto rank_count_pair_first = thrust::make_zip_iterator(
@@ -795,7 +801,9 @@ rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [IN
                              stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
-    tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
+    tx_key_first, cuda::proclaim_return_type<int>([key_to_group_id_op] __device__(auto key) {
+      return key_to_group_id_op(key);
+    }));
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
   auto rank_count_pair_first = thrust::make_zip_iterator(
diff --git a/cpp/src/community/detail/mis_impl.cuh b/cpp/src/community/detail/mis_impl.cuh
index bcd71af5a08..2659a982183 100644
--- a/cpp/src/community/detail/mis_impl.cuh
+++ b/cpp/src/community/detail/mis_impl.cuh
@@ -37,6 +37,8 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
+#include <cuda/functional>
+
 #include <cmath>
 
 namespace cugraph {
@@ -78,13 +80,13 @@ rmm::device_uvector<vertex_t> maximal_independent_set(
   thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
 
   // Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
-  thrust::transform_if(
-    handle.get_thrust_policy(),
-    out_degrees.begin(),
-    out_degrees.end(),
-    ranks.begin(),
-    [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); },
-    [] __device__(auto deg) { return deg == 0; });
+  thrust::transform_if(handle.get_thrust_policy(),
+                       out_degrees.begin(),
+                       out_degrees.end(),
+                       ranks.begin(),
+                       cuda::proclaim_return_type<vertex_t>(
+                         [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); }),
+                       [] __device__(auto deg) { return deg == 0; });
 
   out_degrees.resize(0, handle.get_stream());
   out_degrees.shrink_to_fit(handle.get_stream());
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index ebaae498d04..eb874657f01 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -46,6 +46,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 CUCO_DECLARE_BITWISE_COMPARABLE(float)
 CUCO_DECLARE_BITWISE_COMPARABLE(double)
 // FIXME: a temporary workaround for a compiler error, should be deleted once cuco gets patched.
@@ -213,16 +215,17 @@ refine_clustering(
                                 : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
                                     louvain_assignment_of_vertices.data(), vertex_t{0}),
     *edge_weight_view,
-    [] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
-      weight_t weighted_cut_contribution{0};
+    cuda::proclaim_return_type<weight_t>(
+      [] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
+        weight_t weighted_cut_contribution{0};
 
-      if (src == dst)  // self loop
-        weighted_cut_contribution = 0;
-      else if (src_cluster == dst_cluster)
-        weighted_cut_contribution = wt;
+        if (src == dst)  // self loop
+          weighted_cut_contribution = 0;
+        else if (src_cluster == dst_cluster)
+          weighted_cut_contribution = wt;
 
-      return weighted_cut_contribution;
-    },
+        return weighted_cut_contribution;
+      }),
     weight_t{0},
     cugraph::reduce_op::plus<weight_t>{},
     weighted_cut_of_vertices_to_louvain.begin());
@@ -243,13 +246,14 @@ refine_clustering(
                     wcut_deg_and_cluster_vol_triple_begin,
                     wcut_deg_and_cluster_vol_triple_end,
                     singleton_and_connected_flags.begin(),
-                    [resolution, total_edge_weight] __device__(auto wcut_wdeg_and_louvain_volume) {
+                    cuda::proclaim_return_type<uint8_t>([resolution, total_edge_weight] __device__(
+                                                          auto wcut_wdeg_and_louvain_volume) {
                       auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
                       auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
                       auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
-                      return wcut >
-                             (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight);
-                    });
+                      return static_cast<uint8_t>(
+                        wcut > (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight));
+                    }));
 
   edge_src_property_t<GraphViewType, weight_t> src_louvain_cluster_weight_cache(handle);
   edge_src_property_t<GraphViewType, weight_t> src_cut_to_louvain_cache(handle);
@@ -718,11 +722,12 @@ refine_clustering(
       vertices_in_mis.begin(),
       vertices_in_mis.end(),
       dst_vertices.begin(),
-      [dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
-       v_first   = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
-        auto dst = *(dst_first + v - v_first);
-        return dst;
-      });
+      cuda::proclaim_return_type<vertex_t>(
+        [dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
+         v_first   = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
+          auto dst = *(dst_first + v - v_first);
+          return dst;
+        }));
 
     cugraph::resize_dataframe_buffer(gain_and_dst_output_pairs, 0, handle.get_stream());
     cugraph::shrink_to_fit_dataframe_buffer(gain_and_dst_output_pairs, handle.get_stream());
diff --git a/cpp/src/detail/collect_local_vertex_values.cu b/cpp/src/detail/collect_local_vertex_values.cu
index 9d5d2cb553b..795902dfd87 100644
--- a/cpp/src/detail/collect_local_vertex_values.cu
+++ b/cpp/src/detail/collect_local_vertex_values.cu
@@ -19,6 +19,8 @@
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
+#include <cuda/functional>
+
 namespace cugraph {
 namespace detail {
 
@@ -64,7 +66,8 @@ rmm::device_uvector<value_t> collect_local_vertex_values_from_ext_vertex_value_p
 
   auto vertex_iterator = thrust::make_transform_iterator(
     d_vertices.begin(),
-    [local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; });
+    cuda::proclaim_return_type<vertex_t>(
+      [local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; }));
 
   d_local_values.resize(local_vertex_last - local_vertex_first, handle.get_stream());
   thrust::fill(
diff --git a/cpp/src/generators/erdos_renyi_generator.cu b/cpp/src/generators/erdos_renyi_generator.cu
index 6d847ae0bde..8448eeaf960 100644
--- a/cpp/src/generators/erdos_renyi_generator.cu
+++ b/cpp/src/generators/erdos_renyi_generator.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,8 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 namespace cugraph {
 
 template <typename vertex_t>
@@ -42,12 +44,13 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                   "Implementation cannot support specified value");
 
   auto random_iterator = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_t>(0), [seed] __device__(size_t index) {
+    thrust::make_counting_iterator<size_t>(0),
+    cuda::proclaim_return_type<float>([seed] __device__(size_t index) {
       thrust::default_random_engine rng(seed);
       thrust::uniform_real_distribution<float> dist(0.0, 1.0);
       rng.discard(index);
       return dist(rng);
-    });
+    }));
 
   size_t count = thrust::count_if(handle.get_thrust_policy(),
                                   random_iterator,
@@ -69,13 +72,14 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                     indices_v.begin(),
                     indices_v.end(),
                     thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), src_v.end())),
-                    [num_vertices] __device__(size_t index) {
-                      size_t src = index / num_vertices;
-                      size_t dst = index % num_vertices;
-
-                      return thrust::make_tuple(static_cast<vertex_t>(src),
-                                                static_cast<vertex_t>(dst));
-                    });
+                    cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
+                      [num_vertices] __device__(size_t index) {
+                        size_t src = index / num_vertices;
+                        size_t dst = index % num_vertices;
+
+                        return thrust::make_tuple(static_cast<vertex_t>(src),
+                                                  static_cast<vertex_t>(dst));
+                      }));
 
   handle.sync_stream();
 
diff --git a/cpp/src/generators/simple_generators.cu b/cpp/src/generators/simple_generators.cu
index 6dba63909c3..65647be5de0 100644
--- a/cpp/src/generators/simple_generators.cu
+++ b/cpp/src/generators/simple_generators.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,8 @@
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 
 namespace cugraph {
@@ -264,23 +266,24 @@ generate_complete_graph_edgelist(
 
     auto transform_iter = thrust::make_transform_iterator(
       thrust::make_counting_iterator<size_t>(0),
-      [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
-        size_t graph_index = index / (num_vertices * num_vertices);
-        size_t local_index = index % (num_vertices * num_vertices);
-
-        vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
-        vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
-
-        if (src == dst) {
-          src = invalid_vertex;
-          dst = invalid_vertex;
-        } else {
-          src += (graph_index * num_vertices);
-          dst += (graph_index * num_vertices);
-        }
-
-        return thrust::make_tuple(src, dst);
-      });
+      cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
+        [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
+          size_t graph_index = index / (num_vertices * num_vertices);
+          size_t local_index = index % (num_vertices * num_vertices);
+
+          vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
+          vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
+
+          if (src == dst) {
+            src = invalid_vertex;
+            dst = invalid_vertex;
+          } else {
+            src += (graph_index * num_vertices);
+            dst += (graph_index * num_vertices);
+          }
+
+          return thrust::make_tuple(src, dst);
+        }));
 
     output_iterator = thrust::copy_if(handle.get_thrust_policy(),
                                       transform_iter,
diff --git a/cpp/src/mtmg/vertex_result.cu b/cpp/src/mtmg/vertex_result.cu
index 5b1825656ff..414f1bdfa88 100644
--- a/cpp/src/mtmg/vertex_result.cu
+++ b/cpp/src/mtmg/vertex_result.cu
@@ -21,6 +21,7 @@
 
 #include <detail/graph_partition_utils.cuh>
 
+#include <thrust/functional.h>
 #include <thrust/gather.h>
 
 namespace cugraph {
@@ -91,10 +92,11 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
   auto vertex_partition =
     vertex_partition_device_view_t<vertex_t, multi_gpu>(vertex_partition_view);
 
-  auto iter =
-    thrust::make_transform_iterator(local_vertices.begin(), [vertex_partition] __device__(auto v) {
+  auto iter = thrust::make_transform_iterator(
+    local_vertices.begin(),
+    cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
       return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-    });
+    }));
 
   thrust::gather(handle.get_thrust_policy(),
                  iter,
@@ -111,7 +113,7 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
       vertex_gpu_ids.begin(),
       vertex_gpu_ids.end(),
       thrust::make_zip_iterator(local_vertices.begin(), vertex_pos.begin(), tmp_result.begin()),
-      [] __device__(int gpu) { return gpu; },
+      thrust::identity{},
       handle.get_stream());
 
   //
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 5fee97790f1..4c5c43c7d1e 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -44,6 +44,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <optional>
 #include <tuple>
 
@@ -596,8 +598,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
                                         multiplier_t<size_t>{high_partition_over_sampling_K}),
         thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_t{0}),
-          [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-            size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
         handle.get_stream());
       if (tmp_storage_bytes > d_tmp_storage.size()) {
         d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
@@ -615,8 +618,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
                                         multiplier_t<size_t>{high_partition_over_sampling_K}),
         thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_t{0}),
-          [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-            size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
         handle.get_stream());
 
       // copy the neighbor indices back to sample_nbr_indices
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 0b6c6a554bb..5de2dbc18e2 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -51,6 +51,8 @@
 #include <thrust/tuple.h>
 #include <thrust/type_traits/integer_sequence.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 #include <type_traits>
 #include <utility>
@@ -940,16 +942,19 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
                      minor_init);
         auto value_first = thrust::make_transform_iterator(
           view.value_first(),
-          [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); });
-        thrust::scatter(
-          handle.get_thrust_policy(),
-          value_first + (*minor_key_offsets)[i],
-          value_first + (*minor_key_offsets)[i + 1],
-          thrust::make_transform_iterator(
-            (*(view.keys())).begin() + (*minor_key_offsets)[i],
-            [key_first = graph_view.vertex_partition_range_first(
-               this_segment_vertex_partition_id)] __device__(auto key) { return key - key_first; }),
-          tx_buffer_first);
+          cuda::proclaim_return_type<T>(
+            [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); }));
+        thrust::scatter(handle.get_thrust_policy(),
+                        value_first + (*minor_key_offsets)[i],
+                        value_first + (*minor_key_offsets)[i + 1],
+                        thrust::make_transform_iterator(
+                          (*(view.keys())).begin() + (*minor_key_offsets)[i],
+                          cuda::proclaim_return_type<vertex_t>(
+                            [key_first = graph_view.vertex_partition_range_first(
+                               this_segment_vertex_partition_id)] __device__(auto key) {
+                              return key - key_first;
+                            })),
+                        tx_buffer_first);
         device_reduce(major_comm,
                       tx_buffer_first,
                       vertex_value_output_first,
diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh
index b8621e122c6..18bdf5bcf2d 100644
--- a/cpp/src/prims/update_edge_src_dst_property.cuh
+++ b/cpp/src/prims/update_edge_src_dst_property.cuh
@@ -44,6 +44,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scatter.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <numeric>
 #include <type_traits>
@@ -181,13 +183,14 @@ void update_edge_major_property(raft::handle_t const& handle,
                        handle.get_stream());
           auto bool_first = thrust::make_transform_iterator(
             (*edge_partition_keys)[i].begin(),
-            [rx_value_first,
-             v_first = graph_view.vertex_partition_range_first(
-               major_range_vertex_partition_id)] __device__(auto v) {
-              auto v_offset = v - v_first;
-              return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
-                                       packed_bool_mask(v_offset));
-            });
+            cuda::proclaim_return_type<bool>(
+              [rx_value_first,
+               v_first = graph_view.vertex_partition_range_first(
+                 major_range_vertex_partition_id)] __device__(auto v) {
+                auto v_offset = v - v_first;
+                return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                         packed_bool_mask(v_offset));
+              }));
           pack_bools(handle,
                      bool_first,
                      bool_first + (*edge_partition_keys)[i].size(),
@@ -202,8 +205,9 @@ void update_edge_major_property(raft::handle_t const& handle,
 
           auto v_offset_first = thrust::make_transform_iterator(
             (*edge_partition_keys)[i].begin(),
-            [v_first = graph_view.vertex_partition_range_first(
-               major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+            cuda::proclaim_return_type<vertex_t>(
+              [v_first = graph_view.vertex_partition_range_first(
+                 major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; }));
           thrust::gather(handle.get_thrust_policy(),
                          v_offset_first,
                          v_offset_first + (*edge_partition_keys)[i].size(),
@@ -312,21 +316,24 @@ void update_edge_major_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (packed_bool) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+            vertex_first,
+            cuda::proclaim_return_type<bool>([vertex_property_input_first,
+                                              vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
               return static_cast<bool>(
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
-            });
+            }));
           pack_bools(handle,
                      bool_first,
                      bool_first + thrust::distance(vertex_first, vertex_last),
                      rx_value_first);
         } else {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+          auto map_first = thrust::make_transform_iterator(
+            vertex_first,
+            cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-            });
+            }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
           thrust::gather(handle.get_thrust_policy(),
@@ -391,9 +398,10 @@ void update_edge_major_property(raft::handle_t const& handle,
             });
         } else {
           auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
               return edge_partition.major_offset_from_major_nocheck(v);
-            });
+            }));
           // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
@@ -593,13 +601,14 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
                 auto bool_first = thrust::make_transform_iterator(
                   (*edge_partition_keys).begin() + key_offsets[j],
-                  [rx_value_first,
-                   v_first = graph_view.vertex_partition_range_first(
-                     minor_range_vertex_partition_id)] __device__(auto v) {
-                    auto v_offset = v - v_first;
-                    return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
-                                             packed_bool_mask(v_offset));
-                  });
+                  cuda::proclaim_return_type<bool>(
+                    [rx_value_first,
+                     v_first = graph_view.vertex_partition_range_first(
+                       minor_range_vertex_partition_id)] __device__(auto v) {
+                      auto v_offset = v - v_first;
+                      return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                               packed_bool_mask(v_offset));
+                    }));
                 pack_unaligned_bools(
                   handle,
                   bool_first,
@@ -611,10 +620,10 @@ void update_edge_minor_property(raft::handle_t const& handle,
                   std::get<std::vector<size_t>>(key_offsets_or_rx_displacements);
                 auto bool_first = thrust::make_transform_iterator(
                   thrust::make_counting_iterator(vertex_t{0}),
-                  [rx_value_first] __device__(vertex_t v_offset) {
+                  cuda::proclaim_return_type<bool>([rx_value_first] __device__(vertex_t v_offset) {
                     return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
                                              packed_bool_mask(v_offset));
-                  });
+                  }));
                 pack_unaligned_bools(
                   handle,
                   bool_first,
@@ -630,8 +639,9 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
               auto v_offset_first = thrust::make_transform_iterator(
                 (*edge_partition_keys).begin() + key_offsets[j],
-                [v_first = graph_view.vertex_partition_range_first(
-                   minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+                cuda::proclaim_return_type<vertex_t>(
+                  [v_first = graph_view.vertex_partition_range_first(
+                     minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; }));
               thrust::gather(handle.get_thrust_policy(),
                              v_offset_first,
                              v_offset_first + (key_offsets[j + 1] - key_offsets[j]),
@@ -718,21 +728,24 @@ void update_edge_minor_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (packed_bool) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+            vertex_first,
+            cuda::proclaim_return_type<bool>([vertex_property_input_first,
+                                              vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
               return static_cast<bool>(
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
-            });
+            }));
           pack_bools(handle,
                      bool_first,
                      bool_first + thrust::distance(vertex_first, vertex_last),
                      rx_value_first);
         } else {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+          auto map_first = thrust::make_transform_iterator(
+            vertex_first,
+            cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-            });
+            }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
           thrust::gather(handle.get_thrust_policy(),
@@ -799,9 +812,10 @@ void update_edge_minor_property(raft::handle_t const& handle,
             });
         } else {
           auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
               return edge_partition.minor_offset_from_minor_nocheck(v);
-            });
+            }));
           // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 6a7334e9f1a..5a9ded02009 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -52,6 +52,8 @@
 #include <thrust/transform_scan.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <cassert>
 #include <cstdlib>  // FIXME: requirement for temporary std::getenv()
 #include <limits>
@@ -378,7 +380,8 @@ struct random_walker_t {
 
     // scatter d_src_init_v to coalesced vertex vector:
     //
-    auto dlambda = [stride = max_depth_] __device__(auto indx) { return indx * stride; };
+    auto dlambda = cuda::proclaim_return_type<index_t>(
+      [stride = max_depth_] __device__(auto indx) { return indx * stride; });
 
     // use the transform iterator as map:
     //
@@ -539,10 +542,11 @@ struct random_walker_t {
 
     // delta = ptr_d_sizes[indx] - 1
     //
-    auto dlambda = [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
-      auto delta = ptr_d_sizes[indx] - 1;
-      return ptr_d_coalesced[indx * stride + delta];
-    };
+    auto dlambda = cuda::proclaim_return_type<vertex_t>(
+      [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
+        auto delta = ptr_d_sizes[indx] - 1;
+        return ptr_d_coalesced[indx * stride + delta];
+      });
 
     // use the transform iterator as map:
     //
@@ -587,10 +591,11 @@ struct random_walker_t {
   {
     index_t const* ptr_d_sizes = original::raw_const_ptr(d_sizes);
 
-    auto dlambda = [stride, adjust, ptr_d_sizes] __device__(auto indx) {
-      auto delta = ptr_d_sizes[indx] - adjust - 1;
-      return indx * stride + delta;
-    };
+    auto dlambda =
+      cuda::proclaim_return_type<index_t>([stride, adjust, ptr_d_sizes] __device__(auto indx) {
+        auto delta = ptr_d_sizes[indx] - adjust - 1;
+        return indx * stride + delta;
+      });
 
     // use the transform iterator as map:
     //
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 77d4f2d865f..852d82e78ab 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -40,6 +40,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <optional>
 
 namespace cugraph {
@@ -1229,10 +1231,12 @@ renumber_and_compress_sampled_edgelist(
           auto pair_first       = thrust::make_zip_iterator((*compressed_label_indices).begin(),
                                                       (*compressed_hops).begin());
           auto value_pair_first = thrust::make_transform_iterator(
-            thrust::make_counting_iterator(size_t{0}), [num_hops] __device__(size_t i) {
-              return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
-                                        static_cast<int32_t>(i % num_hops));
-            });
+            thrust::make_counting_iterator(size_t{0}),
+            cuda::proclaim_return_type<thrust::tuple<label_index_t, int32_t>>(
+              [num_hops] __device__(size_t i) {
+                return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
+                                          static_cast<int32_t>(i % num_hops));
+              }));
           thrust::upper_bound(handle.get_thrust_policy(),
                               pair_first,
                               pair_first + (*compressed_label_indices).size(),
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 37a553dcdbd..a40747f13f7 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -51,6 +51,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <cstdint>
 #include <type_traits>
@@ -159,14 +161,14 @@ rmm::device_uvector<edge_t> compute_major_degrees(
                       thrust::make_counting_iterator(vertex_t{0}),
                       thrust::make_counting_iterator(major_hypersparse_first - major_range_first),
                       local_degrees.begin(),
-                      [offsets, masks] __device__(auto i) {
+                      cuda::proclaim_return_type<edge_t>([offsets, masks] __device__(auto i) {
                         auto local_degree = offsets[i + 1] - offsets[i];
                         if (masks) {
                           local_degree = static_cast<edge_t>(
                             detail::count_set_bits(*masks, offsets[i], local_degree));
                         }
                         return local_degree;
-                      });
+                      }));
     if (use_dcs) {
       auto dcs_nzd_vertices     = (*edge_partition_dcs_nzd_vertices)[i];
       auto dcs_nzd_vertex_count = (*edge_partition_dcs_nzd_vertex_counts)[i];
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index e5267d75ac2..6be735c3482 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -38,6 +38,8 @@
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 struct SamplingPostProcessing_Usecase {
   size_t num_labels{};
   size_t num_seeds_per_label{};
@@ -318,15 +320,16 @@ bool check_renumber_map_invariants(
 
     auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
       merged_vertices.begin(),
-      [sorted_org_vertices =
-         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
-       matching_renumbered_vertices = raft::device_span<vertex_t const>(
-         matching_renumbered_vertices.data(),
-         matching_renumbered_vertices.size())] __device__(vertex_t major) {
-        auto it = thrust::lower_bound(
-          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
-        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-      });
+      cuda::proclaim_return_type<vertex_t>(
+        [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
+                                                                 sorted_org_vertices.size()),
+         matching_renumbered_vertices = raft::device_span<vertex_t const>(
+           matching_renumbered_vertices.data(),
+           matching_renumbered_vertices.size())] __device__(vertex_t major) {
+          auto it = thrust::lower_bound(
+            thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+          return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+        }));
 
     thrust::reduce_by_key(handle.get_thrust_policy(),
                           sort_key_first,
@@ -1020,23 +1023,24 @@ class Tests_SamplingPostProcessing
                  ? this_label_output_edgelist_srcs.begin()
                  : this_label_output_edgelist_dsts.begin()) +
                 old_size,
-              [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
-               nzd_vertices =
-                 renumbered_and_compressed_nzd_vertices
-                   ? thrust::make_optional<raft::device_span<vertex_t const>>(
-                       (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
-                       (offset_end_offset - offset_start_offset) - 1)
-                   : thrust::nullopt,
-               base_v] __device__(size_t i) {
-                auto idx = static_cast<size_t>(thrust::distance(
-                  offsets.begin() + 1,
-                  thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
-                if (nzd_vertices) {
-                  return (*nzd_vertices)[idx];
-                } else {
-                  return base_v + static_cast<vertex_t>(idx);
-                }
-              });
+              cuda::proclaim_return_type<vertex_t>(
+                [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+                 nzd_vertices =
+                   renumbered_and_compressed_nzd_vertices
+                     ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                         (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
+                         (offset_end_offset - offset_start_offset) - 1)
+                     : thrust::nullopt,
+                 base_v] __device__(size_t i) {
+                  auto idx = static_cast<size_t>(thrust::distance(
+                    offsets.begin() + 1,
+                    thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
+                  if (nzd_vertices) {
+                    return (*nzd_vertices)[idx];
+                  } else {
+                    return base_v + static_cast<vertex_t>(idx);
+                  }
+                }));
             thrust::copy(handle.get_thrust_policy(),
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets.back(),