diff --git a/ci/build_wheel_cugraph.sh b/ci/build_wheel_cugraph.sh
index 5b5061f67c2..0a722c88c3e 100755
--- a/ci/build_wheel_cugraph.sh
+++ b/ci/build_wheel_cugraph.sh
@@ -12,6 +12,6 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME=pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX} rapids-download-wheels-from-s3 ./local-pylibcugraph
 export PIP_FIND_LINKS=$(pwd)/local-pylibcugraph
 
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
 ./ci/build_wheel.sh cugraph python/cugraph
diff --git a/ci/build_wheel_pylibcugraph.sh b/ci/build_wheel_pylibcugraph.sh
index 8d365bc250b..9e236c145ce 100755
--- a/ci/build_wheel_pylibcugraph.sh
+++ b/ci/build_wheel_pylibcugraph.sh
@@ -3,6 +3,6 @@
 
 set -euo pipefail
 
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
 ./ci/build_wheel.sh pylibcugraph python/pylibcugraph
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 67bf1c25b09..a772651625d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -423,7 +423,7 @@ add_library(cugraph_c
         src/c_api/core_result.cpp
         src/c_api/extract_ego.cpp
         src/c_api/k_core.cpp
-	src/c_api/hierarchical_clustering_result.cpp
+        src/c_api/hierarchical_clustering_result.cpp
         src/c_api/induced_subgraph.cpp
         src/c_api/capi_helper.cu
         src/c_api/legacy_spectral.cpp
diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 60b9f1a4054..a723fde24df 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -90,24 +90,25 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
     graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
     bool do_expensive_check = false);
 
+  edge_t number_of_edges() const { return this->number_of_edges_; }
+
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> view() const
   {
-    std::vector<edge_t const*> offsets(edge_partition_offsets_.size(), nullptr);
-    std::vector<vertex_t const*> indices(edge_partition_indices_.size(), nullptr);
-    auto dcs_nzd_vertices      = edge_partition_dcs_nzd_vertices_
-                                   ? std::make_optional<std::vector<vertex_t const*>>(
-                                  (*edge_partition_dcs_nzd_vertices_).size(), nullptr)
-                                   : std::nullopt;
-    auto dcs_nzd_vertex_counts = edge_partition_dcs_nzd_vertex_counts_
-                                   ? std::make_optional<std::vector<vertex_t>>(
-                                       (*edge_partition_dcs_nzd_vertex_counts_).size(), vertex_t{0})
-                                   : std::nullopt;
+    std::vector<raft::device_span<edge_t const>> offsets(edge_partition_offsets_.size());
+    std::vector<raft::device_span<vertex_t const>> indices(edge_partition_indices_.size());
+    auto dcs_nzd_vertices = edge_partition_dcs_nzd_vertices_
+                              ? std::make_optional<std::vector<raft::device_span<vertex_t const>>>(
+                                  (*edge_partition_dcs_nzd_vertices_).size())
+                              : std::nullopt;
     for (size_t i = 0; i < offsets.size(); ++i) {
-      offsets[i] = edge_partition_offsets_[i].data();
-      indices[i] = edge_partition_indices_[i].data();
+      offsets[i] = raft::device_span<edge_t const>(edge_partition_offsets_[i].data(),
+                                                   edge_partition_offsets_[i].size());
+      indices[i] = raft::device_span<vertex_t const>(edge_partition_indices_[i].data(),
+                                                     edge_partition_indices_[i].size());
       if (dcs_nzd_vertices) {
-        (*dcs_nzd_vertices)[i]      = (*edge_partition_dcs_nzd_vertices_)[i].data();
-        (*dcs_nzd_vertex_counts)[i] = (*edge_partition_dcs_nzd_vertex_counts_)[i];
+        (*dcs_nzd_vertices)[i] =
+          raft::device_span<vertex_t const>((*edge_partition_dcs_nzd_vertices_)[i].data(),
+                                            (*edge_partition_dcs_nzd_vertices_)[i].size());
       }
     }
 
@@ -196,15 +197,13 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
     }
 
     return graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>(
-      *(this->handle_ptr()),
       offsets,
       indices,
       dcs_nzd_vertices,
-      dcs_nzd_vertex_counts,
       graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{
         this->number_of_vertices(),
         this->number_of_edges(),
-        this->graph_properties(),
+        this->properties_,
         partition_,
         edge_partition_segment_offsets_,
         local_sorted_unique_edge_srcs,
@@ -224,7 +223,6 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
   // nzd: nonzero (local) degree
   std::optional<std::vector<rmm::device_uvector<vertex_t>>> edge_partition_dcs_nzd_vertices_{
     std::nullopt};
-  std::optional<std::vector<vertex_t>> edge_partition_dcs_nzd_vertex_counts_{std::nullopt};
   partition_t<vertex_t> partition_{};
 
   // segment offsets within the vertex partition based on vertex degree
@@ -283,16 +281,15 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!m
           graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check = false);
 
+  edge_t number_of_edges() const { return this->number_of_edges_; }
+
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> view() const
   {
     return graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>(
-      *(this->handle_ptr()),
-      offsets_.data(),
-      indices_.data(),
-      graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{this->number_of_vertices(),
-                                                                       this->number_of_edges(),
-                                                                       this->graph_properties(),
-                                                                       segment_offsets_});
+      raft::device_span<edge_t const>(offsets_.data(), offsets_.size()),
+      raft::device_span<vertex_t const>(indices_.data(), indices_.size()),
+      graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{
+        this->number_of_vertices(), this->number_of_edges(), this->properties_, segment_offsets_});
   }
 
  private:
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index d79d4635c54..53c66c6483e 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -258,21 +258,12 @@ class graph_base_t {
  public:
   graph_base_t() = default;
 
-  graph_base_t(raft::handle_t const& handle,
-               vertex_t number_of_vertices,
-               edge_t number_of_edges,
-               graph_properties_t properties)
-    : handle_ptr_(&handle),
-      number_of_vertices_(number_of_vertices),
+  graph_base_t(vertex_t number_of_vertices, edge_t number_of_edges, graph_properties_t properties)
+    : number_of_vertices_(number_of_vertices),
       number_of_edges_(number_of_edges),
       properties_(properties){};
 
   vertex_t number_of_vertices() const { return number_of_vertices_; }
-  edge_t number_of_edges() const
-  {
-    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
-    return number_of_edges_;
-  }
 
   template <typename vertex_type = vertex_t>
   std::enable_if_t<std::is_signed<vertex_type>::value, bool> is_valid_vertex(vertex_type v) const
@@ -289,33 +280,12 @@ class graph_base_t {
   bool is_symmetric() const { return properties_.is_symmetric; }
   bool is_multigraph() const { return properties_.is_multigraph; }
 
-  void attach_edge_mask(edge_property_view_t<edge_t, uint32_t const*, bool> edge_mask_view)
-  {
-    edge_mask_view_ = edge_mask_view;
-  }
-
-  void clear_edge_mask() { edge_mask_view_ = std::nullopt; }
-
-  bool has_edge_mask() const { return edge_mask_view_.has_value(); }
-
-  std::optional<edge_property_view_t<edge_t, uint32_t const*, bool>> edge_mask_view() const
-  {
-    return edge_mask_view_;
-  }
-
  protected:
-  raft::handle_t const* handle_ptr() const { return handle_ptr_; };
-  graph_properties_t graph_properties() const { return properties_; }
-
- private:
-  raft::handle_t const* handle_ptr_{nullptr};
-
-  vertex_t number_of_vertices_{0};
   edge_t number_of_edges_{0};
-
   graph_properties_t properties_{};
 
-  std::optional<edge_property_view_t<edge_t, uint32_t const*, bool>> edge_mask_view_{std::nullopt};
+ private:
+  vertex_t number_of_vertices_{0};
 };
 
 }  // namespace detail
@@ -405,11 +375,10 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   static constexpr bool is_storage_transposed = store_transposed;
   static constexpr bool is_multi_gpu          = multi_gpu;
 
-  graph_view_t(raft::handle_t const& handle,
-               std::vector<edge_t const*> const& edge_partition_offsets,
-               std::vector<vertex_t const*> const& edge_partition_indices,
-               std::optional<std::vector<vertex_t const*>> const& edge_partition_dcs_nzd_vertices,
-               std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
+  graph_view_t(std::vector<raft::device_span<edge_t const>> const& edge_partition_offsets,
+               std::vector<raft::device_span<vertex_t const>> const& edge_partition_indices,
+               std::optional<std::vector<raft::device_span<vertex_t const>>> const&
+                 edge_partition_dcs_nzd_vertices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta);
 
   std::vector<vertex_t> vertex_partition_range_offsets() const
@@ -624,25 +593,16 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
         this->local_edge_partition_src_value_start_offset(partition_idx);
     }
     std::optional<vertex_t> major_hypersparse_first{std::nullopt};
-    vertex_t offset_size = (major_range_last - major_range_first) + 1;
     if (this->use_dcs()) {
       major_hypersparse_first =
         major_range_first + (*(this->local_edge_partition_segment_offsets(
                               partition_idx)))[detail::num_sparse_segments_per_vertex_partition];
-      offset_size = ((*major_hypersparse_first) - major_range_first) +
-                    (*edge_partition_dcs_nzd_vertex_counts_)[partition_idx] + 1;
     }
     return edge_partition_view_t<vertex_t, edge_t, true>(
-      raft::device_span<edge_t const>(edge_partition_offsets_[partition_idx],
-                                      edge_partition_offsets_[partition_idx] + offset_size),
-      raft::device_span<vertex_t const>(
-        edge_partition_indices_[partition_idx],
-        edge_partition_indices_[partition_idx] + edge_partition_number_of_edges_[partition_idx]),
+      edge_partition_offsets_[partition_idx],
+      edge_partition_indices_[partition_idx],
       edge_partition_dcs_nzd_vertices_
-        ? std::make_optional<raft::device_span<vertex_t const>>(
-            (*edge_partition_dcs_nzd_vertices_)[partition_idx],
-            (*edge_partition_dcs_nzd_vertices_)[partition_idx] +
-              (*edge_partition_dcs_nzd_vertex_counts_)[partition_idx])
+        ? std::make_optional((*edge_partition_dcs_nzd_vertices_)[partition_idx])
         : std::nullopt,
       major_hypersparse_first,
       major_range_first,
@@ -652,6 +612,16 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
       major_value_range_start_offset);
   }
 
+  // FIXME: deprecated, replaced with copmute_number_of_edges (which works with or without edge
+  // masking)
+  edge_t number_of_edges() const
+  {
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
+    return this->number_of_edges_;
+  }
+
+  edge_t compute_number_of_edges(raft::handle_t const& handle) const;
+
   rmm::device_uvector<edge_t> compute_in_degrees(raft::handle_t const& handle) const;
   rmm::device_uvector<edge_t> compute_out_degrees(raft::handle_t const& handle) const;
 
@@ -751,15 +721,26 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
     return local_sorted_unique_edge_dst_vertex_partition_offsets_;
   }
 
+  void attach_edge_mask(edge_property_view_t<edge_t, uint32_t const*, bool> edge_mask_view)
+  {
+    edge_mask_view_ = edge_mask_view;
+  }
+
+  void clear_edge_mask() { edge_mask_view_ = std::nullopt; }
+
+  bool has_edge_mask() const { return edge_mask_view_.has_value(); }
+
+  std::optional<edge_property_view_t<edge_t, uint32_t const*, bool>> edge_mask_view() const
+  {
+    return edge_mask_view_;
+  }
+
  private:
-  std::vector<edge_t const*> edge_partition_offsets_{};
-  std::vector<vertex_t const*> edge_partition_indices_{};
+  std::vector<raft::device_span<edge_t const>> edge_partition_offsets_{};
+  std::vector<raft::device_span<vertex_t const>> edge_partition_indices_{};
 
   // relevant only if we use the CSR + DCSR (or CSC + DCSC) hybrid format
-  std::optional<std::vector<vertex_t const*>> edge_partition_dcs_nzd_vertices_{};
-  std::optional<std::vector<vertex_t>> edge_partition_dcs_nzd_vertex_counts_{};
-
-  std::vector<edge_t> edge_partition_number_of_edges_{};
+  std::optional<std::vector<raft::device_span<vertex_t const>>> edge_partition_dcs_nzd_vertices_{};
 
   partition_t<vertex_t> partition_{};
 
@@ -796,6 +777,8 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
                      std::optional<raft::host_span<vertex_t const>>,
                      std::optional<std::byte> /* dummy */>
     local_sorted_unique_edge_dst_vertex_partition_offsets_{std::nullopt};
+
+  std::optional<edge_property_view_t<edge_t, uint32_t const*, bool>> edge_mask_view_{std::nullopt};
 };
 
 // single-GPU version
@@ -808,9 +791,8 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   static constexpr bool is_storage_transposed = store_transposed;
   static constexpr bool is_multi_gpu          = multi_gpu;
 
-  graph_view_t(raft::handle_t const& handle,
-               edge_t const* offsets,
-               vertex_t const* indices,
+  graph_view_t(raft::device_span<edge_t const> offsets,
+               raft::device_span<vertex_t const> indices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta);
 
   std::vector<vertex_t> vertex_partition_range_offsets() const
@@ -924,11 +906,19 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   {
     assert(partition_idx == 0);  // there is only one edge partition in single-GPU
     return edge_partition_view_t<vertex_t, edge_t, false>(
-      raft::device_span<edge_t const>(offsets_, offsets_ + (this->number_of_vertices() + 1)),
-      raft::device_span<vertex_t const>(indices_, indices_ + this->number_of_edges()),
-      this->number_of_vertices());
+      offsets_, indices_, this->number_of_vertices());
+  }
+
+  // FIXME: deprecated, replaced with copmute_number_of_edges (which works with or without edge
+  // masking)
+  edge_t number_of_edges() const
+  {
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
+    return this->number_of_edges_;
   }
 
+  edge_t compute_number_of_edges(raft::handle_t const& handle) const;
+
   rmm::device_uvector<edge_t> compute_in_degrees(raft::handle_t const& handle) const;
   rmm::device_uvector<edge_t> compute_out_degrees(raft::handle_t const& handle) const;
 
@@ -1016,12 +1006,28 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
     return std::nullopt;
   }
 
+  void attach_edge_mask(edge_property_view_t<edge_t, uint32_t const*, bool> edge_mask_view)
+  {
+    edge_mask_view_ = edge_mask_view;
+  }
+
+  void clear_edge_mask() { edge_mask_view_ = std::nullopt; }
+
+  bool has_edge_mask() const { return edge_mask_view_.has_value(); }
+
+  std::optional<edge_property_view_t<edge_t, uint32_t const*, bool>> edge_mask_view() const
+  {
+    return edge_mask_view_;
+  }
+
  private:
-  edge_t const* offsets_{nullptr};
-  vertex_t const* indices_{nullptr};
+  raft::device_span<edge_t const> offsets_{};
+  raft::device_span<vertex_t const> indices_{};
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> segment_offsets_{std::nullopt};
+
+  std::optional<edge_property_view_t<edge_t, uint32_t const*, bool>> edge_mask_view_{std::nullopt};
 };
 
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh
index 28e2853727f..04aeac49c9d 100644
--- a/cpp/include/cugraph/utilities/misc_utils.cuh
+++ b/cpp/include/cugraph/utilities/misc_utils.cuh
@@ -26,6 +26,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
 
+#include <cuda/functional>
+
 #include <optional>
 #include <tuple>
 #include <vector>
@@ -44,7 +46,8 @@ std::tuple<std::vector<vertex_t>, std::vector<edge_t>> compute_offset_aligned_ed
 {
   auto search_offset_first = thrust::make_transform_iterator(
     thrust::make_counting_iterator(size_t{1}),
-    [approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; });
+    cuda::proclaim_return_type<size_t>(
+      [approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; }));
   auto num_chunks = (num_edges + approx_edge_chunk_size - 1) / approx_edge_chunk_size;
 
   if (num_chunks > 1) {
diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index ab6a54cc1c0..414d9b36992 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -40,6 +40,8 @@
 #include <thrust/tabulate.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <numeric>
 #include <vector>
@@ -197,12 +199,13 @@ void multi_partition(ValueIterator value_first,
     value_last,
     thrust::make_zip_iterator(
       thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
-    [value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
-      auto group_id = value_to_group_id_op(value);
-      cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
-      return thrust::make_tuple(group_id,
-                                counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
-    });
+    cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
+      [value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
+        auto group_id = value_to_group_id_op(value);
+        cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
+        return thrust::make_tuple(group_id,
+                                  counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
+      }));
 
   rmm::device_uvector<size_t> displacements(num_groups, stream_view);
   thrust::exclusive_scan(
@@ -245,17 +248,19 @@ void multi_partition(KeyIterator key_first,
   rmm::device_uvector<int> group_ids(num_keys, stream_view);
   rmm::device_uvector<size_t> intra_partition_offsets(num_keys, stream_view);
   thrust::fill(rmm::exec_policy(stream_view), counts.begin(), counts.end(), size_t{0});
-  thrust::transform(rmm::exec_policy(stream_view),
-                    key_first,
-                    key_last,
-                    thrust::make_zip_iterator(
-                      thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
-                    [key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
-                      auto group_id = key_to_group_id_op(key);
-                      cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
-                      return thrust::make_tuple(
-                        group_id, counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream_view),
+    key_first,
+    key_last,
+    thrust::make_zip_iterator(
+      thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
+    cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
+      [key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
+        auto group_id = key_to_group_id_op(key);
+        cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
+        return thrust::make_tuple(group_id,
+                                  counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
+      }));
 
   rmm::device_uvector<size_t> displacements(num_groups, stream_view);
   thrust::exclusive_scan(
@@ -761,8 +766,9 @@ rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [I
                              stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
-    tx_value_first,
-    [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); });
+    tx_value_first, cuda::proclaim_return_type<int>([value_to_group_id_op] __device__(auto value) {
+      return value_to_group_id_op(value);
+    }));
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
   auto rank_count_pair_first = thrust::make_zip_iterator(
@@ -795,7 +801,9 @@ rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [IN
                              stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
-    tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
+    tx_key_first, cuda::proclaim_return_type<int>([key_to_group_id_op] __device__(auto key) {
+      return key_to_group_id_op(key);
+    }));
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
   auto rank_count_pair_first = thrust::make_zip_iterator(
diff --git a/cpp/src/c_api/capi_helper.cu b/cpp/src/c_api/capi_helper.cu
index 0ee49f87265..f08af4137db 100644
--- a/cpp/src/c_api/capi_helper.cu
+++ b/cpp/src/c_api/capi_helper.cu
@@ -74,6 +74,104 @@ template void sort_by_key(raft::handle_t const& handle,
                           raft::device_span<int64_t> keys,
                           raft::device_span<int64_t> values);
 
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<size_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<vertex_t>&& edge_srcs,
+                          rmm::device_uvector<vertex_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<weight_t>>&& edge_weights)
+{
+  rmm::device_uvector<size_t> sort_indices(edge_srcs.size(), handle.get_stream());
+  thrust::tabulate(
+    handle.get_thrust_policy(),
+    sort_indices.begin(),
+    sort_indices.end(),
+    [offset_lasts   = raft::device_span<size_t const>(offsets.begin() + 1, offsets.end()),
+     source_indices = raft::device_span<size_t const>(source_indices.data(),
+                                                      source_indices.size())] __device__(size_t i) {
+      auto idx = static_cast<size_t>(thrust::distance(
+        offset_lasts.begin(),
+        thrust::upper_bound(thrust::seq, offset_lasts.begin(), offset_lasts.end(), i)));
+      return source_indices[idx];
+    });
+  source_indices.resize(0, handle.get_stream());
+  source_indices.shrink_to_fit(handle.get_stream());
+
+  auto triplet_first =
+    thrust::make_zip_iterator(sort_indices.begin(), edge_srcs.begin(), edge_dsts.begin());
+  if (edge_weights) {
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        triplet_first,
+                        triplet_first + sort_indices.size(),
+                        (*edge_weights).begin());
+  } else {
+    thrust::sort(handle.get_thrust_policy(), triplet_first, triplet_first + sort_indices.size());
+  }
+
+  thrust::tabulate(
+    handle.get_thrust_policy(),
+    offsets.begin() + 1,
+    offsets.end(),
+    [sort_indices = raft::device_span<size_t const>(sort_indices.data(),
+                                                    sort_indices.size())] __device__(size_t i) {
+      return static_cast<size_t>(thrust::distance(
+        sort_indices.begin(),
+        thrust::upper_bound(thrust::seq, sort_indices.begin(), sort_indices.end(), i)));
+    });
+
+  return std::make_tuple(
+    std::move(offsets), std::move(edge_srcs), std::move(edge_dsts), std::move(edge_weights));
+}
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int32_t>&& edge_srcs,
+                          rmm::device_uvector<int32_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<float>>&& edge_weights);
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int32_t>&& edge_srcs,
+                          rmm::device_uvector<int32_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<double>>&& edge_weights);
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int64_t>&& edge_srcs,
+                          rmm::device_uvector<int64_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<float>>&& edge_weights);
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int64_t>&& edge_srcs,
+                          rmm::device_uvector<int64_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<double>>&& edge_weights);
+
 }  // namespace detail
 }  // namespace c_api
 }  // namespace cugraph
diff --git a/cpp/src/c_api/capi_helper.hpp b/cpp/src/c_api/capi_helper.hpp
index ce08e8d90d3..56401606477 100644
--- a/cpp/src/c_api/capi_helper.hpp
+++ b/cpp/src/c_api/capi_helper.hpp
@@ -36,6 +36,18 @@ void sort_by_key(raft::handle_t const& handle,
                  raft::device_span<key_t> keys,
                  raft::device_span<value_t> values);
 
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<size_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<vertex_t>&& edge_srcs,
+                          rmm::device_uvector<vertex_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<weight_t>>&& edge_weights);
+
 }  // namespace detail
 }  // namespace c_api
 }  // namespace cugraph
diff --git a/cpp/src/c_api/extract_ego.cpp b/cpp/src/c_api/extract_ego.cpp
index 931d58b5185..cbe07af2e77 100644
--- a/cpp/src/c_api/extract_ego.cpp
+++ b/cpp/src/c_api/extract_ego.cpp
@@ -17,6 +17,7 @@
 #include <cugraph_c/algorithms.h>
 
 #include <c_api/abstract_functor.hpp>
+#include <c_api/capi_helper.hpp>
 #include <c_api/graph.hpp>
 #include <c_api/induced_subgraph_result.hpp>
 #include <c_api/resource_handle.hpp>
@@ -26,7 +27,10 @@
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/utilities/device_comm.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
 
+#include <numeric>
 #include <optional>
 
 namespace {
@@ -91,9 +95,22 @@ struct extract_ego_functor : public cugraph::c_api::abstract_functor {
                  source_vertices.size(),
                  handle_.get_stream());
 
+      std::optional<rmm::device_uvector<size_t>> source_indices{std::nullopt};
+
       if constexpr (multi_gpu) {
-        source_vertices = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
-          handle_, std::move(source_vertices));
+        auto displacements = cugraph::host_scalar_allgather(
+          handle_.get_comms(), source_vertices.size(), handle_.get_stream());
+        std::exclusive_scan(
+          displacements.begin(), displacements.end(), displacements.begin(), size_t{0});
+        source_indices = rmm::device_uvector<size_t>(source_vertices.size(), handle_.get_stream());
+        cugraph::detail::sequence_fill(handle_.get_stream(),
+                                       (*source_indices).data(),
+                                       (*source_indices).size(),
+                                       displacements[handle_.get_comms().get_rank()]);
+
+        std::tie(source_vertices, source_indices) =
+          cugraph::detail::shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+            handle_, std::move(source_vertices), std::move(*source_indices));
       }
 
       cugraph::renumber_ext_vertices<vertex_t, multi_gpu>(
@@ -130,6 +147,31 @@ struct extract_ego_functor : public cugraph::c_api::abstract_functor {
         graph_view.vertex_partition_range_lasts(),
         do_expensive_check_);
 
+      if constexpr (multi_gpu) {
+        auto recvcounts = cugraph::host_scalar_allgather(
+          handle_.get_comms(), (*source_indices).size(), handle_.get_stream());
+        std::vector<size_t> displacements(recvcounts.size());
+        std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0});
+        rmm::device_uvector<size_t> allgathered_indices(displacements.back() + recvcounts.back(),
+                                                        handle_.get_stream());
+        cugraph::device_allgatherv(handle_.get_comms(),
+                                   (*source_indices).begin(),
+                                   allgathered_indices.begin(),
+                                   recvcounts,
+                                   displacements,
+                                   handle_.get_stream());
+        source_indices = std::move(allgathered_indices);
+
+        std::tie(edge_offsets, src, dst, wgt) =
+          cugraph::c_api::detail::reorder_extracted_egonets<vertex_t, weight_t>(
+            handle_,
+            std::move(*source_indices),
+            std::move(edge_offsets),
+            std::move(src),
+            std::move(dst),
+            std::move(wgt));
+      }
+
       result_ = new cugraph::c_api::cugraph_induced_subgraph_result_t{
         new cugraph::c_api::cugraph_type_erased_device_array_t(src, graph_->vertex_type_),
         new cugraph::c_api::cugraph_type_erased_device_array_t(dst, graph_->vertex_type_),
diff --git a/cpp/src/community/detail/mis_impl.cuh b/cpp/src/community/detail/mis_impl.cuh
index bcd71af5a08..2659a982183 100644
--- a/cpp/src/community/detail/mis_impl.cuh
+++ b/cpp/src/community/detail/mis_impl.cuh
@@ -37,6 +37,8 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
+#include <cuda/functional>
+
 #include <cmath>
 
 namespace cugraph {
@@ -78,13 +80,13 @@ rmm::device_uvector<vertex_t> maximal_independent_set(
   thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
 
   // Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
-  thrust::transform_if(
-    handle.get_thrust_policy(),
-    out_degrees.begin(),
-    out_degrees.end(),
-    ranks.begin(),
-    [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); },
-    [] __device__(auto deg) { return deg == 0; });
+  thrust::transform_if(handle.get_thrust_policy(),
+                       out_degrees.begin(),
+                       out_degrees.end(),
+                       ranks.begin(),
+                       cuda::proclaim_return_type<vertex_t>(
+                         [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); }),
+                       [] __device__(auto deg) { return deg == 0; });
 
   out_degrees.resize(0, handle.get_stream());
   out_degrees.shrink_to_fit(handle.get_stream());
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index ebaae498d04..eb874657f01 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -46,6 +46,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 CUCO_DECLARE_BITWISE_COMPARABLE(float)
 CUCO_DECLARE_BITWISE_COMPARABLE(double)
 // FIXME: a temporary workaround for a compiler error, should be deleted once cuco gets patched.
@@ -213,16 +215,17 @@ refine_clustering(
                                 : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
                                     louvain_assignment_of_vertices.data(), vertex_t{0}),
     *edge_weight_view,
-    [] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
-      weight_t weighted_cut_contribution{0};
+    cuda::proclaim_return_type<weight_t>(
+      [] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
+        weight_t weighted_cut_contribution{0};
 
-      if (src == dst)  // self loop
-        weighted_cut_contribution = 0;
-      else if (src_cluster == dst_cluster)
-        weighted_cut_contribution = wt;
+        if (src == dst)  // self loop
+          weighted_cut_contribution = 0;
+        else if (src_cluster == dst_cluster)
+          weighted_cut_contribution = wt;
 
-      return weighted_cut_contribution;
-    },
+        return weighted_cut_contribution;
+      }),
     weight_t{0},
     cugraph::reduce_op::plus<weight_t>{},
     weighted_cut_of_vertices_to_louvain.begin());
@@ -243,13 +246,14 @@ refine_clustering(
                     wcut_deg_and_cluster_vol_triple_begin,
                     wcut_deg_and_cluster_vol_triple_end,
                     singleton_and_connected_flags.begin(),
-                    [resolution, total_edge_weight] __device__(auto wcut_wdeg_and_louvain_volume) {
+                    cuda::proclaim_return_type<uint8_t>([resolution, total_edge_weight] __device__(
+                                                          auto wcut_wdeg_and_louvain_volume) {
                       auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
                       auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
                       auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
-                      return wcut >
-                             (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight);
-                    });
+                      return static_cast<uint8_t>(
+                        wcut > (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight));
+                    }));
 
   edge_src_property_t<GraphViewType, weight_t> src_louvain_cluster_weight_cache(handle);
   edge_src_property_t<GraphViewType, weight_t> src_cut_to_louvain_cache(handle);
@@ -718,11 +722,12 @@ refine_clustering(
       vertices_in_mis.begin(),
       vertices_in_mis.end(),
       dst_vertices.begin(),
-      [dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
-       v_first   = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
-        auto dst = *(dst_first + v - v_first);
-        return dst;
-      });
+      cuda::proclaim_return_type<vertex_t>(
+        [dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
+         v_first   = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
+          auto dst = *(dst_first + v - v_first);
+          return dst;
+        }));
 
     cugraph::resize_dataframe_buffer(gain_and_dst_output_pairs, 0, handle.get_stream());
     cugraph::shrink_to_fit_dataframe_buffer(gain_and_dst_output_pairs, handle.get_stream());
diff --git a/cpp/src/community/leiden_impl.cuh b/cpp/src/community/leiden_impl.cuh
index b6e20272de9..1e2b8f2ad44 100644
--- a/cpp/src/community/leiden_impl.cuh
+++ b/cpp/src/community/leiden_impl.cuh
@@ -568,17 +568,17 @@ void flatten_leiden_dendrogram(raft::handle_t const& handle,
   leiden_partition_at_level<vertex_t, multi_gpu>(
     handle, dendrogram, clustering, dendrogram.num_levels());
 
-  rmm::device_uvector<vertex_t> unique_cluster_ids(graph_view.number_of_vertices(),
+  rmm::device_uvector<vertex_t> unique_cluster_ids(graph_view.local_vertex_partition_range_size(),
                                                    handle.get_stream());
   thrust::copy(handle.get_thrust_policy(),
                clustering,
-               clustering + graph_view.number_of_vertices(),
+               clustering + graph_view.local_vertex_partition_range_size(),
                unique_cluster_ids.begin());
 
   remove_duplicates<vertex_t, multi_gpu>(handle, unique_cluster_ids);
 
   relabel_cluster_ids<vertex_t, multi_gpu>(
-    handle, unique_cluster_ids, clustering, graph_view.number_of_vertices());
+    handle, unique_cluster_ids, clustering, graph_view.local_vertex_partition_range_size());
 }
 
 }  // namespace detail
diff --git a/cpp/src/detail/collect_local_vertex_values.cu b/cpp/src/detail/collect_local_vertex_values.cu
index 9d5d2cb553b..795902dfd87 100644
--- a/cpp/src/detail/collect_local_vertex_values.cu
+++ b/cpp/src/detail/collect_local_vertex_values.cu
@@ -19,6 +19,8 @@
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
+#include <cuda/functional>
+
 namespace cugraph {
 namespace detail {
 
@@ -64,7 +66,8 @@ rmm::device_uvector<value_t> collect_local_vertex_values_from_ext_vertex_value_p
 
   auto vertex_iterator = thrust::make_transform_iterator(
     d_vertices.begin(),
-    [local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; });
+    cuda::proclaim_return_type<vertex_t>(
+      [local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; }));
 
   d_local_values.resize(local_vertex_last - local_vertex_first, handle.get_stream());
   thrust::fill(
diff --git a/cpp/src/detail/shuffle_vertices.cu b/cpp/src/detail/shuffle_vertices.cu
index bc450ce3bbf..94729a770f7 100644
--- a/cpp/src/detail/shuffle_vertices.cu
+++ b/cpp/src/detail/shuffle_vertices.cu
@@ -200,6 +200,12 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int32_t>&& vertices,
   rmm::device_uvector<int32_t>&& values);
 
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<size_t>>
+shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& vertices,
+  rmm::device_uvector<size_t>&& values);
+
 template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
 shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle,
@@ -224,6 +230,12 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int64_t>&& vertices,
   rmm::device_uvector<int64_t>&& values);
 
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<size_t>>
+shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& vertices,
+  rmm::device_uvector<size_t>&& values);
+
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
 shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle,
diff --git a/cpp/src/generators/erdos_renyi_generator.cu b/cpp/src/generators/erdos_renyi_generator.cu
index 6d847ae0bde..8448eeaf960 100644
--- a/cpp/src/generators/erdos_renyi_generator.cu
+++ b/cpp/src/generators/erdos_renyi_generator.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,8 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 namespace cugraph {
 
 template <typename vertex_t>
@@ -42,12 +44,13 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                   "Implementation cannot support specified value");
 
   auto random_iterator = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_t>(0), [seed] __device__(size_t index) {
+    thrust::make_counting_iterator<size_t>(0),
+    cuda::proclaim_return_type<float>([seed] __device__(size_t index) {
       thrust::default_random_engine rng(seed);
       thrust::uniform_real_distribution<float> dist(0.0, 1.0);
       rng.discard(index);
       return dist(rng);
-    });
+    }));
 
   size_t count = thrust::count_if(handle.get_thrust_policy(),
                                   random_iterator,
@@ -69,13 +72,14 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                     indices_v.begin(),
                     indices_v.end(),
                     thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), src_v.end())),
-                    [num_vertices] __device__(size_t index) {
-                      size_t src = index / num_vertices;
-                      size_t dst = index % num_vertices;
-
-                      return thrust::make_tuple(static_cast<vertex_t>(src),
-                                                static_cast<vertex_t>(dst));
-                    });
+                    cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
+                      [num_vertices] __device__(size_t index) {
+                        size_t src = index / num_vertices;
+                        size_t dst = index % num_vertices;
+
+                        return thrust::make_tuple(static_cast<vertex_t>(src),
+                                                  static_cast<vertex_t>(dst));
+                      }));
 
   handle.sync_stream();
 
diff --git a/cpp/src/generators/simple_generators.cu b/cpp/src/generators/simple_generators.cu
index 6dba63909c3..65647be5de0 100644
--- a/cpp/src/generators/simple_generators.cu
+++ b/cpp/src/generators/simple_generators.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,8 @@
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 
 namespace cugraph {
@@ -264,23 +266,24 @@ generate_complete_graph_edgelist(
 
     auto transform_iter = thrust::make_transform_iterator(
       thrust::make_counting_iterator<size_t>(0),
-      [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
-        size_t graph_index = index / (num_vertices * num_vertices);
-        size_t local_index = index % (num_vertices * num_vertices);
-
-        vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
-        vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
-
-        if (src == dst) {
-          src = invalid_vertex;
-          dst = invalid_vertex;
-        } else {
-          src += (graph_index * num_vertices);
-          dst += (graph_index * num_vertices);
-        }
-
-        return thrust::make_tuple(src, dst);
-      });
+      cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
+        [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
+          size_t graph_index = index / (num_vertices * num_vertices);
+          size_t local_index = index % (num_vertices * num_vertices);
+
+          vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
+          vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
+
+          if (src == dst) {
+            src = invalid_vertex;
+            dst = invalid_vertex;
+          } else {
+            src += (graph_index * num_vertices);
+            dst += (graph_index * num_vertices);
+          }
+
+          return thrust::make_tuple(src, dst);
+        }));
 
     output_iterator = thrust::copy_if(handle.get_thrust_policy(),
                                       transform_iter,
diff --git a/cpp/src/link_analysis/hits_impl.cuh b/cpp/src/link_analysis/hits_impl.cuh
index 674046745b1..5cdf1b9dc6a 100644
--- a/cpp/src/link_analysis/hits_impl.cuh
+++ b/cpp/src/link_analysis/hits_impl.cuh
@@ -80,6 +80,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
   if (num_vertices == 0) { return std::make_tuple(diff_sum, final_iteration_count); }
 
   CUGRAPH_EXPECTS(epsilon >= 0.0, "Invalid input argument: epsilon should be non-negative.");
+  auto tolerance = static_cast<result_t>(graph_view.number_of_vertices()) * epsilon;
 
   // Check validity of initial guess if supplied
   if (has_initial_hubs_guess && do_expensive_check) {
@@ -171,7 +172,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
     std::swap(prev_hubs, curr_hubs);
     iter++;
 
-    if (diff_sum < epsilon) {
+    if (diff_sum < tolerance) {
       break;
     } else if (iter >= max_iterations) {
       CUGRAPH_FAIL("HITS failed to converge.");
diff --git a/cpp/src/mtmg/vertex_result.cu b/cpp/src/mtmg/vertex_result.cu
index 5b1825656ff..414f1bdfa88 100644
--- a/cpp/src/mtmg/vertex_result.cu
+++ b/cpp/src/mtmg/vertex_result.cu
@@ -21,6 +21,7 @@
 
 #include <detail/graph_partition_utils.cuh>
 
+#include <thrust/functional.h>
 #include <thrust/gather.h>
 
 namespace cugraph {
@@ -91,10 +92,11 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
   auto vertex_partition =
     vertex_partition_device_view_t<vertex_t, multi_gpu>(vertex_partition_view);
 
-  auto iter =
-    thrust::make_transform_iterator(local_vertices.begin(), [vertex_partition] __device__(auto v) {
+  auto iter = thrust::make_transform_iterator(
+    local_vertices.begin(),
+    cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
       return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-    });
+    }));
 
   thrust::gather(handle.get_thrust_policy(),
                  iter,
@@ -111,7 +113,7 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
       vertex_gpu_ids.begin(),
       vertex_gpu_ids.end(),
       thrust::make_zip_iterator(local_vertices.begin(), vertex_pos.begin(), tmp_result.begin()),
-      [] __device__(int gpu) { return gpu; },
+      thrust::identity{},
       handle.get_stream());
 
   //
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 5fee97790f1..4c5c43c7d1e 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -44,6 +44,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <optional>
 #include <tuple>
 
@@ -596,8 +598,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
                                         multiplier_t<size_t>{high_partition_over_sampling_K}),
         thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_t{0}),
-          [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-            size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
         handle.get_stream());
       if (tmp_storage_bytes > d_tmp_storage.size()) {
         d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
@@ -615,8 +618,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
                                         multiplier_t<size_t>{high_partition_over_sampling_K}),
         thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_t{0}),
-          [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-            size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
         handle.get_stream());
 
       // copy the neighbor indices back to sample_nbr_indices
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 0b6c6a554bb..1a7fc0130c4 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -51,6 +51,8 @@
 #include <thrust/tuple.h>
 #include <thrust/type_traits/integer_sequence.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 #include <type_traits>
 #include <utility>
@@ -591,10 +593,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
           value_size = sizeof(T);
         }
 
-        auto avg_vertex_degree = graph_view.number_of_vertices() > 0
-                                   ? (static_cast<double>(graph_view.number_of_edges()) /
-                                      static_cast<double>(graph_view.number_of_vertices()))
-                                   : double{0.0};
+        auto avg_vertex_degree =
+          graph_view.number_of_vertices() > 0
+            ? (static_cast<double>(graph_view.compute_number_of_edges(handle)) /
+               static_cast<double>(graph_view.number_of_vertices()))
+            : double{0.0};
 
         num_streams =
           std::min(static_cast<size_t>(avg_vertex_degree * (static_cast<double>(sizeof(vertex_t)) /
@@ -940,16 +943,19 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
                      minor_init);
         auto value_first = thrust::make_transform_iterator(
           view.value_first(),
-          [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); });
-        thrust::scatter(
-          handle.get_thrust_policy(),
-          value_first + (*minor_key_offsets)[i],
-          value_first + (*minor_key_offsets)[i + 1],
-          thrust::make_transform_iterator(
-            (*(view.keys())).begin() + (*minor_key_offsets)[i],
-            [key_first = graph_view.vertex_partition_range_first(
-               this_segment_vertex_partition_id)] __device__(auto key) { return key - key_first; }),
-          tx_buffer_first);
+          cuda::proclaim_return_type<T>(
+            [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); }));
+        thrust::scatter(handle.get_thrust_policy(),
+                        value_first + (*minor_key_offsets)[i],
+                        value_first + (*minor_key_offsets)[i + 1],
+                        thrust::make_transform_iterator(
+                          (*(view.keys())).begin() + (*minor_key_offsets)[i],
+                          cuda::proclaim_return_type<vertex_t>(
+                            [key_first = graph_view.vertex_partition_range_first(
+                               this_segment_vertex_partition_id)] __device__(auto key) {
+                              return key - key_first;
+                            })),
+                        tx_buffer_first);
         device_reduce(major_comm,
                       tx_buffer_first,
                       vertex_value_output_first,
diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh
index b8621e122c6..0c7058cccb4 100644
--- a/cpp/src/prims/update_edge_src_dst_property.cuh
+++ b/cpp/src/prims/update_edge_src_dst_property.cuh
@@ -44,6 +44,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scatter.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <numeric>
 #include <type_traits>
@@ -181,13 +183,14 @@ void update_edge_major_property(raft::handle_t const& handle,
                        handle.get_stream());
           auto bool_first = thrust::make_transform_iterator(
             (*edge_partition_keys)[i].begin(),
-            [rx_value_first,
-             v_first = graph_view.vertex_partition_range_first(
-               major_range_vertex_partition_id)] __device__(auto v) {
-              auto v_offset = v - v_first;
-              return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
-                                       packed_bool_mask(v_offset));
-            });
+            cuda::proclaim_return_type<bool>(
+              [rx_value_first,
+               v_first = graph_view.vertex_partition_range_first(
+                 major_range_vertex_partition_id)] __device__(auto v) {
+                auto v_offset = v - v_first;
+                return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                         packed_bool_mask(v_offset));
+              }));
           pack_bools(handle,
                      bool_first,
                      bool_first + (*edge_partition_keys)[i].size(),
@@ -202,8 +205,9 @@ void update_edge_major_property(raft::handle_t const& handle,
 
           auto v_offset_first = thrust::make_transform_iterator(
             (*edge_partition_keys)[i].begin(),
-            [v_first = graph_view.vertex_partition_range_first(
-               major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+            cuda::proclaim_return_type<vertex_t>(
+              [v_first = graph_view.vertex_partition_range_first(
+                 major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; }));
           thrust::gather(handle.get_thrust_policy(),
                          v_offset_first,
                          v_offset_first + (*edge_partition_keys)[i].size(),
@@ -312,21 +316,24 @@ void update_edge_major_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (packed_bool) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+            vertex_first,
+            cuda::proclaim_return_type<bool>([vertex_property_input_first,
+                                              vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
               return static_cast<bool>(
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
-            });
+            }));
           pack_bools(handle,
                      bool_first,
                      bool_first + thrust::distance(vertex_first, vertex_last),
                      rx_value_first);
         } else {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+          auto map_first = thrust::make_transform_iterator(
+            vertex_first,
+            cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-            });
+            }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
           thrust::gather(handle.get_thrust_policy(),
@@ -391,9 +398,10 @@ void update_edge_major_property(raft::handle_t const& handle,
             });
         } else {
           auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
               return edge_partition.major_offset_from_major_nocheck(v);
-            });
+            }));
           // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
@@ -471,7 +479,8 @@ void update_edge_minor_property(raft::handle_t const& handle,
       bcast_size *= sizeof(typename EdgeMinorPropertyOutputWrapper::value_type);
     }
     auto num_concurrent_bcasts =
-      (static_cast<size_t>(graph_view.number_of_edges() / comm_size) * sizeof(vertex_t)) /
+      (static_cast<size_t>(graph_view.compute_number_of_edges(handle) / comm_size) *
+       sizeof(vertex_t)) /
       std::max(bcast_size, size_t{1});
     num_concurrent_bcasts = std::max(num_concurrent_bcasts, size_t{1});
     num_concurrent_bcasts = std::min(num_concurrent_bcasts, static_cast<size_t>(major_comm_size));
@@ -593,13 +602,14 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
                 auto bool_first = thrust::make_transform_iterator(
                   (*edge_partition_keys).begin() + key_offsets[j],
-                  [rx_value_first,
-                   v_first = graph_view.vertex_partition_range_first(
-                     minor_range_vertex_partition_id)] __device__(auto v) {
-                    auto v_offset = v - v_first;
-                    return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
-                                             packed_bool_mask(v_offset));
-                  });
+                  cuda::proclaim_return_type<bool>(
+                    [rx_value_first,
+                     v_first = graph_view.vertex_partition_range_first(
+                       minor_range_vertex_partition_id)] __device__(auto v) {
+                      auto v_offset = v - v_first;
+                      return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                               packed_bool_mask(v_offset));
+                    }));
                 pack_unaligned_bools(
                   handle,
                   bool_first,
@@ -611,10 +621,10 @@ void update_edge_minor_property(raft::handle_t const& handle,
                   std::get<std::vector<size_t>>(key_offsets_or_rx_displacements);
                 auto bool_first = thrust::make_transform_iterator(
                   thrust::make_counting_iterator(vertex_t{0}),
-                  [rx_value_first] __device__(vertex_t v_offset) {
+                  cuda::proclaim_return_type<bool>([rx_value_first] __device__(vertex_t v_offset) {
                     return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
                                              packed_bool_mask(v_offset));
-                  });
+                  }));
                 pack_unaligned_bools(
                   handle,
                   bool_first,
@@ -630,8 +640,9 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
               auto v_offset_first = thrust::make_transform_iterator(
                 (*edge_partition_keys).begin() + key_offsets[j],
-                [v_first = graph_view.vertex_partition_range_first(
-                   minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+                cuda::proclaim_return_type<vertex_t>(
+                  [v_first = graph_view.vertex_partition_range_first(
+                     minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; }));
               thrust::gather(handle.get_thrust_policy(),
                              v_offset_first,
                              v_offset_first + (key_offsets[j + 1] - key_offsets[j]),
@@ -718,21 +729,24 @@ void update_edge_minor_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (packed_bool) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+            vertex_first,
+            cuda::proclaim_return_type<bool>([vertex_property_input_first,
+                                              vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
               return static_cast<bool>(
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
-            });
+            }));
           pack_bools(handle,
                      bool_first,
                      bool_first + thrust::distance(vertex_first, vertex_last),
                      rx_value_first);
         } else {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+          auto map_first = thrust::make_transform_iterator(
+            vertex_first,
+            cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-            });
+            }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
           thrust::gather(handle.get_thrust_policy(),
@@ -799,9 +813,10 @@ void update_edge_minor_property(raft::handle_t const& handle,
             });
         } else {
           auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
               return edge_partition.minor_offset_from_minor_nocheck(v);
-            });
+            }));
           // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 6a7334e9f1a..5a9ded02009 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -52,6 +52,8 @@
 #include <thrust/transform_scan.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <cassert>
 #include <cstdlib>  // FIXME: requirement for temporary std::getenv()
 #include <limits>
@@ -378,7 +380,8 @@ struct random_walker_t {
 
     // scatter d_src_init_v to coalesced vertex vector:
     //
-    auto dlambda = [stride = max_depth_] __device__(auto indx) { return indx * stride; };
+    auto dlambda = cuda::proclaim_return_type<index_t>(
+      [stride = max_depth_] __device__(auto indx) { return indx * stride; });
 
     // use the transform iterator as map:
     //
@@ -539,10 +542,11 @@ struct random_walker_t {
 
     // delta = ptr_d_sizes[indx] - 1
     //
-    auto dlambda = [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
-      auto delta = ptr_d_sizes[indx] - 1;
-      return ptr_d_coalesced[indx * stride + delta];
-    };
+    auto dlambda = cuda::proclaim_return_type<vertex_t>(
+      [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
+        auto delta = ptr_d_sizes[indx] - 1;
+        return ptr_d_coalesced[indx * stride + delta];
+      });
 
     // use the transform iterator as map:
     //
@@ -587,10 +591,11 @@ struct random_walker_t {
   {
     index_t const* ptr_d_sizes = original::raw_const_ptr(d_sizes);
 
-    auto dlambda = [stride, adjust, ptr_d_sizes] __device__(auto indx) {
-      auto delta = ptr_d_sizes[indx] - adjust - 1;
-      return indx * stride + delta;
-    };
+    auto dlambda =
+      cuda::proclaim_return_type<index_t>([stride, adjust, ptr_d_sizes] __device__(auto indx) {
+        auto delta = ptr_d_sizes[indx] - adjust - 1;
+        return indx * stride + delta;
+      });
 
     // use the transform iterator as map:
     //
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 77d4f2d865f..852d82e78ab 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -40,6 +40,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <optional>
 
 namespace cugraph {
@@ -1229,10 +1231,12 @@ renumber_and_compress_sampled_edgelist(
           auto pair_first       = thrust::make_zip_iterator((*compressed_label_indices).begin(),
                                                       (*compressed_hops).begin());
           auto value_pair_first = thrust::make_transform_iterator(
-            thrust::make_counting_iterator(size_t{0}), [num_hops] __device__(size_t i) {
-              return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
-                                        static_cast<int32_t>(i % num_hops));
-            });
+            thrust::make_counting_iterator(size_t{0}),
+            cuda::proclaim_return_type<thrust::tuple<label_index_t, int32_t>>(
+              [num_hops] __device__(size_t i) {
+                return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
+                                          static_cast<int32_t>(i % num_hops));
+              }));
           thrust::upper_bound(handle.get_thrust_policy(),
                               pair_first,
                               pair_first + (*compressed_label_indices).size(),
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 75862266789..6568b5e3b9e 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -133,8 +133,7 @@ update_local_sorted_unique_edge_majors_minors(
   graph_meta_t<vertex_t, edge_t, multi_gpu> const& meta,
   std::vector<rmm::device_uvector<edge_t>> const& edge_partition_offsets,
   std::vector<rmm::device_uvector<vertex_t>> const& edge_partition_indices,
-  std::optional<std::vector<rmm::device_uvector<vertex_t>>> const& edge_partition_dcs_nzd_vertices,
-  std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts)
+  std::optional<std::vector<rmm::device_uvector<vertex_t>>> const& edge_partition_dcs_nzd_vertices)
 {
   auto& comm                 = handle.get_comms();
   auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
@@ -341,8 +340,7 @@ update_local_sorted_unique_edge_majors_minors(
       if (use_dcs) {
         thrust::copy(handle.get_thrust_policy(),
                      (*edge_partition_dcs_nzd_vertices)[i].begin(),
-                     (*edge_partition_dcs_nzd_vertices)[i].begin() +
-                       (*edge_partition_dcs_nzd_vertex_counts)[i],
+                     (*edge_partition_dcs_nzd_vertices)[i].end(),
                      unique_edge_majors.begin() + cur_size);
       }
 
@@ -390,7 +388,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
   graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
   bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+      meta.number_of_vertices, meta.number_of_edges, meta.properties),
     partition_(meta.partition)
 {
   CUGRAPH_EXPECTS(
@@ -408,14 +406,6 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
   edge_partition_offsets_          = std::move(edge_partition_offsets);
   edge_partition_indices_          = std::move(edge_partition_indices);
   edge_partition_dcs_nzd_vertices_ = std::move(edge_partition_dcs_nzd_vertices);
-  if (edge_partition_dcs_nzd_vertices_) {
-    edge_partition_dcs_nzd_vertex_counts_ =
-      std::vector<vertex_t>((*edge_partition_dcs_nzd_vertices_).size());
-    for (size_t i = 0; i < (*edge_partition_dcs_nzd_vertex_counts_).size(); ++i) {
-      (*edge_partition_dcs_nzd_vertex_counts_)[i] =
-        static_cast<vertex_t>((*edge_partition_dcs_nzd_vertices_)[i].size());
-    }
-  }
 
   // update local sorted unique edge sources/destinations (only if key, value pair will be used)
 
@@ -432,8 +422,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
         meta,
         edge_partition_offsets_,
         edge_partition_indices_,
-        edge_partition_dcs_nzd_vertices_,
-        edge_partition_dcs_nzd_vertex_counts_);
+        edge_partition_dcs_nzd_vertices_);
   } else {
     std::tie(local_sorted_unique_edge_srcs_,
              local_sorted_unique_edge_src_chunk_start_offsets_,
@@ -447,8 +436,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
         meta,
         edge_partition_offsets_,
         edge_partition_indices_,
-        edge_partition_dcs_nzd_vertices_,
-        edge_partition_dcs_nzd_vertex_counts_);
+        edge_partition_dcs_nzd_vertices_);
   }
 }
 
@@ -460,7 +448,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_g
   graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
   bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, static_cast<edge_t>(indices.size()), meta.properties),
+      meta.number_of_vertices, static_cast<edge_t>(indices.size()), meta.properties),
     offsets_(std::move(offsets)),
     indices_(std::move(indices)),
     segment_offsets_(meta.segment_offsets)
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 37a553dcdbd..da0ecc991df 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -51,6 +51,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <cstdint>
 #include <type_traits>
@@ -70,44 +72,15 @@ struct out_of_range_t {
   __device__ bool operator()(vertex_t v) const { return (v < min) || (v >= max); }
 };
 
-template <typename vertex_t, typename edge_t>
-std::vector<edge_t> update_edge_partition_edge_counts(
-  std::vector<edge_t const*> const& edge_partition_offsets,
-  std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
-  partition_t<vertex_t> const& partition,
-  std::vector<vertex_t> const& edge_partition_segment_offsets,
-  cudaStream_t stream)
-{
-  std::vector<edge_t> edge_partition_edge_counts(partition.number_of_local_edge_partitions(), 0);
-  auto use_dcs = edge_partition_dcs_nzd_vertex_counts.has_value();
-  for (size_t i = 0; i < edge_partition_offsets.size(); ++i) {
-    auto [major_range_first, major_range_last] = partition.local_edge_partition_major_range(i);
-    auto segment_offset_size_per_partition =
-      edge_partition_segment_offsets.size() / edge_partition_offsets.size();
-    raft::update_host(
-      &(edge_partition_edge_counts[i]),
-      edge_partition_offsets[i] +
-        (use_dcs
-           ? (edge_partition_segment_offsets[segment_offset_size_per_partition * i +
-                                             detail::num_sparse_segments_per_vertex_partition] +
-              (*edge_partition_dcs_nzd_vertex_counts)[i])
-           : (major_range_last - major_range_first)),
-      1,
-      stream);
-  }
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  return edge_partition_edge_counts;
-}
-
 // compute out-degrees (if we are internally storing edges in the sparse 2D matrix using sources as
 // major indices) or in-degrees (otherwise)
 template <typename vertex_t, typename edge_t>
 rmm::device_uvector<edge_t> compute_major_degrees(
   raft::handle_t const& handle,
-  std::vector<edge_t const*> const& edge_partition_offsets,
-  std::optional<std::vector<vertex_t const*>> const& edge_partition_dcs_nzd_vertices,
-  std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
-  std::optional<std::vector<uint32_t const*>> const& edge_partition_masks,
+  std::vector<raft::device_span<edge_t const>> const& edge_partition_offsets,
+  std::optional<std::vector<raft::device_span<vertex_t const>>> const&
+    edge_partition_dcs_nzd_vertices,
+  std::optional<std::vector<raft::device_span<uint32_t const>>> const& edge_partition_masks,
   partition_t<vertex_t> const& partition,
   std::vector<vertex_t> const& edge_partition_segment_offsets)
 {
@@ -159,39 +132,39 @@ rmm::device_uvector<edge_t> compute_major_degrees(
                       thrust::make_counting_iterator(vertex_t{0}),
                       thrust::make_counting_iterator(major_hypersparse_first - major_range_first),
                       local_degrees.begin(),
-                      [offsets, masks] __device__(auto i) {
+                      cuda::proclaim_return_type<edge_t>([offsets, masks] __device__(auto i) {
                         auto local_degree = offsets[i + 1] - offsets[i];
                         if (masks) {
                           local_degree = static_cast<edge_t>(
-                            detail::count_set_bits(*masks, offsets[i], local_degree));
+                            detail::count_set_bits((*masks).begin(), offsets[i], local_degree));
                         }
                         return local_degree;
-                      });
+                      }));
     if (use_dcs) {
-      auto dcs_nzd_vertices     = (*edge_partition_dcs_nzd_vertices)[i];
-      auto dcs_nzd_vertex_count = (*edge_partition_dcs_nzd_vertex_counts)[i];
+      auto dcs_nzd_vertices = (*edge_partition_dcs_nzd_vertices)[i];
       thrust::fill(execution_policy,
                    local_degrees.begin() + (major_hypersparse_first - major_range_first),
                    local_degrees.begin() + (major_range_last - major_range_first),
                    edge_t{0});
-      thrust::for_each(execution_policy,
-                       thrust::make_counting_iterator(vertex_t{0}),
-                       thrust::make_counting_iterator(dcs_nzd_vertex_count),
-                       [offsets,
-                        dcs_nzd_vertices,
-                        masks,
-                        major_range_first,
-                        major_hypersparse_first,
-                        local_degrees = local_degrees.data()] __device__(auto i) {
-                         auto major_idx    = (major_hypersparse_first - major_range_first) + i;
-                         auto local_degree = offsets[major_idx + 1] - offsets[major_idx];
-                         if (masks) {
-                           local_degree = static_cast<edge_t>(
-                             detail::count_set_bits(*masks, offsets[major_idx], local_degree));
-                         }
-                         auto v                               = dcs_nzd_vertices[i];
-                         local_degrees[v - major_range_first] = local_degree;
-                       });
+      thrust::for_each(
+        execution_policy,
+        thrust::make_counting_iterator(vertex_t{0}),
+        thrust::make_counting_iterator(static_cast<vertex_t>(dcs_nzd_vertices.size())),
+        [offsets,
+         dcs_nzd_vertices,
+         masks,
+         major_range_first,
+         major_hypersparse_first,
+         local_degrees = local_degrees.data()] __device__(auto i) {
+          auto major_idx    = (major_hypersparse_first - major_range_first) + i;
+          auto local_degree = offsets[major_idx + 1] - offsets[major_idx];
+          if (masks) {
+            local_degree = static_cast<edge_t>(
+              detail::count_set_bits((*masks).begin(), offsets[major_idx], local_degree));
+          }
+          auto v                               = dcs_nzd_vertices[i];
+          local_degrees[v - major_range_first] = local_degree;
+        });
     }
     minor_comm.reduce(local_degrees.data(),
                       i == minor_comm_rank ? degrees.data() : static_cast<edge_t*>(nullptr),
@@ -207,10 +180,11 @@ rmm::device_uvector<edge_t> compute_major_degrees(
 // compute out-degrees (if we are internally storing edges in the sparse 2D matrix using sources as
 // major indices) or in-degrees (otherwise)
 template <typename vertex_t, typename edge_t>
-rmm::device_uvector<edge_t> compute_major_degrees(raft::handle_t const& handle,
-                                                  edge_t const* offsets,
-                                                  std::optional<uint32_t const*> masks,
-                                                  vertex_t number_of_vertices)
+rmm::device_uvector<edge_t> compute_major_degrees(
+  raft::handle_t const& handle,
+  raft::device_span<edge_t const> offsets,
+  std::optional<raft::device_span<uint32_t const>> masks,
+  vertex_t number_of_vertices)
 {
   rmm::device_uvector<edge_t> degrees(number_of_vertices, handle.get_stream());
   thrust::tabulate(
@@ -221,7 +195,7 @@ rmm::device_uvector<edge_t> compute_major_degrees(raft::handle_t const& handle,
       auto local_degree = offsets[i + 1] - offsets[i];
       if (masks) {
         local_degree =
-          static_cast<edge_t>(detail::count_set_bits(*masks, offsets[i], local_degree));
+          static_cast<edge_t>(detail::count_set_bits((*masks).begin(), offsets[i], local_degree));
       }
       return local_degree;
     });
@@ -444,24 +418,16 @@ edge_t count_edge_partition_multi_edges(
 
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
-  graph_view_t(raft::handle_t const& handle,
-               std::vector<edge_t const*> const& edge_partition_offsets,
-               std::vector<vertex_t const*> const& edge_partition_indices,
-               std::optional<std::vector<vertex_t const*>> const& edge_partition_dcs_nzd_vertices,
-               std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
+  graph_view_t(std::vector<raft::device_span<edge_t const>> const& edge_partition_offsets,
+               std::vector<raft::device_span<vertex_t const>> const& edge_partition_indices,
+               std::optional<std::vector<raft::device_span<vertex_t const>>> const&
+                 edge_partition_dcs_nzd_vertices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+      meta.number_of_vertices, meta.number_of_edges, meta.properties),
     edge_partition_offsets_(edge_partition_offsets),
     edge_partition_indices_(edge_partition_indices),
     edge_partition_dcs_nzd_vertices_(edge_partition_dcs_nzd_vertices),
-    edge_partition_dcs_nzd_vertex_counts_(edge_partition_dcs_nzd_vertex_counts),
-    edge_partition_number_of_edges_(
-      update_edge_partition_edge_counts(edge_partition_offsets,
-                                        edge_partition_dcs_nzd_vertex_counts,
-                                        meta.partition,
-                                        meta.edge_partition_segment_offsets,
-                                        handle.get_stream())),
     partition_(meta.partition),
     edge_partition_segment_offsets_(meta.edge_partition_segment_offsets),
     local_sorted_unique_edge_srcs_(meta.local_sorted_unique_edge_srcs),
@@ -479,51 +445,42 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
 {
   // cheap error checks
 
-  auto const minor_comm_size =
-    this->handle_ptr()->get_subcomm(cugraph::partition_manager::minor_comm_name()).get_size();
-
   auto use_dcs = edge_partition_dcs_nzd_vertices.has_value();
 
   CUGRAPH_EXPECTS(edge_partition_offsets.size() == edge_partition_indices.size(),
                   "Internal Error: edge_partition_offsets.size() and "
                   "edge_partition_indices.size() should coincide.");
-  CUGRAPH_EXPECTS(edge_partition_dcs_nzd_vertex_counts.has_value() == use_dcs,
-                  "edge_partition_dcs_nzd_vertices.has_value() and "
-                  "edge_partition_dcs_nzd_vertex_counts.has_value() should coincide");
-  CUGRAPH_EXPECTS(!use_dcs || ((*edge_partition_dcs_nzd_vertices).size() ==
-                               (*edge_partition_dcs_nzd_vertex_counts).size()),
-                  "Internal Error: edge_partition_dcs_nzd_vertices.size() and "
-                  "edge_partition_dcs_nzd_vertex_counts.size() should coincide (if used).");
   CUGRAPH_EXPECTS(
     !use_dcs || ((*edge_partition_dcs_nzd_vertices).size() == edge_partition_offsets.size()),
     "Internal Error: edge_partition_dcs_nzd_vertices.size() should coincide "
     "with edge_partition_offsets.size() (if used).");
 
-  CUGRAPH_EXPECTS(edge_partition_offsets.size() == static_cast<size_t>(minor_comm_size),
-                  "Internal Error: erroneous edge_partition_offsets.size().");
-
-  CUGRAPH_EXPECTS(
-    meta.edge_partition_segment_offsets.size() ==
-      minor_comm_size * (detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 3 : 2)),
-    "Internal Error: invalid edge_partition_segment_offsets.size().");
+  CUGRAPH_EXPECTS(meta.edge_partition_segment_offsets.size() ==
+                    edge_partition_offsets.size() *
+                      (detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 3 : 2)),
+                  "Internal Error: invalid edge_partition_segment_offsets.size().");
 
   // skip expensive error checks as this function is only called by graph_t
 }
 
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
-  graph_view_t(raft::handle_t const& handle,
-               edge_t const* offsets,
-               vertex_t const* indices,
+  graph_view_t(raft::device_span<edge_t const> offsets,
+               raft::device_span<vertex_t const> indices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+      meta.number_of_vertices, meta.number_of_edges, meta.properties),
     offsets_(offsets),
     indices_(indices),
     segment_offsets_(meta.segment_offsets)
 {
   // cheap error checks
 
+  CUGRAPH_EXPECTS(offsets.size() == static_cast<size_t>(meta.number_of_vertices + 1),
+                  "Internal Error: offsets.size() returns an invalid value.");
+  CUGRAPH_EXPECTS(indices.size() == static_cast<size_t>(meta.number_of_edges),
+                  "Internal Error: indices.size() returns an invalid value.");
+
   CUGRAPH_EXPECTS(
     !(meta.segment_offsets).has_value() ||
       ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
@@ -532,19 +489,62 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
   // skip expensive error checks as this function is only called by graph_t
 }
 
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_number_of_edges(raft::handle_t const& handle) const
+{
+  if (this->has_edge_mask()) {
+    edge_t ret{};
+    auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+    auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+    for (size_t i = 0; i < value_firsts.size(); ++i) {
+      ret += static_cast<edge_t>(detail::count_set_bits(handle, value_firsts[i], edge_counts[i]));
+    }
+    ret =
+      host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream());
+    return ret;
+  } else {
+    return this->number_of_edges_;
+  }
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
+  compute_number_of_edges(raft::handle_t const& handle) const
+{
+  if (this->has_edge_mask()) {
+    auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+    auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+    assert(value_firsts.size() == 0);
+    assert(edge_counts.size() == 0);
+    return static_cast<edge_t>(detail::count_set_bits(handle, value_firsts[0], edge_counts[0]));
+  } else {
+    return this->number_of_edges_;
+  }
+}
+
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 rmm::device_uvector<edge_t>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_in_degrees(raft::handle_t const& handle) const
 {
   if (store_transposed) {
+    std::optional<std::vector<raft::device_span<uint32_t const>>> edge_partition_masks{
+      std::nullopt};
+    if (this->has_edge_mask()) {
+      edge_partition_masks =
+        std::vector<raft::device_span<uint32_t const>>(this->edge_partition_offsets_.size());
+      auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+      auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+      for (size_t i = 0; i < (*edge_partition_masks).size(); ++i) {
+        (*edge_partition_masks)[i] =
+          raft::device_span<uint32_t const>(value_firsts[i], edge_counts[i]);
+      }
+    }
     return compute_major_degrees(handle,
                                  this->edge_partition_offsets_,
                                  this->edge_partition_dcs_nzd_vertices_,
-                                 this->edge_partition_dcs_nzd_vertex_counts_,
-                                 this->has_edge_mask()
-                                   ? std::make_optional((*(this->edge_mask_view())).value_firsts())
-                                   : std::nullopt,
+                                 edge_partition_masks,
                                  this->partition_,
                                  this->edge_partition_segment_offsets_);
   } else {
@@ -559,12 +559,14 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
   compute_in_degrees(raft::handle_t const& handle) const
 {
   if (store_transposed) {
-    return compute_major_degrees(
-      handle,
-      this->offsets_,
-      this->has_edge_mask() ? std::make_optional((*(this->edge_mask_view())).value_firsts()[0])
-                            : std::nullopt,
-      this->local_vertex_partition_range_size());
+    return compute_major_degrees(handle,
+                                 this->offsets_,
+                                 this->has_edge_mask()
+                                   ? std::make_optional(raft::device_span<uint32_t const>(
+                                       (*(this->edge_mask_view())).value_firsts()[0],
+                                       (*(this->edge_mask_view())).edge_counts()[0]))
+                                   : std::nullopt,
+                                 this->local_vertex_partition_range_size());
   } else {
     CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
@@ -580,13 +582,22 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
     CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
   } else {
+    std::optional<std::vector<raft::device_span<uint32_t const>>> edge_partition_masks{
+      std::nullopt};
+    if (this->has_edge_mask()) {
+      edge_partition_masks =
+        std::vector<raft::device_span<uint32_t const>>(this->edge_partition_offsets_.size());
+      auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+      auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+      for (size_t i = 0; i < (*edge_partition_masks).size(); ++i) {
+        (*edge_partition_masks)[i] =
+          raft::device_span<uint32_t const>(value_firsts[i], edge_counts[i]);
+      }
+    }
     return compute_major_degrees(handle,
                                  this->edge_partition_offsets_,
                                  this->edge_partition_dcs_nzd_vertices_,
-                                 this->edge_partition_dcs_nzd_vertex_counts_,
-                                 this->has_edge_mask()
-                                   ? std::make_optional((*(this->edge_mask_view())).value_firsts())
-                                   : std::nullopt,
+                                 edge_partition_masks,
                                  this->partition_,
                                  this->edge_partition_segment_offsets_);
   }
@@ -601,12 +612,14 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
     CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
   } else {
-    return compute_major_degrees(
-      handle,
-      this->offsets_,
-      this->has_edge_mask() ? std::make_optional((*(this->edge_mask_view())).value_firsts()[0])
-                            : std::nullopt,
-      this->local_vertex_partition_range_size());
+    return compute_major_degrees(handle,
+                                 this->offsets_,
+                                 this->has_edge_mask()
+                                   ? std::make_optional(raft::device_span<uint32_t const>(
+                                       (*(this->edge_mask_view())).value_firsts()[0],
+                                       (*(this->edge_mask_view())).edge_counts()[0]))
+                                   : std::nullopt,
+                                 this->local_vertex_partition_range_size());
   }
 }
 
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index 6a0c5a4a675..cc69cb5f67f 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -432,7 +432,7 @@ rmm::device_uvector<weight_t> od_shortest_distances(
   // 1. check input arguments
 
   auto const num_vertices = graph_view.number_of_vertices();
-  auto const num_edges    = graph_view.number_of_edges();
+  auto const num_edges    = graph_view.compute_number_of_edges(handle);
 
   CUGRAPH_EXPECTS(num_vertices != 0 || (origins.size() == 0 && destinations.size() == 0),
                   "Invalid input argument: the input graph is empty but origins.size() > 0 or "
@@ -1049,7 +1049,7 @@ rmm::device_uvector<weight_t> od_shortest_distances(
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
   auto const num_vertices = graph_view.number_of_vertices();
-  auto const num_edges    = graph_view.number_of_edges();
+  auto const num_edges    = graph_view.compute_number_of_edges(handle);
 
   weight_t average_vertex_degree =
     static_cast<weight_t>(num_edges) / static_cast<weight_t>(num_vertices);
diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh
index c78fa3839e2..5a6d536c6f5 100644
--- a/cpp/src/traversal/sssp_impl.cuh
+++ b/cpp/src/traversal/sssp_impl.cuh
@@ -93,7 +93,7 @@ void sssp(raft::handle_t const& handle,
                 "GraphViewType should support the push model.");
 
   auto const num_vertices = push_graph_view.number_of_vertices();
-  auto const num_edges    = push_graph_view.number_of_edges();
+  auto const num_edges    = push_graph_view.compute_number_of_edges(handle);
   if (num_vertices == 0) { return; }
 
   // implements the Near-Far Pile method in
diff --git a/cpp/src/utilities/cugraph_ops_utils.hpp b/cpp/src/utilities/cugraph_ops_utils.hpp
index 9aea4183866..880a2c8d104 100644
--- a/cpp/src/utilities/cugraph_ops_utils.hpp
+++ b/cpp/src/utilities/cugraph_ops_utils.hpp
@@ -30,7 +30,7 @@ ops::graph::csc<EdgeTypeT, NodeTypeT> get_graph(
   ops::graph::csc<EdgeTypeT, NodeTypeT> graph;
   graph.n_src_nodes = gview.number_of_vertices();
   graph.n_dst_nodes = gview.number_of_vertices();
-  graph.n_indices   = gview.number_of_edges();
+  graph.n_indices   = gview.local_edge_partition_view().number_of_edges();
   // FIXME this is sufficient for now, but if there is a fast (cached) way
   // of getting max degree, use that instead
   graph.dst_max_in_degree = std::numeric_limits<EdgeTypeT>::max();
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d9c88bc179e..e9c6dc446af 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -257,7 +257,7 @@ ConfigureTest(BALANCED_TEST community/balanced_edge_test.cpp)
 
 ###################################################################################################
 # - EGO tests -------------------------------------------------------------------------------------
-ConfigureTest(EGO_TEST community/egonet_test.cpp GPUS 1 PERCENT 75)
+ConfigureTest(EGONET_TEST community/egonet_test.cpp GPUS 1 PERCENT 75)
 
 ###################################################################################################
 # - FORCE ATLAS 2  tests --------------------------------------------------------------------------
@@ -531,7 +531,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
 
     ###############################################################################################
     # - MG LOUVAIN tests --------------------------------------------------------------------------
-    ConfigureTestMG(MG_EGO_TEST community/mg_egonet_test.cu)
+    ConfigureTestMG(MG_EGONET_TEST community/mg_egonet_test.cu)
 
     ###############################################################################################
     # - MG WEAKLY CONNECTED COMPONENTS tests ------------------------------------------------------
diff --git a/cpp/tests/c_api/hits_test.c b/cpp/tests/c_api/hits_test.c
index c275d883d11..1ebd4f82a51 100644
--- a/cpp/tests/c_api/hits_test.c
+++ b/cpp/tests/c_api/hits_test.c
@@ -163,7 +163,7 @@ int test_hits()
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // hits wants store_transposed = TRUE
@@ -195,7 +195,7 @@ int test_hits_with_transpose()
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // Hits wants store_transposed = TRUE
@@ -232,7 +232,7 @@ int test_hits_with_initial()
   vertex_t h_initial_vertices[] = {0, 1, 2, 3, 4};
   weight_t h_initial_hubs[]     = {0.347296, 0.532089, 1, 0.00003608, 0.00003608};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   return generic_hits_test(h_src,
diff --git a/cpp/tests/c_api/mg_create_graph_test.c b/cpp/tests/c_api/mg_create_graph_test.c
index fec319d1881..7156647b025 100644
--- a/cpp/tests/c_api/mg_create_graph_test.c
+++ b/cpp/tests/c_api/mg_create_graph_test.c
@@ -175,18 +175,18 @@ int test_create_mg_graph_multiple_edge_lists(const cugraph_resource_handle_t* ha
   int my_rank = cugraph_resource_handle_get_rank(handle);
   int comm_size = cugraph_resource_handle_get_comm_size(handle);
 
-  size_t local_num_vertices = (num_vertices + comm_size - 1) / comm_size;
+  size_t local_num_vertices = num_vertices / comm_size;
   size_t local_start_vertex = my_rank * local_num_vertices;
-  size_t local_num_edges = (num_edges + comm_size - 1) / comm_size;
+  size_t local_num_edges = num_edges / comm_size;
   size_t local_start_edge = my_rank * local_num_edges;
 
-  local_num_edges = (local_num_edges < (num_edges - local_start_edge)) ? local_num_edges : (num_edges - local_start_edge);
-  local_num_vertices = (local_num_vertices < (num_vertices - local_start_vertex)) ? local_num_vertices : (num_vertices - local_start_vertex);
+  local_num_edges = (my_rank != (comm_size - 1)) ? local_num_edges : (num_edges - local_start_edge);
+  local_num_vertices = (my_rank != (comm_size - 1)) ? local_num_vertices : (num_vertices - local_start_vertex);
 
   for (size_t i = 0 ; i < num_local_arrays ; ++i) {
-    size_t vertex_count = (local_num_vertices + num_local_arrays - 1) / num_local_arrays;
+    size_t vertex_count = local_num_vertices / num_local_arrays;
     size_t vertex_start = i * vertex_count;
-    vertex_count = (vertex_count < (local_num_vertices - vertex_start)) ? vertex_count : (local_num_vertices - vertex_start);
+    vertex_count = (i != (num_local_arrays - 1)) ? vertex_count : (local_num_vertices - vertex_start);
     
     ret_code =
       cugraph_type_erased_device_array_create(handle, vertex_count, vertex_tid, vertices + i, &ret_error);
@@ -363,18 +363,18 @@ int test_create_mg_graph_multiple_edge_lists_multi_edge(const cugraph_resource_h
   int my_rank = cugraph_resource_handle_get_rank(handle);
   int comm_size = cugraph_resource_handle_get_comm_size(handle);
 
-  size_t local_num_vertices = (num_vertices + comm_size - 1) / comm_size;
+  size_t local_num_vertices = num_vertices / comm_size;
   size_t local_start_vertex = my_rank * local_num_vertices;
-  size_t local_num_edges = (num_edges + comm_size - 1) / comm_size;
+  size_t local_num_edges = num_edges / comm_size;
   size_t local_start_edge = my_rank * local_num_edges;
 
-  local_num_edges = (local_num_edges < (num_edges - local_start_edge)) ? local_num_edges : (num_edges - local_start_edge);
-  local_num_vertices = (local_num_vertices < (num_vertices - local_start_vertex)) ? local_num_vertices : (num_vertices - local_start_vertex);
+  local_num_edges = (my_rank != (comm_size - 1)) ? local_num_edges : (num_edges - local_start_edge);
+  local_num_vertices = (my_rank != (comm_size - 1)) ? local_num_vertices : (num_vertices - local_start_vertex);
 
   for (size_t i = 0 ; i < num_local_arrays ; ++i) {
     size_t vertex_count = (local_num_vertices + num_local_arrays - 1) / num_local_arrays;
     size_t vertex_start = i * vertex_count;
-    vertex_count = (vertex_count < (local_num_vertices - vertex_start)) ? vertex_count : (local_num_vertices - vertex_start);
+    vertex_count = (i != (num_local_arrays - 1)) ? vertex_count : (local_num_vertices - vertex_start);
     
     ret_code =
       cugraph_type_erased_device_array_create(handle, vertex_count, vertex_tid, vertices + i, &ret_error);
diff --git a/cpp/tests/c_api/mg_hits_test.c b/cpp/tests/c_api/mg_hits_test.c
index 87371093613..3e10bfc05d6 100644
--- a/cpp/tests/c_api/mg_hits_test.c
+++ b/cpp/tests/c_api/mg_hits_test.c
@@ -171,7 +171,7 @@ int test_hits(const cugraph_resource_handle_t* handle)
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // hits wants store_transposed = TRUE
@@ -203,7 +203,7 @@ int test_hits_with_transpose(const cugraph_resource_handle_t* handle)
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // Hits wants store_transposed = TRUE
diff --git a/cpp/tests/link_analysis/hits_test.cpp b/cpp/tests/link_analysis/hits_test.cpp
index d0e77769034..cf35356bb76 100644
--- a/cpp/tests/link_analysis/hits_test.cpp
+++ b/cpp/tests/link_analysis/hits_test.cpp
@@ -52,9 +52,11 @@ std::tuple<std::vector<result_t>, std::vector<result_t>, double, size_t> hits_re
   size_t max_iterations,
   std::optional<result_t const*> starting_hub_values,
   bool normalized,
-  double tolerance)
+  double epsilon)
 {
   CUGRAPH_EXPECTS(num_vertices > 1, "number of vertices expected to be non-zero");
+  auto tolerance = static_cast<result_t>(num_vertices) * epsilon;
+
   std::vector<result_t> prev_hubs(num_vertices, result_t{1.0} / num_vertices);
   std::vector<result_t> prev_authorities(num_vertices, result_t{1.0} / num_vertices);
   std::vector<result_t> curr_hubs(num_vertices);
@@ -127,8 +129,8 @@ std::tuple<std::vector<result_t>, std::vector<result_t>, double, size_t> hits_re
 }
 
 struct Hits_Usecase {
-  bool check_correctness{true};
   bool check_initial_input{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -175,8 +177,8 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
     // 3. run hits
 
     auto graph_view         = graph.view();
-    auto maximum_iterations = 500;
-    weight_t tolerance      = 1e-5;
+    auto maximum_iterations = 200;
+    weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_hubs(graph_view.local_vertex_partition_range_size(),
                                          handle.get_stream());
 
@@ -201,7 +203,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
                                 graph_view,
                                 d_hubs.data(),
                                 d_authorities.data(),
-                                tolerance,
+                                epsilon,
                                 maximum_iterations,
                                 hits_usecase.check_initial_input,
                                 true,
@@ -227,12 +229,12 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
         offsets.data(),
         indices.data(),
         unrenumbered_graph_view.number_of_vertices(),
-        unrenumbered_graph_view.number_of_edges(),
+        unrenumbered_graph_view.compute_number_of_edges(handle),
         maximum_iterations,
         (hits_usecase.check_initial_input) ? std::make_optional(initial_random_hubs.data())
                                            : std::nullopt,
         true,
-        tolerance);
+        epsilon);
 
       std::vector<weight_t> h_cugraph_hits{};
       if (renumber) {
@@ -246,8 +248,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
       handle.sync_stream();
       auto threshold_ratio = 1e-3;
       auto threshold_magnitude =
-        (1.0 / static_cast<weight_t>(graph_view.number_of_vertices())) *
-        threshold_ratio;  // skip comparison for low hits vertices (lowly ranked vertices)
+        1e-6;  // skip comparison for low hits vertices (lowly ranked vertices)
       auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
         return std::abs(lhs - rhs) <=
                std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
@@ -294,14 +295,17 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"),
                       cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_Hits_Rmat,
                          // enable correctness checks
-                         ::testing::Combine(::testing::Values(Hits_Usecase{true, false},
+                         ::testing::Combine(::testing::Values(Hits_Usecase{false, true},
                                                               Hits_Usecase{true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
@@ -315,7 +319,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_File,
   ::testing::Combine(
     // disable correctness checks
-    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{false, true}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -327,7 +331,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_Rmat,
   // disable correctness checks for large graphs
   ::testing::Combine(
-    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{false, true}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/link_analysis/mg_hits_test.cpp b/cpp/tests/link_analysis/mg_hits_test.cpp
index cf95d03681d..5c89bafd08e 100644
--- a/cpp/tests/link_analysis/mg_hits_test.cpp
+++ b/cpp/tests/link_analysis/mg_hits_test.cpp
@@ -33,8 +33,8 @@
 #include <gtest/gtest.h>
 
 struct Hits_Usecase {
-  bool check_correctness{true};
   bool check_initial_input{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -81,7 +81,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
     auto mg_graph_view = mg_graph.view();
 
     auto maximum_iterations = 200;
-    weight_t tolerance      = 1e-8;
+    weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_mg_hubs(mg_graph_view.local_vertex_partition_range_size(),
                                             handle_->get_stream());
 
@@ -110,7 +110,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
                                 mg_graph_view,
                                 d_mg_hubs.data(),
                                 d_mg_authorities.data(),
-                                tolerance,
+                                epsilon,
                                 maximum_iterations,
                                 hits_usecase.check_initial_input,
                                 true,
@@ -205,7 +205,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
                                     sg_graph_view,
                                     d_sg_hubs.data(),
                                     d_sg_authorities.data(),
-                                    tolerance,
+                                    epsilon,
                                     maximum_iterations,
                                     hits_usecase.check_initial_input,
                                     true,
@@ -218,9 +218,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
 
         auto threshold_ratio = 1e-3;
         auto threshold_magnitude =
-          (1.0 / static_cast<result_t>(mg_graph_view.number_of_vertices())) *
-          threshold_ratio;  // skip comparison for low Hits verties (lowly ranked
-                            // vertices)
+          1e-6;  // skip comparison for low Hits verties (lowly ranked vertices)
         auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
           return std::abs(lhs - rhs) <
                  std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
@@ -274,7 +272,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -285,7 +283,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_Rmat,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -297,7 +295,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_Rmat,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(Hits_Usecase{false, false}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
index 2b9e9aafa3f..80aa34b68ae 100644
--- a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
@@ -301,8 +301,8 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
                      sg_graph_view.local_edge_partition_view().offsets().begin(),
                      sg_graph_view.local_edge_partition_view().offsets().end(),
                      sg_offsets.begin());
-        rmm::device_uvector<vertex_t> sg_indices(sg_graph_view.number_of_edges(),
-                                                 handle_->get_stream());
+        rmm::device_uvector<vertex_t> sg_indices(
+          sg_graph_view.local_edge_partition_view().indices().size(), handle_->get_stream());
         thrust::copy(handle_->get_thrust_policy(),
                      sg_graph_view.local_edge_partition_view().indices().begin(),
                      sg_graph_view.local_edge_partition_view().indices().end(),
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index e5267d75ac2..6be735c3482 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -38,6 +38,8 @@
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 struct SamplingPostProcessing_Usecase {
   size_t num_labels{};
   size_t num_seeds_per_label{};
@@ -318,15 +320,16 @@ bool check_renumber_map_invariants(
 
     auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
       merged_vertices.begin(),
-      [sorted_org_vertices =
-         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
-       matching_renumbered_vertices = raft::device_span<vertex_t const>(
-         matching_renumbered_vertices.data(),
-         matching_renumbered_vertices.size())] __device__(vertex_t major) {
-        auto it = thrust::lower_bound(
-          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
-        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-      });
+      cuda::proclaim_return_type<vertex_t>(
+        [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
+                                                                 sorted_org_vertices.size()),
+         matching_renumbered_vertices = raft::device_span<vertex_t const>(
+           matching_renumbered_vertices.data(),
+           matching_renumbered_vertices.size())] __device__(vertex_t major) {
+          auto it = thrust::lower_bound(
+            thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+          return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+        }));
 
     thrust::reduce_by_key(handle.get_thrust_policy(),
                           sort_key_first,
@@ -1020,23 +1023,24 @@ class Tests_SamplingPostProcessing
                  ? this_label_output_edgelist_srcs.begin()
                  : this_label_output_edgelist_dsts.begin()) +
                 old_size,
-              [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
-               nzd_vertices =
-                 renumbered_and_compressed_nzd_vertices
-                   ? thrust::make_optional<raft::device_span<vertex_t const>>(
-                       (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
-                       (offset_end_offset - offset_start_offset) - 1)
-                   : thrust::nullopt,
-               base_v] __device__(size_t i) {
-                auto idx = static_cast<size_t>(thrust::distance(
-                  offsets.begin() + 1,
-                  thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
-                if (nzd_vertices) {
-                  return (*nzd_vertices)[idx];
-                } else {
-                  return base_v + static_cast<vertex_t>(idx);
-                }
-              });
+              cuda::proclaim_return_type<vertex_t>(
+                [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+                 nzd_vertices =
+                   renumbered_and_compressed_nzd_vertices
+                     ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                         (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
+                         (offset_end_offset - offset_start_offset) - 1)
+                     : thrust::nullopt,
+                 base_v] __device__(size_t i) {
+                  auto idx = static_cast<size_t>(thrust::distance(
+                    offsets.begin() + 1,
+                    thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
+                  if (nzd_vertices) {
+                    return (*nzd_vertices)[idx];
+                  } else {
+                    return base_v + static_cast<vertex_t>(idx);
+                  }
+                }));
             thrust::copy(handle.get_thrust_policy(),
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets.back(),
diff --git a/cpp/tests/structure/mg_select_random_vertices_test.cpp b/cpp/tests/structure/mg_select_random_vertices_test.cpp
index e49e1ebcb99..8392a6831ca 100644
--- a/cpp/tests/structure/mg_select_random_vertices_test.cpp
+++ b/cpp/tests/structure/mg_select_random_vertices_test.cpp
@@ -79,6 +79,8 @@ class Tests_MGSelectRandomVertices
     //
 
     std::vector<bool> with_replacement_flags = {true, false};
+    std::vector<bool> sort_vertices_flags    = {true, false};
+
     {
       // Generate distributed vertex set to sample from
       std::srand((unsigned)std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -107,80 +109,95 @@ class Tests_MGSelectRandomVertices
           ? select_random_vertices_usecase.select_count
           : std::rand() % (num_of_elements_in_given_set + 1);
 
-      for (int idx = 0; idx < with_replacement_flags.size(); idx++) {
-        bool with_replacement = with_replacement_flags[idx];
-        auto d_sampled_vertices =
-          cugraph::select_random_vertices(*handle_,
-                                          mg_graph_view,
-                                          std::make_optional(raft::device_span<vertex_t const>{
-                                            d_given_set.data(), d_given_set.size()}),
-                                          rng_state,
-                                          select_count,
-                                          with_replacement,
-                                          true);
-
-        RAFT_CUDA_TRY(cudaDeviceSynchronize());
-
-        auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
-
-        if (select_random_vertices_usecase.check_correctness) {
-          if (!with_replacement) {
-            std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
-
-            auto nr_duplicates =
-              std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
-                            h_sampled_vertices.end());
-
-            ASSERT_EQ(nr_duplicates, 0);
+      for (int i = 0; i < with_replacement_flags.size(); i++) {
+        for (int j = 0; j < sort_vertices_flags.size(); j++) {
+          bool with_replacement = with_replacement_flags[i];
+          bool sort_vertices    = sort_vertices_flags[j];
+
+          auto d_sampled_vertices =
+            cugraph::select_random_vertices(*handle_,
+                                            mg_graph_view,
+                                            std::make_optional(raft::device_span<vertex_t const>{
+                                              d_given_set.data(), d_given_set.size()}),
+                                            rng_state,
+                                            select_count,
+                                            with_replacement,
+                                            sort_vertices);
+
+          RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+          auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
+
+          if (select_random_vertices_usecase.check_correctness) {
+            if (!with_replacement) {
+              std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+
+              auto nr_duplicates =
+                std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
+                              h_sampled_vertices.end());
+
+              ASSERT_EQ(nr_duplicates, 0);
+            }
+
+            std::sort(h_given_set.begin(), h_given_set.end());
+            if (sort_vertices) {
+              assert(std::is_sorted(h_sampled_vertices.begin(), h_sampled_vertices.end()));
+            } else {
+              std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+            }
+            std::for_each(
+              h_sampled_vertices.begin(), h_sampled_vertices.end(), [&h_given_set](vertex_t v) {
+                ASSERT_TRUE(std::binary_search(h_given_set.begin(), h_given_set.end(), v));
+              });
           }
-
-          std::sort(h_given_set.begin(), h_given_set.end());
-          std::for_each(
-            h_sampled_vertices.begin(), h_sampled_vertices.end(), [&h_given_set](vertex_t v) {
-              ASSERT_TRUE(std::binary_search(h_given_set.begin(), h_given_set.end(), v));
-            });
         }
       }
-    }
-
-    //
-    // Test sampling from [0, V)
-    //
-
-    for (int idx = 0; idx < with_replacement_flags.size(); idx++) {
-      bool with_replacement   = false;
-      auto d_sampled_vertices = cugraph::select_random_vertices(
-        *handle_,
-        mg_graph_view,
-        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
-        rng_state,
-        select_random_vertices_usecase.select_count,
-        with_replacement,
-        true);
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-
-      auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
 
-      if (select_random_vertices_usecase.check_correctness) {
-        if (!with_replacement) {
-          std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
-
-          auto nr_duplicates =
-            std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
-                          h_sampled_vertices.end());
-
-          ASSERT_EQ(nr_duplicates, 0);
+      //
+      // Test sampling from [0, V)
+      //
+
+      for (int i = 0; i < with_replacement_flags.size(); i++) {
+        for (int j = 0; j < sort_vertices_flags.size(); j++) {
+          bool with_replacement = with_replacement_flags[i];
+          bool sort_vertices    = sort_vertices_flags[j];
+
+          auto d_sampled_vertices = cugraph::select_random_vertices(
+            *handle_,
+            mg_graph_view,
+            std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+            rng_state,
+            select_random_vertices_usecase.select_count,
+            with_replacement,
+            sort_vertices);
+
+          RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+          auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
+
+          if (select_random_vertices_usecase.check_correctness) {
+            if (!with_replacement) {
+              std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+
+              auto nr_duplicates =
+                std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
+                              h_sampled_vertices.end());
+
+              ASSERT_EQ(nr_duplicates, 0);
+            }
+            if (sort_vertices) {
+              assert(std::is_sorted(h_sampled_vertices.begin(), h_sampled_vertices.end()));
+            }
+
+            auto vertex_first = mg_graph_view.local_vertex_partition_range_first();
+            auto vertex_last  = mg_graph_view.local_vertex_partition_range_last();
+            std::for_each(h_sampled_vertices.begin(),
+                          h_sampled_vertices.end(),
+                          [vertex_first, vertex_last](vertex_t v) {
+                            ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
+                          });
+          }
         }
-
-        auto vertex_first = mg_graph_view.local_vertex_partition_range_first();
-        auto vertex_last  = mg_graph_view.local_vertex_partition_range_last();
-
-        std::for_each(h_sampled_vertices.begin(),
-                      h_sampled_vertices.end(),
-                      [vertex_first, vertex_last](vertex_t v) {
-                        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
-                      });
       }
     }
   }
diff --git a/dependencies.yaml b/dependencies.yaml
index baa08c37413..b5c1fb2fa2d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -511,6 +511,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -536,6 +537,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -561,6 +563,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -586,6 +589,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -611,6 +615,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -636,6 +641,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -661,6 +667,7 @@ dependencies:
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
index 9c9dcdb43bb..bef3a023b93 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
@@ -13,6 +13,7 @@
 
 from .gat_conv import GATConv
 from .gatv2_conv import GATv2Conv
+from .hetero_gat_conv import HeteroGATConv
 from .rgcn_conv import RGCNConv
 from .sage_conv import SAGEConv
 from .transformer_conv import TransformerConv
@@ -20,6 +21,7 @@
 __all__ = [
     "GATConv",
     "GATv2Conv",
+    "HeteroGATConv",
     "RGCNConv",
     "SAGEConv",
     "TransformerConv",
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
new file mode 100644
index 00000000000..3b717552a96
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+from collections import defaultdict
+
+from cugraph.utilities.utils import import_optional
+from pylibcugraphops.pytorch.operators import mha_gat_n2n
+
+from .base import BaseConv
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+
+class HeteroGATConv(BaseConv):
+    r"""The graph attentional operator on heterogeneous graphs, where a separate
+    `GATConv` is applied on the homogeneous graph for each edge type. Compared
+    with directly wrapping `GATConv`s with `HeteroConv`, `HeteroGATConv` fuses
+    all the linear transformation associated with each node type together into 1
+    GEMM call, to improve the performance on GPUs.
+
+    Parameters
+    ----------
+    in_channels : int or Dict[str, int])
+        Size of each input sample of every node type.
+
+    out_channels : int
+        Size of each output sample.
+
+    node_types : List[str]
+        List of Node types.
+
+    edge_types : List[Tuple[str, str, str]]
+        List of Edge types.
+
+    heads : int, optional (default=1)
+        Number of multi-head-attentions.
+
+    concat : bool, optional (default=True):
+        If set to :obj:`False`, the multi-head attentions are averaged instead
+        of concatenated.
+
+    negative_slope : float, optional (default=0.2)
+        LeakyReLU angle of the negative slope.
+
+    bias : bool, optional (default=True)
+        If set to :obj:`False`, the layer will not learn an additive bias.
+
+    aggr : str, optional (default="sum")
+        The aggregation scheme to use for grouping node embeddings generated by
+        different relations. Choose from "sum", "mean", "min", "max".
+    """
+
+    def __init__(
+        self,
+        in_channels: Union[int, dict[str, int]],
+        out_channels: int,
+        node_types: list[str],
+        edge_types: list[tuple[str, str, str]],
+        heads: int = 1,
+        concat: bool = True,
+        negative_slope: float = 0.2,
+        bias: bool = True,
+        aggr: str = "sum",
+    ):
+        major, minor, patch = torch_geometric.__version__.split(".")[:3]
+        pyg_version = tuple(map(int, [major, minor, patch]))
+        if pyg_version < (2, 4, 0):
+            raise RuntimeError(f"{self.__class__.__name__} requires pyg >= 2.4.0.")
+
+        super().__init__()
+
+        if isinstance(in_channels, int):
+            in_channels = dict.fromkeys(node_types, in_channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.node_types = node_types
+        self.edge_types = edge_types
+        self.num_heads = heads
+        self.concat_heads = concat
+
+        self.negative_slope = negative_slope
+        self.aggr = aggr
+
+        self.relations_per_ntype = defaultdict(lambda: ([], []))
+
+        lin_weights = dict.fromkeys(self.node_types)
+        attn_weights = dict.fromkeys(self.edge_types)
+        biases = dict.fromkeys(self.edge_types)
+
+        ParameterDict = torch_geometric.nn.parameter_dict.ParameterDict
+
+        for edge_type in self.edge_types:
+            src_type, _, dst_type = edge_type
+            self.relations_per_ntype[src_type][0].append(edge_type)
+            if src_type != dst_type:
+                self.relations_per_ntype[dst_type][1].append(edge_type)
+
+            attn_weights[edge_type] = torch.empty(
+                2 * self.num_heads * self.out_channels
+            )
+
+            if bias and concat:
+                biases[edge_type] = torch.empty(self.num_heads * out_channels)
+            elif bias:
+                biases[edge_type] = torch.empty(out_channels)
+            else:
+                biases[edge_type] = None
+
+        for ntype in self.node_types:
+            n_src_rel = len(self.relations_per_ntype[ntype][0])
+            n_dst_rel = len(self.relations_per_ntype[ntype][1])
+            n_rel = n_src_rel + n_dst_rel
+
+            lin_weights[ntype] = torch.empty(
+                (n_rel * self.num_heads * self.out_channels, self.in_channels[ntype])
+            )
+
+        self.lin_weights = ParameterDict(lin_weights)
+        self.attn_weights = ParameterDict(attn_weights)
+
+        if bias:
+            self.bias = ParameterDict(biases)
+        else:
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def split_tensors(
+        self, x_fused_dict: dict[str, torch.Tensor], dim: int
+    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+        """Split fused tensors into chunks based on edge types.
+
+        Parameters
+        ----------
+        x_fused_dict : dict[str, torch.Tensor]
+            A dictionary to hold node feature for each node type. The key is
+            node type; the value is a fused tensor that account for all
+            relations for that node type.
+
+        dim : int
+            Dimension along which to split the fused tensor.
+
+        Returns
+        -------
+        x_src_dict : dict[str, torch.Tensor]
+            A dictionary to hold source node feature for each relation graph.
+
+        x_dst_dict : dict[str, torch.Tensor]
+            A dictionary to hold destination node feature for each relation graph.
+        """
+        x_src_dict = dict.fromkeys(self.edge_types)
+        x_dst_dict = dict.fromkeys(self.edge_types)
+
+        for ntype, t in x_fused_dict.items():
+            n_src_rel = len(self.relations_per_ntype[ntype][0])
+            n_dst_rel = len(self.relations_per_ntype[ntype][1])
+            n_rel = n_src_rel + n_dst_rel
+            t_list = torch.chunk(t, chunks=n_rel, dim=dim)
+
+            for i, src_rel in enumerate(self.relations_per_ntype[ntype][0]):
+                x_src_dict[src_rel] = t_list[i]
+
+            for i, dst_rel in enumerate(self.relations_per_ntype[ntype][1]):
+                x_dst_dict[dst_rel] = t_list[i + n_src_rel]
+
+        return x_src_dict, x_dst_dict
+
+    def reset_parameters(self, seed: Optional[int] = None):
+        if seed is not None:
+            torch.manual_seed(seed)
+
+        w_src, w_dst = self.split_tensors(self.lin_weights, dim=0)
+
+        for edge_type in self.edge_types:
+            src_type, _, dst_type = edge_type
+
+            # lin_src
+            torch_geometric.nn.inits.glorot(w_src[edge_type])
+
+            # lin_dst
+            if src_type != dst_type:
+                torch_geometric.nn.inits.glorot(w_dst[edge_type])
+
+            # attn_weights
+            torch_geometric.nn.inits.glorot(
+                self.attn_weights[edge_type].view(-1, self.num_heads, self.out_channels)
+            )
+
+            # bias
+            if self.bias is not None:
+                torch_geometric.nn.inits.zeros(self.bias[edge_type])
+
+    def forward(
+        self,
+        x_dict: dict[str, torch.Tensor],
+        edge_index_dict: dict[tuple[str, str, str], torch.Tensor],
+    ) -> dict[str, torch.Tensor]:
+        feat_dict = dict.fromkeys(x_dict.keys())
+
+        for ntype, x in x_dict.items():
+            feat_dict[ntype] = x @ self.lin_weights[ntype].T
+
+        x_src_dict, x_dst_dict = self.split_tensors(feat_dict, dim=1)
+
+        out_dict = defaultdict(list)
+
+        for edge_type, edge_index in edge_index_dict.items():
+            src_type, _, dst_type = edge_type
+
+            csc = BaseConv.to_csc(
+                edge_index, (x_dict[src_type].size(0), x_dict[dst_type].size(0))
+            )
+
+            if src_type == dst_type:
+                graph = self.get_cugraph(
+                    csc,
+                    bipartite=False,
+                )
+                out = mha_gat_n2n(
+                    x_src_dict[edge_type],
+                    self.attn_weights[edge_type],
+                    graph,
+                    num_heads=self.num_heads,
+                    activation="LeakyReLU",
+                    negative_slope=self.negative_slope,
+                    concat_heads=self.concat_heads,
+                )
+
+            else:
+                graph = self.get_cugraph(
+                    csc,
+                    bipartite=True,
+                )
+                out = mha_gat_n2n(
+                    (x_src_dict[edge_type], x_dst_dict[edge_type]),
+                    self.attn_weights[edge_type],
+                    graph,
+                    num_heads=self.num_heads,
+                    activation="LeakyReLU",
+                    negative_slope=self.negative_slope,
+                    concat_heads=self.concat_heads,
+                )
+
+            if self.bias is not None:
+                out = out + self.bias[edge_type]
+
+            out_dict[dst_type].append(out)
+
+        for key, value in out_dict.items():
+            out_dict[key] = torch_geometric.nn.conv.hetero_conv.group(value, self.aggr)
+
+        return out_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
index 1512901822a..30994289f9c 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
@@ -284,3 +284,32 @@ def basic_pyg_graph_2():
     )
     size = (10, 10)
     return edge_index, size
+
+
+@pytest.fixture
+def sample_pyg_hetero_data():
+    torch.manual_seed(12345)
+    raw_data_dict = {
+        "v0": torch.randn(6, 3),
+        "v1": torch.randn(7, 2),
+        "v2": torch.randn(5, 4),
+        ("v2", "e0", "v1"): torch.tensor([[0, 2, 2, 4, 4], [4, 3, 6, 0, 1]]),
+        ("v1", "e1", "v1"): torch.tensor(
+            [[0, 2, 2, 2, 3, 5, 5], [4, 0, 4, 5, 3, 0, 1]]
+        ),
+        ("v0", "e2", "v0"): torch.tensor([[0, 2, 2, 3, 5, 5], [1, 1, 5, 1, 1, 2]]),
+        ("v1", "e3", "v2"): torch.tensor(
+            [[0, 1, 1, 2, 4, 5, 6], [1, 2, 3, 1, 2, 2, 2]]
+        ),
+        ("v0", "e4", "v2"): torch.tensor([[1, 1, 3, 3, 4, 4], [1, 4, 1, 4, 0, 3]]),
+    }
+
+    # create a nested dictionary to facilitate PyG's HeteroData construction
+    hetero_data_dict = {}
+    for key, value in raw_data_dict.items():
+        if isinstance(key, tuple):
+            hetero_data_dict[key] = {"edge_index": value}
+        else:
+            hetero_data_dict[key] = {"x": value}
+
+    return hetero_data_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
new file mode 100644
index 00000000000..1c841a17df7
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph_pyg.nn import HeteroGATConv as CuGraphHeteroGATConv
+from cugraph.utilities.utils import import_optional, MissingModule
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+ATOL = 1e-6
+
+
+@pytest.mark.cugraph_ops
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(torch_geometric, MissingModule), reason="torch_geometric not available"
+)
+@pytest.mark.parametrize("heads", [1, 3, 10])
+@pytest.mark.parametrize("aggr", ["sum", "mean"])
+def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
+    major, minor, patch = torch_geometric.__version__.split(".")[:3]
+    pyg_version = tuple(map(int, [major, minor, patch]))
+    if pyg_version < (2, 4, 0):
+        pytest.skip("Skipping HeteroGATConv test")
+
+    from torch_geometric.data import HeteroData
+    from torch_geometric.nn import HeteroConv, GATConv
+
+    device = torch.device("cuda:0")
+    data = HeteroData(sample_pyg_hetero_data).to(device)
+
+    in_channels_dict = {k: v.size(1) for k, v in data.x_dict.items()}
+    out_channels = 2
+
+    convs_dict = {}
+    kwargs1 = dict(heads=heads, add_self_loops=False, bias=False)
+    for edge_type in data.edge_types:
+        src_t, _, dst_t = edge_type
+        in_channels_src, in_channels_dst = data.x_dict[src_t].size(-1), data.x_dict[
+            dst_t
+        ].size(-1)
+        if src_t == dst_t:
+            convs_dict[edge_type] = GATConv(in_channels_src, out_channels, **kwargs1)
+        else:
+            convs_dict[edge_type] = GATConv(
+                (in_channels_src, in_channels_dst), out_channels, **kwargs1
+            )
+
+    conv1 = HeteroConv(convs_dict, aggr=aggr).to(device)
+    kwargs2 = dict(
+        heads=heads,
+        aggr=aggr,
+        node_types=data.node_types,
+        edge_types=data.edge_types,
+        bias=False,
+    )
+    conv2 = CuGraphHeteroGATConv(in_channels_dict, out_channels, **kwargs2).to(device)
+
+    # copy over linear and attention weights
+    w_src, w_dst = conv2.split_tensors(conv2.lin_weights, dim=0)
+    with torch.no_grad():
+        for edge_type in conv2.edge_types:
+            src_t, _, dst_t = edge_type
+            w_src[edge_type][:, :] = conv1.convs[edge_type].lin_src.weight[:, :]
+            if w_dst[edge_type] is not None:
+                w_dst[edge_type][:, :] = conv1.convs[edge_type].lin_dst.weight[:, :]
+
+            conv2.attn_weights[edge_type][: heads * out_channels] = conv1.convs[
+                edge_type
+            ].att_src.data.flatten()
+            conv2.attn_weights[edge_type][heads * out_channels :] = conv1.convs[
+                edge_type
+            ].att_dst.data.flatten()
+
+    out1 = conv1(data.x_dict, data.edge_index_dict)
+    out2 = conv2(data.x_dict, data.edge_index_dict)
+
+    for node_type in data.node_types:
+        assert torch.allclose(out1[node_type], out2[node_type], atol=ATOL)
+
+    loss1 = 0
+    loss2 = 0
+    for node_type in data.node_types:
+        loss1 += out1[node_type].mean()
+        loss2 += out2[node_type].mean()
+
+    loss1.backward()
+    loss2.backward()
+
+    # check gradient w.r.t attention weights
+    out_dim = heads * out_channels
+    for edge_type in conv2.edge_types:
+        assert torch.allclose(
+            conv1.convs[edge_type].att_src.grad.flatten(),
+            conv2.attn_weights[edge_type].grad[:out_dim],
+            atol=ATOL,
+        )
+        assert torch.allclose(
+            conv1.convs[edge_type].att_dst.grad.flatten(),
+            conv2.attn_weights[edge_type].grad[out_dim:],
+            atol=ATOL,
+        )
+
+    # check gradient w.r.t linear weights
+    grad_lin_weights_ref = dict.fromkeys(out1.keys())
+    for node_t, (rels_as_src, rels_as_dst) in conv2.relations_per_ntype.items():
+        grad_list = []
+        for rel_t in rels_as_src:
+            grad_list.append(conv1.convs[rel_t].lin_src.weight.grad.clone())
+        for rel_t in rels_as_dst:
+            grad_list.append(conv1.convs[rel_t].lin_dst.weight.grad.clone())
+        assert len(grad_list) > 0
+        grad_lin_weights_ref[node_t] = torch.vstack(grad_list)
+
+    for node_type in conv2.lin_weights:
+        assert torch.allclose(
+            grad_lin_weights_ref[node_type],
+            conv2.lin_weights[node_type].grad,
+            atol=ATOL,
+        )
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index a1ec12c6e07..99936b23a8c 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -38,7 +38,6 @@ project(
 option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before defaulting to local files"
        OFF
 )
-option(CUGRAPH_BUILD_WHEELS "Whether this build is generating a Python wheel." OFF)
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
 
 if(NOT USE_CUGRAPH_OPS)
@@ -59,22 +58,14 @@ if(NOT cugraph_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
-
-  set(_exclude_from_all "")
-  if(CUGRAPH_BUILD_WHEELS)
-    # Statically link dependencies if building wheels
-    set(CUDA_STATIC_RUNTIME ON)
-    set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_LIB ON)
-    set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
-    set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
-    set(ALLOW_CLONE_CUGRAPH_OPS ON)
-
-    # Don't install the cuML C++ targets into wheels
-    set(_exclude_from_all EXCLUDE_FROM_ALL)
-  endif()
-
-  add_subdirectory(../../cpp cugraph-cpp ${_exclude_from_all})
+  set(CUDA_STATIC_RUNTIME ON)
+  set(USE_RAFT_STATIC ON)
+  set(CUGRAPH_COMPILE_RAFT_LIB ON)
+  set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
+  set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
+  set(ALLOW_CLONE_CUGRAPH_OPS ON)
+
+  add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
   set(cython_lib_dir cugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index a94aa9f0448..de6f20bc439 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -45,12 +45,12 @@ repos:
       - id: pyupgrade
         args: [--py39-plus]
   - repo: https://github.com/psf/black
-    rev: 23.10.1
+    rev: 23.11.0
     hooks:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.3
+    rev: v0.1.7
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]  # --unsafe-fixes]
@@ -62,7 +62,7 @@ repos:
         additional_dependencies: &flake8_dependencies
           # These versions need updated manually
           - flake8==6.1.0
-          - flake8-bugbear==23.9.16
+          - flake8-bugbear==23.12.2
           - flake8-simplify==0.21.0
   - repo: https://github.com/asottile/yesqa
     rev: v1.5.0
@@ -77,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.3
+    rev: v0.1.7
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
index 63841b15bd5..e4947491555 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
@@ -15,13 +15,13 @@
     centrality,
     community,
     components,
-    shortest_paths,
     link_analysis,
+    shortest_paths,
 )
 from .bipartite import complete_bipartite_graph
 from .centrality import *
 from .components import *
 from .core import *
 from .isolate import *
-from .shortest_paths import *
 from .link_analysis import *
+from .shortest_paths import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
index 1d3e762b4fd..25b9b39554b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
@@ -17,15 +17,14 @@
 import numpy as np
 
 from nx_cugraph.generators._utils import _create_using_class, _number_and_nodes
-from nx_cugraph.utils import index_dtype, networkx_algorithm, nodes_or_number
+from nx_cugraph.utils import index_dtype, networkx_algorithm
 
 __all__ = [
     "complete_bipartite_graph",
 ]
 
 
-@nodes_or_number([0, 1])
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=[0, 1])
 def complete_bipartite_graph(n1, n2, create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/core.py b/python/nx-cugraph/nx_cugraph/algorithms/core.py
index 2219388bc58..390598d070e 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/core.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/core.py
@@ -31,7 +31,11 @@ def k_truss(G, k):
     if is_nx := isinstance(G, nx.Graph):
         G = nxcg.from_networkx(G, preserve_all_attrs=True)
     if nxcg.number_of_selfloops(G) > 0:
-        raise nx.NetworkXError(
+        if nx.__version__[:3] <= "3.2":
+            exc_class = nx.NetworkXError
+        else:
+            exc_class = nx.NetworkXNotImplemented
+        raise exc_class(
             "Input graph has self loops which is not permitted; "
             "Consider using G.remove_edges_from(nx.selfloop_edges(G))."
         )
diff --git a/python/nx-cugraph/nx_cugraph/generators/classic.py b/python/nx-cugraph/nx_cugraph/generators/classic.py
index b196c232320..4213e6dd2a0 100644
--- a/python/nx-cugraph/nx_cugraph/generators/classic.py
+++ b/python/nx-cugraph/nx_cugraph/generators/classic.py
@@ -19,7 +19,7 @@
 
 import nx_cugraph as nxcg
 
-from ..utils import _get_int_dtype, index_dtype, networkx_algorithm, nodes_or_number
+from ..utils import _get_int_dtype, index_dtype, networkx_algorithm
 from ._utils import (
     _IS_NX32_OR_LESS,
     _common_small_graph,
@@ -86,8 +86,7 @@ def circular_ladder_graph(n, create_using=None):
     return _ladder_graph(n, create_using, is_circular=True)
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def complete_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     if n < 3:
@@ -143,8 +142,7 @@ def complete_multipartite_graph(*subset_sizes):
     )
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def cycle_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using)
@@ -174,8 +172,7 @@ def cycle_graph(n, create_using=None):
     return G
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def empty_graph(n=0, create_using=None, default=nx.Graph):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using, default=default)
@@ -242,8 +239,7 @@ def ladder_graph(n, create_using=None):
     return _ladder_graph(n, create_using)
 
 
-@nodes_or_number([0, 1])
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=[0, 1])
 def lollipop_graph(m, n, create_using=None):
     # Like complete_graph then path_graph
     orig_m, unused_nodes_m = m
@@ -283,8 +279,7 @@ def null_graph(create_using=None):
     return _common_small_graph(0, None, create_using)
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def path_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using)
@@ -304,8 +299,7 @@ def path_graph(n, create_using=None):
     return G
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def star_graph(n, create_using=None):
     orig_n, orig_nodes = n
     n, nodes = _number_and_nodes(n)
@@ -329,8 +323,7 @@ def star_graph(n, create_using=None):
     return G
 
 
-@nodes_or_number([0, 1])
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=[0, 1])
 def tadpole_graph(m, n, create_using=None):
     orig_m, unused_nodes_m = m
     orig_n, unused_nodes_n = n
@@ -382,8 +375,7 @@ def turan_graph(n, r):
     return complete_multipartite_graph(*partitions)
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0)
 def wheel_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using)
diff --git a/python/nx-cugraph/nx_cugraph/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py
index 0048aee51bb..a0dbfcec890 100644
--- a/python/nx-cugraph/nx_cugraph/utils/decorators.py
+++ b/python/nx-cugraph/nx_cugraph/utils/decorators.py
@@ -15,6 +15,7 @@
 from functools import partial, update_wrapper
 from textwrap import dedent
 
+import networkx as nx
 from networkx.utils.decorators import nodes_or_number, not_implemented_for
 
 from nx_cugraph.interface import BackendInterface
@@ -47,10 +48,18 @@ def __new__(
         *,
         name: str | None = None,
         extra_params: dict[str, str] | str | None = None,
+        nodes_or_number: list[int] | int | None = None,
     ):
         if func is None:
-            return partial(networkx_algorithm, name=name, extra_params=extra_params)
+            return partial(
+                networkx_algorithm,
+                name=name,
+                extra_params=extra_params,
+                nodes_or_number=nodes_or_number,
+            )
         instance = object.__new__(cls)
+        if nodes_or_number is not None and nx.__version__[:3] > "3.2":
+            func = nx.utils.decorators.nodes_or_number(nodes_or_number)(func)
         # update_wrapper sets __wrapped__, which will be used for the signature
         update_wrapper(instance, func)
         instance.__defaults__ = func.__defaults__
@@ -76,6 +85,8 @@ def __new__(
         setattr(BackendInterface, instance.name, instance)
         # Set methods so they are in __dict__
         instance._can_run = instance._can_run
+        if nodes_or_number is not None and nx.__version__[:3] <= "3.2":
+            instance = nx.utils.decorators.nodes_or_number(nodes_or_number)(instance)
         return instance
 
     def _can_run(self, func):
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index 7d5dc790ad0..e1250cb2edb 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -38,7 +38,6 @@ project(
 option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before defaulting to local files"
        OFF
 )
-option(CUGRAPH_BUILD_WHEELS "Whether we're building a wheel for pypi" OFF)
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
 
 if(NOT USE_CUGRAPH_OPS)
@@ -59,22 +58,14 @@ if (NOT cugraph_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
-
-  set(_exclude_from_all "")
-  if(CUGRAPH_BUILD_WHEELS)
-    # Statically link dependencies if building wheels
-    set(CUDA_STATIC_RUNTIME ON)
-    set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_LIB ON)
-    set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
-    set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
-    set(ALLOW_CLONE_CUGRAPH_OPS ON)
-
-    # Don't install the cuML C++ targets into wheels
-    set(_exclude_from_all EXCLUDE_FROM_ALL)
-  endif()
-
-  add_subdirectory(../../cpp cugraph-cpp ${_exclude_from_all})
+  set(CUDA_STATIC_RUNTIME ON)
+  set(USE_RAFT_STATIC ON)
+  set(CUGRAPH_COMPILE_RAFT_LIB ON)
+  set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
+  set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
+  set(ALLOW_CLONE_CUGRAPH_OPS ON)
+
+  add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
   set(cython_lib_dir pylibcugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})