Primitives & BFS performance improvements (#4751)

This PR includes multiple updates to cut peak memory usage in graph creation and improve performance of BFS on scale-free graphs. * Add a bitmap for non-zero local degree vertices in the hypersparse region; this information can be used to quickly filter out locally zero degree vertices which don't need to be processed in multiple instances. * Store (global-)degree offsets for vertices in the hypersparse region; this information can used to quickly identify the vertices with a certain global degree (e.g. for global degree 1 vertices, we can skip inter-GPU reduction as we know each vertex has only one neighbor). * Skip kernel invocations in computing edge counts if the vertex list is empty. * Add asynchronous functions to compute edge counts. This helps in preventing unnecessary serialization when we can process multiple such functions concurrently. * Replace rmm::exec_policy with rmm::exec_policy_nosync in multiple places; the former enforces stream synchronization at the end. The latter does not. * Enforce cache line alignment in NCCL communication in multiple places (NCCL communication performance is significantly affected by cache line alignment, often leading to 30-40% or more differences). * For primitives working on a subset of vertices, broadcast a vertex list using a bitmap if the vertex frontier size is large. If the vertex frontier size is small (in case vertex_t is 8B and the local vertex partition range can fit into 4B), use vertex offsets instead of vertices to cut communication volume. * Merge multiple host scalar communication function calls to a single one. * Increase multi-stream concurrency in detail::extract_transform_e & detail::per_v_transform_reduce_e * Multiple optimizations in template specialization (for update_major == true && reduce_op == any && key type is vertex && working on a subset of vertices) in detail::per_v_transform_reduce_e (this includes pre-processing vertices with non-zero local degrees; so we don't need to process such vertices using multiple GPUs, pre-filtering of zero local degree vertices, allreduce communication to reduce shuffle communication volumes, and special treatment of global degree 1 vertices, and so on). * Multiple optimizations & specializations in detail::fill_edge_minor_property that works on a subset of vertices (this includes kernel fusion, specialization for bitmap properties including direct broadcast to the property buffer and special treatments for vertex partition boundaries, and so on). * Added multiple optimizations & specializations in transform_reduce_v_frontier_outgoing_e (especially for reduce_op::any and to cut communication volumes and to filter out (key, value) pairs that won't contribute to the final results). * Multiple low-level optimizations in direction optimizing BFS (including approximations in determining between bottom -up and top-down). * Multiple optimizations to cut peak memory usage in graph creation. Authors: - Seunghwa Kang (https://github.com/seunghwak) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) URL: #4751
rapidsai · Nov 25, 2024 · d714243 · d714243
1 parent 3478fb5
commit d714243
Show file tree

Hide file tree

Showing 57 changed files with 11,526 additions and 4,057 deletions.
diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -204,6 +204,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   edge_partition_device_view_t(edge_partition_view_t<vertex_t, edge_t, multi_gpu> view)
     : detail::edge_partition_device_view_base_t<vertex_t, edge_t>(view.offsets(), view.indices()),
       dcs_nzd_vertices_(detail::to_thrust_optional(view.dcs_nzd_vertices())),
+      dcs_nzd_range_bitmap_(detail::to_thrust_optional(view.dcs_nzd_range_bitmap())),
       major_hypersparse_first_(detail::to_thrust_optional(view.major_hypersparse_first())),
       major_range_first_(view.major_range_first()),
       major_range_last_(view.major_range_last()),
@@ -218,6 +219,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                           MajorIterator major_last,
                                           rmm::cuda_stream_view stream) const
   {
+    if (thrust::distance(major_first, major_last) == 0) return size_t{0};
     return dcs_nzd_vertices_ ? thrust::transform_reduce(
                                  rmm::exec_policy(stream),
                                  major_first,
@@ -250,12 +252,72 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                  thrust::plus<size_t>());
   }
 
+  template <typename MajorIterator>
+  __host__ void compute_number_of_edges_async(MajorIterator major_first,
+                                              MajorIterator major_last,
+                                              raft::device_span<size_t> count /* size = 1 */,
+                                              rmm::cuda_stream_view stream) const
+  {
+    if (thrust::distance(major_first, major_last) == 0) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
+    }
+
+    rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
+    size_t tmp_storage_bytes{0};
+
+    if (dcs_nzd_vertices_) {
+      auto local_degree_first = thrust::make_transform_iterator(
+        major_first,
+        detail::local_degree_op_t<vertex_t,
+                                  edge_t,
+                                  size_t /* no limit on majors.size(), so edge_t can overflow */,
+                                  multi_gpu,
+                                  true>{
+          this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_});
+      cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
+                             tmp_storage_bytes,
+                             local_degree_first,
+                             count.data(),
+                             thrust::distance(major_first, major_last),
+                             stream);
+      d_tmp_storage.resize(tmp_storage_bytes, stream);
+      cub::DeviceReduce::Sum(d_tmp_storage.data(),
+                             tmp_storage_bytes,
+                             local_degree_first,
+                             count.data(),
+                             thrust::distance(major_first, major_last),
+                             stream);
+    } else {
+      auto local_degree_first = thrust::make_transform_iterator(
+        major_first,
+        detail::local_degree_op_t<vertex_t,
+                                  edge_t,
+                                  size_t /* no limit on majors.size(), so edge_t can overflow */,
+                                  multi_gpu,
+                                  false>{
+          this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */});
+      cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
+                             tmp_storage_bytes,
+                             local_degree_first,
+                             count.data(),
+                             thrust::distance(major_first, major_last),
+                             stream);
+      d_tmp_storage.resize(tmp_storage_bytes, stream);
+      cub::DeviceReduce::Sum(d_tmp_storage.data(),
+                             tmp_storage_bytes,
+                             local_degree_first,
+                             count.data(),
+                             thrust::distance(major_first, major_last),
+                             stream);
+    }
+  }
+
   __host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
     if (dcs_nzd_vertices_) {
       assert(major_hypersparse_first_);
-      thrust::transform(rmm::exec_policy(stream),
+      thrust::transform(rmm::exec_policy_nosync(stream),
                         thrust::make_counting_iterator(this->major_range_first()),
                         thrust::make_counting_iterator(this->major_range_last()),
                         local_degrees.begin(),
@@ -266,7 +328,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                           major_hypersparse_first_.value_or(vertex_t{0})});
     } else {
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         thrust::make_counting_iterator(this->major_range_first()),
         thrust::make_counting_iterator(this->major_range_last()),
         local_degrees.begin(),
@@ -284,7 +346,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
     if (dcs_nzd_vertices_) {
       assert(major_hypersparse_first_);
-      thrust::transform(rmm::exec_policy(stream),
+      thrust::transform(rmm::exec_policy_nosync(stream),
                         major_first,
                         major_last,
                         local_degrees.begin(),
@@ -295,7 +357,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                           major_hypersparse_first_.value_or(vertex_t{0})});
     } else {
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         major_first,
         major_last,
         local_degrees.begin(),
@@ -311,6 +373,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                                     MajorIterator major_last,
                                                     rmm::cuda_stream_view stream) const
   {
+    if (thrust::distance(major_first, major_last) == 0) return size_t{0};
     return dcs_nzd_vertices_ ? thrust::transform_reduce(
                                  rmm::exec_policy(stream),
                                  major_first,
@@ -355,7 +418,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     if (dcs_nzd_vertices_) {
       assert(major_hypersparse_first_);
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         thrust::make_counting_iterator(this->major_range_first()),
         thrust::make_counting_iterator(this->major_range_last()),
         local_degrees.begin(),
@@ -368,7 +431,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
             mask_first});
     } else {
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         thrust::make_counting_iterator(this->major_range_first()),
         thrust::make_counting_iterator(this->major_range_last()),
         local_degrees.begin(),
@@ -394,7 +457,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     if (dcs_nzd_vertices_) {
       assert(major_hypersparse_first_);
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         major_first,
         major_last,
         local_degrees.begin(),
@@ -407,7 +470,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
             mask_first});
     } else {
       thrust::transform(
-        rmm::exec_policy(stream),
+        rmm::exec_policy_nosync(stream),
         major_first,
         major_last,
         local_degrees.begin(),
@@ -515,6 +578,8 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return minor_range_first_ + minor_offset;
   }
 
+  // FIxME: better return thrust::optional<raft::device_span<vertex_t const>> for consistency (see
+  // dcs_nzd_range_bitmap())
   __host__ __device__ thrust::optional<vertex_t const*> dcs_nzd_vertices() const
   {
     return dcs_nzd_vertices_ ? thrust::optional<vertex_t const*>{(*dcs_nzd_vertices_).data()}
@@ -528,10 +593,20 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
              : thrust::nullopt;
   }
 
+  __host__ __device__ thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap()
+    const
+  {
+    return dcs_nzd_range_bitmap_
+             ? thrust::make_optional<raft::device_span<uint32_t const>>(
+                 (*dcs_nzd_range_bitmap_).data(), (*dcs_nzd_range_bitmap_).size())
+             : thrust::nullopt;
+  }
+
  private:
   // should be trivially copyable to device
 
   thrust::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices_{thrust::nullopt};
+  thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap_{thrust::nullopt};
   thrust::optional<vertex_t> major_hypersparse_first_{thrust::nullopt};
 
   vertex_t major_range_first_{0};
@@ -558,6 +633,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                           MajorIterator major_last,
                                           rmm::cuda_stream_view stream) const
   {
+    if (thrust::distance(major_first, major_last) == 0) return size_t{0};
     return thrust::transform_reduce(
       rmm::exec_policy(stream),
       major_first,
@@ -574,10 +650,48 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
       thrust::plus<size_t>());
   }
 
+  template <typename MajorIterator>
+  __host__ void compute_number_of_edges_async(MajorIterator major_first,
+                                              MajorIterator major_last,
+                                              raft::device_span<size_t> count /* size = 1 */,
+                                              rmm::cuda_stream_view stream) const
+  {
+    if (thrust::distance(major_first, major_last) == 0) {
+      RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
+    }
+
+    rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
+    size_t tmp_storage_bytes{0};
+
+    auto local_degree_first = thrust::make_transform_iterator(
+      major_first,
+      detail::local_degree_op_t<vertex_t,
+                                edge_t,
+                                size_t /* no limit on majors.size(), so edge_t can overflow */,
+                                multi_gpu,
+                                false>{this->offsets_,
+                                       std::byte{0} /* dummy */,
+                                       std::byte{0} /* dummy */,
+                                       std::byte{0} /* dummy */});
+    cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
+                           tmp_storage_bytes,
+                           local_degree_first,
+                           count.data(),
+                           thrust::distance(major_first, major_last),
+                           stream);
+    d_tmp_storage.resize(tmp_storage_bytes, stream);
+    cub::DeviceReduce::Sum(d_tmp_storage.data(),
+                           tmp_storage_bytes,
+                           local_degree_first,
+                           count.data(),
+                           thrust::distance(major_first, major_last),
+                           stream);
+  }
+
   __host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
-    thrust::transform(rmm::exec_policy(stream),
+    thrust::transform(rmm::exec_policy_nosync(stream),
                       thrust::make_counting_iterator(this->major_range_first()),
                       thrust::make_counting_iterator(this->major_range_last()),
                       local_degrees.begin(),
@@ -595,7 +709,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                                              rmm::cuda_stream_view stream) const
   {
     rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
-    thrust::transform(rmm::exec_policy(stream),
+    thrust::transform(rmm::exec_policy_nosync(stream),
                       major_first,
                       major_last,
                       local_degrees.begin(),
@@ -613,6 +727,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
                                                     MajorIterator major_last,
                                                     rmm::cuda_stream_view stream) const
   {
+    if (thrust::distance(major_first, major_last) == 0) return size_t{0};
     return thrust::transform_reduce(
       rmm::exec_policy(stream),
       major_first,
@@ -638,7 +753,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   {
     rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
     thrust::transform(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator(this->major_range_first()),
       thrust::make_counting_iterator(this->major_range_last()),
       local_degrees.begin(),
@@ -660,7 +775,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
   {
     rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
     thrust::transform(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       major_first,
       major_last,
       local_degrees.begin(),

diff --git a/cpp/include/cugraph/edge_partition_view.hpp b/cpp/include/cugraph/edge_partition_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,6 +56,7 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
   edge_partition_view_t(raft::device_span<edge_t const> offsets,
                         raft::device_span<vertex_t const> indices,
                         std::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices,
+                        std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap,
                         std::optional<vertex_t> major_hypersparse_first,
                         vertex_t major_range_first,
                         vertex_t major_range_last,
@@ -64,6 +65,7 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
                         vertex_t major_value_start_offset)
     : detail::edge_partition_view_base_t<vertex_t, edge_t>(offsets, indices),
       dcs_nzd_vertices_(dcs_nzd_vertices),
+      dcs_nzd_range_bitmap_(dcs_nzd_range_bitmap),
       major_hypersparse_first_(major_hypersparse_first),
       major_range_first_(major_range_first),
       major_range_last_(major_range_last),
@@ -78,6 +80,11 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
     return dcs_nzd_vertices_;
   }
 
+  std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap() const
+  {
+    return dcs_nzd_range_bitmap_;
+  }
+
   std::optional<vertex_t> major_hypersparse_first() const { return major_hypersparse_first_; }
 
   vertex_t major_range_first() const { return major_range_first_; }
@@ -90,6 +97,7 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
  private:
   // relevant only if we use the CSR + DCSR (or CSC + DCSC) hybrid format
   std::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices_{std::nullopt};
+  std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap_{std::nullopt};
   std::optional<vertex_t> major_hypersparse_first_{std::nullopt};
 
   vertex_t major_range_first_{0};