Skip to content

Commit

Permalink
Primitives & BFS performance improvements (#4751)
Browse files Browse the repository at this point in the history
This PR includes multiple updates to cut peak memory usage in graph creation and improve performance of BFS on scale-free graphs.

* Add a bitmap for non-zero local degree vertices in the hypersparse region; this information can be used to quickly filter out locally zero degree vertices which don't need to be processed in multiple instances.
* Store (global-)degree offsets for vertices in the hypersparse region; this information can used to quickly identify the vertices with a certain global degree (e.g. for global degree 1 vertices, we can skip inter-GPU reduction as we know each vertex has only one neighbor).
* Skip kernel invocations in computing edge counts if the vertex list is empty.
* Add asynchronous functions to compute edge counts. This helps in preventing unnecessary serialization when we can process multiple such functions concurrently.
* Replace rmm::exec_policy with rmm::exec_policy_nosync in multiple places; the former enforces stream synchronization at the end. The latter does not.
* Enforce cache line alignment in NCCL communication in multiple places (NCCL communication performance is significantly affected by cache line alignment, often leading to 30-40% or more differences).
* For primitives working on a subset of vertices, broadcast a vertex list using a bitmap if the vertex frontier size is large. If the vertex frontier size is small (in case vertex_t is 8B and the local vertex partition range can fit into 4B), use vertex offsets instead of vertices to cut communication volume.
* Merge multiple host scalar communication function calls to a single one.
* Increase multi-stream concurrency in detail::extract_transform_e & detail::per_v_transform_reduce_e
* Multiple optimizations in template specialization (for update_major == true && reduce_op == any && key type is vertex && working on a subset of vertices) in detail::per_v_transform_reduce_e (this includes pre-processing vertices with non-zero local degrees; so we don't need to process such vertices using multiple GPUs, pre-filtering of zero local degree vertices, allreduce communication to reduce shuffle communication volumes, and special treatment of global degree 1 vertices, and so on).
* Multiple optimizations & specializations in detail::fill_edge_minor_property that works on a subset of vertices (this includes kernel fusion, specialization for bitmap properties including direct broadcast to the property buffer and special treatments for vertex partition boundaries, and so on).
* Added multiple optimizations & specializations in transform_reduce_v_frontier_outgoing_e (especially for reduce_op::any and to cut communication volumes and to filter out (key, value) pairs that won't contribute to the final results).
* Multiple low-level optimizations in direction optimizing BFS (including approximations in determining between bottom -up and top-down).
* Multiple optimizations to cut peak memory usage in graph creation.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: #4751
  • Loading branch information
seunghwak authored Nov 25, 2024
1 parent 3478fb5 commit d714243
Show file tree
Hide file tree
Showing 57 changed files with 11,526 additions and 4,057 deletions.
139 changes: 127 additions & 12 deletions cpp/include/cugraph/edge_partition_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
edge_partition_device_view_t(edge_partition_view_t<vertex_t, edge_t, multi_gpu> view)
: detail::edge_partition_device_view_base_t<vertex_t, edge_t>(view.offsets(), view.indices()),
dcs_nzd_vertices_(detail::to_thrust_optional(view.dcs_nzd_vertices())),
dcs_nzd_range_bitmap_(detail::to_thrust_optional(view.dcs_nzd_range_bitmap())),
major_hypersparse_first_(detail::to_thrust_optional(view.major_hypersparse_first())),
major_range_first_(view.major_range_first()),
major_range_last_(view.major_range_last()),
Expand All @@ -218,6 +219,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return dcs_nzd_vertices_ ? thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand Down Expand Up @@ -250,12 +252,72 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
thrust::plus<size_t>());
}

template <typename MajorIterator>
__host__ void compute_number_of_edges_async(MajorIterator major_first,
MajorIterator major_last,
raft::device_span<size_t> count /* size = 1 */,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) {
RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
}

rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
size_t tmp_storage_bytes{0};

if (dcs_nzd_vertices_) {
auto local_degree_first = thrust::make_transform_iterator(
major_first,
detail::local_degree_op_t<vertex_t,
edge_t,
size_t /* no limit on majors.size(), so edge_t can overflow */,
multi_gpu,
true>{
this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_});
cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
d_tmp_storage.resize(tmp_storage_bytes, stream);
cub::DeviceReduce::Sum(d_tmp_storage.data(),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
} else {
auto local_degree_first = thrust::make_transform_iterator(
major_first,
detail::local_degree_op_t<vertex_t,
edge_t,
size_t /* no limit on majors.size(), so edge_t can overflow */,
multi_gpu,
false>{
this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */});
cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
d_tmp_storage.resize(tmp_storage_bytes, stream);
cub::DeviceReduce::Sum(d_tmp_storage.data(),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
}
}

__host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
{
rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -266,7 +328,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
major_hypersparse_first_.value_or(vertex_t{0})});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -284,7 +346,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -295,7 +357,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
major_hypersparse_first_.value_or(vertex_t{0})});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -311,6 +373,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return dcs_nzd_vertices_ ? thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand Down Expand Up @@ -355,7 +418,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -368,7 +431,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
mask_first});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -394,7 +457,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -407,7 +470,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
mask_first});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand Down Expand Up @@ -515,6 +578,8 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
return minor_range_first_ + minor_offset;
}

// FIxME: better return thrust::optional<raft::device_span<vertex_t const>> for consistency (see
// dcs_nzd_range_bitmap())
__host__ __device__ thrust::optional<vertex_t const*> dcs_nzd_vertices() const
{
return dcs_nzd_vertices_ ? thrust::optional<vertex_t const*>{(*dcs_nzd_vertices_).data()}
Expand All @@ -528,10 +593,20 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
: thrust::nullopt;
}

__host__ __device__ thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap()
const
{
return dcs_nzd_range_bitmap_
? thrust::make_optional<raft::device_span<uint32_t const>>(
(*dcs_nzd_range_bitmap_).data(), (*dcs_nzd_range_bitmap_).size())
: thrust::nullopt;
}

private:
// should be trivially copyable to device

thrust::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices_{thrust::nullopt};
thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap_{thrust::nullopt};
thrust::optional<vertex_t> major_hypersparse_first_{thrust::nullopt};

vertex_t major_range_first_{0};
Expand All @@ -558,6 +633,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand All @@ -574,10 +650,48 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
thrust::plus<size_t>());
}

template <typename MajorIterator>
__host__ void compute_number_of_edges_async(MajorIterator major_first,
MajorIterator major_last,
raft::device_span<size_t> count /* size = 1 */,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) {
RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
}

rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
size_t tmp_storage_bytes{0};

auto local_degree_first = thrust::make_transform_iterator(
major_first,
detail::local_degree_op_t<vertex_t,
edge_t,
size_t /* no limit on majors.size(), so edge_t can overflow */,
multi_gpu,
false>{this->offsets_,
std::byte{0} /* dummy */,
std::byte{0} /* dummy */,
std::byte{0} /* dummy */});
cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
d_tmp_storage.resize(tmp_storage_bytes, stream);
cub::DeviceReduce::Sum(d_tmp_storage.data(),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
}

__host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
{
rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -595,7 +709,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
rmm::cuda_stream_view stream) const
{
rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -613,6 +727,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand All @@ -638,7 +753,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
{
rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -660,7 +775,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
{
rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand Down
10 changes: 9 additions & 1 deletion cpp/include/cugraph/edge_partition_view.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -56,6 +56,7 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
edge_partition_view_t(raft::device_span<edge_t const> offsets,
raft::device_span<vertex_t const> indices,
std::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices,
std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap,
std::optional<vertex_t> major_hypersparse_first,
vertex_t major_range_first,
vertex_t major_range_last,
Expand All @@ -64,6 +65,7 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
vertex_t major_value_start_offset)
: detail::edge_partition_view_base_t<vertex_t, edge_t>(offsets, indices),
dcs_nzd_vertices_(dcs_nzd_vertices),
dcs_nzd_range_bitmap_(dcs_nzd_range_bitmap),
major_hypersparse_first_(major_hypersparse_first),
major_range_first_(major_range_first),
major_range_last_(major_range_last),
Expand All @@ -78,6 +80,11 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
return dcs_nzd_vertices_;
}

std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap() const
{
return dcs_nzd_range_bitmap_;
}

std::optional<vertex_t> major_hypersparse_first() const { return major_hypersparse_first_; }

vertex_t major_range_first() const { return major_range_first_; }
Expand All @@ -90,6 +97,7 @@ class edge_partition_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_
private:
// relevant only if we use the CSR + DCSR (or CSC + DCSC) hybrid format
std::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices_{std::nullopt};
std::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap_{std::nullopt};
std::optional<vertex_t> major_hypersparse_first_{std::nullopt};

vertex_t major_range_first_{0};
Expand Down
Loading

0 comments on commit d714243

Please sign in to comment.