Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Forward-merge branch-24.12 into branch-25.02 #4782

Merged
merged 7 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ concurrency:
cancel-in-progress: true

jobs:
# Please keep pr-builder as the top job here
pr-builder:
needs:
- changed-files
Expand All @@ -25,14 +26,24 @@ jobs:
- wheel-tests-pylibcugraph
- wheel-build-cugraph
- wheel-tests-cugraph
- telemetry-setup
- devcontainer
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
if: always()
with:
needs: ${{ toJSON(needs) }}
telemetry-setup:
runs-on: ubuntu-latest
continue-on-error: true
env:
OTEL_SERVICE_NAME: "pr-cugraph"
steps:
- name: Telemetry setup
uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
changed-files:
secrets: inherit
needs: telemetry-setup
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
files_yaml: |
Expand Down Expand Up @@ -63,9 +74,11 @@ jobs:
- '!notebooks/**'
checks:
secrets: inherit
needs: telemetry-setup
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
enable_check_generated_files: false
ignored_pr_jobs: telemetry-summarize
conda-cpp-build:
needs: checks
secrets: inherit
Expand Down Expand Up @@ -161,6 +174,7 @@ jobs:
script: ci/test_wheel_cugraph.sh
devcontainer:
secrets: inherit
needs: telemetry-setup
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
arch: '["amd64"]'
Expand All @@ -171,3 +185,17 @@ jobs:
sccache -z;
build-all --verbose -j$(nproc --ignore=1) -DBUILD_CUGRAPH_MG_TESTS=ON;
sccache -s;
telemetry-summarize:
runs-on: ubuntu-latest
needs: pr-builder
if: always()
continue-on-error: true
steps:
- name: Load stashed telemetry env vars
uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
with:
load_service_name: true
- name: Telemetry summarize
uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
with:
cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ repos:
meta[.]yaml$|
setup[.]cfg$
- id: verify-alpha-spec
- repo: https://github.com/sphinx-contrib/sphinx-lint
rev: v1.0.0
hooks:
- id: sphinx-lint
args: ["--enable=all", "--disable=line-too-long"]
- repo: https://github.com/rapidsai/dependency-file-generator
rev: v1.16.0
hooks:
Expand Down
10 changes: 0 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,6 @@

</div>

-----
## News

___NEW!___ _[nx-cugraph](https://rapids.ai/nx-cugraph/)_, a NetworkX backend that provides GPU acceleration to NetworkX with zero code change.
```
> pip install nx-cugraph-cu11 --extra-index-url https://pypi.nvidia.com
> export NETWORKX_AUTOMATIC_BACKENDS=cugraph
```
That's it. NetworkX now leverages cuGraph for accelerated graph algorithms.

-----

## Table of contents
Expand Down
139 changes: 127 additions & 12 deletions cpp/include/cugraph/edge_partition_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
edge_partition_device_view_t(edge_partition_view_t<vertex_t, edge_t, multi_gpu> view)
: detail::edge_partition_device_view_base_t<vertex_t, edge_t>(view.offsets(), view.indices()),
dcs_nzd_vertices_(detail::to_thrust_optional(view.dcs_nzd_vertices())),
dcs_nzd_range_bitmap_(detail::to_thrust_optional(view.dcs_nzd_range_bitmap())),
major_hypersparse_first_(detail::to_thrust_optional(view.major_hypersparse_first())),
major_range_first_(view.major_range_first()),
major_range_last_(view.major_range_last()),
Expand All @@ -218,6 +219,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return dcs_nzd_vertices_ ? thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand Down Expand Up @@ -250,12 +252,72 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
thrust::plus<size_t>());
}

template <typename MajorIterator>
__host__ void compute_number_of_edges_async(MajorIterator major_first,
MajorIterator major_last,
raft::device_span<size_t> count /* size = 1 */,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) {
RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
}

rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
size_t tmp_storage_bytes{0};

if (dcs_nzd_vertices_) {
auto local_degree_first = thrust::make_transform_iterator(
major_first,
detail::local_degree_op_t<vertex_t,
edge_t,
size_t /* no limit on majors.size(), so edge_t can overflow */,
multi_gpu,
true>{
this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_});
cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
d_tmp_storage.resize(tmp_storage_bytes, stream);
cub::DeviceReduce::Sum(d_tmp_storage.data(),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
} else {
auto local_degree_first = thrust::make_transform_iterator(
major_first,
detail::local_degree_op_t<vertex_t,
edge_t,
size_t /* no limit on majors.size(), so edge_t can overflow */,
multi_gpu,
false>{
this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */});
cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
d_tmp_storage.resize(tmp_storage_bytes, stream);
cub::DeviceReduce::Sum(d_tmp_storage.data(),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
}
}

__host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
{
rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -266,7 +328,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
major_hypersparse_first_.value_or(vertex_t{0})});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -284,7 +346,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -295,7 +357,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
major_hypersparse_first_.value_or(vertex_t{0})});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -311,6 +373,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return dcs_nzd_vertices_ ? thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand Down Expand Up @@ -355,7 +418,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -368,7 +431,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
mask_first});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -394,7 +457,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -407,7 +470,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
mask_first});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand Down Expand Up @@ -515,6 +578,8 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
return minor_range_first_ + minor_offset;
}

// FIxME: better return thrust::optional<raft::device_span<vertex_t const>> for consistency (see
// dcs_nzd_range_bitmap())
__host__ __device__ thrust::optional<vertex_t const*> dcs_nzd_vertices() const
{
return dcs_nzd_vertices_ ? thrust::optional<vertex_t const*>{(*dcs_nzd_vertices_).data()}
Expand All @@ -528,10 +593,20 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
: thrust::nullopt;
}

__host__ __device__ thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap()
const
{
return dcs_nzd_range_bitmap_
? thrust::make_optional<raft::device_span<uint32_t const>>(
(*dcs_nzd_range_bitmap_).data(), (*dcs_nzd_range_bitmap_).size())
: thrust::nullopt;
}

private:
// should be trivially copyable to device

thrust::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices_{thrust::nullopt};
thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap_{thrust::nullopt};
thrust::optional<vertex_t> major_hypersparse_first_{thrust::nullopt};

vertex_t major_range_first_{0};
Expand All @@ -558,6 +633,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand All @@ -574,10 +650,48 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
thrust::plus<size_t>());
}

template <typename MajorIterator>
__host__ void compute_number_of_edges_async(MajorIterator major_first,
MajorIterator major_last,
raft::device_span<size_t> count /* size = 1 */,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) {
RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
}

rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
size_t tmp_storage_bytes{0};

auto local_degree_first = thrust::make_transform_iterator(
major_first,
detail::local_degree_op_t<vertex_t,
edge_t,
size_t /* no limit on majors.size(), so edge_t can overflow */,
multi_gpu,
false>{this->offsets_,
std::byte{0} /* dummy */,
std::byte{0} /* dummy */,
std::byte{0} /* dummy */});
cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
d_tmp_storage.resize(tmp_storage_bytes, stream);
cub::DeviceReduce::Sum(d_tmp_storage.data(),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
}

__host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
{
rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -595,7 +709,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
rmm::cuda_stream_view stream) const
{
rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -613,6 +727,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand All @@ -638,7 +753,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
{
rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -660,7 +775,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
{
rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand Down
Loading
Loading