Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
jameslamb committed Nov 26, 2024
2 parents 871939c + e419c72 commit 1929945
Show file tree
Hide file tree
Showing 75 changed files with 11,658 additions and 4,168 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ concurrency:
cancel-in-progress: true

jobs:
# Please keep pr-builder as the top job here
pr-builder:
needs:
- changed-files
Expand All @@ -25,14 +26,24 @@ jobs:
- wheel-tests-pylibcugraph
- wheel-build-cugraph
- wheel-tests-cugraph
- telemetry-setup
- devcontainer
secrets: inherit
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
if: always()
with:
needs: ${{ toJSON(needs) }}
telemetry-setup:
runs-on: ubuntu-latest
continue-on-error: true
env:
OTEL_SERVICE_NAME: "pr-cugraph"
steps:
- name: Telemetry setup
uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
changed-files:
secrets: inherit
needs: telemetry-setup
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
files_yaml: |
Expand Down Expand Up @@ -63,9 +74,11 @@ jobs:
- '!notebooks/**'
checks:
secrets: inherit
needs: telemetry-setup
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
enable_check_generated_files: false
ignored_pr_jobs: telemetry-summarize
conda-cpp-build:
needs: checks
secrets: inherit
Expand Down Expand Up @@ -155,6 +168,7 @@ jobs:
script: ci/test_wheel_cugraph.sh
devcontainer:
secrets: inherit
needs: telemetry-setup
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
arch: '["amd64"]'
Expand All @@ -164,3 +178,17 @@ jobs:
sccache -z;
build-all --verbose -j$(nproc --ignore=1) -DBUILD_CUGRAPH_MG_TESTS=ON;
sccache -s;
telemetry-summarize:
runs-on: ubuntu-latest
needs: pr-builder
if: always()
continue-on-error: true
steps:
- name: Load stashed telemetry env vars
uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
with:
load_service_name: true
- name: Telemetry summarize
uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
with:
cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ repos:
meta[.]yaml$|
setup[.]cfg$
- id: verify-alpha-spec
- repo: https://github.com/sphinx-contrib/sphinx-lint
rev: v1.0.0
hooks:
- id: sphinx-lint
args: ["--enable=all", "--disable=line-too-long"]
- repo: https://github.com/rapidsai/dependency-file-generator
rev: v1.16.0
hooks:
Expand Down
10 changes: 0 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,6 @@

</div>

-----
## News

___NEW!___ _[nx-cugraph](https://rapids.ai/nx-cugraph/)_, a NetworkX backend that provides GPU acceleration to NetworkX with zero code change.
```
> pip install nx-cugraph-cu11 --extra-index-url https://pypi.nvidia.com
> export NETWORKX_AUTOMATIC_BACKENDS=cugraph
```
That's it. NetworkX now leverages cuGraph for accelerated graph algorithms.

-----

## Table of contents
Expand Down
139 changes: 127 additions & 12 deletions cpp/include/cugraph/edge_partition_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
edge_partition_device_view_t(edge_partition_view_t<vertex_t, edge_t, multi_gpu> view)
: detail::edge_partition_device_view_base_t<vertex_t, edge_t>(view.offsets(), view.indices()),
dcs_nzd_vertices_(detail::to_thrust_optional(view.dcs_nzd_vertices())),
dcs_nzd_range_bitmap_(detail::to_thrust_optional(view.dcs_nzd_range_bitmap())),
major_hypersparse_first_(detail::to_thrust_optional(view.major_hypersparse_first())),
major_range_first_(view.major_range_first()),
major_range_last_(view.major_range_last()),
Expand All @@ -218,6 +219,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return dcs_nzd_vertices_ ? thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand Down Expand Up @@ -250,12 +252,72 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
thrust::plus<size_t>());
}

template <typename MajorIterator>
__host__ void compute_number_of_edges_async(MajorIterator major_first,
MajorIterator major_last,
raft::device_span<size_t> count /* size = 1 */,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) {
RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
}

rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
size_t tmp_storage_bytes{0};

if (dcs_nzd_vertices_) {
auto local_degree_first = thrust::make_transform_iterator(
major_first,
detail::local_degree_op_t<vertex_t,
edge_t,
size_t /* no limit on majors.size(), so edge_t can overflow */,
multi_gpu,
true>{
this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_});
cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
d_tmp_storage.resize(tmp_storage_bytes, stream);
cub::DeviceReduce::Sum(d_tmp_storage.data(),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
} else {
auto local_degree_first = thrust::make_transform_iterator(
major_first,
detail::local_degree_op_t<vertex_t,
edge_t,
size_t /* no limit on majors.size(), so edge_t can overflow */,
multi_gpu,
false>{
this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */});
cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
d_tmp_storage.resize(tmp_storage_bytes, stream);
cub::DeviceReduce::Sum(d_tmp_storage.data(),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
}
}

__host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
{
rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -266,7 +328,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
major_hypersparse_first_.value_or(vertex_t{0})});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -284,7 +346,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -295,7 +357,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
major_hypersparse_first_.value_or(vertex_t{0})});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -311,6 +373,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return dcs_nzd_vertices_ ? thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand Down Expand Up @@ -355,7 +418,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -368,7 +431,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
mask_first});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -394,7 +457,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
if (dcs_nzd_vertices_) {
assert(major_hypersparse_first_);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -407,7 +470,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
mask_first});
} else {
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand Down Expand Up @@ -515,6 +578,8 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
return minor_range_first_ + minor_offset;
}

// FIxME: better return thrust::optional<raft::device_span<vertex_t const>> for consistency (see
// dcs_nzd_range_bitmap())
__host__ __device__ thrust::optional<vertex_t const*> dcs_nzd_vertices() const
{
return dcs_nzd_vertices_ ? thrust::optional<vertex_t const*>{(*dcs_nzd_vertices_).data()}
Expand All @@ -528,10 +593,20 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
: thrust::nullopt;
}

__host__ __device__ thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap()
const
{
return dcs_nzd_range_bitmap_
? thrust::make_optional<raft::device_span<uint32_t const>>(
(*dcs_nzd_range_bitmap_).data(), (*dcs_nzd_range_bitmap_).size())
: thrust::nullopt;
}

private:
// should be trivially copyable to device

thrust::optional<raft::device_span<vertex_t const>> dcs_nzd_vertices_{thrust::nullopt};
thrust::optional<raft::device_span<uint32_t const>> dcs_nzd_range_bitmap_{thrust::nullopt};
thrust::optional<vertex_t> major_hypersparse_first_{thrust::nullopt};

vertex_t major_range_first_{0};
Expand All @@ -558,6 +633,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand All @@ -574,10 +650,48 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
thrust::plus<size_t>());
}

template <typename MajorIterator>
__host__ void compute_number_of_edges_async(MajorIterator major_first,
MajorIterator major_last,
raft::device_span<size_t> count /* size = 1 */,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) {
RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream));
}

rmm::device_uvector<std::byte> d_tmp_storage(0, stream);
size_t tmp_storage_bytes{0};

auto local_degree_first = thrust::make_transform_iterator(
major_first,
detail::local_degree_op_t<vertex_t,
edge_t,
size_t /* no limit on majors.size(), so edge_t can overflow */,
multi_gpu,
false>{this->offsets_,
std::byte{0} /* dummy */,
std::byte{0} /* dummy */,
std::byte{0} /* dummy */});
cub::DeviceReduce::Sum(static_cast<void*>(nullptr),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
d_tmp_storage.resize(tmp_storage_bytes, stream);
cub::DeviceReduce::Sum(d_tmp_storage.data(),
tmp_storage_bytes,
local_degree_first,
count.data(),
thrust::distance(major_first, major_last),
stream);
}

__host__ rmm::device_uvector<edge_t> compute_local_degrees(rmm::cuda_stream_view stream) const
{
rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -595,7 +709,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
rmm::cuda_stream_view stream) const
{
rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::transform(rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand All @@ -613,6 +727,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
MajorIterator major_last,
rmm::cuda_stream_view stream) const
{
if (thrust::distance(major_first, major_last) == 0) return size_t{0};
return thrust::transform_reduce(
rmm::exec_policy(stream),
major_first,
Expand All @@ -638,7 +753,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
{
rmm::device_uvector<edge_t> local_degrees(this->major_range_size(), stream);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(this->major_range_first()),
thrust::make_counting_iterator(this->major_range_last()),
local_degrees.begin(),
Expand All @@ -660,7 +775,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
{
rmm::device_uvector<edge_t> local_degrees(thrust::distance(major_first, major_last), stream);
thrust::transform(
rmm::exec_policy(stream),
rmm::exec_policy_nosync(stream),
major_first,
major_last,
local_degrees.begin(),
Expand Down
Loading

0 comments on commit 1929945

Please sign in to comment.