diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 3d0ac075be3..190003dd7af 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -7,6 +7,11 @@ FROM ${BASE} as pip-base ENV DEFAULT_VIRTUAL_ENV=rapids +RUN apt update -y \ + && DEBIAN_FRONTEND=noninteractive apt install -y \ + libblas-dev liblapack-dev \ + && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*; + FROM ${BASE} as conda-base ENV DEFAULT_CONDA_ENV=rapids diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json index 4bf09f5cab9..55c4707631f 100644 --- a/.devcontainer/cuda11.8-conda/devcontainer.json +++ b/.devcontainer/cuda11.8-conda/devcontainer.json @@ -8,6 +8,11 @@ "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04" } }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda" + ], "hostRequirements": {"gpu": "optional"}, "features": { "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index f9504b10b32..b55b0feaf34 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -8,6 +8,11 @@ "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04" } }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip" + ], "hostRequirements": {"gpu": "optional"}, "features": { "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": { @@ -23,7 +28,6 @@ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ - "ghcr.io/rapidsai/devcontainers/features/ucx", "ghcr.io/rapidsai/devcontainers/features/cuda", "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json index 768be8bf3b1..2d71dba6e74 100644 --- a/.devcontainer/cuda12.2-conda/devcontainer.json +++ b/.devcontainer/cuda12.2-conda/devcontainer.json @@ -8,6 +8,11 @@ "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04" } }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda" + ], "hostRequirements": {"gpu": "optional"}, "features": { "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json index 83845ca6261..cacaf027ae4 100644 --- a/.devcontainer/cuda12.2-pip/devcontainer.json +++ b/.devcontainer/cuda12.2-pip/devcontainer.json @@ -8,6 +8,11 @@ "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.2-ubuntu22.04" } }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip" + ], "hostRequirements": {"gpu": "optional"}, "features": { "ghcr.io/rapidsai/devcontainers/features/ucx:24.8": { @@ -23,7 +28,6 @@ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {} }, "overrideFeatureInstallOrder": [ - "ghcr.io/rapidsai/devcontainers/features/ucx", "ghcr.io/rapidsai/devcontainers/features/cuda", "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 91279062092..4b5d0c26d0b 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -196,5 +196,5 @@ jobs: extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY build_command: | sccache -z; - build-all --verbose -j$(nproc --ignore=1); + build-all --verbose -j$(nproc --ignore=1) -DBUILD_CUGRAPH_MG_TESTS=ON; sccache -s; diff --git a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py index 95e1afcb28b..578e2520765 100644 --- a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py +++ b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py @@ -344,7 +344,7 @@ def generate_rmat_dataset( del label_df gc.collect() - dask_label_df = dask_cudf.from_dask_dataframe(dask_label_df) + dask_label_df = dask_label_df.to_backend("cudf") node_offsets = {"paper": 0} edge_offsets = {("paper", "cites", "paper"): 0} diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py index 3b085a9bfdb..d40b5130827 100644 --- a/benchmarks/nx-cugraph/pytest-based/bench_algos.py +++ b/benchmarks/nx-cugraph/pytest-based/bench_algos.py @@ -848,6 +848,23 @@ def bench_weakly_connected_components(benchmark, graph_obj, backend_wrapper): assert type(result) is list +def bench_ego_graph(benchmark, graph_obj, backend_wrapper): + G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper) + node = get_highest_degree_node(graph_obj) + result = benchmark.pedantic( + target=backend_wrapper(nx.ego_graph), + args=(G,), + kwargs=dict( + n=node, + radius=100, + ), + rounds=rounds, + iterations=iterations, + warmup_rounds=warmup_rounds, + ) + assert isinstance(result, (nx.Graph, nxcg.Graph)) + + @pytest.mark.skip(reason="benchmark not implemented") def bench_complete_bipartite_graph(benchmark, graph_obj, backend_wrapper): pass diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 587c5fb38e7..c980ed320dc 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -40,7 +40,7 @@ if ! rapids-is-release-build; then alpha_spec=',>=0.0.0a0' fi -for dep in rmm cudf cugraph raft-dask pylibcugraph pylibcugraphops pylibraft ucx-py; do +for dep in rmm cudf cugraph raft-dask pylibcugraph pylibcugraphops pylibwholegraph pylibraft ucx-py; do sed -r -i "s/${dep}==(.*)\"/${dep}${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} done diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 9a7324fb330..f5c14e8d315 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -105,6 +105,7 @@ find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}" sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}" sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}" + sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}" done sed_runner "s/:[0-9][0-9]\.[0-9][0-9]/:${NEXT_SHORT_TAG}/" ./notebooks/README.md diff --git a/ci/run_cugraph_pyg_pytests.sh b/ci/run_cugraph_pyg_pytests.sh index 88642e6ceb6..fb27f16d79e 100755 --- a/ci/run_cugraph_pyg_pytests.sh +++ b/ci/run_cugraph_pyg_pytests.sh @@ -6,7 +6,10 @@ set -euo pipefail # Support invoking run_cugraph_pyg_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-pyg/cugraph_pyg -pytest --cache-clear --ignore=tests/mg "$@" . +pytest --cache-clear --benchmark-disable "$@" . + +# Used to skip certain examples in CI due to memory limitations +export CI_RUN=1 # Test examples for e in "$(pwd)"/examples/*.py; do diff --git a/ci/test.sh b/ci/test.sh index f20fc40f85a..884ed7ac881 100755 --- a/ci/test.sh +++ b/ci/test.sh @@ -103,7 +103,7 @@ if hasArg "--run-python-tests"; then conda list cd ${CUGRAPH_ROOT}/python/cugraph-pyg/cugraph_pyg # rmat is not tested because of MG testing - pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-pytests.xml -v --cov-config=.coveragerc --cov=cugraph_pyg --cov-report=xml:${WORKSPACE}/python/cugraph_pyg/cugraph-coverage.xml --cov-report term --ignore=raft --ignore=tests/mg --ignore=tests/int --ignore=tests/generators --benchmark-disable + pytest -sv -m sg --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph-pytests.xml -v --cov-config=.coveragerc --cov=cugraph_pyg --cov-report=xml:${WORKSPACE}/python/cugraph_pyg/cugraph-coverage.xml --cov-report term --ignore=raft --benchmark-disable echo "Ran Python pytest for cugraph_pyg : return code was: $?, test script exit code is now: $EXITCODE" echo "Python pytest for cugraph-service (single-GPU only)..." diff --git a/ci/test_python.sh b/ci/test_python.sh index 25b99dcf377..fdcf88d692a 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -3,10 +3,6 @@ set -euo pipefail -# TODO: Enable dask query planning (by default) once some bugs are fixed. -# xref: https://github.com/rapidsai/cudf/issues/15027 -export DASK_DATAFRAME__QUERY_PLANNING=False - # Support invoking test_python.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ @@ -217,13 +213,14 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then # Install pyg dependencies (which requires pip) - pip install ogb + pip install \ + ogb \ + tensordict + pip install \ pyg_lib \ torch_scatter \ torch_sparse \ - torch_cluster \ - torch_spline_conv \ -f ${PYG_URL} rapids-print-env diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index cda40d92c74..158704e08d1 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -3,10 +3,6 @@ set -eoxu pipefail -# TODO: Enable dask query planning (by default) once some bugs are fixed. -# xref: https://github.com/rapidsai/cudf/issues/15027 -export DASK_DATAFRAME__QUERY_PLANNING=False - package_name=$1 package_dir=$2 diff --git a/ci/test_wheel_cugraph-dgl.sh b/ci/test_wheel_cugraph-dgl.sh index 827ad487115..564b46cb07e 100755 --- a/ci/test_wheel_cugraph-dgl.sh +++ b/ci/test_wheel_cugraph-dgl.sh @@ -32,8 +32,18 @@ fi PYTORCH_URL="https://download.pytorch.org/whl/cu${PYTORCH_CUDA_VER}" DGL_URL="https://data.dgl.ai/wheels/cu${PYTORCH_CUDA_VER}/repo.html" +# Starting from 2.2, PyTorch wheels depend on nvidia-nccl-cuxx>=2.19 wheel and +# dynamically link to NCCL. RAPIDS CUDA 11 CI images have an older NCCL version that +# might shadow the newer NCCL required by PyTorch during import (when importing +# `cupy` before `torch`). +if [[ "${NCCL_VERSION}" < "2.19" ]]; then + PYTORCH_VER="2.1.0" +else + PYTORCH_VER="2.3.0" +fi + rapids-logger "Installing PyTorch and DGL" -rapids-retry python -m pip install torch --index-url ${PYTORCH_URL} +rapids-retry python -m pip install "torch==${PYTORCH_VER}" --index-url ${PYTORCH_URL} rapids-retry python -m pip install dgl==2.0.0 --find-links ${DGL_URL} python -m pytest python/cugraph-dgl/tests diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh index f45112dd80b..1004063cc38 100755 --- a/ci/test_wheel_cugraph-pyg.sh +++ b/ci/test_wheel_cugraph-pyg.sh @@ -24,6 +24,9 @@ python -m pip install $(ls ./dist/${python_package_name}*.whl)[test] # RAPIDS_DATASET_ROOT_DIR is used by test scripts export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)" +# Used to skip certain examples in CI due to memory limitations +export CI_RUN=1 + if [[ "${CUDA_VERSION}" == "11.8.0" ]]; then PYTORCH_URL="https://download.pytorch.org/whl/cu118" PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html" @@ -39,15 +42,14 @@ rapids-retry python -m pip install \ pyg_lib \ torch_scatter \ torch_sparse \ - torch_cluster \ - torch_spline_conv \ + tensordict \ -f ${PYG_URL} rapids-logger "pytest cugraph-pyg (single GPU)" pushd python/cugraph-pyg/cugraph_pyg python -m pytest \ --cache-clear \ - --ignore=tests/mg \ + --benchmark-disable \ tests # Test examples for e in "$(pwd)"/examples/*.py; do diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml index c02e8391eb2..64091ff4782 100644 --- a/conda/recipes/cugraph-pyg/meta.yaml +++ b/conda/recipes/cugraph-pyg/meta.yaml @@ -34,6 +34,7 @@ requirements: - cupy >=12.0.0 - cugraph ={{ version }} - pylibcugraphops ={{ minor_version }} + - tensordict >=0.1.2 - pyg >=2.5,<2.6 tests: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d3dfdbd068c..7dca3d983a5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -92,14 +92,14 @@ set(CUGRAPH_CXX_FLAGS "") set(CUGRAPH_CUDA_FLAGS "") if(CMAKE_COMPILER_IS_GNUCXX) - list(APPEND CUGRAPH_CXX_FLAGS -Werror -Wno-error=deprecated-declarations) + list(APPEND CUGRAPH_CXX_FLAGS -Werror -Wno-error=deprecated-declarations -Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS) endif(CMAKE_COMPILER_IS_GNUCXX) message("-- Building for GPU_ARCHS = ${CMAKE_CUDA_ARCHITECTURES}") list(APPEND CUGRAPH_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) -list(APPEND CUGRAPH_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xptxas=--disable-warnings) +list(APPEND CUGRAPH_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS -Xptxas=--disable-warnings) list(APPEND CUGRAPH_CUDA_FLAGS -Xcompiler=-Wall,-Wno-error=sign-compare,-Wno-error=unused-but-set-variable) list(APPEND CUGRAPH_CUDA_FLAGS -Xfatbin=-compress-all) @@ -180,6 +180,7 @@ set(CUGRAPH_SOURCES src/community/detail/refine_sg.cu src/community/detail/refine_mg.cu src/community/edge_triangle_count_sg.cu + src/community/edge_triangle_count_mg.cu src/community/detail/maximal_independent_moves_sg.cu src/community/detail/maximal_independent_moves_mg.cu src/detail/utility_wrappers.cu @@ -288,6 +289,8 @@ set(CUGRAPH_SOURCES src/structure/symmetrize_edgelist_mg.cu src/community/triangle_count_sg.cu src/community/triangle_count_mg.cu + src/community/approx_weighted_matching_sg.cu + src/community/approx_weighted_matching_mg.cu src/traversal/k_hop_nbrs_sg.cu src/traversal/k_hop_nbrs_mg.cu src/mtmg/vertex_result.cu diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp index 0caa151daac..cc42399f091 100644 --- a/cpp/include/cugraph/algorithms.hpp +++ b/cpp/include/cugraph/algorithms.hpp @@ -2007,6 +2007,24 @@ void triangle_count(raft::handle_t const& handle, raft::device_span counts, bool do_expensive_check = false); +/* + * @brief Compute edge triangle counts. + * + * Compute edge triangle counts for the entire set of edges. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam edge_t Type of edge identifiers. Needs to be an integral type. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Graph view object. + * + * @return edge_property_t containing the edge triangle count + */ +template +edge_property_t, edge_t> edge_triangle_count( + raft::handle_t const& handle, graph_view_t const& graph_view); + /* * @brief Compute K-Truss. * @@ -2368,6 +2386,32 @@ rmm::device_uvector vertex_coloring( graph_view_t const& graph_view, raft::random::RngState& rng_state); +/* + * @brief Approximate Weighted Matching + * + * A matching in an undirected graph G = (V, E) is a pairing of adjacent vertices + * such that each vertex is matched with at most one other vertex, the objective + * being to match as many vertices as possible or to maximise the sum of the + * weights of the matched edges. Here we provide an implementation of an + * approximation algorithm to the weighted Maximum matching. See + * https://web.archive.org/web/20081031230449id_/http://www.ii.uib.no/~fredrikm/fredrik/papers/CP75.pdf + * for further information. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam edge_t Type of edge identifiers. Needs to be an integral type. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, + * and handles to various CUDA libraries) to run graph algorithms. + * @param[in] graph_view Graph view object. + * @param[in] edge_weight_view View object holding edge weights for @p graph_view. + * @return A tuple of device vector of matched vertex ids and sum of the weights of the matched + * edges. + */ +template +std::tuple, weight_t> approximate_weighted_matching( + raft::handle_t const& handle, + graph_view_t const& graph_view, + edge_property_view_t edge_weight_view); } // namespace cugraph /** diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index fc19a8f68dd..583b0a37214 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -214,9 +214,9 @@ class edge_partition_device_view_t - size_t compute_number_of_edges(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ size_t compute_number_of_edges(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const { return dcs_nzd_vertices_ ? thrust::transform_reduce( rmm::exec_policy(stream), @@ -250,7 +250,7 @@ class edge_partition_device_view_t()); } - rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); if (dcs_nzd_vertices_) { @@ -277,9 +277,9 @@ class edge_partition_device_view_t - rmm::device_uvector compute_local_degrees(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(thrust::distance(major_first, major_last), stream); if (dcs_nzd_vertices_) { @@ -306,10 +306,10 @@ class edge_partition_device_view_t - size_t compute_number_of_edges_with_mask(MaskIterator mask_first, - MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const { return dcs_nzd_vertices_ ? thrust::transform_reduce( rmm::exec_policy(stream), @@ -348,8 +348,8 @@ class edge_partition_device_view_t - rmm::device_uvector compute_local_degrees_with_mask(MaskIterator mask_first, - rmm::cuda_stream_view stream) const + __host__ rmm::device_uvector compute_local_degrees_with_mask( + MaskIterator mask_first, rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); if (dcs_nzd_vertices_) { @@ -384,10 +384,11 @@ class edge_partition_device_view_t - rmm::device_uvector compute_local_degrees_with_mask(MaskIterator mask_first, - MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ rmm::device_uvector compute_local_degrees_with_mask( + MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(thrust::distance(major_first, major_last), stream); if (dcs_nzd_vertices_) { @@ -553,9 +554,9 @@ class edge_partition_device_view_t - size_t compute_number_of_edges(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ size_t compute_number_of_edges(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const { return thrust::transform_reduce( rmm::exec_policy(stream), @@ -573,7 +574,7 @@ class edge_partition_device_view_t()); } - rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); thrust::transform(rmm::exec_policy(stream), @@ -589,9 +590,9 @@ class edge_partition_device_view_t - rmm::device_uvector compute_local_degrees(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(thrust::distance(major_first, major_last), stream); thrust::transform(rmm::exec_policy(stream), @@ -607,10 +608,10 @@ class edge_partition_device_view_t - size_t compute_number_of_edges_with_mask(MaskIterator mask_first, - MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const { return thrust::transform_reduce( rmm::exec_policy(stream), @@ -632,8 +633,8 @@ class edge_partition_device_view_t - rmm::device_uvector compute_local_degrees_with_mask(MaskIterator mask_first, - rmm::cuda_stream_view stream) const + __host__ rmm::device_uvector compute_local_degrees_with_mask( + MaskIterator mask_first, rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); thrust::transform( @@ -651,10 +652,11 @@ class edge_partition_device_view_t - rmm::device_uvector compute_local_degrees_with_mask(MaskIterator mask_first, - MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ rmm::device_uvector compute_local_degrees_with_mask( + MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(thrust::distance(major_first, major_last), stream); thrust::transform( diff --git a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp index 7fd5bb726e6..63d7fd9685e 100644 --- a/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp +++ b/cpp/include/cugraph/mtmg/detail/per_device_edgelist.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,17 +62,17 @@ class per_device_edgelist_t { /** * @brief Construct a new per device edgelist t object * - * @param handle MTMG resource handle - used to identify GPU resources * @param device_buffer_size Number of edges to store in each device buffer * @param use_weight Whether or not the edgelist will have weights * @param use_edge_id Whether or not the edgelist will have edge ids * @param use_edge_type Whether or not the edgelist will have edge types + * @param stream_view CUDA stream view */ - per_device_edgelist_t(cugraph::mtmg::handle_t const& handle, - size_t device_buffer_size, + per_device_edgelist_t(size_t device_buffer_size, bool use_weight, bool use_edge_id, - bool use_edge_type) + bool use_edge_type, + rmm::cuda_stream_view stream_view) : device_buffer_size_{device_buffer_size}, current_pos_{0}, src_{}, @@ -89,7 +89,7 @@ class per_device_edgelist_t { edge_type_ = std::make_optional(std::vector>()); } - create_new_buffers(handle); + create_new_buffers(stream_view); } /** @@ -111,19 +111,19 @@ class per_device_edgelist_t { /** * @brief Append a list of edges to the edge list * - * @param handle The resource handle - * @param src Source vertex id - * @param dst Destination vertex id - * @param wgt Edge weight - * @param edge_id Edge id - * @param edge_type Edge type + * @param src Source vertex id + * @param dst Destination vertex id + * @param wgt Edge weight + * @param edge_id Edge id + * @param edge_type Edge type + * @param stream_view CUDA stream view */ - void append(handle_t const& handle, - raft::host_span src, + void append(raft::host_span src, raft::host_span dst, std::optional> wgt, std::optional> edge_id, - std::optional> edge_type) + std::optional> edge_type, + rmm::cuda_stream_view stream_view) { std::vector> copy_positions; @@ -142,13 +142,13 @@ class per_device_edgelist_t { pos += copy_count; current_pos_ += copy_count; - if (current_pos_ == src_.back().size()) { create_new_buffers(handle); } + if (current_pos_ == src_.back().size()) { create_new_buffers(stream_view); } } } std::for_each(copy_positions.begin(), copy_positions.end(), - [&handle, + [&stream_view, &this_src = src_, &src, &this_dst = dst_, @@ -164,47 +164,45 @@ class per_device_edgelist_t { raft::update_device(this_src[buffer_idx].begin() + buffer_pos, src.begin() + input_pos, copy_count, - handle.get_stream()); + stream_view); raft::update_device(this_dst[buffer_idx].begin() + buffer_pos, dst.begin() + input_pos, copy_count, - handle.get_stream()); + stream_view); if (this_wgt) raft::update_device((*this_wgt)[buffer_idx].begin() + buffer_pos, wgt->begin() + input_pos, copy_count, - handle.get_stream()); + stream_view); if (this_edge_id) raft::update_device((*this_edge_id)[buffer_idx].begin() + buffer_pos, edge_id->begin() + input_pos, copy_count, - handle.get_stream()); + stream_view); if (this_edge_type) raft::update_device((*this_edge_type)[buffer_idx].begin() + buffer_pos, edge_type->begin() + input_pos, copy_count, - handle.get_stream()); + stream_view); }); - - handle.sync_stream(); } /** * @brief Mark the edgelist as ready for reading (all writes are complete) * - * @param handle The resource handle + * @param stream_view CUDA stream view */ - void finalize_buffer(handle_t const& handle) + void finalize_buffer(rmm::cuda_stream_view stream_view) { - src_.back().resize(current_pos_, handle.get_stream()); - dst_.back().resize(current_pos_, handle.get_stream()); - if (wgt_) wgt_->back().resize(current_pos_, handle.get_stream()); - if (edge_id_) edge_id_->back().resize(current_pos_, handle.get_stream()); - if (edge_type_) edge_type_->back().resize(current_pos_, handle.get_stream()); + src_.back().resize(current_pos_, stream_view); + dst_.back().resize(current_pos_, stream_view); + if (wgt_) wgt_->back().resize(current_pos_, stream_view); + if (edge_id_) edge_id_->back().resize(current_pos_, stream_view); + if (edge_type_) edge_type_->back().resize(current_pos_, stream_view); } bool use_weight() const { return wgt_.has_value(); } @@ -230,16 +228,18 @@ class per_device_edgelist_t { void consolidate_and_shuffle(cugraph::mtmg::handle_t const& handle, bool store_transposed) { if (src_.size() > 1) { + auto stream = handle.raft_handle().get_stream(); + size_t total_size = std::transform_reduce( src_.begin(), src_.end(), size_t{0}, std::plus(), [](auto& d_vector) { return d_vector.size(); }); - resize_and_copy_buffers(handle.get_stream(), src_, total_size); - resize_and_copy_buffers(handle.get_stream(), dst_, total_size); - if (wgt_) resize_and_copy_buffers(handle.get_stream(), *wgt_, total_size); - if (edge_id_) resize_and_copy_buffers(handle.get_stream(), *edge_id_, total_size); - if (edge_type_) resize_and_copy_buffers(handle.get_stream(), *edge_type_, total_size); + resize_and_copy_buffers(src_, total_size, stream); + resize_and_copy_buffers(dst_, total_size, stream); + if (wgt_) resize_and_copy_buffers(*wgt_, total_size, stream); + if (edge_id_) resize_and_copy_buffers(*edge_id_, total_size, stream); + if (edge_type_) resize_and_copy_buffers(*edge_type_, total_size, stream); } auto tmp_wgt = wgt_ ? std::make_optional(std::move((*wgt_)[0])) : std::nullopt; @@ -267,9 +267,9 @@ class per_device_edgelist_t { private: template - void resize_and_copy_buffers(rmm::cuda_stream_view stream, - std::vector>& buffer, - size_t total_size) + void resize_and_copy_buffers(std::vector>& buffer, + size_t total_size, + rmm::cuda_stream_view stream) { size_t pos = buffer[0].size(); buffer[0].resize(total_size, stream); @@ -286,16 +286,16 @@ class per_device_edgelist_t { buffer = std::move(new_buffer); } - void create_new_buffers(cugraph::mtmg::handle_t const& handle) + void create_new_buffers(rmm::cuda_stream_view stream_view) { - src_.emplace_back(device_buffer_size_, handle.get_stream()); - dst_.emplace_back(device_buffer_size_, handle.get_stream()); + src_.emplace_back(device_buffer_size_, stream_view); + dst_.emplace_back(device_buffer_size_, stream_view); - if (wgt_) { wgt_->emplace_back(device_buffer_size_, handle.get_stream()); } + if (wgt_) { wgt_->emplace_back(device_buffer_size_, stream_view); } - if (edge_id_) { edge_id_->emplace_back(device_buffer_size_, handle.get_stream()); } + if (edge_id_) { edge_id_->emplace_back(device_buffer_size_, stream_view); } - if (edge_type_) { edge_type_->emplace_back(device_buffer_size_, handle.get_stream()); } + if (edge_type_) { edge_type_->emplace_back(device_buffer_size_, stream_view); } current_pos_ = 0; } diff --git a/cpp/include/cugraph/mtmg/edge_property.hpp b/cpp/include/cugraph/mtmg/edge_property.hpp index afa72492b9a..0b27ca85e46 100644 --- a/cpp/include/cugraph/mtmg/edge_property.hpp +++ b/cpp/include/cugraph/mtmg/edge_property.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,6 @@ #include #include -#include namespace cugraph { namespace mtmg { diff --git a/cpp/include/cugraph/mtmg/edge_property_view.hpp b/cpp/include/cugraph/mtmg/edge_property_view.hpp index c84a6458e1d..6416ea382ef 100644 --- a/cpp/include/cugraph/mtmg/edge_property_view.hpp +++ b/cpp/include/cugraph/mtmg/edge_property_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #pragma once #include -#include namespace cugraph { namespace mtmg { diff --git a/cpp/include/cugraph/mtmg/edgelist.hpp b/cpp/include/cugraph/mtmg/edgelist.hpp index 90c53dfbb64..d5d2bd2bca7 100644 --- a/cpp/include/cugraph/mtmg/edgelist.hpp +++ b/cpp/include/cugraph/mtmg/edgelist.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,7 +39,7 @@ class edgelist_t : public detail::device_shared_wrapper_t< bool use_edge_type) { detail::per_device_edgelist_t tmp( - handle, device_buffer_size, use_weight, use_edge_id, use_edge_type); + device_buffer_size, use_weight, use_edge_id, use_edge_type, handle.get_stream()); detail::device_shared_wrapper_t< detail::per_device_edgelist_t>::set(handle, @@ -49,7 +49,11 @@ class edgelist_t : public detail::device_shared_wrapper_t< /** * @brief Stop inserting edges into this edgelist so we can use the edges */ - void finalize_buffer(handle_t const& handle) { this->get(handle).finalize_buffer(handle); } + void finalize_buffer(handle_t const& handle) + { + handle.sync_stream_pool(); + this->get(handle).finalize_buffer(handle.get_stream()); + } /** * @brief Consolidate for the edgelist edges into a single edgelist and then diff --git a/cpp/include/cugraph/mtmg/handle.hpp b/cpp/include/cugraph/mtmg/handle.hpp index 0b02091a3cc..26c283f6acf 100644 --- a/cpp/include/cugraph/mtmg/handle.hpp +++ b/cpp/include/cugraph/mtmg/handle.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -79,6 +79,11 @@ class handle_t { */ void sync_stream() const { sync_stream(get_stream()); } + /** + * @brief Sync all streams in the stream pool + */ + void sync_stream_pool() const { raft::resource::sync_stream_pool(raft_handle_); } + /** * @brief get thrust policy for the stream * diff --git a/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp b/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp index b672db48719..73d69fdd5a7 100644 --- a/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp +++ b/cpp/include/cugraph/mtmg/per_thread_edgelist.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,6 @@ #include #include -#include namespace cugraph { namespace mtmg { @@ -70,21 +69,21 @@ class per_thread_edgelist_t { /** * @brief Append an edge to the edge list * - * @param handle The resource handle - * @param src Source vertex id - * @param dst Destination vertex id - * @param wgt Edge weight - * @param edge_id Edge id - * @param edge_type Edge type + * @param src Source vertex id + * @param dst Destination vertex id + * @param wgt Edge weight + * @param edge_id Edge id + * @param edge_type Edge type + * @param stream_view The cuda stream */ - void append(handle_t const& handle, - vertex_t src, + void append(vertex_t src, vertex_t dst, std::optional wgt, std::optional edge_id, - std::optional edge_type) + std::optional edge_type, + rmm::cuda_stream_view stream_view) { - if (current_pos_ == src_.size()) { flush(handle); } + if (current_pos_ == src_.size()) { flush(stream_view); } src_[current_pos_] = src; dst_[current_pos_] = dst; @@ -98,19 +97,19 @@ class per_thread_edgelist_t { /** * @brief Append a list of edges to the edge list * - * @param handle The resource handle * @param src Source vertex id * @param dst Destination vertex id * @param wgt Edge weight * @param edge_id Edge id * @param edge_type Edge type + * @param stream_view The cuda stream */ - void append(handle_t const& handle, - raft::host_span src, + void append(raft::host_span src, raft::host_span dst, std::optional> wgt, std::optional> edge_id, - std::optional> edge_type) + std::optional> edge_type, + rmm::cuda_stream_view stream_view) { size_t count = src.size(); size_t pos = 0; @@ -131,7 +130,7 @@ class per_thread_edgelist_t { edge_type.begin() + pos + copy_count, edge_type_->begin() + current_pos_); - if (current_pos_ == src_.size()) { flush(handle); } + if (current_pos_ == src_.size()) { flush(stream_view); } count -= copy_count; pos += copy_count; @@ -141,12 +140,13 @@ class per_thread_edgelist_t { /** * @brief Flush thread data from host to GPU memory * - * @param handle The resource handle + * @param stream_view The cuda stream + * @param sync If true, synchronize the asynchronous copy of data; + * defaults to false. */ - void flush(handle_t const& handle) + void flush(rmm::cuda_stream_view stream_view, bool sync = false) { edgelist_.append( - handle, raft::host_span{src_.data(), current_pos_}, raft::host_span{dst_.data(), current_pos_}, wgt_ ? std::make_optional(raft::host_span{wgt_->data(), current_pos_}) @@ -155,9 +155,12 @@ class per_thread_edgelist_t { : std::nullopt, edge_type_ ? std::make_optional(raft::host_span{edge_type_->data(), current_pos_}) - : std::nullopt); + : std::nullopt, + stream_view); current_pos_ = 0; + + if (sync) stream_view.synchronize(); } private: diff --git a/cpp/include/cugraph/utilities/device_functors.cuh b/cpp/include/cugraph/utilities/device_functors.cuh index 3af8ed1dd19..20cf98f7e6d 100644 --- a/cpp/include/cugraph/utilities/device_functors.cuh +++ b/cpp/include/cugraph/utilities/device_functors.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,13 +78,14 @@ struct indirection_t { template struct indirection_if_idx_valid_t { + using value_type = typename thrust::iterator_traits::value_type; Iterator first{}; index_t invalid_idx{}; - typename thrust::iterator_traits::value_type invalid_value{}; + value_type invalid_value{}; - __device__ typename thrust::iterator_traits::value_type operator()(index_t i) const + __device__ value_type operator()(index_t i) const { - return (i != invalid_idx) ? *(first + i) : invalid_value; + return (i != invalid_idx) ? static_cast(*(first + i)) : invalid_value; } }; diff --git a/cpp/include/cugraph/utilities/mask_utils.cuh b/cpp/include/cugraph/utilities/mask_utils.cuh index 7b69ea3fe3a..1d86eef0ed1 100644 --- a/cpp/include/cugraph/utilities/mask_utils.cuh +++ b/cpp/include/cugraph/utilities/mask_utils.cuh @@ -20,6 +20,7 @@ #include +#include #include #include #include @@ -160,13 +161,13 @@ size_t count_set_bits(raft::handle_t const& handle, MaskIterator mask_first, siz handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), thrust::make_counting_iterator(packed_bool_size(num_bits)), - [mask_first, num_bits] __device__(size_t i) { + cuda::proclaim_return_type([mask_first, num_bits] __device__(size_t i) -> size_t { auto word = *(mask_first + i); if ((i + 1) * packed_bools_per_word() > num_bits) { word &= packed_bool_partial_mask(num_bits % packed_bools_per_word()); } return static_cast(__popc(word)); - }, + }), size_t{0}, thrust::plus{}); } diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp index d98754f51d1..304a5b94bd6 100644 --- a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp +++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include +#include #include #include @@ -30,7 +31,7 @@ template struct is_thrust_tuple_of_arithemetic_impl { constexpr bool evaluate() const { - if (!std::is_arithmetic::type>::value) { + if (!std::is_arithmetic_v::type>) { return false; } else { return is_thrust_tuple_of_arithemetic_impl().evaluate(); @@ -123,19 +124,19 @@ struct is_arithmetic_vector : std::false_type {}; template