From 119816cfe04be421bf96baa8075d342f72cbbfc4 Mon Sep 17 00:00:00 2001 From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com> Date: Fri, 17 Nov 2023 09:16:31 -0500 Subject: [PATCH 1/3] [BUG] Fix Incorrect Edge Index, Directory Selection in cuGraph-PyG Loader (#3978) Fixes three major bugs: 1. Edge index is set to [dst, dst] instead of [dst, src] in some cases 2. The sample directory is always set to a new temporary directory rather than the path given 3. The version of `pylibcugraphops` in `meta.yaml` is wrong and causes the wrong packages to be resolved This PR also simplifies `ci/test_python.sh` by doing only a single conda install when creating the `test_cugraph_pyg` environment. Closes #3959 Authors: - Alex Barghi (https://github.com/alexbarghi-nv) - Naim (https://github.com/naimnv) Approvers: - Brad Rees (https://github.com/BradReesWork) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cugraph/pull/3978 --- ci/test_python.sh | 31 ++++++----- conda/recipes/cugraph-pyg/meta.yaml | 2 +- dependencies.yaml | 4 +- .../conda/cugraph_pyg_dev_cuda-118.yaml | 4 +- .../cugraph_pyg/data/cugraph_store.py | 12 +++-- .../cugraph_pyg/loader/cugraph_node_loader.py | 48 ++++++++++------- .../tests/mg/test_mg_cugraph_store.py | 2 +- .../cugraph_pyg/tests/test_cugraph_loader.py | 54 +++++++++++++++++-- .../cugraph_pyg/tests/test_cugraph_store.py | 2 +- 9 files changed, 112 insertions(+), 47 deletions(-) diff --git a/ci/test_python.sh b/ci/test_python.sh index 1690ce2f15b..273d3c93482 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -197,27 +197,26 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then conda activate test_cugraph_pyg set -u - # Install pytorch + # Will automatically install built dependencies of cuGraph-PyG rapids-mamba-retry install \ - --force-reinstall \ - --channel pyg \ + --channel "${CPP_CHANNEL}" \ + --channel "${PYTHON_CHANNEL}" \ --channel pytorch \ --channel nvidia \ - 'pyg=2.3' \ - 'pytorch=2.0.0' \ - 'pytorch-cuda=11.8' + --channel pyg \ + --channel rapidsai-nightly \ + "cugraph-pyg" \ + "pytorch>=2.0,<2.1" \ + "pytorch-cuda=11.8" # Install pyg dependencies (which requires pip) - pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.0.0+cu118.html - - rapids-mamba-retry install \ - --channel "${CPP_CHANNEL}" \ - --channel "${PYTHON_CHANNEL}" \ - libcugraph \ - pylibcugraph \ - pylibcugraphops \ - cugraph \ - cugraph-pyg + pip install \ + pyg_lib \ + torch_scatter \ + torch_sparse \ + torch_cluster \ + torch_spline_conv \ + -f https://data.pyg.org/whl/torch-2.0.0+cu118.html rapids-print-env diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml index 07caf07daab..a2a02a1d9f6 100644 --- a/conda/recipes/cugraph-pyg/meta.yaml +++ b/conda/recipes/cugraph-pyg/meta.yaml @@ -34,7 +34,7 @@ requirements: - cupy >=12.0.0 - cugraph ={{ version }} - pylibcugraphops ={{ minor_version }} - - pyg >=2.3,<2.4 + - pyg >=2.3,<2.5 tests: imports: diff --git a/dependencies.yaml b/dependencies.yaml index 13f100610cf..a89acd9288b 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -497,9 +497,9 @@ dependencies: - output_types: [conda] packages: - cugraph==23.12.* - - pytorch==2.0 + - pytorch>=2.0 - pytorch-cuda==11.8 - - pyg=2.3.1=*torch_2.0.0*cu118* + - pyg>=2.4.0 depends_on_rmm: common: diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml index f98eab430ba..71d1c7e389c 100644 --- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml +++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml @@ -13,13 +13,13 @@ dependencies: - cugraph==23.12.* - pandas - pre-commit -- pyg=2.3.1=*torch_2.0.0*cu118* +- pyg>=2.4.0 - pylibcugraphops==23.12.* - pytest - pytest-benchmark - pytest-cov - pytest-xdist - pytorch-cuda==11.8 -- pytorch==2.0 +- pytorch>=2.0 - scipy name: cugraph_pyg_dev_cuda-118 diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py index d1b24543956..edeeface4c4 100644 --- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py +++ b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py @@ -210,7 +210,10 @@ class EXPERIMENTAL__CuGraphStore: def __init__( self, F: cugraph.gnn.FeatureStore, - G: Union[Dict[str, Tuple[TensorType]], Dict[str, int]], + G: Union[ + Dict[Tuple[str, str, str], Tuple[TensorType]], + Dict[Tuple[str, str, str], int], + ], num_nodes_dict: Dict[str, int], *, multi_gpu: bool = False, @@ -744,7 +747,7 @@ def _subgraph(self, edge_types: List[tuple] = None) -> cugraph.MultiGraph: def _get_vertex_groups_from_sample( self, nodes_of_interest: TensorType, is_sorted: bool = False - ) -> dict: + ) -> Dict[str, torch.Tensor]: """ Given a tensor of nodes of interest, this method a single dictionary, noi_index. @@ -808,7 +811,10 @@ def _get_sample_from_vertex_groups( def _get_renumbered_edge_groups_from_sample( self, sampling_results: cudf.DataFrame, noi_index: dict - ) -> Tuple[dict, dict]: + ) -> Tuple[ + Dict[Tuple[str, str, str], torch.Tensor], + Tuple[Dict[Tuple[str, str, str], torch.Tensor]], + ]: """ Given a cudf (NOT dask_cudf) DataFrame of sampling results and a dictionary of non-renumbered vertex ids grouped by vertex type, this method diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py index 8552e7412e0..ad8d22e255e 100644 --- a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py +++ b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py @@ -15,6 +15,7 @@ import os import re +import warnings import cupy import cudf @@ -159,23 +160,34 @@ def __init__( if batch_size is None or batch_size < 1: raise ValueError("Batch size must be >= 1") - self.__directory = tempfile.TemporaryDirectory(dir=directory) + self.__directory = ( + tempfile.TemporaryDirectory() if directory is None else directory + ) if isinstance(num_neighbors, dict): raise ValueError("num_neighbors dict is currently unsupported!") - renumber = ( - True - if ( - (len(self.__graph_store.node_types) == 1) - and (len(self.__graph_store.edge_types) == 1) + if "renumber" in kwargs: + warnings.warn( + "Setting renumbering manually could result in invalid output," + " please ensure you intended to do this." + ) + renumber = kwargs.pop("renumber") + else: + renumber = ( + True + if ( + (len(self.__graph_store.node_types) == 1) + and (len(self.__graph_store.edge_types) == 1) + ) + else False ) - else False - ) bulk_sampler = BulkSampler( batch_size, - self.__directory.name, + self.__directory + if isinstance(self.__directory, str) + else self.__directory.name, self.__graph_store._subgraph(edge_types), fanout_vals=num_neighbors, with_replacement=replace, @@ -219,7 +231,13 @@ def __init__( ) bulk_sampler.flush() - self.__input_files = iter(os.listdir(self.__directory.name)) + self.__input_files = iter( + os.listdir( + self.__directory + if isinstance(self.__directory, str) + else self.__directory.name + ) + ) def __next__(self): from time import perf_counter @@ -423,9 +441,6 @@ def __next__(self): sampler_output.edge, ) else: - if self.__graph_store.order == "CSR": - raise ValueError("CSR format incompatible with CSC output") - out = filter_cugraph_store_csc( self.__feature_store, self.__graph_store, @@ -437,11 +452,8 @@ def __next__(self): # Account for CSR format in cuGraph vs. CSC format in PyG if self.__coo and self.__graph_store.order == "CSC": - for node_type in out.edge_index_dict: - out[node_type].edge_index[0], out[node_type].edge_index[1] = ( - out[node_type].edge_index[1], - out[node_type].edge_index[0], - ) + for edge_type in out.edge_index_dict: + out[edge_type].edge_index = out[edge_type].edge_index.flip(dims=[0]) out.set_value_dict("num_sampled_nodes", sampler_output.num_sampled_nodes) out.set_value_dict("num_sampled_edges", sampler_output.num_sampled_edges) diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py index ed7f70034e2..13c9c90c7c2 100644 --- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py +++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py @@ -120,7 +120,7 @@ def test_get_edge_index(graph, edge_index_type, dask_client): G[et][0] = dask_cudf.from_cudf(cudf.Series(G[et][0]), npartitions=1) G[et][1] = dask_cudf.from_cudf(cudf.Series(G[et][1]), npartitions=1) - cugraph_store = CuGraphStore(F, G, N, multi_gpu=True) + cugraph_store = CuGraphStore(F, G, N, order="CSC", multi_gpu=True) for pyg_can_edge_type in G: src, dst = cugraph_store.get_edge_index( diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py index 853836dc2a6..27b73bf7d35 100644 --- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py +++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_loader.py @@ -18,6 +18,7 @@ import cudf import cupy +import numpy as np from cugraph_pyg.loader import CuGraphNeighborLoader from cugraph_pyg.loader import BulkSampleLoader @@ -27,6 +28,8 @@ from cugraph.gnn import FeatureStore from cugraph.utilities.utils import import_optional, MissingModule +from typing import Dict, Tuple + torch = import_optional("torch") torch_geometric = import_optional("torch_geometric") trim_to_layer = import_optional("torch_geometric.utils.trim_to_layer") @@ -40,7 +43,11 @@ @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available") -def test_cugraph_loader_basic(karate_gnn): +def test_cugraph_loader_basic( + karate_gnn: Tuple[ + FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int] + ] +): F, G, N = karate_gnn cugraph_store = CuGraphStore(F, G, N, order="CSR") loader = CuGraphNeighborLoader( @@ -66,7 +73,11 @@ def test_cugraph_loader_basic(karate_gnn): @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available") -def test_cugraph_loader_hetero(karate_gnn): +def test_cugraph_loader_hetero( + karate_gnn: Tuple[ + FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int] + ] +): F, G, N = karate_gnn cugraph_store = CuGraphStore(F, G, N, order="CSR") loader = CuGraphNeighborLoader( @@ -342,7 +353,7 @@ def test_cugraph_loader_e2e_coo(): @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available") @pytest.mark.skipif(not HAS_TORCH_SPARSE, reason="torch-sparse not available") @pytest.mark.parametrize("framework", ["pyg", "cugraph-ops"]) -def test_cugraph_loader_e2e_csc(framework): +def test_cugraph_loader_e2e_csc(framework: str): m = [2, 9, 99, 82, 9, 3, 18, 1, 12] x = torch.randint(3000, (256, 256)).to(torch.float32) F = FeatureStore() @@ -442,3 +453,40 @@ def test_cugraph_loader_e2e_csc(framework): x = x.narrow(dim=0, start=0, length=s - num_sampled_nodes[1]) assert list(x.shape) == [1, 1] + + +@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available") +@pytest.mark.parametrize("directory", ["local", "temp"]) +def test_load_directory( + karate_gnn: Tuple[ + FeatureStore, Dict[Tuple[str, str, str], np.ndarray], Dict[str, int] + ], + directory: str, +): + if directory == "local": + local_dir = tempfile.TemporaryDirectory(dir=".") + + cugraph_store = CuGraphStore(*karate_gnn) + cugraph_loader = CuGraphNeighborLoader( + (cugraph_store, cugraph_store), + torch.arange(8, dtype=torch.int64), + 2, + num_neighbors=[8, 4, 2], + random_state=62, + replace=False, + directory=None if directory == "temp" else local_dir.name, + batches_per_partition=1, + ) + + it = iter(cugraph_loader) + next_batch = next(it) + assert next_batch is not None + + if directory == "local": + assert len(os.listdir(local_dir.name)) == 4 + + count = 1 + while next(it, None) is not None: + count += 1 + + assert count == 4 diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py index da3043760d4..b39ebad8254 100644 --- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py +++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py @@ -113,7 +113,7 @@ def test_get_edge_index(graph, edge_index_type): G[et][0] = cudf.Series(G[et][0]) G[et][1] = cudf.Series(G[et][1]) - cugraph_store = CuGraphStore(F, G, N) + cugraph_store = CuGraphStore(F, G, N, order="CSC") for pyg_can_edge_type in G: src, dst = cugraph_store.get_edge_index( From 06f082b43aab84408d63a907327b9524f1cd0229 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 17 Nov 2023 10:36:35 -0600 Subject: [PATCH 2/3] Enable build concurrency for nightly and merge triggers. (#4009) --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ccfdb826812..0f490283795 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -22,7 +22,7 @@ on: default: nightly concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true jobs: From 5d43f1463c1829322d2a92684f2bb11269730af3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 17 Nov 2023 18:17:47 -0800 Subject: [PATCH 3/3] Find rmm before cuco (#4011) RAPIDS currently relies on copies of CCCL headers bundled into rmm. This dependency is centralized by virtue of rmm installing these into the package and everything else finding those installed packages. To do this, however, rmm must be loaded first so that the libcudacxx install location is patched into CMake's search paths. cugraph also uses cuco, which requires libcudacxx but does not bundle its own, so rmm must be found first so that cuco can find libcudacxx where rmm installed it. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/4011 --- cpp/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 360165e688d..3e867643041 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -153,6 +153,11 @@ rapids_cpm_init() # lags behind. ### +# Need to make sure rmm is found before cuco so that rmm patches the libcudacxx +# directory to be found by cuco. +include(${rapids-cmake-dir}/cpm/rmm.cmake) +rapids_cpm_rmm(BUILD_EXPORT_SET cugraph-exports + INSTALL_EXPORT_SET cugraph-exports) # Putting this before raft to override RAFT from pulling them in. include(cmake/thirdparty/get_libcudacxx.cmake) include(${rapids-cmake-dir}/cpm/cuco.cmake)