From 5d43f1463c1829322d2a92684f2bb11269730af3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 17 Nov 2023 18:17:47 -0800 Subject: [PATCH 1/3] Find rmm before cuco (#4011) RAPIDS currently relies on copies of CCCL headers bundled into rmm. This dependency is centralized by virtue of rmm installing these into the package and everything else finding those installed packages. To do this, however, rmm must be loaded first so that the libcudacxx install location is patched into CMake's search paths. cugraph also uses cuco, which requires libcudacxx but does not bundle its own, so rmm must be found first so that cuco can find libcudacxx where rmm installed it. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/4011 --- cpp/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 360165e688d..3e867643041 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -153,6 +153,11 @@ rapids_cpm_init() # lags behind. ### +# Need to make sure rmm is found before cuco so that rmm patches the libcudacxx +# directory to be found by cuco. +include(${rapids-cmake-dir}/cpm/rmm.cmake) +rapids_cpm_rmm(BUILD_EXPORT_SET cugraph-exports + INSTALL_EXPORT_SET cugraph-exports) # Putting this before raft to override RAFT from pulling them in. include(cmake/thirdparty/get_libcudacxx.cmake) include(${rapids-cmake-dir}/cpm/cuco.cmake) From 0684f9de2015c4b7d45f28605f2bb2f9bf523359 Mon Sep 17 00:00:00 2001 From: Ralph Liu <137829296+nv-rliu@users.noreply.github.com> Date: Sun, 19 Nov 2023 07:09:32 -0500 Subject: [PATCH 2/3] `Resultset` and `Dataset` Refactors (#3957) This PR replaces and is a continuation of #3857 (by @betochimas) > This PR primarily adds testing for the `Resultset` class, introduced earlier in 23.10. The tests take a similar approach to test_dataset, creating a temporary directory to test downloading all result files. To align `Resultset` and `Dataset`, the setter and getter for each download directory is moved into `DefaultDownloadDir`, so that each class shares an instance of `DefaultDownloadDir` and can be configured independently, although their default locations are still both dependent on the RAPIDS_DATASET_ROOT_DIR_PATH environment variable. The old patterns are present but commented-out, so this change would be breaking. This PR also removes the deprecated `experimental.datasets` package due to it being promoted to stable for >=1 release. Authors: - Ralph Liu (https://github.com/nv-rliu) - Dylan Chima-Sanchez (https://github.com/betochimas) - Rick Ratzel (https://github.com/rlratzel) - Brad Rees (https://github.com/BradReesWork) Approvers: - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/3957 --- datasets/README.md | 4 + .../cugraph-pyg/cugraph_pyg/tests/conftest.py | 2 +- .../testing/benchmark_server_extension.py | 2 +- .../cugraph/cugraph/dask/community/leiden.py | 2 +- .../cugraph/cugraph/dask/community/louvain.py | 2 +- python/cugraph/cugraph/datasets/__init__.py | 10 + python/cugraph/cugraph/datasets/dataset.py | 86 +++-- .../datasets/metadata/cit-patents.yaml | 22 ++ .../cugraph/datasets/metadata/europe_osm.yaml | 21 ++ .../cugraph/datasets/metadata/hollywood.yaml | 26 ++ .../datasets/metadata/soc-livejournal1.yaml | 22 ++ .../datasets/metadata/soc-twitter-2010.yaml | 22 ++ .../cugraph/experimental/datasets/__init__.py | 79 ----- .../cugraph/experimental/datasets/dataset.py | 312 ------------------ .../datasets/datasets_config.yaml | 5 - .../datasets/metadata/__init__.py | 13 - .../experimental/datasets/metadata/cyber.yaml | 22 -- .../datasets/metadata/dolphins.yaml | 25 -- .../datasets/metadata/email-Eu-core.yaml | 22 -- .../datasets/metadata/karate-disjoint.yaml | 22 -- .../datasets/metadata/karate.yaml | 24 -- .../datasets/metadata/karate_asymmetric.yaml | 24 -- .../datasets/metadata/karate_data.yaml | 22 -- .../datasets/metadata/karate_undirected.yaml | 22 -- .../datasets/metadata/ktruss_polbooks.yaml | 23 -- .../datasets/metadata/netscience.yaml | 22 -- .../datasets/metadata/polbooks.yaml | 22 -- .../datasets/metadata/small_line.yaml | 22 -- .../datasets/metadata/small_tree.yaml | 22 -- .../datasets/metadata/toy_graph.yaml | 22 -- .../metadata/toy_graph_undirected.yaml | 22 -- python/cugraph/cugraph/testing/__init__.py | 8 +- .../cugraph/testing/generate_resultsets.py | 9 +- python/cugraph/cugraph/testing/resultset.py | 90 ++--- .../test_edge_betweenness_centrality_mg.py | 4 +- .../cugraph/tests/nx/test_compat_pr.py | 2 +- .../cugraph/tests/utils/test_dataset.py | 109 +++--- .../cugraph/tests/utils/test_resultset.py | 71 ++++ 38 files changed, 379 insertions(+), 882 deletions(-) create mode 100644 python/cugraph/cugraph/datasets/metadata/cit-patents.yaml create mode 100644 python/cugraph/cugraph/datasets/metadata/europe_osm.yaml create mode 100644 python/cugraph/cugraph/datasets/metadata/hollywood.yaml create mode 100644 python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml create mode 100644 python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/__init__.py delete mode 100644 python/cugraph/cugraph/experimental/datasets/dataset.py delete mode 100644 python/cugraph/cugraph/experimental/datasets/datasets_config.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/__init__.py delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/email-Eu-core.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/karate-disjoint.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/ktruss_polbooks.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/toy_graph.yaml delete mode 100644 python/cugraph/cugraph/experimental/datasets/metadata/toy_graph_undirected.yaml create mode 100644 python/cugraph/cugraph/tests/utils/test_resultset.py diff --git a/datasets/README.md b/datasets/README.md index e42413fc996..a23dc644081 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -120,9 +120,13 @@ The benchmark datasets are described below: | soc-twitter-2010 | 21,297,772 | 265,025,809 | No | No | **cit-Patents** : A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations. + **soc-LiveJournal** : A graph of the LiveJournal social network. + **europe_osm** : A graph of OpenStreetMap data for Europe. + **hollywood** : A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together. + **soc-twitter-2010** : A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i. _NOTE: the benchmark datasets were converted to a CSV format from their original format described in the reference URL below, and in doing so had edge weights and isolated vertices discarded._ diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py index 083c4a2b37b..1512901822a 100644 --- a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py +++ b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py @@ -24,7 +24,7 @@ import torch import numpy as np from cugraph.gnn import FeatureStore -from cugraph.experimental.datasets import karate +from cugraph.datasets import karate import tempfile diff --git a/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py b/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py index 5f9eac6b2a3..361226c8071 100644 --- a/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py +++ b/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py @@ -17,7 +17,7 @@ import cugraph from cugraph.experimental import PropertyGraph, MGPropertyGraph -from cugraph.experimental import datasets +from cugraph import datasets from cugraph.generators import rmat diff --git a/python/cugraph/cugraph/dask/community/leiden.py b/python/cugraph/cugraph/dask/community/leiden.py index 75582fa48f7..67bd0876ce6 100644 --- a/python/cugraph/cugraph/dask/community/leiden.py +++ b/python/cugraph/cugraph/dask/community/leiden.py @@ -125,7 +125,7 @@ def leiden( Examples -------- - >>> from cugraph.experimental.datasets import karate + >>> from cugraph.datasets import karate >>> G = karate.get_graph(fetch=True) >>> parts, modularity_score = cugraph.leiden(G) diff --git a/python/cugraph/cugraph/dask/community/louvain.py b/python/cugraph/cugraph/dask/community/louvain.py index 8efbbafaf7b..1b091817a1a 100644 --- a/python/cugraph/cugraph/dask/community/louvain.py +++ b/python/cugraph/cugraph/dask/community/louvain.py @@ -129,7 +129,7 @@ def louvain( Examples -------- - >>> from cugraph.experimental.datasets import karate + >>> from cugraph.datasets import karate >>> G = karate.get_graph(fetch=True) >>> parts = cugraph.louvain(G) diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index 65a820f108b..ac18274d354 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -39,3 +39,13 @@ small_tree = Dataset(meta_path / "small_tree.yaml") toy_graph = Dataset(meta_path / "toy_graph.yaml") toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml") + +# Benchmarking datasets: be mindful of memory usage +# 250 MB +soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml") +# 965 MB +cit_patents = Dataset(meta_path / "cit-patents.yaml") +# 1.8 GB +europe_osm = Dataset(meta_path / "europe_osm.yaml") +# 1.5 GB +hollywood = Dataset(meta_path / "hollywood.yaml") diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py index 877eade7708..dd7aa0df00a 100644 --- a/python/cugraph/cugraph/datasets/dataset.py +++ b/python/cugraph/cugraph/datasets/dataset.py @@ -14,44 +14,45 @@ import cudf import yaml import os +import pandas as pd from pathlib import Path from cugraph.structure.graph_classes import Graph class DefaultDownloadDir: """ - Maintains the path to the download directory used by Dataset instances. + Maintains a path to be used as a default download directory. + + All DefaultDownloadDir instances are based on RAPIDS_DATASET_ROOT_DIR if + set, or _default_base_dir if not set. + Instances of this class are typically shared by several Dataset instances in order to allow for the download directory to be defined and updated by a single object. """ - def __init__(self): - self._path = Path( - os.environ.get("RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets") - ) + _default_base_dir = Path.home() / ".cugraph/datasets" - @property - def path(self): + def __init__(self, *, subdir=""): """ - If `path` is not set, set it to the environment variable - RAPIDS_DATASET_ROOT_DIR. If the variable is not set, default to the - user's home directory. + subdir can be specified to provide a specialized dir under the base dir. """ - if self._path is None: - self._path = Path( - os.environ.get( - "RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets" - ) - ) - return self._path + self._subdir = Path(subdir) + self.reset() + + @property + def path(self): + return self._path.absolute() @path.setter def path(self, new): self._path = Path(new) - def clear(self): - self._path = None + def reset(self): + self._basedir = Path( + os.environ.get("RAPIDS_DATASET_ROOT_DIR", self._default_base_dir) + ) + self._path = self._basedir / self._subdir default_download_dir = DefaultDownloadDir() @@ -159,7 +160,7 @@ def unload(self): """ self._edgelist = None - def get_edgelist(self, download=False): + def get_edgelist(self, download=False, reader="cudf"): """ Return an Edgelist @@ -168,6 +169,9 @@ def get_edgelist(self, download=False): download : Boolean (default=False) Automatically download the dataset from the 'url' location within the YAML file. + + reader : 'cudf' or 'pandas' (default='cudf') + The library used to read a CSV and return an edgelist DataFrame. """ if self._edgelist is None: full_path = self.get_path() @@ -180,14 +184,29 @@ def get_edgelist(self, download=False): " exist. Try setting download=True" " to download the datafile" ) + header = None if isinstance(self.metadata["header"], int): header = self.metadata["header"] - self._edgelist = cudf.read_csv( - full_path, + + if reader == "cudf": + self.__reader = cudf.read_csv + elif reader == "pandas": + self.__reader = pd.read_csv + else: + raise ValueError( + "reader must be a module with a read_csv function compatible with \ + cudf.read_csv" + ) + + self._edgelist = self.__reader( + filepath_or_buffer=full_path, delimiter=self.metadata["delim"], names=self.metadata["col_names"], - dtype=self.metadata["col_types"], + dtype={ + self.metadata["col_names"][i]: self.metadata["col_types"][i] + for i in range(len(self.metadata["col_types"])) + }, header=header, ) @@ -219,6 +238,10 @@ def get_graph( dataset -if present- will be applied to the Graph. If the dataset does not contain weights, the Graph returned will be unweighted regardless of ignore_weights. + + store_transposed: Boolean (default=False) + If True, stores the transpose of the adjacency matrix. Required + for certain algorithms, such as pagerank. """ if self._edgelist is None: self.get_edgelist(download) @@ -237,20 +260,19 @@ def get_graph( "(or subclass) type or instance, got: " f"{type(create_using)}" ) - if len(self.metadata["col_names"]) > 2 and not (ignore_weights): G.from_cudf_edgelist( self._edgelist, - source="src", - destination="dst", - edge_attr="wgt", + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], + edge_attr=self.metadata["col_names"][2], store_transposed=store_transposed, ) else: G.from_cudf_edgelist( self._edgelist, - source="src", - destination="dst", + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], store_transposed=store_transposed, ) return G @@ -331,7 +353,7 @@ def download_all(force=False): def set_download_dir(path): """ - Set the download location fors datasets + Set the download location for datasets Parameters ---------- @@ -339,10 +361,10 @@ def set_download_dir(path): Location used to store datafiles """ if path is None: - default_download_dir.clear() + default_download_dir.reset() else: default_download_dir.path = path def get_download_dir(): - return default_download_dir.path.absolute() + return default_download_dir.path diff --git a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml new file mode 100644 index 00000000000..d5c4cf195bd --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml @@ -0,0 +1,22 @@ +name: cit-Patents +file_type: .csv +description: A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations. +author: NBER +refs: + J. Leskovec, J. Kleinberg and C. Faloutsos. Graphs over Time Densification Laws, Shrinking Diameters and Possible Explanations. + ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2005. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 16518948 +number_of_nodes: 3774768 +url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml new file mode 100644 index 00000000000..fe0e42a4b86 --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml @@ -0,0 +1,21 @@ +name: europe_osm +file_type: .csv +description: A graph of OpenStreetMap data for Europe. +author: M. Kobitzsh / Geofabrik GmbH +refs: + Rossi, Ryan. Ahmed, Nesreen. The Network Data Respoistory with Interactive Graph Analytics and Visualization. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 54054660 +number_of_nodes: 50912018 +url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml new file mode 100644 index 00000000000..2f09cf7679b --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml @@ -0,0 +1,26 @@ +name: hollywood +file_type: .csv +description: + A graph of movie actors where vertices are actors, and two actors are + joined by an edge whenever they appeared in a movie together. +author: Laboratory for Web Algorithmics (LAW) +refs: + The WebGraph Framework I Compression Techniques, Paolo Boldi + and Sebastiano Vigna, Proc. of the Thirteenth International + World Wide Web Conference (WWW 2004), 2004, Manhattan, USA, + pp. 595--601, ACM Press. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 57515616 +number_of_nodes: 1139905 +url: https://data.rapids.ai/cugraph/datasets/hollywood.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml new file mode 100644 index 00000000000..fafc68acb9b --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml @@ -0,0 +1,22 @@ +name: soc-LiveJournal1 +file_type: .csv +description: A graph of the LiveJournal social network. +author: L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan +refs: + L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan. Group Formation in + Large Social Networks Membership, Growth, and Evolution. KDD, 2006. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 68993773 +number_of_nodes: 4847571 +url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml new file mode 100644 index 00000000000..df5df5735af --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml @@ -0,0 +1,22 @@ +name: soc-twitter-2010 +file_type: .csv +description: A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i. +author: H. Kwak, C. Lee, H. Park, S. Moon +refs: + J. Yang, J. Leskovec. Temporal Variation in Online Media. ACM Intl. + Conf. on Web Search and Data Mining (WSDM '11), 2011. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: false +number_of_edges: 530051354 +number_of_nodes: 21297772 +url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/experimental/datasets/__init__.py b/python/cugraph/cugraph/experimental/datasets/__init__.py deleted file mode 100644 index 18220243df1..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/__init__.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from cugraph.experimental.datasets.dataset import ( - Dataset, - load_all, - set_download_dir, - get_download_dir, - default_download_dir, -) -from cugraph.experimental.datasets import metadata -from pathlib import Path - -from cugraph.utilities.api_tools import promoted_experimental_warning_wrapper - - -Dataset = promoted_experimental_warning_wrapper(Dataset) -load_all = promoted_experimental_warning_wrapper(load_all) -set_download_dir = promoted_experimental_warning_wrapper(set_download_dir) -get_download_dir = promoted_experimental_warning_wrapper(get_download_dir) - -meta_path = Path(__file__).parent / "metadata" - - -# individual dataset objects -karate = Dataset(meta_path / "karate.yaml") -karate_data = Dataset(meta_path / "karate_data.yaml") -karate_undirected = Dataset(meta_path / "karate_undirected.yaml") -karate_asymmetric = Dataset(meta_path / "karate_asymmetric.yaml") -karate_disjoint = Dataset(meta_path / "karate-disjoint.yaml") -dolphins = Dataset(meta_path / "dolphins.yaml") -polbooks = Dataset(meta_path / "polbooks.yaml") -netscience = Dataset(meta_path / "netscience.yaml") -cyber = Dataset(meta_path / "cyber.yaml") -small_line = Dataset(meta_path / "small_line.yaml") -small_tree = Dataset(meta_path / "small_tree.yaml") -toy_graph = Dataset(meta_path / "toy_graph.yaml") -toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml") -email_Eu_core = Dataset(meta_path / "email-Eu-core.yaml") -ktruss_polbooks = Dataset(meta_path / "ktruss_polbooks.yaml") - - -# batches of datasets -DATASETS_UNDIRECTED = [karate, dolphins] - -DATASETS_UNDIRECTED_WEIGHTS = [netscience] - -DATASETS_UNRENUMBERED = [karate_disjoint] - -DATASETS = [dolphins, netscience, karate_disjoint] - -DATASETS_SMALL = [karate, dolphins, polbooks] - -STRONGDATASETS = [dolphins, netscience, email_Eu_core] - -DATASETS_KTRUSS = [(polbooks, ktruss_polbooks)] - -MEDIUM_DATASETS = [polbooks] - -SMALL_DATASETS = [karate, dolphins, netscience] - -RLY_SMALL_DATASETS = [small_line, small_tree] - -ALL_DATASETS = [karate, dolphins, netscience, polbooks, small_line, small_tree] - -ALL_DATASETS_WGT = [karate, dolphins, netscience, polbooks, small_line, small_tree] - -TEST_GROUP = [dolphins, netscience] diff --git a/python/cugraph/cugraph/experimental/datasets/dataset.py b/python/cugraph/cugraph/experimental/datasets/dataset.py deleted file mode 100644 index 6b395d50fef..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/dataset.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import cudf -import yaml -import os -from pathlib import Path -from cugraph.structure.graph_classes import Graph - - -class DefaultDownloadDir: - """ - Maintains the path to the download directory used by Dataset instances. - Instances of this class are typically shared by several Dataset instances - in order to allow for the download directory to be defined and updated by - a single object. - """ - - def __init__(self): - self._path = Path( - os.environ.get("RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets") - ) - - @property - def path(self): - """ - If `path` is not set, set it to the environment variable - RAPIDS_DATASET_ROOT_DIR. If the variable is not set, default to the - user's home directory. - """ - if self._path is None: - self._path = Path( - os.environ.get( - "RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets" - ) - ) - return self._path - - @path.setter - def path(self, new): - self._path = Path(new) - - def clear(self): - self._path = None - - -default_download_dir = DefaultDownloadDir() - - -class Dataset: - """ - A Dataset Object, used to easily import edgelist data and cuGraph.Graph - instances. - - Parameters - ---------- - meta_data_file_name : yaml file - The metadata file for the specific graph dataset, which includes - information on the name, type, url link, data loading format, graph - properties - """ - - def __init__( - self, - metadata_yaml_file=None, - csv_file=None, - csv_header=None, - csv_delim=" ", - csv_col_names=None, - csv_col_dtypes=None, - ): - self._metadata_file = None - self._dl_path = default_download_dir - self._edgelist = None - self._path = None - - if metadata_yaml_file is not None and csv_file is not None: - raise ValueError("cannot specify both metadata_yaml_file and csv_file") - - elif metadata_yaml_file is not None: - with open(metadata_yaml_file, "r") as file: - self.metadata = yaml.safe_load(file) - self._metadata_file = Path(metadata_yaml_file) - - elif csv_file is not None: - if csv_col_names is None or csv_col_dtypes is None: - raise ValueError( - "csv_col_names and csv_col_dtypes must both be " - "not None when csv_file is specified." - ) - self._path = Path(csv_file) - if self._path.exists() is False: - raise FileNotFoundError(csv_file) - self.metadata = { - "name": self._path.with_suffix("").name, - "file_type": ".csv", - "url": None, - "header": csv_header, - "delim": csv_delim, - "col_names": csv_col_names, - "col_types": csv_col_dtypes, - } - - else: - raise ValueError("must specify either metadata_yaml_file or csv_file") - - def __str__(self): - """ - Use the basename of the meta_data_file the instance was constructed with, - without any extension, as the string repr. - """ - # The metadata file is likely to have a more descriptive file name, so - # use that one first if present. - # FIXME: this may need to provide a more unique or descriptive string repr - if self._metadata_file is not None: - return self._metadata_file.with_suffix("").name - else: - return self.get_path().with_suffix("").name - - def __download_csv(self, url): - """ - Downloads the .csv file from url to the current download path - (self._dl_path), updates self._path with the full path to the - downloaded file, and returns the latest value of self._path. - """ - self._dl_path.path.mkdir(parents=True, exist_ok=True) - - filename = self.metadata["name"] + self.metadata["file_type"] - if self._dl_path.path.is_dir(): - df = cudf.read_csv(url) - self._path = self._dl_path.path / filename - df.to_csv(self._path, index=False) - - else: - raise RuntimeError( - f"The directory {self._dl_path.path.absolute()}" "does not exist" - ) - return self._path - - def unload(self): - - """ - Remove all saved internal objects, forcing them to be re-created when - accessed. - - NOTE: This will cause calls to get_*() to re-read the dataset file from - disk. The caller should ensure the file on disk has not moved/been - deleted/changed. - """ - self._edgelist = None - - def get_edgelist(self, fetch=False): - """ - Return an Edgelist - - Parameters - ---------- - fetch : Boolean (default=False) - Automatically fetch for the dataset from the 'url' location within - the YAML file. - """ - if self._edgelist is None: - full_path = self.get_path() - if not full_path.is_file(): - if fetch: - full_path = self.__download_csv(self.metadata["url"]) - else: - raise RuntimeError( - f"The datafile {full_path} does not" - " exist. Try get_edgelist(fetch=True)" - " to download the datafile" - ) - header = None - if isinstance(self.metadata["header"], int): - header = self.metadata["header"] - self._edgelist = cudf.read_csv( - full_path, - delimiter=self.metadata["delim"], - names=self.metadata["col_names"], - dtype=self.metadata["col_types"], - header=header, - ) - - return self._edgelist - - def get_graph( - self, - fetch=False, - create_using=Graph, - ignore_weights=False, - store_transposed=False, - ): - """ - Return a Graph object. - - Parameters - ---------- - fetch : Boolean (default=False) - Downloads the dataset from the web. - - create_using: cugraph.Graph (instance or class), optional - (default=Graph) - Specify the type of Graph to create. Can pass in an instance to - create a Graph instance with specified 'directed' attribute. - - ignore_weights : Boolean (default=False) - Ignores weights in the dataset if True, resulting in an - unweighted Graph. If False (the default), weights from the - dataset -if present- will be applied to the Graph. If the - dataset does not contain weights, the Graph returned will - be unweighted regardless of ignore_weights. - """ - if self._edgelist is None: - self.get_edgelist(fetch) - - if create_using is None: - G = Graph() - elif isinstance(create_using, Graph): - # what about BFS if trnaposed is True - attrs = {"directed": create_using.is_directed()} - G = type(create_using)(**attrs) - elif type(create_using) is type: - G = create_using() - else: - raise TypeError( - "create_using must be a cugraph.Graph " - "(or subclass) type or instance, got: " - f"{type(create_using)}" - ) - - if len(self.metadata["col_names"]) > 2 and not (ignore_weights): - G.from_cudf_edgelist( - self._edgelist, - source="src", - destination="dst", - edge_attr="wgt", - store_transposed=store_transposed, - ) - else: - G.from_cudf_edgelist( - self._edgelist, - source="src", - destination="dst", - store_transposed=store_transposed, - ) - return G - - def get_path(self): - """ - Returns the location of the stored dataset file - """ - if self._path is None: - self._path = self._dl_path.path / ( - self.metadata["name"] + self.metadata["file_type"] - ) - - return self._path.absolute() - - -def load_all(force=False): - """ - Looks in `metadata` directory and fetches all datafiles from the the URLs - provided in each YAML file. - - Parameters - force : Boolean (default=False) - Overwrite any existing copies of datafiles. - """ - default_download_dir.path.mkdir(parents=True, exist_ok=True) - - meta_path = Path(__file__).parent.absolute() / "metadata" - for file in meta_path.iterdir(): - meta = None - if file.suffix == ".yaml": - with open(meta_path / file, "r") as metafile: - meta = yaml.safe_load(metafile) - - if "url" in meta: - filename = meta["name"] + meta["file_type"] - save_to = default_download_dir.path / filename - if not save_to.is_file() or force: - df = cudf.read_csv(meta["url"]) - df.to_csv(save_to, index=False) - - -def set_download_dir(path): - """ - Set the download directory for fetching datasets - - Parameters - ---------- - path : String - Location used to store datafiles - """ - if path is None: - default_download_dir.clear() - else: - default_download_dir.path = path - - -def get_download_dir(): - return default_download_dir.path.absolute() diff --git a/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml b/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml deleted file mode 100644 index 69a79db9cd9..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml +++ /dev/null @@ -1,5 +0,0 @@ ---- -fetch: "False" -force: "False" -# path where datasets will be downloaded to and stored -download_dir: "datasets" diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py b/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py deleted file mode 100644 index 081b2ae8260..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml deleted file mode 100644 index 93ab5345442..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: cyber -file_type: .csv -author: N/A -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/cyber.csv -refs: N/A -col_names: - - idx - - srcip - - dstip -col_types: - - int32 - - str - - str -delim: "," -header: 0 -has_loop: true -is_directed: true -is_multigraph: false -is_symmetric: false -number_of_edges: 2546575 -number_of_nodes: 706529 -number_of_lines: 2546576 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml deleted file mode 100644 index e4951375321..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml +++ /dev/null @@ -1,25 +0,0 @@ -name: dolphins -file_type: .csv -author: D. Lusseau -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/dolphins.csv -refs: - D. Lusseau, K. Schneider, O. J. Boisseau, P. Haase, E. Slooten, and S. M. Dawson, - The bottlenose dolphin community of Doubtful Sound features a large proportion of - long-lasting associations, Behavioral Ecology and Sociobiology 54, 396-405 (2003). -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -delim: " " -header: None -has_loop: false -is_directed: true -is_multigraph: false -is_symmetric: false -number_of_edges: 318 -number_of_nodes: 62 -number_of_lines: 318 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/email-Eu-core.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/email-Eu-core.yaml deleted file mode 100644 index 97d0dc82ee3..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/email-Eu-core.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: email-Eu-core -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/email-Eu-core.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: false -is_multigraph: false -is_symmetric: true -number_of_edges: 25571 -number_of_nodes: 1005 -number_of_lines: 25571 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate-disjoint.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate-disjoint.yaml deleted file mode 100644 index 0c0eaf78b63..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate-disjoint.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: karate-disjoint -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-disjoint.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: True -is_multigraph: false -is_symmetric: true -number_of_edges: 312 -number_of_nodes: 68 -number_of_lines: 312 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml deleted file mode 100644 index 273381ed368..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: karate -file_type: .csv -author: Zachary W. -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate.csv -refs: - W. W. Zachary, An information flow model for conflict and fission in small groups, - Journal of Anthropological Research 33, 452-473 (1977). -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: true -is_directed: true -is_multigraph: false -is_symmetric: true -number_of_edges: 156 -number_of_nodes: 34 -number_of_lines: 156 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml deleted file mode 100644 index 3616b8fb3a5..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: karate-asymmetric -file_type: .csv -author: Zachary W. -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-asymmetric.csv -delim: " " -header: None -refs: - W. W. Zachary, An information flow model for conflict and fission in small groups, - Journal of Anthropological Research 33, 452-473 (1977). -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: true -is_directed: false -is_multigraph: false -is_symmetric: false -number_of_edges: 78 -number_of_nodes: 34 -number_of_lines: 78 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml deleted file mode 100644 index 9a8b27f21ae..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: karate-data -file_type: .csv -author: Zachary W. -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-data.csv -refs: - W. W. Zachary, An information flow model for conflict and fission in small groups, - Journal of Anthropological Research 33, 452-473 (1977). -delim: "\t" -header: None -col_names: - - src - - dst -col_types: - - int32 - - int32 -has_loop: true -is_directed: true -is_multigraph: false -is_symmetric: true -number_of_edges: 156 -number_of_nodes: 34 -number_of_lines: 156 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml deleted file mode 100644 index 1b45f86caee..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: karate_undirected -file_type: .csv -author: Zachary W. -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate_undirected.csv -refs: - W. W. Zachary, An information flow model for conflict and fission in small groups, - Journal of Anthropological Research 33, 452-473 (1977). -delim: "\t" -header: None -col_names: - - src - - dst -col_types: - - int32 - - int32 -has_loop: true -is_directed: false -is_multigraph: false -is_symmetric: true -number_of_edges: 78 -number_of_nodes: 34 -number_of_lines: 78 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/ktruss_polbooks.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/ktruss_polbooks.yaml deleted file mode 100644 index 1ef29b3917e..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/ktruss_polbooks.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: ktruss_polbooks -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/ref/ktruss/polbooks.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: true -is_multigraph: false -is_symmetric: false -number_of_edges: 233 -number_of_nodes: 58 -number_of_lines: 233 - diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml deleted file mode 100644 index 2dca702df3d..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: netscience -file_type: .csv -author: Newman, Mark EJ -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/netscience.csv -refs: Finding community structure in networks using the eigenvectors of matrices. -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: true -is_multigraph: false -is_symmetric: true -number_of_edges: 2742 -number_of_nodes: 1461 -number_of_lines: 5484 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml deleted file mode 100644 index 5816e5672fd..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: polbooks -file_type: .csv -author: V. Krebs -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/polbooks.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -is_directed: true -has_loop: null -is_multigraph: null -is_symmetric: true -number_of_edges: 882 -number_of_nodes: 105 -number_of_lines: 882 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml deleted file mode 100644 index 5b724ac99fd..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: small_line -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/small_line.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: false -is_multigraph: false -is_symmetric: true -number_of_edges: 9 -number_of_nodes: 10 -number_of_lines: 8 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml deleted file mode 100644 index 8eeac346d2a..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: small_tree -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/small_tree.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: true -is_multigraph: false -is_symmetric: true -number_of_edges: 11 -number_of_nodes: 9 -number_of_lines: 11 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph.yaml deleted file mode 100644 index 819aad06f6a..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: toy_graph -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/toy_graph.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: false -is_multigraph: false -is_symmetric: true -number_of_edges: 16 -number_of_nodes: 6 -number_of_lines: 16 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph_undirected.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph_undirected.yaml deleted file mode 100644 index c6e86bdf334..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph_undirected.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: toy_graph_undirected -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/toy_graph_undirected.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: false -is_multigraph: false -is_symmetric: true -number_of_edges: 8 -number_of_nodes: 6 -number_of_lines: 8 diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py index f5f0bcb06eb..2b4a4fd3ebf 100644 --- a/python/cugraph/cugraph/testing/__init__.py +++ b/python/cugraph/cugraph/testing/__init__.py @@ -19,7 +19,7 @@ Resultset, load_resultset, get_resultset, - results_dir_path, + default_resultset_download_dir, ) from cugraph.datasets import ( cyber, @@ -34,6 +34,11 @@ email_Eu_core, toy_graph, toy_graph_undirected, + soc_livejournal, + cit_patents, + europe_osm, + hollywood, + # twitter, ) # @@ -66,3 +71,4 @@ toy_graph_undirected, ] DEFAULT_DATASETS = [dolphins, netscience, karate_disjoint] +BENCHMARKING_DATASETS = [soc_livejournal, cit_patents, europe_osm, hollywood] diff --git a/python/cugraph/cugraph/testing/generate_resultsets.py b/python/cugraph/cugraph/testing/generate_resultsets.py index 9724aca32dc..2ae0f52d88b 100644 --- a/python/cugraph/cugraph/testing/generate_resultsets.py +++ b/python/cugraph/cugraph/testing/generate_resultsets.py @@ -20,8 +20,14 @@ import cudf import cugraph from cugraph.datasets import dolphins, netscience, karate_disjoint, karate -from cugraph.testing import utils, Resultset, SMALL_DATASETS, results_dir_path +# from cugraph.testing import utils, Resultset, SMALL_DATASETS, results_dir_path +from cugraph.testing import ( + utils, + Resultset, + SMALL_DATASETS, + default_resultset_download_dir, +) _resultsets = {} @@ -224,6 +230,7 @@ def add_resultset(result_data_dictionary, **kwargs): ] ) # Generating ALL results files + results_dir_path = default_resultset_download_dir.path if not results_dir_path.exists(): results_dir_path.mkdir(parents=True, exist_ok=True) diff --git a/python/cugraph/cugraph/testing/resultset.py b/python/cugraph/cugraph/testing/resultset.py index 490e3a7c4ff..9570d7f3e04 100644 --- a/python/cugraph/cugraph/testing/resultset.py +++ b/python/cugraph/cugraph/testing/resultset.py @@ -16,10 +16,12 @@ import urllib.request import cudf -from cugraph.testing import utils +from cugraph.datasets.dataset import ( + DefaultDownloadDir, + default_download_dir, +) - -results_dir_path = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" / "resultsets" +# results_dir_path = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" / "resultsets" class Resultset: @@ -48,6 +50,42 @@ def get_cudf_dataframe(self): _resultsets = {} +def get_resultset(resultset_name, **kwargs): + """ + Returns the golden results for a specific test. + + Parameters + ---------- + resultset_name : String + Name of the test's module (currently just 'traversal' is supported) + + kwargs : + All distinct test details regarding the choice of algorithm, dataset, + and graph + """ + arg_dict = dict(kwargs) + arg_dict["resultset_name"] = resultset_name + # Example: + # {'a': 1, 'z': 9, 'c': 5, 'b': 2} becomes 'a-1-b-2-c-5-z-9' + resultset_key = "-".join( + [ + str(val) + for arg_dict_pair in sorted(arg_dict.items()) + for val in arg_dict_pair + ] + ) + uuid = _resultsets.get(resultset_key) + if uuid is None: + raise KeyError(f"results for {arg_dict} not found") + + results_dir_path = default_resultset_download_dir.path + results_filename = results_dir_path / (uuid + ".csv") + return cudf.read_csv(results_filename) + + +default_resultset_download_dir = DefaultDownloadDir(subdir="tests/resultsets") + + def load_resultset(resultset_name, resultset_download_url): """ Read a mapping file (.csv) in the _results_dir and save the @@ -56,17 +94,21 @@ def load_resultset(resultset_name, resultset_download_url): _results_dir, use resultset_download_url to download a file to install/unpack/etc. to _results_dir first. """ - mapping_file_path = results_dir_path / (resultset_name + "_mappings.csv") + # curr_resultset_download_dir = get_resultset_download_dir() + curr_resultset_download_dir = default_resultset_download_dir.path + # curr_download_dir = path + curr_download_dir = default_download_dir.path + mapping_file_path = curr_resultset_download_dir / (resultset_name + "_mappings.csv") if not mapping_file_path.exists(): # Downloads a tar gz from s3 bucket, then unpacks the results files - compressed_file_dir = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" + compressed_file_dir = curr_download_dir / "tests" compressed_file_path = compressed_file_dir / "resultsets.tar.gz" - if not results_dir_path.exists(): - results_dir_path.mkdir(parents=True, exist_ok=True) + if not curr_resultset_download_dir.exists(): + curr_resultset_download_dir.mkdir(parents=True, exist_ok=True) if not compressed_file_path.exists(): urllib.request.urlretrieve(resultset_download_url, compressed_file_path) tar = tarfile.open(str(compressed_file_path), "r:gz") - tar.extractall(str(results_dir_path)) + tar.extractall(str(curr_resultset_download_dir)) tar.close() # FIXME: This assumes separator is " ", but should this be configurable? @@ -102,35 +144,3 @@ def load_resultset(resultset_name, resultset_download_url): ) _resultsets[resultset_key] = uuid - - -def get_resultset(resultset_name, **kwargs): - """ - Returns the golden results for a specific test. - - Parameters - ---------- - resultset_name : String - Name of the test's module (currently just 'traversal' is supported) - - kwargs : - All distinct test details regarding the choice of algorithm, dataset, - and graph - """ - arg_dict = dict(kwargs) - arg_dict["resultset_name"] = resultset_name - # Example: - # {'a': 1, 'z': 9, 'c': 5, 'b': 2} becomes 'a-1-b-2-c-5-z-9' - resultset_key = "-".join( - [ - str(val) - for arg_dict_pair in sorted(arg_dict.items()) - for val in arg_dict_pair - ] - ) - uuid = _resultsets.get(resultset_key) - if uuid is None: - raise KeyError(f"results for {arg_dict} not found") - - results_filename = results_dir_path / (uuid + ".csv") - return cudf.read_csv(results_filename) diff --git a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py index 4277f94a396..478b7e655d5 100644 --- a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py +++ b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py @@ -16,7 +16,7 @@ import dask_cudf from pylibcugraph.testing.utils import gen_fixture_params_product -from cugraph.experimental.datasets import DATASETS_UNDIRECTED +from cugraph.datasets import karate, dolphins import cugraph import cugraph.dask as dcg @@ -41,7 +41,7 @@ def setup_function(): # email_Eu_core is too expensive to test -datasets = DATASETS_UNDIRECTED +datasets = [karate, dolphins] # ============================================================================= diff --git a/python/cugraph/cugraph/tests/nx/test_compat_pr.py b/python/cugraph/cugraph/tests/nx/test_compat_pr.py index 9be3912a33f..45cab7a5674 100644 --- a/python/cugraph/cugraph/tests/nx/test_compat_pr.py +++ b/python/cugraph/cugraph/tests/nx/test_compat_pr.py @@ -24,7 +24,7 @@ import numpy as np from cugraph.testing import utils -from cugraph.experimental.datasets import karate +from cugraph.datasets import karate from pylibcugraph.testing.utils import gen_fixture_params_product diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index c2a4f7c6072..60bc6dbb45a 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -13,11 +13,10 @@ import os import gc -import sys -import warnings from pathlib import Path from tempfile import TemporaryDirectory +import pandas import pytest import cudf @@ -27,6 +26,7 @@ ALL_DATASETS, WEIGHTED_DATASETS, SMALL_DATASETS, + BENCHMARKING_DATASETS, ) from cugraph import datasets @@ -74,27 +74,14 @@ def setup(tmpdir): gc.collect() -@pytest.fixture() -def setup_deprecation_warning_tests(): - """ - Fixture used to set warning filters to 'default' and reload - experimental.datasets module if it has been previously - imported. Tests that import this fixture are expected to - import cugraph.experimental.datasets - """ - warnings.filterwarnings("default") - - if "cugraph.experimental.datasets" in sys.modules: - del sys.modules["cugraph.experimental.datasets"] - - yield - - ############################################################################### # Helpers # check if there is a row where src == dst -def has_loop(df): +def has_selfloop(dataset): + if not dataset.metadata["is_directed"]: + return False + df = dataset.get_edgelist(download=True) df.rename(columns={df.columns[0]: "src", df.columns[1]: "dst"}, inplace=True) res = df.where(df["src"] == df["dst"]) @@ -109,7 +96,13 @@ def is_symmetric(dataset): else: df = dataset.get_edgelist(download=True) df_a = df.sort_values("src") - df_b = df_a[["dst", "src", "wgt"]] + + # create df with swapped src/dst columns + df_b = None + if "wgt" in df_a.columns: + df_b = df_a[["dst", "src", "wgt"]] + else: + df_b = df_a[["dst", "src"]] df_b.rename(columns={"dst": "src", "src": "dst"}, inplace=True) # created a df by appending the two res = cudf.concat([df_a, df_b]) @@ -157,6 +150,27 @@ def test_download(dataset): assert dataset.get_path().is_file() +@pytest.mark.parametrize("dataset", SMALL_DATASETS) +def test_reader(dataset): + # defaults to using cudf.read_csv + E = dataset.get_edgelist(download=True) + + assert E is not None + assert isinstance(E, cudf.core.dataframe.DataFrame) + dataset.unload() + + # using pandas + E_pd = dataset.get_edgelist(download=True, reader="pandas") + + assert E_pd is not None + assert isinstance(E_pd, pandas.core.frame.DataFrame) + dataset.unload() + + with pytest.raises(ValueError): + dataset.get_edgelist(reader="fail") + dataset.get_edgelist(reader=None) + + @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_get_edgelist(dataset): E = dataset.get_edgelist(download=True) @@ -172,7 +186,6 @@ def test_get_graph(dataset): @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_metadata(dataset): M = dataset.metadata - assert M is not None @@ -310,10 +323,8 @@ def test_is_directed(dataset): @pytest.mark.parametrize("dataset", ALL_DATASETS) -def test_has_loop(dataset): - df = dataset.get_edgelist(download=True) - - assert has_loop(df) == dataset.metadata["has_loop"] +def test_has_selfloop(dataset): + assert has_selfloop(dataset) == dataset.metadata["has_loop"] @pytest.mark.parametrize("dataset", ALL_DATASETS) @@ -328,6 +339,25 @@ def test_is_multigraph(dataset): assert G.is_multigraph() == dataset.metadata["is_multigraph"] +# The datasets used for benchmarks are in their own test, since downloading them +# repeatedly would increase testing overhead significantly +@pytest.mark.parametrize("dataset", BENCHMARKING_DATASETS) +def test_benchmarking_datasets(dataset): + dataset_is_directed = dataset.metadata["is_directed"] + G = dataset.get_graph( + download=True, create_using=Graph(directed=dataset_is_directed) + ) + + assert G.is_directed() == dataset.metadata["is_directed"] + assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] + assert G.number_of_edges() == dataset.metadata["number_of_edges"] + assert has_selfloop(dataset) == dataset.metadata["has_loop"] + assert is_symmetric(dataset) == dataset.metadata["is_symmetric"] + assert G.is_multigraph() == dataset.metadata["is_multigraph"] + + dataset.unload() + + @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_object_getters(dataset): assert dataset.is_directed() == dataset.metadata["is_directed"] @@ -336,32 +366,3 @@ def test_object_getters(dataset): assert dataset.number_of_nodes() == dataset.metadata["number_of_nodes"] assert dataset.number_of_vertices() == dataset.metadata["number_of_nodes"] assert dataset.number_of_edges() == dataset.metadata["number_of_edges"] - - -# -# Test experimental for DeprecationWarnings -# -def test_experimental_dataset_import(setup_deprecation_warning_tests): - with pytest.deprecated_call(): - from cugraph.experimental.datasets import karate - - # unload() is called to pass flake8 - karate.unload() - - -def test_experimental_method_warnings(setup_deprecation_warning_tests): - from cugraph.experimental.datasets import ( - load_all, - set_download_dir, - get_download_dir, - ) - - warnings.filterwarnings("default") - tmpd = TemporaryDirectory() - - with pytest.deprecated_call(): - set_download_dir(tmpd.name) - get_download_dir() - load_all() - - tmpd.cleanup() diff --git a/python/cugraph/cugraph/tests/utils/test_resultset.py b/python/cugraph/cugraph/tests/utils/test_resultset.py new file mode 100644 index 00000000000..5c2298bedb7 --- /dev/null +++ b/python/cugraph/cugraph/tests/utils/test_resultset.py @@ -0,0 +1,71 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from pathlib import Path +from tempfile import TemporaryDirectory + +import cudf +from cugraph.datasets.dataset import ( + set_download_dir, + get_download_dir, +) +from cugraph.testing.resultset import load_resultset, default_resultset_download_dir + +############################################################################### + + +def test_load_resultset(): + with TemporaryDirectory() as tmpd: + + set_download_dir(Path(tmpd)) + default_resultset_download_dir.path = Path(tmpd) / "tests" / "resultsets" + default_resultset_download_dir.path.mkdir(parents=True, exist_ok=True) + + datasets_download_dir = get_download_dir() + resultsets_download_dir = default_resultset_download_dir.path + assert "tests" in os.listdir(datasets_download_dir) + assert "resultsets.tar.gz" not in os.listdir(datasets_download_dir / "tests") + assert "traversal_mappings.csv" not in os.listdir(resultsets_download_dir) + + load_resultset( + "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz" + ) + + assert "resultsets.tar.gz" in os.listdir(datasets_download_dir / "tests") + assert "traversal_mappings.csv" in os.listdir(resultsets_download_dir) + + +def test_verify_resultset_load(): + # This test is more detailed than test_load_resultset, where for each module, + # we check that every single resultset file is included along with the + # corresponding mapping file. + with TemporaryDirectory() as tmpd: + set_download_dir(Path(tmpd)) + default_resultset_download_dir.path = Path(tmpd) / "tests" / "resultsets" + default_resultset_download_dir.path.mkdir(parents=True, exist_ok=True) + + resultsets_download_dir = default_resultset_download_dir.path + + load_resultset( + "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz" + ) + + resultsets = os.listdir(resultsets_download_dir) + downloaded_results = cudf.read_csv( + resultsets_download_dir / "traversal_mappings.csv", sep=" " + ) + downloaded_uuids = downloaded_results["#UUID"].values + for resultset_uuid in downloaded_uuids: + assert str(resultset_uuid) + ".csv" in resultsets From 0f28b2ee45130486ca891b757574780ac58dd720 Mon Sep 17 00:00:00 2001 From: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com> Date: Mon, 20 Nov 2023 13:04:06 -0500 Subject: [PATCH 3/3] [BUG] Fix Graph Construction From Pandas in cuGraph-PyG (#3985) The current graph construction creates a single pandas dataframe, which for larger datasets (i.e. ogbn-papers100M) cannot be serialized. This PR resolves this by breaking up the dataframe into scattered numpy arrays that are then reassembled. Merge after #3978 Authors: - Alex Barghi (https://github.com/alexbarghi-nv) - Naim (https://github.com/naimnv) Approvers: - Vibhu Jawa (https://github.com/VibhuJawa) - Brad Rees (https://github.com/BradReesWork) - Tingyu Wang (https://github.com/tingyu66) URL: https://github.com/rapidsai/cugraph/pull/3985 --- .../cugraph_pyg/data/cugraph_store.py | 75 +++++++++++++------ .../tests/mg/test_mg_cugraph_loader.py | 1 - .../tests/mg/test_mg_cugraph_store.py | 26 +++++++ 3 files changed, 80 insertions(+), 22 deletions(-) diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py index edeeface4c4..14dc5d84f90 100644 --- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py +++ b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py @@ -27,11 +27,12 @@ import cugraph import warnings -from cugraph.utilities.utils import import_optional, MissingModule +import dask.array as dar +import dask.dataframe as dd +import dask.distributed as distributed +import dask_cudf -dd = import_optional("dask.dataframe") -distributed = import_optional("dask.distributed") -dask_cudf = import_optional("dask_cudf") +from cugraph.utilities.utils import import_optional, MissingModule torch = import_optional("torch") torch_geometric = import_optional("torch_geometric") @@ -367,6 +368,13 @@ def __infer_offsets( } ) + def __dask_array_from_numpy(self, array: np.ndarray, npartitions: int): + return dar.from_array( + array, + meta=np.array([], dtype=array.dtype), + chunks=max(1, len(array) // npartitions), + ) + def __construct_graph( self, edge_info: Dict[Tuple[str, str, str], List[TensorType]], @@ -464,22 +472,32 @@ def __construct_graph( ] ) - df = pandas.DataFrame( - { - "src": pandas.Series(na_dst) - if order == "CSC" - else pandas.Series(na_src), - "dst": pandas.Series(na_src) - if order == "CSC" - else pandas.Series(na_dst), - "etp": pandas.Series(na_etp), - } - ) - vertex_dtype = df.src.dtype + vertex_dtype = na_src.dtype if multi_gpu: - nworkers = len(distributed.get_client().scheduler_info()["workers"]) - df = dd.from_pandas(df, npartitions=nworkers if len(df) > 32 else 1) + client = distributed.get_client() + nworkers = len(client.scheduler_info()["workers"]) + npartitions = nworkers * 4 + + src_dar = self.__dask_array_from_numpy(na_src, npartitions) + del na_src + + dst_dar = self.__dask_array_from_numpy(na_dst, npartitions) + del na_dst + + etp_dar = self.__dask_array_from_numpy(na_etp, npartitions) + del na_etp + + df = dd.from_dask_array(etp_dar, columns=["etp"]) + df["src"] = dst_dar if order == "CSC" else src_dar + df["dst"] = src_dar if order == "CSC" else dst_dar + + del src_dar + del dst_dar + del etp_dar + + if df.etp.dtype != "int32": + raise ValueError("Edge type must be int32!") # Ensure the dataframe is constructed on each partition # instead of adding additional synchronization head from potential @@ -487,9 +505,9 @@ def __construct_graph( def get_empty_df(): return cudf.DataFrame( { + "etp": cudf.Series([], dtype="int32"), "src": cudf.Series([], dtype=vertex_dtype), "dst": cudf.Series([], dtype=vertex_dtype), - "etp": cudf.Series([], dtype="int32"), } ) @@ -500,9 +518,23 @@ def get_empty_df(): if len(f) > 0 else get_empty_df(), meta=get_empty_df(), - ).reset_index(drop=True) + ).reset_index( + drop=True + ) # should be ok for dask else: - df = cudf.from_pandas(df).reset_index(drop=True) + df = pandas.DataFrame( + { + "src": pandas.Series(na_dst) + if order == "CSC" + else pandas.Series(na_src), + "dst": pandas.Series(na_src) + if order == "CSC" + else pandas.Series(na_dst), + "etp": pandas.Series(na_etp), + } + ) + df = cudf.from_pandas(df) + df.reset_index(drop=True, inplace=True) graph = cugraph.MultiGraph(directed=True) if multi_gpu: @@ -521,6 +553,7 @@ def get_empty_df(): edge_type="etp", ) + del df return graph @property diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py index 55aebf305da..f5035a38621 100644 --- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py +++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_loader.py @@ -15,7 +15,6 @@ from cugraph_pyg.loader import CuGraphNeighborLoader from cugraph_pyg.data import CuGraphStore - from cugraph.utilities.utils import import_optional, MissingModule torch = import_optional("torch") diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py index 13c9c90c7c2..be8f8245807 100644 --- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py +++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py @@ -386,3 +386,29 @@ def test_mg_frame_handle(graph, dask_client): F, G, N = graph cugraph_store = CuGraphStore(F, G, N, multi_gpu=True) assert isinstance(cugraph_store._EXPERIMENTAL__CuGraphStore__graph._plc_graph, dict) + + +@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available") +def test_cugraph_loader_large_index(dask_client): + large_index = ( + np.random.randint(0, 1_000_000, (100_000_000,)), + np.random.randint(0, 1_000_000, (100_000_000,)), + ) + + large_features = np.random.randint(0, 50, (1_000_000,)) + F = cugraph.gnn.FeatureStore(backend="torch") + F.add_data(large_features, "N", "f") + + store = CuGraphStore( + F, + {("N", "e", "N"): large_index}, + {"N": 1_000_000}, + multi_gpu=True, + ) + + graph = store._subgraph() + assert isinstance(graph, cugraph.Graph) + + el = graph.view_edge_list().compute() + assert (el["src"].values_host - large_index[0]).sum() == 0 + assert (el["dst"].values_host - large_index[1]).sum() == 0