diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cecfd16d959..e0a00b1faf7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -153,6 +153,11 @@ rapids_cpm_init() # lags behind. ### +# Need to make sure rmm is found before cuco so that rmm patches the libcudacxx +# directory to be found by cuco. +include(${rapids-cmake-dir}/cpm/rmm.cmake) +rapids_cpm_rmm(BUILD_EXPORT_SET cugraph-exports + INSTALL_EXPORT_SET cugraph-exports) # Putting this before raft to override RAFT from pulling them in. include(cmake/thirdparty/get_libcudacxx.cmake) include(${rapids-cmake-dir}/cpm/cuco.cmake) diff --git a/datasets/README.md b/datasets/README.md index e42413fc996..a23dc644081 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -120,9 +120,13 @@ The benchmark datasets are described below: | soc-twitter-2010 | 21,297,772 | 265,025,809 | No | No | **cit-Patents** : A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations. + **soc-LiveJournal** : A graph of the LiveJournal social network. + **europe_osm** : A graph of OpenStreetMap data for Europe. + **hollywood** : A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together. + **soc-twitter-2010** : A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i. _NOTE: the benchmark datasets were converted to a CSV format from their original format described in the reference URL below, and in doing so had edge weights and isolated vertices discarded._ diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py index 083c4a2b37b..1512901822a 100644 --- a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py +++ b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py @@ -24,7 +24,7 @@ import torch import numpy as np from cugraph.gnn import FeatureStore -from cugraph.experimental.datasets import karate +from cugraph.datasets import karate import tempfile diff --git a/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py b/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py index 5f9eac6b2a3..361226c8071 100644 --- a/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py +++ b/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py @@ -17,7 +17,7 @@ import cugraph from cugraph.experimental import PropertyGraph, MGPropertyGraph -from cugraph.experimental import datasets +from cugraph import datasets from cugraph.generators import rmat diff --git a/python/cugraph/cugraph/dask/community/leiden.py b/python/cugraph/cugraph/dask/community/leiden.py index 75582fa48f7..67bd0876ce6 100644 --- a/python/cugraph/cugraph/dask/community/leiden.py +++ b/python/cugraph/cugraph/dask/community/leiden.py @@ -125,7 +125,7 @@ def leiden( Examples -------- - >>> from cugraph.experimental.datasets import karate + >>> from cugraph.datasets import karate >>> G = karate.get_graph(fetch=True) >>> parts, modularity_score = cugraph.leiden(G) diff --git a/python/cugraph/cugraph/dask/community/louvain.py b/python/cugraph/cugraph/dask/community/louvain.py index 8efbbafaf7b..1b091817a1a 100644 --- a/python/cugraph/cugraph/dask/community/louvain.py +++ b/python/cugraph/cugraph/dask/community/louvain.py @@ -129,7 +129,7 @@ def louvain( Examples -------- - >>> from cugraph.experimental.datasets import karate + >>> from cugraph.datasets import karate >>> G = karate.get_graph(fetch=True) >>> parts = cugraph.louvain(G) diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index 65a820f108b..ac18274d354 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -39,3 +39,13 @@ small_tree = Dataset(meta_path / "small_tree.yaml") toy_graph = Dataset(meta_path / "toy_graph.yaml") toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml") + +# Benchmarking datasets: be mindful of memory usage +# 250 MB +soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml") +# 965 MB +cit_patents = Dataset(meta_path / "cit-patents.yaml") +# 1.8 GB +europe_osm = Dataset(meta_path / "europe_osm.yaml") +# 1.5 GB +hollywood = Dataset(meta_path / "hollywood.yaml") diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py index 877eade7708..dd7aa0df00a 100644 --- a/python/cugraph/cugraph/datasets/dataset.py +++ b/python/cugraph/cugraph/datasets/dataset.py @@ -14,44 +14,45 @@ import cudf import yaml import os +import pandas as pd from pathlib import Path from cugraph.structure.graph_classes import Graph class DefaultDownloadDir: """ - Maintains the path to the download directory used by Dataset instances. + Maintains a path to be used as a default download directory. + + All DefaultDownloadDir instances are based on RAPIDS_DATASET_ROOT_DIR if + set, or _default_base_dir if not set. + Instances of this class are typically shared by several Dataset instances in order to allow for the download directory to be defined and updated by a single object. """ - def __init__(self): - self._path = Path( - os.environ.get("RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets") - ) + _default_base_dir = Path.home() / ".cugraph/datasets" - @property - def path(self): + def __init__(self, *, subdir=""): """ - If `path` is not set, set it to the environment variable - RAPIDS_DATASET_ROOT_DIR. If the variable is not set, default to the - user's home directory. + subdir can be specified to provide a specialized dir under the base dir. """ - if self._path is None: - self._path = Path( - os.environ.get( - "RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets" - ) - ) - return self._path + self._subdir = Path(subdir) + self.reset() + + @property + def path(self): + return self._path.absolute() @path.setter def path(self, new): self._path = Path(new) - def clear(self): - self._path = None + def reset(self): + self._basedir = Path( + os.environ.get("RAPIDS_DATASET_ROOT_DIR", self._default_base_dir) + ) + self._path = self._basedir / self._subdir default_download_dir = DefaultDownloadDir() @@ -159,7 +160,7 @@ def unload(self): """ self._edgelist = None - def get_edgelist(self, download=False): + def get_edgelist(self, download=False, reader="cudf"): """ Return an Edgelist @@ -168,6 +169,9 @@ def get_edgelist(self, download=False): download : Boolean (default=False) Automatically download the dataset from the 'url' location within the YAML file. + + reader : 'cudf' or 'pandas' (default='cudf') + The library used to read a CSV and return an edgelist DataFrame. """ if self._edgelist is None: full_path = self.get_path() @@ -180,14 +184,29 @@ def get_edgelist(self, download=False): " exist. Try setting download=True" " to download the datafile" ) + header = None if isinstance(self.metadata["header"], int): header = self.metadata["header"] - self._edgelist = cudf.read_csv( - full_path, + + if reader == "cudf": + self.__reader = cudf.read_csv + elif reader == "pandas": + self.__reader = pd.read_csv + else: + raise ValueError( + "reader must be a module with a read_csv function compatible with \ + cudf.read_csv" + ) + + self._edgelist = self.__reader( + filepath_or_buffer=full_path, delimiter=self.metadata["delim"], names=self.metadata["col_names"], - dtype=self.metadata["col_types"], + dtype={ + self.metadata["col_names"][i]: self.metadata["col_types"][i] + for i in range(len(self.metadata["col_types"])) + }, header=header, ) @@ -219,6 +238,10 @@ def get_graph( dataset -if present- will be applied to the Graph. If the dataset does not contain weights, the Graph returned will be unweighted regardless of ignore_weights. + + store_transposed: Boolean (default=False) + If True, stores the transpose of the adjacency matrix. Required + for certain algorithms, such as pagerank. """ if self._edgelist is None: self.get_edgelist(download) @@ -237,20 +260,19 @@ def get_graph( "(or subclass) type or instance, got: " f"{type(create_using)}" ) - if len(self.metadata["col_names"]) > 2 and not (ignore_weights): G.from_cudf_edgelist( self._edgelist, - source="src", - destination="dst", - edge_attr="wgt", + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], + edge_attr=self.metadata["col_names"][2], store_transposed=store_transposed, ) else: G.from_cudf_edgelist( self._edgelist, - source="src", - destination="dst", + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], store_transposed=store_transposed, ) return G @@ -331,7 +353,7 @@ def download_all(force=False): def set_download_dir(path): """ - Set the download location fors datasets + Set the download location for datasets Parameters ---------- @@ -339,10 +361,10 @@ def set_download_dir(path): Location used to store datafiles """ if path is None: - default_download_dir.clear() + default_download_dir.reset() else: default_download_dir.path = path def get_download_dir(): - return default_download_dir.path.absolute() + return default_download_dir.path diff --git a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml new file mode 100644 index 00000000000..d5c4cf195bd --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml @@ -0,0 +1,22 @@ +name: cit-Patents +file_type: .csv +description: A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations. +author: NBER +refs: + J. Leskovec, J. Kleinberg and C. Faloutsos. Graphs over Time Densification Laws, Shrinking Diameters and Possible Explanations. + ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2005. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 16518948 +number_of_nodes: 3774768 +url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml new file mode 100644 index 00000000000..fe0e42a4b86 --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml @@ -0,0 +1,21 @@ +name: europe_osm +file_type: .csv +description: A graph of OpenStreetMap data for Europe. +author: M. Kobitzsh / Geofabrik GmbH +refs: + Rossi, Ryan. Ahmed, Nesreen. The Network Data Respoistory with Interactive Graph Analytics and Visualization. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 54054660 +number_of_nodes: 50912018 +url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml new file mode 100644 index 00000000000..2f09cf7679b --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml @@ -0,0 +1,26 @@ +name: hollywood +file_type: .csv +description: + A graph of movie actors where vertices are actors, and two actors are + joined by an edge whenever they appeared in a movie together. +author: Laboratory for Web Algorithmics (LAW) +refs: + The WebGraph Framework I Compression Techniques, Paolo Boldi + and Sebastiano Vigna, Proc. of the Thirteenth International + World Wide Web Conference (WWW 2004), 2004, Manhattan, USA, + pp. 595--601, ACM Press. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 57515616 +number_of_nodes: 1139905 +url: https://data.rapids.ai/cugraph/datasets/hollywood.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml new file mode 100644 index 00000000000..fafc68acb9b --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml @@ -0,0 +1,22 @@ +name: soc-LiveJournal1 +file_type: .csv +description: A graph of the LiveJournal social network. +author: L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan +refs: + L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan. Group Formation in + Large Social Networks Membership, Growth, and Evolution. KDD, 2006. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 68993773 +number_of_nodes: 4847571 +url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml new file mode 100644 index 00000000000..df5df5735af --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml @@ -0,0 +1,22 @@ +name: soc-twitter-2010 +file_type: .csv +description: A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i. +author: H. Kwak, C. Lee, H. Park, S. Moon +refs: + J. Yang, J. Leskovec. Temporal Variation in Online Media. ACM Intl. + Conf. on Web Search and Data Mining (WSDM '11), 2011. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: false +number_of_edges: 530051354 +number_of_nodes: 21297772 +url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/experimental/datasets/__init__.py b/python/cugraph/cugraph/experimental/datasets/__init__.py deleted file mode 100644 index 18220243df1..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/__init__.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from cugraph.experimental.datasets.dataset import ( - Dataset, - load_all, - set_download_dir, - get_download_dir, - default_download_dir, -) -from cugraph.experimental.datasets import metadata -from pathlib import Path - -from cugraph.utilities.api_tools import promoted_experimental_warning_wrapper - - -Dataset = promoted_experimental_warning_wrapper(Dataset) -load_all = promoted_experimental_warning_wrapper(load_all) -set_download_dir = promoted_experimental_warning_wrapper(set_download_dir) -get_download_dir = promoted_experimental_warning_wrapper(get_download_dir) - -meta_path = Path(__file__).parent / "metadata" - - -# individual dataset objects -karate = Dataset(meta_path / "karate.yaml") -karate_data = Dataset(meta_path / "karate_data.yaml") -karate_undirected = Dataset(meta_path / "karate_undirected.yaml") -karate_asymmetric = Dataset(meta_path / "karate_asymmetric.yaml") -karate_disjoint = Dataset(meta_path / "karate-disjoint.yaml") -dolphins = Dataset(meta_path / "dolphins.yaml") -polbooks = Dataset(meta_path / "polbooks.yaml") -netscience = Dataset(meta_path / "netscience.yaml") -cyber = Dataset(meta_path / "cyber.yaml") -small_line = Dataset(meta_path / "small_line.yaml") -small_tree = Dataset(meta_path / "small_tree.yaml") -toy_graph = Dataset(meta_path / "toy_graph.yaml") -toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml") -email_Eu_core = Dataset(meta_path / "email-Eu-core.yaml") -ktruss_polbooks = Dataset(meta_path / "ktruss_polbooks.yaml") - - -# batches of datasets -DATASETS_UNDIRECTED = [karate, dolphins] - -DATASETS_UNDIRECTED_WEIGHTS = [netscience] - -DATASETS_UNRENUMBERED = [karate_disjoint] - -DATASETS = [dolphins, netscience, karate_disjoint] - -DATASETS_SMALL = [karate, dolphins, polbooks] - -STRONGDATASETS = [dolphins, netscience, email_Eu_core] - -DATASETS_KTRUSS = [(polbooks, ktruss_polbooks)] - -MEDIUM_DATASETS = [polbooks] - -SMALL_DATASETS = [karate, dolphins, netscience] - -RLY_SMALL_DATASETS = [small_line, small_tree] - -ALL_DATASETS = [karate, dolphins, netscience, polbooks, small_line, small_tree] - -ALL_DATASETS_WGT = [karate, dolphins, netscience, polbooks, small_line, small_tree] - -TEST_GROUP = [dolphins, netscience] diff --git a/python/cugraph/cugraph/experimental/datasets/dataset.py b/python/cugraph/cugraph/experimental/datasets/dataset.py deleted file mode 100644 index 6b395d50fef..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/dataset.py +++ /dev/null @@ -1,312 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import cudf -import yaml -import os -from pathlib import Path -from cugraph.structure.graph_classes import Graph - - -class DefaultDownloadDir: - """ - Maintains the path to the download directory used by Dataset instances. - Instances of this class are typically shared by several Dataset instances - in order to allow for the download directory to be defined and updated by - a single object. - """ - - def __init__(self): - self._path = Path( - os.environ.get("RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets") - ) - - @property - def path(self): - """ - If `path` is not set, set it to the environment variable - RAPIDS_DATASET_ROOT_DIR. If the variable is not set, default to the - user's home directory. - """ - if self._path is None: - self._path = Path( - os.environ.get( - "RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets" - ) - ) - return self._path - - @path.setter - def path(self, new): - self._path = Path(new) - - def clear(self): - self._path = None - - -default_download_dir = DefaultDownloadDir() - - -class Dataset: - """ - A Dataset Object, used to easily import edgelist data and cuGraph.Graph - instances. - - Parameters - ---------- - meta_data_file_name : yaml file - The metadata file for the specific graph dataset, which includes - information on the name, type, url link, data loading format, graph - properties - """ - - def __init__( - self, - metadata_yaml_file=None, - csv_file=None, - csv_header=None, - csv_delim=" ", - csv_col_names=None, - csv_col_dtypes=None, - ): - self._metadata_file = None - self._dl_path = default_download_dir - self._edgelist = None - self._path = None - - if metadata_yaml_file is not None and csv_file is not None: - raise ValueError("cannot specify both metadata_yaml_file and csv_file") - - elif metadata_yaml_file is not None: - with open(metadata_yaml_file, "r") as file: - self.metadata = yaml.safe_load(file) - self._metadata_file = Path(metadata_yaml_file) - - elif csv_file is not None: - if csv_col_names is None or csv_col_dtypes is None: - raise ValueError( - "csv_col_names and csv_col_dtypes must both be " - "not None when csv_file is specified." - ) - self._path = Path(csv_file) - if self._path.exists() is False: - raise FileNotFoundError(csv_file) - self.metadata = { - "name": self._path.with_suffix("").name, - "file_type": ".csv", - "url": None, - "header": csv_header, - "delim": csv_delim, - "col_names": csv_col_names, - "col_types": csv_col_dtypes, - } - - else: - raise ValueError("must specify either metadata_yaml_file or csv_file") - - def __str__(self): - """ - Use the basename of the meta_data_file the instance was constructed with, - without any extension, as the string repr. - """ - # The metadata file is likely to have a more descriptive file name, so - # use that one first if present. - # FIXME: this may need to provide a more unique or descriptive string repr - if self._metadata_file is not None: - return self._metadata_file.with_suffix("").name - else: - return self.get_path().with_suffix("").name - - def __download_csv(self, url): - """ - Downloads the .csv file from url to the current download path - (self._dl_path), updates self._path with the full path to the - downloaded file, and returns the latest value of self._path. - """ - self._dl_path.path.mkdir(parents=True, exist_ok=True) - - filename = self.metadata["name"] + self.metadata["file_type"] - if self._dl_path.path.is_dir(): - df = cudf.read_csv(url) - self._path = self._dl_path.path / filename - df.to_csv(self._path, index=False) - - else: - raise RuntimeError( - f"The directory {self._dl_path.path.absolute()}" "does not exist" - ) - return self._path - - def unload(self): - - """ - Remove all saved internal objects, forcing them to be re-created when - accessed. - - NOTE: This will cause calls to get_*() to re-read the dataset file from - disk. The caller should ensure the file on disk has not moved/been - deleted/changed. - """ - self._edgelist = None - - def get_edgelist(self, fetch=False): - """ - Return an Edgelist - - Parameters - ---------- - fetch : Boolean (default=False) - Automatically fetch for the dataset from the 'url' location within - the YAML file. - """ - if self._edgelist is None: - full_path = self.get_path() - if not full_path.is_file(): - if fetch: - full_path = self.__download_csv(self.metadata["url"]) - else: - raise RuntimeError( - f"The datafile {full_path} does not" - " exist. Try get_edgelist(fetch=True)" - " to download the datafile" - ) - header = None - if isinstance(self.metadata["header"], int): - header = self.metadata["header"] - self._edgelist = cudf.read_csv( - full_path, - delimiter=self.metadata["delim"], - names=self.metadata["col_names"], - dtype=self.metadata["col_types"], - header=header, - ) - - return self._edgelist - - def get_graph( - self, - fetch=False, - create_using=Graph, - ignore_weights=False, - store_transposed=False, - ): - """ - Return a Graph object. - - Parameters - ---------- - fetch : Boolean (default=False) - Downloads the dataset from the web. - - create_using: cugraph.Graph (instance or class), optional - (default=Graph) - Specify the type of Graph to create. Can pass in an instance to - create a Graph instance with specified 'directed' attribute. - - ignore_weights : Boolean (default=False) - Ignores weights in the dataset if True, resulting in an - unweighted Graph. If False (the default), weights from the - dataset -if present- will be applied to the Graph. If the - dataset does not contain weights, the Graph returned will - be unweighted regardless of ignore_weights. - """ - if self._edgelist is None: - self.get_edgelist(fetch) - - if create_using is None: - G = Graph() - elif isinstance(create_using, Graph): - # what about BFS if trnaposed is True - attrs = {"directed": create_using.is_directed()} - G = type(create_using)(**attrs) - elif type(create_using) is type: - G = create_using() - else: - raise TypeError( - "create_using must be a cugraph.Graph " - "(or subclass) type or instance, got: " - f"{type(create_using)}" - ) - - if len(self.metadata["col_names"]) > 2 and not (ignore_weights): - G.from_cudf_edgelist( - self._edgelist, - source="src", - destination="dst", - edge_attr="wgt", - store_transposed=store_transposed, - ) - else: - G.from_cudf_edgelist( - self._edgelist, - source="src", - destination="dst", - store_transposed=store_transposed, - ) - return G - - def get_path(self): - """ - Returns the location of the stored dataset file - """ - if self._path is None: - self._path = self._dl_path.path / ( - self.metadata["name"] + self.metadata["file_type"] - ) - - return self._path.absolute() - - -def load_all(force=False): - """ - Looks in `metadata` directory and fetches all datafiles from the the URLs - provided in each YAML file. - - Parameters - force : Boolean (default=False) - Overwrite any existing copies of datafiles. - """ - default_download_dir.path.mkdir(parents=True, exist_ok=True) - - meta_path = Path(__file__).parent.absolute() / "metadata" - for file in meta_path.iterdir(): - meta = None - if file.suffix == ".yaml": - with open(meta_path / file, "r") as metafile: - meta = yaml.safe_load(metafile) - - if "url" in meta: - filename = meta["name"] + meta["file_type"] - save_to = default_download_dir.path / filename - if not save_to.is_file() or force: - df = cudf.read_csv(meta["url"]) - df.to_csv(save_to, index=False) - - -def set_download_dir(path): - """ - Set the download directory for fetching datasets - - Parameters - ---------- - path : String - Location used to store datafiles - """ - if path is None: - default_download_dir.clear() - else: - default_download_dir.path = path - - -def get_download_dir(): - return default_download_dir.path.absolute() diff --git a/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml b/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml deleted file mode 100644 index 69a79db9cd9..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml +++ /dev/null @@ -1,5 +0,0 @@ ---- -fetch: "False" -force: "False" -# path where datasets will be downloaded to and stored -download_dir: "datasets" diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py b/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py deleted file mode 100644 index 081b2ae8260..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml deleted file mode 100644 index 93ab5345442..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: cyber -file_type: .csv -author: N/A -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/cyber.csv -refs: N/A -col_names: - - idx - - srcip - - dstip -col_types: - - int32 - - str - - str -delim: "," -header: 0 -has_loop: true -is_directed: true -is_multigraph: false -is_symmetric: false -number_of_edges: 2546575 -number_of_nodes: 706529 -number_of_lines: 2546576 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml deleted file mode 100644 index e4951375321..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml +++ /dev/null @@ -1,25 +0,0 @@ -name: dolphins -file_type: .csv -author: D. Lusseau -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/dolphins.csv -refs: - D. Lusseau, K. Schneider, O. J. Boisseau, P. Haase, E. Slooten, and S. M. Dawson, - The bottlenose dolphin community of Doubtful Sound features a large proportion of - long-lasting associations, Behavioral Ecology and Sociobiology 54, 396-405 (2003). -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -delim: " " -header: None -has_loop: false -is_directed: true -is_multigraph: false -is_symmetric: false -number_of_edges: 318 -number_of_nodes: 62 -number_of_lines: 318 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/email-Eu-core.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/email-Eu-core.yaml deleted file mode 100644 index 97d0dc82ee3..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/email-Eu-core.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: email-Eu-core -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/email-Eu-core.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: false -is_multigraph: false -is_symmetric: true -number_of_edges: 25571 -number_of_nodes: 1005 -number_of_lines: 25571 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate-disjoint.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate-disjoint.yaml deleted file mode 100644 index 0c0eaf78b63..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate-disjoint.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: karate-disjoint -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-disjoint.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: True -is_multigraph: false -is_symmetric: true -number_of_edges: 312 -number_of_nodes: 68 -number_of_lines: 312 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml deleted file mode 100644 index 273381ed368..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: karate -file_type: .csv -author: Zachary W. -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate.csv -refs: - W. W. Zachary, An information flow model for conflict and fission in small groups, - Journal of Anthropological Research 33, 452-473 (1977). -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: true -is_directed: true -is_multigraph: false -is_symmetric: true -number_of_edges: 156 -number_of_nodes: 34 -number_of_lines: 156 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml deleted file mode 100644 index 3616b8fb3a5..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: karate-asymmetric -file_type: .csv -author: Zachary W. -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-asymmetric.csv -delim: " " -header: None -refs: - W. W. Zachary, An information flow model for conflict and fission in small groups, - Journal of Anthropological Research 33, 452-473 (1977). -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: true -is_directed: false -is_multigraph: false -is_symmetric: false -number_of_edges: 78 -number_of_nodes: 34 -number_of_lines: 78 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml deleted file mode 100644 index 9a8b27f21ae..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: karate-data -file_type: .csv -author: Zachary W. -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-data.csv -refs: - W. W. Zachary, An information flow model for conflict and fission in small groups, - Journal of Anthropological Research 33, 452-473 (1977). -delim: "\t" -header: None -col_names: - - src - - dst -col_types: - - int32 - - int32 -has_loop: true -is_directed: true -is_multigraph: false -is_symmetric: true -number_of_edges: 156 -number_of_nodes: 34 -number_of_lines: 156 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml deleted file mode 100644 index 1b45f86caee..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: karate_undirected -file_type: .csv -author: Zachary W. -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate_undirected.csv -refs: - W. W. Zachary, An information flow model for conflict and fission in small groups, - Journal of Anthropological Research 33, 452-473 (1977). -delim: "\t" -header: None -col_names: - - src - - dst -col_types: - - int32 - - int32 -has_loop: true -is_directed: false -is_multigraph: false -is_symmetric: true -number_of_edges: 78 -number_of_nodes: 34 -number_of_lines: 78 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/ktruss_polbooks.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/ktruss_polbooks.yaml deleted file mode 100644 index 1ef29b3917e..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/ktruss_polbooks.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: ktruss_polbooks -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/ref/ktruss/polbooks.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: true -is_multigraph: false -is_symmetric: false -number_of_edges: 233 -number_of_nodes: 58 -number_of_lines: 233 - diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml deleted file mode 100644 index 2dca702df3d..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: netscience -file_type: .csv -author: Newman, Mark EJ -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/netscience.csv -refs: Finding community structure in networks using the eigenvectors of matrices. -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: true -is_multigraph: false -is_symmetric: true -number_of_edges: 2742 -number_of_nodes: 1461 -number_of_lines: 5484 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml deleted file mode 100644 index 5816e5672fd..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: polbooks -file_type: .csv -author: V. Krebs -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/polbooks.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -is_directed: true -has_loop: null -is_multigraph: null -is_symmetric: true -number_of_edges: 882 -number_of_nodes: 105 -number_of_lines: 882 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml deleted file mode 100644 index 5b724ac99fd..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: small_line -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/small_line.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: false -is_multigraph: false -is_symmetric: true -number_of_edges: 9 -number_of_nodes: 10 -number_of_lines: 8 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml deleted file mode 100644 index 8eeac346d2a..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: small_tree -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/small_tree.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: true -is_multigraph: false -is_symmetric: true -number_of_edges: 11 -number_of_nodes: 9 -number_of_lines: 11 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph.yaml deleted file mode 100644 index 819aad06f6a..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: toy_graph -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/toy_graph.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: false -is_multigraph: false -is_symmetric: true -number_of_edges: 16 -number_of_nodes: 6 -number_of_lines: 16 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph_undirected.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph_undirected.yaml deleted file mode 100644 index c6e86bdf334..00000000000 --- a/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph_undirected.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: toy_graph_undirected -file_type: .csv -author: null -url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/toy_graph_undirected.csv -refs: null -delim: " " -header: None -col_names: - - src - - dst - - wgt -col_types: - - int32 - - int32 - - float32 -has_loop: false -is_directed: false -is_multigraph: false -is_symmetric: true -number_of_edges: 8 -number_of_nodes: 6 -number_of_lines: 8 diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py index f5f0bcb06eb..2b4a4fd3ebf 100644 --- a/python/cugraph/cugraph/testing/__init__.py +++ b/python/cugraph/cugraph/testing/__init__.py @@ -19,7 +19,7 @@ Resultset, load_resultset, get_resultset, - results_dir_path, + default_resultset_download_dir, ) from cugraph.datasets import ( cyber, @@ -34,6 +34,11 @@ email_Eu_core, toy_graph, toy_graph_undirected, + soc_livejournal, + cit_patents, + europe_osm, + hollywood, + # twitter, ) # @@ -66,3 +71,4 @@ toy_graph_undirected, ] DEFAULT_DATASETS = [dolphins, netscience, karate_disjoint] +BENCHMARKING_DATASETS = [soc_livejournal, cit_patents, europe_osm, hollywood] diff --git a/python/cugraph/cugraph/testing/generate_resultsets.py b/python/cugraph/cugraph/testing/generate_resultsets.py index 9724aca32dc..2ae0f52d88b 100644 --- a/python/cugraph/cugraph/testing/generate_resultsets.py +++ b/python/cugraph/cugraph/testing/generate_resultsets.py @@ -20,8 +20,14 @@ import cudf import cugraph from cugraph.datasets import dolphins, netscience, karate_disjoint, karate -from cugraph.testing import utils, Resultset, SMALL_DATASETS, results_dir_path +# from cugraph.testing import utils, Resultset, SMALL_DATASETS, results_dir_path +from cugraph.testing import ( + utils, + Resultset, + SMALL_DATASETS, + default_resultset_download_dir, +) _resultsets = {} @@ -224,6 +230,7 @@ def add_resultset(result_data_dictionary, **kwargs): ] ) # Generating ALL results files + results_dir_path = default_resultset_download_dir.path if not results_dir_path.exists(): results_dir_path.mkdir(parents=True, exist_ok=True) diff --git a/python/cugraph/cugraph/testing/resultset.py b/python/cugraph/cugraph/testing/resultset.py index 490e3a7c4ff..9570d7f3e04 100644 --- a/python/cugraph/cugraph/testing/resultset.py +++ b/python/cugraph/cugraph/testing/resultset.py @@ -16,10 +16,12 @@ import urllib.request import cudf -from cugraph.testing import utils +from cugraph.datasets.dataset import ( + DefaultDownloadDir, + default_download_dir, +) - -results_dir_path = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" / "resultsets" +# results_dir_path = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" / "resultsets" class Resultset: @@ -48,6 +50,42 @@ def get_cudf_dataframe(self): _resultsets = {} +def get_resultset(resultset_name, **kwargs): + """ + Returns the golden results for a specific test. + + Parameters + ---------- + resultset_name : String + Name of the test's module (currently just 'traversal' is supported) + + kwargs : + All distinct test details regarding the choice of algorithm, dataset, + and graph + """ + arg_dict = dict(kwargs) + arg_dict["resultset_name"] = resultset_name + # Example: + # {'a': 1, 'z': 9, 'c': 5, 'b': 2} becomes 'a-1-b-2-c-5-z-9' + resultset_key = "-".join( + [ + str(val) + for arg_dict_pair in sorted(arg_dict.items()) + for val in arg_dict_pair + ] + ) + uuid = _resultsets.get(resultset_key) + if uuid is None: + raise KeyError(f"results for {arg_dict} not found") + + results_dir_path = default_resultset_download_dir.path + results_filename = results_dir_path / (uuid + ".csv") + return cudf.read_csv(results_filename) + + +default_resultset_download_dir = DefaultDownloadDir(subdir="tests/resultsets") + + def load_resultset(resultset_name, resultset_download_url): """ Read a mapping file (.csv) in the _results_dir and save the @@ -56,17 +94,21 @@ def load_resultset(resultset_name, resultset_download_url): _results_dir, use resultset_download_url to download a file to install/unpack/etc. to _results_dir first. """ - mapping_file_path = results_dir_path / (resultset_name + "_mappings.csv") + # curr_resultset_download_dir = get_resultset_download_dir() + curr_resultset_download_dir = default_resultset_download_dir.path + # curr_download_dir = path + curr_download_dir = default_download_dir.path + mapping_file_path = curr_resultset_download_dir / (resultset_name + "_mappings.csv") if not mapping_file_path.exists(): # Downloads a tar gz from s3 bucket, then unpacks the results files - compressed_file_dir = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" + compressed_file_dir = curr_download_dir / "tests" compressed_file_path = compressed_file_dir / "resultsets.tar.gz" - if not results_dir_path.exists(): - results_dir_path.mkdir(parents=True, exist_ok=True) + if not curr_resultset_download_dir.exists(): + curr_resultset_download_dir.mkdir(parents=True, exist_ok=True) if not compressed_file_path.exists(): urllib.request.urlretrieve(resultset_download_url, compressed_file_path) tar = tarfile.open(str(compressed_file_path), "r:gz") - tar.extractall(str(results_dir_path)) + tar.extractall(str(curr_resultset_download_dir)) tar.close() # FIXME: This assumes separator is " ", but should this be configurable? @@ -102,35 +144,3 @@ def load_resultset(resultset_name, resultset_download_url): ) _resultsets[resultset_key] = uuid - - -def get_resultset(resultset_name, **kwargs): - """ - Returns the golden results for a specific test. - - Parameters - ---------- - resultset_name : String - Name of the test's module (currently just 'traversal' is supported) - - kwargs : - All distinct test details regarding the choice of algorithm, dataset, - and graph - """ - arg_dict = dict(kwargs) - arg_dict["resultset_name"] = resultset_name - # Example: - # {'a': 1, 'z': 9, 'c': 5, 'b': 2} becomes 'a-1-b-2-c-5-z-9' - resultset_key = "-".join( - [ - str(val) - for arg_dict_pair in sorted(arg_dict.items()) - for val in arg_dict_pair - ] - ) - uuid = _resultsets.get(resultset_key) - if uuid is None: - raise KeyError(f"results for {arg_dict} not found") - - results_filename = results_dir_path / (uuid + ".csv") - return cudf.read_csv(results_filename) diff --git a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py index 4277f94a396..478b7e655d5 100644 --- a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py +++ b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py @@ -16,7 +16,7 @@ import dask_cudf from pylibcugraph.testing.utils import gen_fixture_params_product -from cugraph.experimental.datasets import DATASETS_UNDIRECTED +from cugraph.datasets import karate, dolphins import cugraph import cugraph.dask as dcg @@ -41,7 +41,7 @@ def setup_function(): # email_Eu_core is too expensive to test -datasets = DATASETS_UNDIRECTED +datasets = [karate, dolphins] # ============================================================================= diff --git a/python/cugraph/cugraph/tests/nx/test_compat_pr.py b/python/cugraph/cugraph/tests/nx/test_compat_pr.py index 9be3912a33f..45cab7a5674 100644 --- a/python/cugraph/cugraph/tests/nx/test_compat_pr.py +++ b/python/cugraph/cugraph/tests/nx/test_compat_pr.py @@ -24,7 +24,7 @@ import numpy as np from cugraph.testing import utils -from cugraph.experimental.datasets import karate +from cugraph.datasets import karate from pylibcugraph.testing.utils import gen_fixture_params_product diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index c2a4f7c6072..60bc6dbb45a 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -13,11 +13,10 @@ import os import gc -import sys -import warnings from pathlib import Path from tempfile import TemporaryDirectory +import pandas import pytest import cudf @@ -27,6 +26,7 @@ ALL_DATASETS, WEIGHTED_DATASETS, SMALL_DATASETS, + BENCHMARKING_DATASETS, ) from cugraph import datasets @@ -74,27 +74,14 @@ def setup(tmpdir): gc.collect() -@pytest.fixture() -def setup_deprecation_warning_tests(): - """ - Fixture used to set warning filters to 'default' and reload - experimental.datasets module if it has been previously - imported. Tests that import this fixture are expected to - import cugraph.experimental.datasets - """ - warnings.filterwarnings("default") - - if "cugraph.experimental.datasets" in sys.modules: - del sys.modules["cugraph.experimental.datasets"] - - yield - - ############################################################################### # Helpers # check if there is a row where src == dst -def has_loop(df): +def has_selfloop(dataset): + if not dataset.metadata["is_directed"]: + return False + df = dataset.get_edgelist(download=True) df.rename(columns={df.columns[0]: "src", df.columns[1]: "dst"}, inplace=True) res = df.where(df["src"] == df["dst"]) @@ -109,7 +96,13 @@ def is_symmetric(dataset): else: df = dataset.get_edgelist(download=True) df_a = df.sort_values("src") - df_b = df_a[["dst", "src", "wgt"]] + + # create df with swapped src/dst columns + df_b = None + if "wgt" in df_a.columns: + df_b = df_a[["dst", "src", "wgt"]] + else: + df_b = df_a[["dst", "src"]] df_b.rename(columns={"dst": "src", "src": "dst"}, inplace=True) # created a df by appending the two res = cudf.concat([df_a, df_b]) @@ -157,6 +150,27 @@ def test_download(dataset): assert dataset.get_path().is_file() +@pytest.mark.parametrize("dataset", SMALL_DATASETS) +def test_reader(dataset): + # defaults to using cudf.read_csv + E = dataset.get_edgelist(download=True) + + assert E is not None + assert isinstance(E, cudf.core.dataframe.DataFrame) + dataset.unload() + + # using pandas + E_pd = dataset.get_edgelist(download=True, reader="pandas") + + assert E_pd is not None + assert isinstance(E_pd, pandas.core.frame.DataFrame) + dataset.unload() + + with pytest.raises(ValueError): + dataset.get_edgelist(reader="fail") + dataset.get_edgelist(reader=None) + + @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_get_edgelist(dataset): E = dataset.get_edgelist(download=True) @@ -172,7 +186,6 @@ def test_get_graph(dataset): @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_metadata(dataset): M = dataset.metadata - assert M is not None @@ -310,10 +323,8 @@ def test_is_directed(dataset): @pytest.mark.parametrize("dataset", ALL_DATASETS) -def test_has_loop(dataset): - df = dataset.get_edgelist(download=True) - - assert has_loop(df) == dataset.metadata["has_loop"] +def test_has_selfloop(dataset): + assert has_selfloop(dataset) == dataset.metadata["has_loop"] @pytest.mark.parametrize("dataset", ALL_DATASETS) @@ -328,6 +339,25 @@ def test_is_multigraph(dataset): assert G.is_multigraph() == dataset.metadata["is_multigraph"] +# The datasets used for benchmarks are in their own test, since downloading them +# repeatedly would increase testing overhead significantly +@pytest.mark.parametrize("dataset", BENCHMARKING_DATASETS) +def test_benchmarking_datasets(dataset): + dataset_is_directed = dataset.metadata["is_directed"] + G = dataset.get_graph( + download=True, create_using=Graph(directed=dataset_is_directed) + ) + + assert G.is_directed() == dataset.metadata["is_directed"] + assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] + assert G.number_of_edges() == dataset.metadata["number_of_edges"] + assert has_selfloop(dataset) == dataset.metadata["has_loop"] + assert is_symmetric(dataset) == dataset.metadata["is_symmetric"] + assert G.is_multigraph() == dataset.metadata["is_multigraph"] + + dataset.unload() + + @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_object_getters(dataset): assert dataset.is_directed() == dataset.metadata["is_directed"] @@ -336,32 +366,3 @@ def test_object_getters(dataset): assert dataset.number_of_nodes() == dataset.metadata["number_of_nodes"] assert dataset.number_of_vertices() == dataset.metadata["number_of_nodes"] assert dataset.number_of_edges() == dataset.metadata["number_of_edges"] - - -# -# Test experimental for DeprecationWarnings -# -def test_experimental_dataset_import(setup_deprecation_warning_tests): - with pytest.deprecated_call(): - from cugraph.experimental.datasets import karate - - # unload() is called to pass flake8 - karate.unload() - - -def test_experimental_method_warnings(setup_deprecation_warning_tests): - from cugraph.experimental.datasets import ( - load_all, - set_download_dir, - get_download_dir, - ) - - warnings.filterwarnings("default") - tmpd = TemporaryDirectory() - - with pytest.deprecated_call(): - set_download_dir(tmpd.name) - get_download_dir() - load_all() - - tmpd.cleanup() diff --git a/python/cugraph/cugraph/tests/utils/test_resultset.py b/python/cugraph/cugraph/tests/utils/test_resultset.py new file mode 100644 index 00000000000..5c2298bedb7 --- /dev/null +++ b/python/cugraph/cugraph/tests/utils/test_resultset.py @@ -0,0 +1,71 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from pathlib import Path +from tempfile import TemporaryDirectory + +import cudf +from cugraph.datasets.dataset import ( + set_download_dir, + get_download_dir, +) +from cugraph.testing.resultset import load_resultset, default_resultset_download_dir + +############################################################################### + + +def test_load_resultset(): + with TemporaryDirectory() as tmpd: + + set_download_dir(Path(tmpd)) + default_resultset_download_dir.path = Path(tmpd) / "tests" / "resultsets" + default_resultset_download_dir.path.mkdir(parents=True, exist_ok=True) + + datasets_download_dir = get_download_dir() + resultsets_download_dir = default_resultset_download_dir.path + assert "tests" in os.listdir(datasets_download_dir) + assert "resultsets.tar.gz" not in os.listdir(datasets_download_dir / "tests") + assert "traversal_mappings.csv" not in os.listdir(resultsets_download_dir) + + load_resultset( + "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz" + ) + + assert "resultsets.tar.gz" in os.listdir(datasets_download_dir / "tests") + assert "traversal_mappings.csv" in os.listdir(resultsets_download_dir) + + +def test_verify_resultset_load(): + # This test is more detailed than test_load_resultset, where for each module, + # we check that every single resultset file is included along with the + # corresponding mapping file. + with TemporaryDirectory() as tmpd: + set_download_dir(Path(tmpd)) + default_resultset_download_dir.path = Path(tmpd) / "tests" / "resultsets" + default_resultset_download_dir.path.mkdir(parents=True, exist_ok=True) + + resultsets_download_dir = default_resultset_download_dir.path + + load_resultset( + "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz" + ) + + resultsets = os.listdir(resultsets_download_dir) + downloaded_results = cudf.read_csv( + resultsets_download_dir / "traversal_mappings.csv", sep=" " + ) + downloaded_uuids = downloaded_results["#UUID"].values + for resultset_uuid in downloaded_uuids: + assert str(resultset_uuid) + ".csv" in resultsets