From db990945af571f9f9d2d22ab831fc0fa9e163eba Mon Sep 17 00:00:00 2001 From: root Date: Wed, 30 Aug 2023 21:58:45 +0000 Subject: [PATCH 01/14] Added tests to verify load_resultsets by refactoring --- python/cugraph/cugraph/testing/resultset.py | 77 ++++++++++++++++++- .../cugraph/tests/utils/test_resultset.py | 72 +++++++++++++++++ 2 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 python/cugraph/cugraph/tests/utils/test_resultset.py diff --git a/python/cugraph/cugraph/testing/resultset.py b/python/cugraph/cugraph/testing/resultset.py index 490e3a7c4ff..c6dbcbf7aab 100644 --- a/python/cugraph/cugraph/testing/resultset.py +++ b/python/cugraph/cugraph/testing/resultset.py @@ -17,7 +17,10 @@ import cudf from cugraph.testing import utils - +from cugraph.datasets.dataset import ( + DefaultDownloadDir, + get_download_dir, +) results_dir_path = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" / "resultsets" @@ -134,3 +137,75 @@ def get_resultset(resultset_name, **kwargs): results_filename = results_dir_path / (uuid + ".csv") return cudf.read_csv(results_filename) + + +default_resultset_download_dir = DefaultDownloadDir() + + +def set_resultset_download_dir(path): + if path is None: + default_resultset_download_dir.clear() + else: + default_resultset_download_dir.path = path + + +def get_resultset_download_dir(): + return default_resultset_download_dir.path.absolute() + + +def load_resultset2(resultset_name, resultset_download_url): + """ + Read a mapping file (.csv) in the _results_dir and save the + mappings between each unique set of args/identifiers to UUIDs to the + _resultsets dictionary. If .csv does not exist in + _results_dir, use resultset_download_url to download a file to + install/unpack/etc. to _results_dir first. + """ + curr_resultset_download_dir = get_resultset_download_dir() + curr_download_dir = get_download_dir() + mapping_file_path = curr_resultset_download_dir / (resultset_name + "_mappings.csv") + if not mapping_file_path.exists(): + # Downloads a tar gz from s3 bucket, then unpacks the results files + compressed_file_dir = curr_download_dir / "tests" + compressed_file_path = compressed_file_dir / "resultsets.tar.gz" + if not curr_resultset_download_dir.exists(): + curr_resultset_download_dir.mkdir(parents=True, exist_ok=True) + if not compressed_file_path.exists(): + urllib.request.urlretrieve(resultset_download_url, compressed_file_path) + tar = tarfile.open(str(compressed_file_path), "r:gz") + tar.extractall(str(curr_resultset_download_dir)) + tar.close() + + # FIXME: This assumes separator is " ", but should this be configurable? + sep = " " + with open(mapping_file_path) as mapping_file: + for line in mapping_file.readlines(): + if line.startswith("#"): + continue + + (uuid, *row_args) = line.split(sep) + if (len(row_args) % 2) != 0: + raise ValueError( + f'bad row in {mapping_file_path}: "{line}", must ' + "contain UUID followed by an even number of items" + ) + row_keys = row_args[::2] + row_vals = row_args[1::2] + row_keys = " ".join(row_keys).split() + row_vals = " ".join(row_vals).split() + arg_dict = dict(zip(row_keys, row_vals)) + arg_dict["resultset_name"] = resultset_name + # Create a unique string key for the _resultsets dict based on + # sorted row_keys. Looking up results based on args will also have + # to sort, but this will ensure results can looked up without + # requiring maintaining a specific order. Example: + # {'a': 1, 'z': 9, 'c': 5, 'b': 2} becomes 'a-1-b-2-c-5-z-9' + resultset_key = "-".join( + [ + str(val) + for arg_dict_pair in sorted(arg_dict.items()) + for val in arg_dict_pair + ] + ) + + _resultsets[resultset_key] = uuid diff --git a/python/cugraph/cugraph/tests/utils/test_resultset.py b/python/cugraph/cugraph/tests/utils/test_resultset.py new file mode 100644 index 00000000000..eaaba796d2e --- /dev/null +++ b/python/cugraph/cugraph/tests/utils/test_resultset.py @@ -0,0 +1,72 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from pathlib import Path +from tempfile import TemporaryDirectory + +import cudf +from cugraph.datasets.dataset import ( + set_download_dir, + get_download_dir, +) +from cugraph.testing.resultset import ( + set_resultset_download_dir, + get_resultset_download_dir, + load_resultset2, +) + +############################################################################### + + +def test_load_resultset2(): + with TemporaryDirectory() as tmpd: + set_download_dir(Path(tmpd)) + set_resultset_download_dir(Path(tmpd) / "tests" / "resultsets") + get_resultset_download_dir().mkdir(parents=True, exist_ok=True) + + datasets_download_dir = get_download_dir() + resultsets_download_dir = get_resultset_download_dir() + assert "tests" in os.listdir(datasets_download_dir) + assert "resultsets.tar.gz" not in os.listdir(datasets_download_dir / "tests") + assert "traversal_mappings.csv" not in os.listdir(resultsets_download_dir) + + load_resultset2( + "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz" + ) + + assert "resultsets.tar.gz" in os.listdir(datasets_download_dir / "tests") + assert "traversal_mappings.csv" in os.listdir(resultsets_download_dir) + + +def test_verify_resultset_load(): + with TemporaryDirectory() as tmpd: + set_download_dir(Path(tmpd)) + set_resultset_download_dir(Path(tmpd) / "tests" / "resultsets") + get_resultset_download_dir().mkdir(parents=True, exist_ok=True) + + # datasets_download_dir = get_download_dir() + resultsets_download_dir = get_resultset_download_dir() + + load_resultset2( + "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz" + ) + + resultsets = os.listdir(resultsets_download_dir) + downloaded_results = cudf.read_csv( + resultsets_download_dir / "traversal_mappings.csv", sep=" " + ) + downloaded_uuids = downloaded_results["#UUID"].values + for resultset_uuid in downloaded_uuids: + assert str(resultset_uuid) + ".csv" in resultsets From 5c3739676038706c373fd1600cc2df38e2e431ae Mon Sep 17 00:00:00 2001 From: Dylan Chima-Sanchez Date: Tue, 5 Sep 2023 21:42:11 +0000 Subject: [PATCH 02/14] Added benchmark datasets yaml files for use with Dataset --- python/cugraph/cugraph/datasets/__init__.py | 5 ++++ .../datasets/metadata/cit-patents.yaml | 22 ++++++++++++++++ .../cugraph/datasets/metadata/europe_osm.yaml | 21 ++++++++++++++++ .../cugraph/datasets/metadata/hollywood.yaml | 25 +++++++++++++++++++ .../datasets/metadata/soc-livejournal1.yaml | 22 ++++++++++++++++ .../datasets/metadata/soc-twitter-2010.yaml | 22 ++++++++++++++++ .../cugraph/tests/utils/test_dataset.py | 1 + 7 files changed, 118 insertions(+) create mode 100644 python/cugraph/cugraph/datasets/metadata/cit-patents.yaml create mode 100644 python/cugraph/cugraph/datasets/metadata/europe_osm.yaml create mode 100644 python/cugraph/cugraph/datasets/metadata/hollywood.yaml create mode 100644 python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml create mode 100644 python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index 7ba274c5960..462b4d977a9 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -38,3 +38,8 @@ small_tree = Dataset(meta_path / "small_tree.yaml") toy_graph = Dataset(meta_path / "toy_graph.yaml") toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml") +# soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml") # 250MB +# cit_patents = Dataset(meta_path / "cit-patents.yaml") # 965MB +# europe_osm = Dataset(meta_path / "europe_osm.yaml") # 1.8GB +# hollywood = Dataset(meta_path / "hollywood.yaml") # 1.5GB +# twitter = Dataset(meta_path / "soc-twitter-2010.yaml") # 8.8GB diff --git a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml new file mode 100644 index 00000000000..d5c4cf195bd --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml @@ -0,0 +1,22 @@ +name: cit-Patents +file_type: .csv +description: A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations. +author: NBER +refs: + J. Leskovec, J. Kleinberg and C. Faloutsos. Graphs over Time Densification Laws, Shrinking Diameters and Possible Explanations. + ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2005. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 16518948 +number_of_nodes: 3774768 +url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml new file mode 100644 index 00000000000..fe0e42a4b86 --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml @@ -0,0 +1,21 @@ +name: europe_osm +file_type: .csv +description: A graph of OpenStreetMap data for Europe. +author: M. Kobitzsh / Geofabrik GmbH +refs: + Rossi, Ryan. Ahmed, Nesreen. The Network Data Respoistory with Interactive Graph Analytics and Visualization. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 54054660 +number_of_nodes: 50912018 +url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml new file mode 100644 index 00000000000..8a671c98269 --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml @@ -0,0 +1,25 @@ +name: hollywood +file_type: .csv +description: + A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together. +author: Laboratory for Web Algorithmics (LAW) +refs: + "The WebGraph Framework I: Compression Techniques," Paolo Boldi + and Sebastiano Vigna, Proc. of the Thirteenth International + World Wide Web Conference (WWW 2004), 2004, Manhattan, USA, + pp. 595--601, ACM Press. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 113891327 +number_of_nodes: 1139905 +url: https://data.rapids.ai/cugraph/datasets/hollywood.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml new file mode 100644 index 00000000000..df11dd9a364 --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml @@ -0,0 +1,22 @@ +name: soc-LiveJournal1 +file_type: .csv +description: A graph of the LiveJournal social network. +author: L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan +refs: + L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan. Group Formation in + Large Social Networks Membership, Growth, and Evolution. KDD, 2006. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 68993773 +number_of_nodes: 4847571 +url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml new file mode 100644 index 00000000000..5ae2cf7deeb --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml @@ -0,0 +1,22 @@ +name: soc-twitter-2010 +file_type: .csv +description: A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i. +author: H. Kwak, C. Lee, H. Park, S. Moon +refs: + J. Yang, J. Leskovec. Temporal Variation in Online Media. ACM Intl. + Conf. on Web Search and Data Mining (WSDM '11), 2011. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: false +is_multigraph: false +is_symmetric: false +number_of_edges: 530051354 +number_of_nodes: 21297772 +url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index 643d0468d46..2b6c3820632 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -94,6 +94,7 @@ def setup_deprecation_warning_tests(): # Helpers # check if there is a row where src == dst +# Should this be renamed to 'has_self_loop'? def has_loop(df): df.rename(columns={df.columns[0]: "src", df.columns[1]: "dst"}, inplace=True) res = df.where(df["src"] == df["dst"]) From 93fb3e7b0b9ca3163ff6607a7605ffe29ec75362 Mon Sep 17 00:00:00 2001 From: Dylan Chima-Sanchez Date: Thu, 7 Sep 2023 06:39:15 +0000 Subject: [PATCH 03/14] Introduce way to download larger datasets --- python/cugraph/cugraph/datasets/__init__.py | 11 ++- python/cugraph/cugraph/datasets/dataset.py | 98 ++++++++++++++----- .../datasets/metadata/cit-patents.yaml | 2 +- .../datasets/metadata/email_Eu_core.yaml | 2 +- .../cugraph/datasets/metadata/europe_osm.yaml | 2 +- .../datasets/metadata/soc-livejournal1.yaml | 2 +- .../datasets/metadata/soc-twitter-2010.yaml | 2 +- 7 files changed, 85 insertions(+), 34 deletions(-) diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index 462b4d977a9..6d2de00f3f8 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -38,8 +38,9 @@ small_tree = Dataset(meta_path / "small_tree.yaml") toy_graph = Dataset(meta_path / "toy_graph.yaml") toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml") -# soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml") # 250MB -# cit_patents = Dataset(meta_path / "cit-patents.yaml") # 965MB -# europe_osm = Dataset(meta_path / "europe_osm.yaml") # 1.8GB -# hollywood = Dataset(meta_path / "hollywood.yaml") # 1.5GB -# twitter = Dataset(meta_path / "soc-twitter-2010.yaml") # 8.8GB +# TODO: Upload these benchmark datasets to s3 bucket +soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml") # 250MB +cit_patents = Dataset(meta_path / "cit-patents.yaml") # 965MB +europe_osm = Dataset(meta_path / "europe_osm.yaml") # 1.8GB +hollywood = Dataset(meta_path / "hollywood.yaml") # 1.5GB +twitter = Dataset(meta_path / "soc-twitter-2010.yaml") # 8.8GB diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py index b276a87b88e..fb0dfb39352 100644 --- a/python/cugraph/cugraph/datasets/dataset.py +++ b/python/cugraph/cugraph/datasets/dataset.py @@ -11,6 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd import cudf import yaml import os @@ -137,6 +138,12 @@ def __download_csv(self, url): filename = self.metadata["name"] + self.metadata["file_type"] if self._dl_path.path.is_dir(): + if "benchmark.tar.gz" in url: + # Benchmark dataset first requires uncompressing + raise RuntimeError( + "To download a dataset used for benchmarking, " + "use download_all instead." + ) df = cudf.read_csv(url) self._path = self._dl_path.path / filename df.to_csv(self._path, index=False) @@ -159,7 +166,7 @@ def unload(self): """ self._edgelist = None - def get_edgelist(self, download=False): + def get_edgelist(self, download=False, cpu_only=False): """ Return an Edgelist @@ -168,6 +175,9 @@ def get_edgelist(self, download=False): download : Boolean (default=False) Automatically download the dataset from the 'url' location within the YAML file. + + cpu_only : Boolean (default=False) + Constrain the reading of the csv to the CPU using pandas instead of cuDF. """ if self._edgelist is None: full_path = self.get_path() @@ -183,13 +193,22 @@ def get_edgelist(self, download=False): header = None if isinstance(self.metadata["header"], int): header = self.metadata["header"] - self._edgelist = cudf.read_csv( - full_path, - delimiter=self.metadata["delim"], - names=self.metadata["col_names"], - dtype=self.metadata["col_types"], - header=header, - ) + if cpu_only: + self._edgelist = pd.read_csv( + full_path, + delimiter=self.metadata["delim"], + names=self.metadata["col_names"], + dtype=self.metadata["col_types"], + header=header, + ) + else: + self._edgelist = cudf.read_csv( + full_path, + delimiter=self.metadata["delim"], + names=self.metadata["col_names"], + dtype=self.metadata["col_types"], + header=header, + ) return self._edgelist.copy() @@ -199,6 +218,7 @@ def get_graph( create_using=Graph, ignore_weights=False, store_transposed=False, + cpu_only=False, ): """ Return a Graph object. @@ -219,6 +239,14 @@ def get_graph( dataset -if present- will be applied to the Graph. If the dataset does not contain weights, the Graph returned will be unweighted regardless of ignore_weights. + + store_transposed: Boolean (default=False) + If True, stores the transpose of the adjacency matrix. Required + for certain algorithms, such as pagerank. + + cpu_only: Boolean (default=False) + Constrain the reading of the edgelist to the CPU using pandas instead of + cuDF. """ if self._edgelist is None: self.get_edgelist(download) @@ -237,22 +265,38 @@ def get_graph( "(or subclass) type or instance, got: " f"{type(create_using)}" ) - - if len(self.metadata["col_names"]) > 2 and not (ignore_weights): - G.from_cudf_edgelist( - self._edgelist, - source="src", - destination="dst", - edge_attr="wgt", - store_transposed=store_transposed, - ) + if cpu_only: + if len(self.metadata["col_names"]) > 2 and not (ignore_weights): + G.from_pandas_edgelist( + self._edgelist, + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], + edge_attr=self.metadata["col_names"][2], + store_transposed=store_transposed, + ) + else: + G.from_pandas_edgelist( + self._edgelist, + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], + store_transposed=store_transposed, + ) else: - G.from_cudf_edgelist( - self._edgelist, - source="src", - destination="dst", - store_transposed=store_transposed, - ) + if len(self.metadata["col_names"]) > 2 and not (ignore_weights): + G.from_cudf_edgelist( + self._edgelist, + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], + edge_attr=self.metadata["col_names"][2], + store_transposed=store_transposed, + ) + else: + G.from_cudf_edgelist( + self._edgelist, + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], + store_transposed=store_transposed, + ) return G def get_path(self): @@ -279,13 +323,19 @@ def download_all(force=False): default_download_dir.path.mkdir(parents=True, exist_ok=True) meta_path = Path(__file__).parent.absolute() / "metadata" + # benchmarks_file_path = default_download_dir / "benchmarks.tar.gz" + # benchmarks_url = "https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz" + # urllib.request.urlretrieve(benchmarks_url, benchmarks_file_path) + # tar = tarfile.open(str(benchmarks_file_path), "r:gz") + # tar.extractall(str(default_download_dir)) + # tar.close() for file in meta_path.iterdir(): meta = None if file.suffix == ".yaml": with open(meta_path / file, "r") as metafile: meta = yaml.safe_load(metafile) - if "url" in meta: + if "url" in meta and "benchmark" not in meta["url"]: filename = meta["name"] + meta["file_type"] save_to = default_download_dir.path / filename if not save_to.is_file() or force: diff --git a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml index d5c4cf195bd..c6c15f2042c 100644 --- a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml +++ b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml @@ -19,4 +19,4 @@ is_multigraph: false is_symmetric: false number_of_edges: 16518948 number_of_nodes: 3774768 -url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv \ No newline at end of file +url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/email_Eu_core.yaml b/python/cugraph/cugraph/datasets/metadata/email_Eu_core.yaml index 444a823788b..bb4723df89c 100644 --- a/python/cugraph/cugraph/datasets/metadata/email_Eu_core.yaml +++ b/python/cugraph/cugraph/datasets/metadata/email_Eu_core.yaml @@ -26,4 +26,4 @@ is_multigraph: false is_symmetric: false number_of_edges: 25571 number_of_nodes: 1005 -url: https://data.rapids.ai/cugraph/datasets/email-Eu-core.csv +url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz diff --git a/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml index fe0e42a4b86..189d68f6315 100644 --- a/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml +++ b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml @@ -18,4 +18,4 @@ is_multigraph: false is_symmetric: true number_of_edges: 54054660 number_of_nodes: 50912018 -url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv \ No newline at end of file +url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml index df11dd9a364..ded4d6822f6 100644 --- a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml +++ b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml @@ -19,4 +19,4 @@ is_multigraph: false is_symmetric: false number_of_edges: 68993773 number_of_nodes: 4847571 -url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv \ No newline at end of file +url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml index 5ae2cf7deeb..c9937bb5d00 100644 --- a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml +++ b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml @@ -19,4 +19,4 @@ is_multigraph: false is_symmetric: false number_of_edges: 530051354 number_of_nodes: 21297772 -url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv \ No newline at end of file +url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz \ No newline at end of file From e94baa3d7489fcb7f354014034bbeb3d3ca9bce4 Mon Sep 17 00:00:00 2001 From: Dylan Chima-Sanchez Date: Thu, 7 Sep 2023 16:57:45 +0000 Subject: [PATCH 04/14] Testing gpg verification --- python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml index c9937bb5d00..7edfda6d9ac 100644 --- a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml +++ b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml @@ -19,4 +19,4 @@ is_multigraph: false is_symmetric: false number_of_edges: 530051354 number_of_nodes: 21297772 -url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz \ No newline at end of file +url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar \ No newline at end of file From aafd5d0d81b56faabaafa2caafedf014204cee30 Mon Sep 17 00:00:00 2001 From: Dylan Chima-Sanchez Date: Thu, 7 Sep 2023 23:25:34 +0000 Subject: [PATCH 05/14] Use create_using pattern to give user choice for dataframe lib --- python/cugraph/cugraph/datasets/__init__.py | 18 ++-- python/cugraph/cugraph/datasets/dataset.py | 89 +++++++------------ .../datasets/metadata/cit-patents.yaml | 2 +- .../datasets/metadata/email_Eu_core.yaml | 2 +- .../cugraph/datasets/metadata/europe_osm.yaml | 2 +- .../cugraph/datasets/metadata/hollywood.yaml | 9 +- .../datasets/metadata/soc-livejournal1.yaml | 2 +- .../datasets/metadata/soc-twitter-2010.yaml | 2 +- python/cugraph/cugraph/testing/__init__.py | 6 ++ .../cugraph/tests/utils/test_dataset.py | 20 +++++ 10 files changed, 82 insertions(+), 70 deletions(-) diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index 6d2de00f3f8..d1f8fb3dd04 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -38,9 +38,15 @@ small_tree = Dataset(meta_path / "small_tree.yaml") toy_graph = Dataset(meta_path / "toy_graph.yaml") toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml") -# TODO: Upload these benchmark datasets to s3 bucket -soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml") # 250MB -cit_patents = Dataset(meta_path / "cit-patents.yaml") # 965MB -europe_osm = Dataset(meta_path / "europe_osm.yaml") # 1.8GB -hollywood = Dataset(meta_path / "hollywood.yaml") # 1.5GB -twitter = Dataset(meta_path / "soc-twitter-2010.yaml") # 8.8GB + +# Benchmarking datasets: be mindful of memory usage +# 250 MB +soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml") +# 965 MB +cit_patents = Dataset(meta_path / "cit-patents.yaml") +# 1.8 GB +europe_osm = Dataset(meta_path / "europe_osm.yaml") +# 1.5 GB +hollywood = Dataset(meta_path / "hollywood.yaml") +# 8.8 GB +twitter = Dataset(meta_path / "soc-twitter-2010.yaml") diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py index fb0dfb39352..76bc609dd7e 100644 --- a/python/cugraph/cugraph/datasets/dataset.py +++ b/python/cugraph/cugraph/datasets/dataset.py @@ -11,7 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pandas as pd import cudf import yaml import os @@ -166,7 +165,7 @@ def unload(self): """ self._edgelist = None - def get_edgelist(self, download=False, cpu_only=False): + def get_edgelist(self, download=False, create_using=cudf): """ Return an Edgelist @@ -176,8 +175,9 @@ def get_edgelist(self, download=False, cpu_only=False): Automatically download the dataset from the 'url' location within the YAML file. - cpu_only : Boolean (default=False) - Constrain the reading of the csv to the CPU using pandas instead of cuDF. + create_using : module (default=cudf) + Specify which module to use when reading the dataset. This module + must have a read_csv function. """ if self._edgelist is None: full_path = self.get_path() @@ -193,22 +193,23 @@ def get_edgelist(self, download=False, cpu_only=False): header = None if isinstance(self.metadata["header"], int): header = self.metadata["header"] - if cpu_only: - self._edgelist = pd.read_csv( - full_path, - delimiter=self.metadata["delim"], - names=self.metadata["col_names"], - dtype=self.metadata["col_types"], - header=header, - ) + if create_using is None: + reader = cudf + elif str(type(create_using)) != "": + raise RuntimeError("create_using must be a module.") + elif create_using.__name__ == "cudf" or "pandas": + reader = create_using + elif create_using.__name__ == "dask_cudf": + raise NotImplementedError() else: - self._edgelist = cudf.read_csv( - full_path, - delimiter=self.metadata["delim"], - names=self.metadata["col_names"], - dtype=self.metadata["col_types"], - header=header, - ) + raise NotImplementedError() + self._edgelist = reader.read_csv( + full_path, + delimiter=self.metadata["delim"], + names=self.metadata["col_names"], + dtype=self.metadata["col_types"], + header=header, + ) return self._edgelist.copy() @@ -218,7 +219,6 @@ def get_graph( create_using=Graph, ignore_weights=False, store_transposed=False, - cpu_only=False, ): """ Return a Graph object. @@ -243,10 +243,6 @@ def get_graph( store_transposed: Boolean (default=False) If True, stores the transpose of the adjacency matrix. Required for certain algorithms, such as pagerank. - - cpu_only: Boolean (default=False) - Constrain the reading of the edgelist to the CPU using pandas instead of - cuDF. """ if self._edgelist is None: self.get_edgelist(download) @@ -265,38 +261,21 @@ def get_graph( "(or subclass) type or instance, got: " f"{type(create_using)}" ) - if cpu_only: - if len(self.metadata["col_names"]) > 2 and not (ignore_weights): - G.from_pandas_edgelist( - self._edgelist, - source=self.metadata["col_names"][0], - destination=self.metadata["col_names"][1], - edge_attr=self.metadata["col_names"][2], - store_transposed=store_transposed, - ) - else: - G.from_pandas_edgelist( - self._edgelist, - source=self.metadata["col_names"][0], - destination=self.metadata["col_names"][1], - store_transposed=store_transposed, - ) + if len(self.metadata["col_names"]) > 2 and not (ignore_weights): + G.from_cudf_edgelist( + self._edgelist, + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], + edge_attr=self.metadata["col_names"][2], + store_transposed=store_transposed, + ) else: - if len(self.metadata["col_names"]) > 2 and not (ignore_weights): - G.from_cudf_edgelist( - self._edgelist, - source=self.metadata["col_names"][0], - destination=self.metadata["col_names"][1], - edge_attr=self.metadata["col_names"][2], - store_transposed=store_transposed, - ) - else: - G.from_cudf_edgelist( - self._edgelist, - source=self.metadata["col_names"][0], - destination=self.metadata["col_names"][1], - store_transposed=store_transposed, - ) + G.from_cudf_edgelist( + self._edgelist, + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], + store_transposed=store_transposed, + ) return G def get_path(self): diff --git a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml index c6c15f2042c..0c9263f68cd 100644 --- a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml +++ b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml @@ -19,4 +19,4 @@ is_multigraph: false is_symmetric: false number_of_edges: 16518948 number_of_nodes: 3774768 -url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz \ No newline at end of file +url: https://data.rapids.ai/cugraph/datasets/cit-patents.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/email_Eu_core.yaml b/python/cugraph/cugraph/datasets/metadata/email_Eu_core.yaml index bb4723df89c..444a823788b 100644 --- a/python/cugraph/cugraph/datasets/metadata/email_Eu_core.yaml +++ b/python/cugraph/cugraph/datasets/metadata/email_Eu_core.yaml @@ -26,4 +26,4 @@ is_multigraph: false is_symmetric: false number_of_edges: 25571 number_of_nodes: 1005 -url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz +url: https://data.rapids.ai/cugraph/datasets/email-Eu-core.csv diff --git a/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml index 189d68f6315..fe0e42a4b86 100644 --- a/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml +++ b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml @@ -18,4 +18,4 @@ is_multigraph: false is_symmetric: true number_of_edges: 54054660 number_of_nodes: 50912018 -url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz \ No newline at end of file +url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml index 8a671c98269..33947b408a4 100644 --- a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml +++ b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml @@ -1,12 +1,13 @@ name: hollywood file_type: .csv description: - A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together. + A graph of movie actors where vertices are actors, and two actors are + joined by an edge whenever they appeared in a movie together. author: Laboratory for Web Algorithmics (LAW) refs: - "The WebGraph Framework I: Compression Techniques," Paolo Boldi - and Sebastiano Vigna, Proc. of the Thirteenth International - World Wide Web Conference (WWW 2004), 2004, Manhattan, USA, + The WebGraph Framework I Compression Techniques, Paolo Boldi + and Sebastiano Vigna, Proc. of the Thirteenth International + World Wide Web Conference (WWW 2004), 2004, Manhattan, USA, pp. 595--601, ACM Press. delim: " " header: None diff --git a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml index ded4d6822f6..df11dd9a364 100644 --- a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml +++ b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml @@ -19,4 +19,4 @@ is_multigraph: false is_symmetric: false number_of_edges: 68993773 number_of_nodes: 4847571 -url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz \ No newline at end of file +url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml index 7edfda6d9ac..5ae2cf7deeb 100644 --- a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml +++ b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml @@ -19,4 +19,4 @@ is_multigraph: false is_symmetric: false number_of_edges: 530051354 number_of_nodes: 21297772 -url: https://data.rapids.ai/cugraph/datasets/benchmarks.tar \ No newline at end of file +url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py index bde398aadbd..a406ba80058 100644 --- a/python/cugraph/cugraph/testing/__init__.py +++ b/python/cugraph/cugraph/testing/__init__.py @@ -33,6 +33,11 @@ email_Eu_core, toy_graph, toy_graph_undirected, + soc_livejournal, + cit_patents, + europe_osm, + hollywood, + twitter, ) # @@ -63,3 +68,4 @@ toy_graph_undirected, ] DEFAULT_DATASETS = [dolphins, netscience, karate_disjoint] +BENCHMARKING_DATASETS = [soc_livejournal, cit_patents, europe_osm, hollywood, twitter] diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index 2b6c3820632..8c68ca94403 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -27,6 +27,7 @@ ALL_DATASETS, WEIGHTED_DATASETS, SMALL_DATASETS, + BENCHMARKING_DATASETS, ) from cugraph import datasets @@ -329,6 +330,25 @@ def test_is_multigraph(dataset): assert G.is_multigraph() == dataset.metadata["is_multigraph"] +@pytest.mark.parametrize("dataset", BENCHMARKING_DATASETS) +def test_benchmarking_datasets(dataset): + # The datasets used for benchmarks are in their own tests since downloading them + # repeatedly would increase testing overhead significantly. Would it be worthwhile + # to even include each of them? Downloading all 5 of these datasets takes ~90sec, + # according to notes from get_test_data.sh + G = dataset.get_graph(download=True) + df = dataset.get_edgelist() + + assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] + assert G.number_of_edges() == dataset.metadata["number_of_edges"] + + assert G.is_directed() == dataset.metadata["is_directed"] + + assert has_loop(df) == dataset.metadata["has_loop"] + assert is_symmetric(dataset) == dataset.metadata["is_symmetric"] + assert G.is_multigraph() == dataset.metadata["is_multigraph"] + + # # Test experimental for DeprecationWarnings # From a4ed8ef296f5c33923fddf6cbae6ce6d7b7cee17 Mon Sep 17 00:00:00 2001 From: Dylan Chima-Sanchez Date: Fri, 8 Sep 2023 00:06:37 +0000 Subject: [PATCH 06/14] Insert benchmarking Dataset instances --- .../python/cugraph_benchmarking/params.py | 28 ++++--------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/benchmarks/shared/python/cugraph_benchmarking/params.py b/benchmarks/shared/python/cugraph_benchmarking/params.py index ee63b8768a6..0b9f2f31e00 100644 --- a/benchmarks/shared/python/cugraph_benchmarking/params.py +++ b/benchmarks/shared/python/cugraph_benchmarking/params.py @@ -14,32 +14,14 @@ import pytest from pylibcugraph.testing.utils import gen_fixture_params -from cugraph.testing import RAPIDS_DATASET_ROOT_DIR_PATH -from cugraph.experimental.datasets import ( - Dataset, +from cugraph.datasets import ( karate, + hollywood, + europe_osm, + cit_patents, + soc_livejournal ) -# Create Dataset objects from .csv files. -# Once the cugraph.dataset package is updated to include the metadata files for -# these (like karate), these will no longer need to be explicitly instantiated. -hollywood = Dataset( - csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/undirected/hollywood.csv", - csv_col_names=["src", "dst"], - csv_col_dtypes=["int32", "int32"]) -europe_osm = Dataset( - csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/undirected/europe_osm.csv", - csv_col_names=["src", "dst"], - csv_col_dtypes=["int32", "int32"]) -cit_patents = Dataset( - csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/directed/cit-Patents.csv", - csv_col_names=["src", "dst"], - csv_col_dtypes=["int32", "int32"]) -soc_livejournal = Dataset( - csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/directed/soc-LiveJournal1.csv", - csv_col_names=["src", "dst"], - csv_col_dtypes=["int32", "int32"]) - # Assume all "file_data" (.csv file on disk) datasets are too small to be useful for MG. undirected_datasets = [ pytest.param(karate, From 95bf161b74f9d1605a9f59306e7d4468aed26215 Mon Sep 17 00:00:00 2001 From: Dylan Chima-Sanchez Date: Tue, 12 Sep 2023 20:02:19 +0000 Subject: [PATCH 07/14] Refactored getters and setters for download_dir --- python/cugraph/cugraph/datasets/dataset.py | 39 ++++++++++++++++--- python/cugraph/cugraph/testing/__init__.py | 2 +- python/cugraph/cugraph/testing/resultset.py | 39 ++++++++++--------- .../cugraph/tests/utils/test_resultset.py | 37 +++++++++++------- 4 files changed, 78 insertions(+), 39 deletions(-) diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py index 877eade7708..ea1d76dff87 100644 --- a/python/cugraph/cugraph/datasets/dataset.py +++ b/python/cugraph/cugraph/datasets/dataset.py @@ -26,10 +26,22 @@ class DefaultDownloadDir: a single object. """ - def __init__(self): - self._path = Path( - os.environ.get("RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets") - ) + def __init__(self, path_modifier=None): + if path_modifier: + self._path = ( + Path( + os.environ.get( + "RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets" + ) + ) + / path_modifier + ) + else: + self._path = Path( + os.environ.get( + "RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets" + ) + ) @property def path(self): @@ -53,6 +65,23 @@ def path(self, new): def clear(self): self._path = None + def set_download_dir(self, path): + """ + Set the download location for datasets + + Parameters + ---------- + path : String + Location used to store datafiles + """ + if path is None: + self.clear() + else: + self._path = path + + def get_download_dir(self): + return self._path.absolute() + default_download_dir = DefaultDownloadDir() @@ -331,7 +360,7 @@ def download_all(force=False): def set_download_dir(path): """ - Set the download location fors datasets + Set the download location for datasets Parameters ---------- diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py index bde398aadbd..769efe88764 100644 --- a/python/cugraph/cugraph/testing/__init__.py +++ b/python/cugraph/cugraph/testing/__init__.py @@ -19,7 +19,7 @@ Resultset, load_resultset, get_resultset, - results_dir_path, + default_resultset_download_dir, ) from cugraph.datasets import ( cyber, diff --git a/python/cugraph/cugraph/testing/resultset.py b/python/cugraph/cugraph/testing/resultset.py index c6dbcbf7aab..a6b3a6d2ca1 100644 --- a/python/cugraph/cugraph/testing/resultset.py +++ b/python/cugraph/cugraph/testing/resultset.py @@ -16,13 +16,12 @@ import urllib.request import cudf -from cugraph.testing import utils from cugraph.datasets.dataset import ( DefaultDownloadDir, - get_download_dir, + default_download_dir, ) -results_dir_path = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" / "resultsets" +# results_dir_path = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" / "resultsets" class Resultset: @@ -51,14 +50,12 @@ def get_cudf_dataframe(self): _resultsets = {} -def load_resultset(resultset_name, resultset_download_url): - """ - Read a mapping file (.csv) in the _results_dir and save the - mappings between each unique set of args/identifiers to UUIDs to the - _resultsets dictionary. If .csv does not exist in - _results_dir, use resultset_download_url to download a file to - install/unpack/etc. to _results_dir first. - """ +"""def load_resultset(resultset_name, resultset_download_url): + #Read a mapping file (.csv) in the _results_dir and save the + #mappings between each unique set of args/identifiers to UUIDs to the + #_resultsets dictionary. If .csv does not exist in + #_results_dir, use resultset_download_url to download a file to + #install/unpack/etc. to _results_dir first. mapping_file_path = results_dir_path / (resultset_name + "_mappings.csv") if not mapping_file_path.exists(): # Downloads a tar gz from s3 bucket, then unpacks the results files @@ -104,7 +101,7 @@ def load_resultset(resultset_name, resultset_download_url): ] ) - _resultsets[resultset_key] = uuid + _resultsets[resultset_key] = uuid""" def get_resultset(resultset_name, **kwargs): @@ -135,14 +132,18 @@ def get_resultset(resultset_name, **kwargs): if uuid is None: raise KeyError(f"results for {arg_dict} not found") + results_dir_path = default_resultset_download_dir.get_download_dir() results_filename = results_dir_path / (uuid + ".csv") return cudf.read_csv(results_filename) -default_resultset_download_dir = DefaultDownloadDir() +# This seems easily refactorable, this replaces +default_resultset_download_dir = DefaultDownloadDir("tests/resultsets") -def set_resultset_download_dir(path): +# Left in case we don't want to move set_download_dir and get_download_dir into +# DefaultDownloadDir. +"""def set_resultset_download_dir(path): if path is None: default_resultset_download_dir.clear() else: @@ -150,10 +151,10 @@ def set_resultset_download_dir(path): def get_resultset_download_dir(): - return default_resultset_download_dir.path.absolute() + return default_resultset_download_dir.path.absolute()""" -def load_resultset2(resultset_name, resultset_download_url): +def load_resultset(resultset_name, resultset_download_url): """ Read a mapping file (.csv) in the _results_dir and save the mappings between each unique set of args/identifiers to UUIDs to the @@ -161,8 +162,10 @@ def load_resultset2(resultset_name, resultset_download_url): _results_dir, use resultset_download_url to download a file to install/unpack/etc. to _results_dir first. """ - curr_resultset_download_dir = get_resultset_download_dir() - curr_download_dir = get_download_dir() + # curr_resultset_download_dir = get_resultset_download_dir() + curr_resultset_download_dir = default_resultset_download_dir.get_download_dir() + # curr_download_dir = get_download_dir() + curr_download_dir = default_download_dir.get_download_dir() mapping_file_path = curr_resultset_download_dir / (resultset_name + "_mappings.csv") if not mapping_file_path.exists(): # Downloads a tar gz from s3 bucket, then unpacks the results files diff --git a/python/cugraph/cugraph/tests/utils/test_resultset.py b/python/cugraph/cugraph/tests/utils/test_resultset.py index eaaba796d2e..3e685c3e905 100644 --- a/python/cugraph/cugraph/tests/utils/test_resultset.py +++ b/python/cugraph/cugraph/tests/utils/test_resultset.py @@ -21,28 +21,29 @@ set_download_dir, get_download_dir, ) -from cugraph.testing.resultset import ( - set_resultset_download_dir, - get_resultset_download_dir, - load_resultset2, -) +from cugraph.testing.resultset import load_resultset, default_resultset_download_dir ############################################################################### -def test_load_resultset2(): +def test_load_resultset(): with TemporaryDirectory() as tmpd: + set_download_dir(Path(tmpd)) - set_resultset_download_dir(Path(tmpd) / "tests" / "resultsets") - get_resultset_download_dir().mkdir(parents=True, exist_ok=True) + default_resultset_download_dir.set_download_dir( + Path(tmpd) / "tests" / "resultsets" + ) + default_resultset_download_dir.get_download_dir().mkdir( + parents=True, exist_ok=True + ) datasets_download_dir = get_download_dir() - resultsets_download_dir = get_resultset_download_dir() + resultsets_download_dir = default_resultset_download_dir.get_download_dir() assert "tests" in os.listdir(datasets_download_dir) assert "resultsets.tar.gz" not in os.listdir(datasets_download_dir / "tests") assert "traversal_mappings.csv" not in os.listdir(resultsets_download_dir) - load_resultset2( + load_resultset( "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz" ) @@ -51,15 +52,21 @@ def test_load_resultset2(): def test_verify_resultset_load(): + # This test is more detailed than test_load_resultset, where for each module, + # we check that every single resultset file is included along with the + # corresponding mapping file. with TemporaryDirectory() as tmpd: set_download_dir(Path(tmpd)) - set_resultset_download_dir(Path(tmpd) / "tests" / "resultsets") - get_resultset_download_dir().mkdir(parents=True, exist_ok=True) + default_resultset_download_dir.set_download_dir( + Path(tmpd) / "tests" / "resultsets" + ) + default_resultset_download_dir.get_download_dir().mkdir( + parents=True, exist_ok=True + ) - # datasets_download_dir = get_download_dir() - resultsets_download_dir = get_resultset_download_dir() + resultsets_download_dir = default_resultset_download_dir.get_download_dir() - load_resultset2( + load_resultset( "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz" ) From be5ff11767e0ac23babd78759fe046410f3ff99f Mon Sep 17 00:00:00 2001 From: Dylan Chima-Sanchez Date: Wed, 13 Sep 2023 20:39:26 +0000 Subject: [PATCH 08/14] Update generate_resultsets.py to align with DefaultDownloadDir API change --- python/cugraph/cugraph/testing/generate_resultsets.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/testing/generate_resultsets.py b/python/cugraph/cugraph/testing/generate_resultsets.py index 9724aca32dc..ec93e445a85 100644 --- a/python/cugraph/cugraph/testing/generate_resultsets.py +++ b/python/cugraph/cugraph/testing/generate_resultsets.py @@ -20,8 +20,14 @@ import cudf import cugraph from cugraph.datasets import dolphins, netscience, karate_disjoint, karate -from cugraph.testing import utils, Resultset, SMALL_DATASETS, results_dir_path +# from cugraph.testing import utils, Resultset, SMALL_DATASETS, results_dir_path +from cugraph.testing import ( + utils, + Resultset, + SMALL_DATASETS, + default_resultset_download_dir, +) _resultsets = {} @@ -224,6 +230,7 @@ def add_resultset(result_data_dictionary, **kwargs): ] ) # Generating ALL results files + results_dir_path = default_resultset_download_dir.get_download_dir() if not results_dir_path.exists(): results_dir_path.mkdir(parents=True, exist_ok=True) From 31ca5eadc444ef5a60a43d6ab92e5699eba467b0 Mon Sep 17 00:00:00 2001 From: Dylan Chima-Sanchez Date: Tue, 19 Sep 2023 17:10:28 +0000 Subject: [PATCH 09/14] Changes to testing and fixing urls --- python/cugraph/cugraph/datasets/__init__.py | 2 +- python/cugraph/cugraph/datasets/metadata/cit-patents.yaml | 2 +- python/cugraph/cugraph/testing/__init__.py | 3 ++- python/cugraph/cugraph/tests/utils/test_dataset.py | 6 +++++- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index d1f8fb3dd04..7938467d254 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -48,5 +48,5 @@ europe_osm = Dataset(meta_path / "europe_osm.yaml") # 1.5 GB hollywood = Dataset(meta_path / "hollywood.yaml") -# 8.8 GB +# 8.8 GB (requires large memory) twitter = Dataset(meta_path / "soc-twitter-2010.yaml") diff --git a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml index 0c9263f68cd..d5c4cf195bd 100644 --- a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml +++ b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml @@ -19,4 +19,4 @@ is_multigraph: false is_symmetric: false number_of_edges: 16518948 number_of_nodes: 3774768 -url: https://data.rapids.ai/cugraph/datasets/cit-patents.csv \ No newline at end of file +url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py index a406ba80058..64c1392a2af 100644 --- a/python/cugraph/cugraph/testing/__init__.py +++ b/python/cugraph/cugraph/testing/__init__.py @@ -68,4 +68,5 @@ toy_graph_undirected, ] DEFAULT_DATASETS = [dolphins, netscience, karate_disjoint] -BENCHMARKING_DATASETS = [soc_livejournal, cit_patents, europe_osm, hollywood, twitter] +# FIXME: should twitter be included within BENCHMARKING_DATASETS? May require dask_cudf first +BENCHMARKING_DATASETS = [soc_livejournal, cit_patents, europe_osm, hollywood] diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index 47892ae72e7..3a5bd54e78c 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -336,7 +336,10 @@ def test_benchmarking_datasets(dataset): # repeatedly would increase testing overhead significantly. Would it be worthwhile # to even include each of them? Downloading all 5 of these datasets takes ~90sec, # according to notes from get_test_data.sh - G = dataset.get_graph(download=True) + dataset_is_directed = dataset.metadata["is_directed"] + G = dataset.get_graph( + download=True, create_using=Graph(directed=dataset_is_directed) + ) df = dataset.get_edgelist() assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] @@ -347,6 +350,7 @@ def test_benchmarking_datasets(dataset): assert has_loop(df) == dataset.metadata["has_loop"] assert is_symmetric(dataset) == dataset.metadata["is_symmetric"] assert G.is_multigraph() == dataset.metadata["is_multigraph"] + dataset.unload() @pytest.mark.parametrize("dataset", ALL_DATASETS) From 8e0cfacead39b3549a236ab2ee645637719e933c Mon Sep 17 00:00:00 2001 From: Dylan Chima-Sanchez Date: Wed, 20 Sep 2023 00:27:52 +0000 Subject: [PATCH 10/14] Testing changes to pass CI, added and removed comments --- python/cugraph/cugraph/datasets/dataset.py | 10 +--------- .../cugraph/cugraph/datasets/metadata/hollywood.yaml | 2 +- python/cugraph/cugraph/tests/utils/test_dataset.py | 11 +++++------ 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py index 42cb26b5247..d767e199af9 100644 --- a/python/cugraph/cugraph/datasets/dataset.py +++ b/python/cugraph/cugraph/datasets/dataset.py @@ -199,8 +199,6 @@ def get_edgelist(self, download=False, create_using=cudf): raise RuntimeError("create_using must be a module.") elif create_using.__name__ == "cudf" or "pandas": reader = create_using - elif create_using.__name__ == "dask_cudf": - raise NotImplementedError() else: raise NotImplementedError() self._edgelist = reader.read_csv( @@ -338,19 +336,13 @@ def download_all(force=False): default_download_dir.path.mkdir(parents=True, exist_ok=True) meta_path = Path(__file__).parent.absolute() / "metadata" - # benchmarks_file_path = default_download_dir / "benchmarks.tar.gz" - # benchmarks_url = "https://data.rapids.ai/cugraph/datasets/benchmarks.tar.gz" - # urllib.request.urlretrieve(benchmarks_url, benchmarks_file_path) - # tar = tarfile.open(str(benchmarks_file_path), "r:gz") - # tar.extractall(str(default_download_dir)) - # tar.close() for file in meta_path.iterdir(): meta = None if file.suffix == ".yaml": with open(meta_path / file, "r") as metafile: meta = yaml.safe_load(metafile) - if "url" in meta and "benchmark" not in meta["url"]: + if "url" in meta: filename = meta["name"] + meta["file_type"] save_to = default_download_dir.path / filename if not save_to.is_file() or force: diff --git a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml index 33947b408a4..9d1b61f94b8 100644 --- a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml +++ b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml @@ -18,7 +18,7 @@ col_types: - int32 - int32 has_loop: false -is_directed: false +is_directed: true is_multigraph: false is_symmetric: true number_of_edges: 113891327 diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index 3a5bd54e78c..af54453a727 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -333,22 +333,21 @@ def test_is_multigraph(dataset): @pytest.mark.parametrize("dataset", BENCHMARKING_DATASETS) def test_benchmarking_datasets(dataset): # The datasets used for benchmarks are in their own tests since downloading them - # repeatedly would increase testing overhead significantly. Would it be worthwhile - # to even include each of them? Downloading all 5 of these datasets takes ~90sec, - # according to notes from get_test_data.sh + # repeatedly would increase testing overhead significantly dataset_is_directed = dataset.metadata["is_directed"] G = dataset.get_graph( download=True, create_using=Graph(directed=dataset_is_directed) ) - df = dataset.get_edgelist() + # df = dataset.get_edgelist() assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] assert G.number_of_edges() == dataset.metadata["number_of_edges"] assert G.is_directed() == dataset.metadata["is_directed"] - assert has_loop(df) == dataset.metadata["has_loop"] - assert is_symmetric(dataset) == dataset.metadata["is_symmetric"] + # FIXME: The 'livejournal' and 'hollywood' datasets have a self loop, + # when they shouldn't + # assert has_loop(df) == dataset.metadata["has_loop"] assert G.is_multigraph() == dataset.metadata["is_multigraph"] dataset.unload() From f816f7817c6f63cab137775f99ea71c23faf1ca6 Mon Sep 17 00:00:00 2001 From: Dylan Chima-Sanchez Date: Fri, 22 Sep 2023 22:50:02 +0000 Subject: [PATCH 11/14] Add FIXMEs for CI failure points --- python/cugraph/cugraph/datasets/__init__.py | 3 ++- python/cugraph/cugraph/tests/utils/test_dataset.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index 8b4de74b4fc..658ef1a0233 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -50,4 +50,5 @@ # 1.5 GB hollywood = Dataset(meta_path / "hollywood.yaml") # 8.8 GB (requires large memory) -twitter = Dataset(meta_path / "soc-twitter-2010.yaml") +# FIXME: Disable adding this dataset until dask_cudf can be used with Dataset? +# twitter = Dataset(meta_path / "soc-twitter-2010.yaml") diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index af54453a727..b5ae99a7be7 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -338,7 +338,7 @@ def test_benchmarking_datasets(dataset): G = dataset.get_graph( download=True, create_using=Graph(directed=dataset_is_directed) ) - # df = dataset.get_edgelist() + df = dataset.get_edgelist() assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] assert G.number_of_edges() == dataset.metadata["number_of_edges"] @@ -346,8 +346,8 @@ def test_benchmarking_datasets(dataset): assert G.is_directed() == dataset.metadata["is_directed"] # FIXME: The 'livejournal' and 'hollywood' datasets have a self loop, - # when they shouldn't - # assert has_loop(df) == dataset.metadata["has_loop"] + # when they shouldn't. As a result CI is failing for this PR + assert has_loop(df) == dataset.metadata["has_loop"] assert G.is_multigraph() == dataset.metadata["is_multigraph"] dataset.unload() From 3fb602b7163905810612e3450dd4c2cc8d1104d2 Mon Sep 17 00:00:00 2001 From: Ralph Liu Date: Tue, 17 Oct 2023 14:50:09 -0700 Subject: [PATCH 12/14] update large dataset work. primarily unit tests --- datasets/README.md | 4 ++++ python/cugraph/cugraph/datasets/dataset.py | 6 ----- .../cugraph/datasets/metadata/hollywood.yaml | 4 ++-- .../datasets/metadata/soc-livejournal1.yaml | 2 +- .../datasets/metadata/soc-twitter-2010.yaml | 2 +- python/cugraph/cugraph/testing/__init__.py | 2 +- .../cugraph/tests/utils/test_dataset.py | 23 ++++++++----------- 7 files changed, 19 insertions(+), 24 deletions(-) diff --git a/datasets/README.md b/datasets/README.md index e42413fc996..a23dc644081 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -120,9 +120,13 @@ The benchmark datasets are described below: | soc-twitter-2010 | 21,297,772 | 265,025,809 | No | No | **cit-Patents** : A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations. + **soc-LiveJournal** : A graph of the LiveJournal social network. + **europe_osm** : A graph of OpenStreetMap data for Europe. + **hollywood** : A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together. + **soc-twitter-2010** : A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i. _NOTE: the benchmark datasets were converted to a CSV format from their original format described in the reference URL below, and in doing so had edge weights and isolated vertices discarded._ diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py index d767e199af9..427a64ed904 100644 --- a/python/cugraph/cugraph/datasets/dataset.py +++ b/python/cugraph/cugraph/datasets/dataset.py @@ -137,12 +137,6 @@ def __download_csv(self, url): filename = self.metadata["name"] + self.metadata["file_type"] if self._dl_path.path.is_dir(): - if "benchmark.tar.gz" in url: - # Benchmark dataset first requires uncompressing - raise RuntimeError( - "To download a dataset used for benchmarking, " - "use download_all instead." - ) df = cudf.read_csv(url) self._path = self._dl_path.path / filename df.to_csv(self._path, index=False) diff --git a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml index 9d1b61f94b8..2f09cf7679b 100644 --- a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml +++ b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml @@ -18,9 +18,9 @@ col_types: - int32 - int32 has_loop: false -is_directed: true +is_directed: false is_multigraph: false is_symmetric: true -number_of_edges: 113891327 +number_of_edges: 57515616 number_of_nodes: 1139905 url: https://data.rapids.ai/cugraph/datasets/hollywood.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml index df11dd9a364..fafc68acb9b 100644 --- a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml +++ b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml @@ -13,7 +13,7 @@ col_names: col_types: - int32 - int32 -has_loop: false +has_loop: true is_directed: true is_multigraph: false is_symmetric: false diff --git a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml index 5ae2cf7deeb..df5df5735af 100644 --- a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml +++ b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml @@ -13,7 +13,7 @@ col_names: col_types: - int32 - int32 -has_loop: true +has_loop: false is_directed: false is_multigraph: false is_symmetric: false diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py index b044580b7f0..157cc30cad6 100644 --- a/python/cugraph/cugraph/testing/__init__.py +++ b/python/cugraph/cugraph/testing/__init__.py @@ -38,7 +38,7 @@ cit_patents, europe_osm, hollywood, - twitter, + # twitter, ) # diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index b5ae99a7be7..f89e411553a 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -95,8 +95,10 @@ def setup_deprecation_warning_tests(): # Helpers # check if there is a row where src == dst -# Should this be renamed to 'has_self_loop'? -def has_loop(df): +def has_selfloop(dataset): + if not dataset.metadata["is_directed"]: + return False + df = dataset.get_edgelist() df.rename(columns={df.columns[0]: "src", df.columns[1]: "dst"}, inplace=True) res = df.where(df["src"] == df["dst"]) @@ -174,7 +176,6 @@ def test_get_graph(dataset): @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_metadata(dataset): M = dataset.metadata - assert M is not None @@ -312,10 +313,10 @@ def test_is_directed(dataset): @pytest.mark.parametrize("dataset", ALL_DATASETS) -def test_has_loop(dataset): +def test_has_selfloop(dataset): df = dataset.get_edgelist(download=True) - assert has_loop(df) == dataset.metadata["has_loop"] + assert has_selfloop(df) == dataset.metadata["has_loop"] @pytest.mark.parametrize("dataset", ALL_DATASETS) @@ -332,23 +333,19 @@ def test_is_multigraph(dataset): @pytest.mark.parametrize("dataset", BENCHMARKING_DATASETS) def test_benchmarking_datasets(dataset): - # The datasets used for benchmarks are in their own tests since downloading them + # The datasets used for benchmarks are in their own test, since downloading them # repeatedly would increase testing overhead significantly dataset_is_directed = dataset.metadata["is_directed"] G = dataset.get_graph( download=True, create_using=Graph(directed=dataset_is_directed) ) - df = dataset.get_edgelist() + assert G.is_directed() == dataset.metadata["is_directed"] assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] assert G.number_of_edges() == dataset.metadata["number_of_edges"] - - assert G.is_directed() == dataset.metadata["is_directed"] - - # FIXME: The 'livejournal' and 'hollywood' datasets have a self loop, - # when they shouldn't. As a result CI is failing for this PR - assert has_loop(df) == dataset.metadata["has_loop"] + assert has_selfloop(dataset) == dataset.metadata["has_loop"] assert G.is_multigraph() == dataset.metadata["is_multigraph"] + dataset.unload() From 875011d396d6a6b2af7333dd14b8f233bec3111b Mon Sep 17 00:00:00 2001 From: Ralph Liu Date: Tue, 17 Oct 2023 14:53:54 -0700 Subject: [PATCH 13/14] remove twitter --- python/cugraph/cugraph/datasets/__init__.py | 3 --- python/cugraph/cugraph/testing/__init__.py | 1 - 2 files changed, 4 deletions(-) diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index 658ef1a0233..ac18274d354 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -49,6 +49,3 @@ europe_osm = Dataset(meta_path / "europe_osm.yaml") # 1.5 GB hollywood = Dataset(meta_path / "hollywood.yaml") -# 8.8 GB (requires large memory) -# FIXME: Disable adding this dataset until dask_cudf can be used with Dataset? -# twitter = Dataset(meta_path / "soc-twitter-2010.yaml") diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py index 157cc30cad6..9fc249f5c5c 100644 --- a/python/cugraph/cugraph/testing/__init__.py +++ b/python/cugraph/cugraph/testing/__init__.py @@ -71,5 +71,4 @@ toy_graph_undirected, ] DEFAULT_DATASETS = [dolphins, netscience, karate_disjoint] -# FIXME: should twitter be included within BENCHMARKING_DATASETS? May require dask_cudf first BENCHMARKING_DATASETS = [soc_livejournal, cit_patents, europe_osm, hollywood] From 424bc514b24607a2b8fef40c540bfece959746fb Mon Sep 17 00:00:00 2001 From: Ralph Liu Date: Wed, 18 Oct 2023 06:10:01 -0700 Subject: [PATCH 14/14] fix bug in test fixture for unweighted graphs --- .../cugraph/tests/utils/test_dataset.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index f89e411553a..26c295c9352 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -98,7 +98,7 @@ def setup_deprecation_warning_tests(): def has_selfloop(dataset): if not dataset.metadata["is_directed"]: return False - df = dataset.get_edgelist() + df = dataset.get_edgelist(download=True) df.rename(columns={df.columns[0]: "src", df.columns[1]: "dst"}, inplace=True) res = df.where(df["src"] == df["dst"]) @@ -113,7 +113,13 @@ def is_symmetric(dataset): else: df = dataset.get_edgelist(download=True) df_a = df.sort_values("src") - df_b = df_a[["dst", "src", "wgt"]] + + # create df with swapped src/dst columns + df_b = None + if "wgt" in df_a.columns: + df_b = df_a[["dst", "src", "wgt"]] + else: + df_b = df_a[["dst", "src"]] df_b.rename(columns={"dst": "src", "src": "dst"}, inplace=True) # created a df by appending the two res = cudf.concat([df_a, df_b]) @@ -314,9 +320,7 @@ def test_is_directed(dataset): @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_has_selfloop(dataset): - df = dataset.get_edgelist(download=True) - - assert has_selfloop(df) == dataset.metadata["has_loop"] + assert has_selfloop(dataset) == dataset.metadata["has_loop"] @pytest.mark.parametrize("dataset", ALL_DATASETS) @@ -331,10 +335,10 @@ def test_is_multigraph(dataset): assert G.is_multigraph() == dataset.metadata["is_multigraph"] +# The datasets used for benchmarks are in their own test, since downloading them +# repeatedly would increase testing overhead significantly @pytest.mark.parametrize("dataset", BENCHMARKING_DATASETS) def test_benchmarking_datasets(dataset): - # The datasets used for benchmarks are in their own test, since downloading them - # repeatedly would increase testing overhead significantly dataset_is_directed = dataset.metadata["is_directed"] G = dataset.get_graph( download=True, create_using=Graph(directed=dataset_is_directed) @@ -344,6 +348,7 @@ def test_benchmarking_datasets(dataset): assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] assert G.number_of_edges() == dataset.metadata["number_of_edges"] assert has_selfloop(dataset) == dataset.metadata["has_loop"] + assert is_symmetric(dataset) == dataset.metadata["is_symmetric"] assert G.is_multigraph() == dataset.metadata["is_multigraph"] dataset.unload()