diff --git a/datasets/README.md b/datasets/README.md index e42413fc996..a23dc644081 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -120,9 +120,13 @@ The benchmark datasets are described below: | soc-twitter-2010 | 21,297,772 | 265,025,809 | No | No | **cit-Patents** : A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations. + **soc-LiveJournal** : A graph of the LiveJournal social network. + **europe_osm** : A graph of OpenStreetMap data for Europe. + **hollywood** : A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together. + **soc-twitter-2010** : A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i. _NOTE: the benchmark datasets were converted to a CSV format from their original format described in the reference URL below, and in doing so had edge weights and isolated vertices discarded._ diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py index 65a820f108b..ac18274d354 100644 --- a/python/cugraph/cugraph/datasets/__init__.py +++ b/python/cugraph/cugraph/datasets/__init__.py @@ -39,3 +39,13 @@ small_tree = Dataset(meta_path / "small_tree.yaml") toy_graph = Dataset(meta_path / "toy_graph.yaml") toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml") + +# Benchmarking datasets: be mindful of memory usage +# 250 MB +soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml") +# 965 MB +cit_patents = Dataset(meta_path / "cit-patents.yaml") +# 1.8 GB +europe_osm = Dataset(meta_path / "europe_osm.yaml") +# 1.5 GB +hollywood = Dataset(meta_path / "hollywood.yaml") diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py index 877eade7708..b36042606e7 100644 --- a/python/cugraph/cugraph/datasets/dataset.py +++ b/python/cugraph/cugraph/datasets/dataset.py @@ -26,10 +26,22 @@ class DefaultDownloadDir: a single object. """ - def __init__(self): - self._path = Path( - os.environ.get("RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets") - ) + def __init__(self, path_modifier=None): + if path_modifier: + self._path = ( + Path( + os.environ.get( + "RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets" + ) + ) + / path_modifier + ) + else: + self._path = Path( + os.environ.get( + "RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets" + ) + ) @property def path(self): @@ -53,6 +65,23 @@ def path(self, new): def clear(self): self._path = None + def set_download_dir(self, path): + """ + Set the download location for datasets + + Parameters + ---------- + path : String + Location used to store datafiles + """ + if path is None: + self.clear() + else: + self._path = path + + def get_download_dir(self): + return self._path.absolute() + default_download_dir = DefaultDownloadDir() @@ -159,7 +188,7 @@ def unload(self): """ self._edgelist = None - def get_edgelist(self, download=False): + def get_edgelist(self, download=False, create_using=cudf): """ Return an Edgelist @@ -168,6 +197,10 @@ def get_edgelist(self, download=False): download : Boolean (default=False) Automatically download the dataset from the 'url' location within the YAML file. + + create_using : module (default=cudf) + Specify which module to use when reading the dataset. This module + must have a read_csv function. """ if self._edgelist is None: full_path = self.get_path() @@ -183,7 +216,15 @@ def get_edgelist(self, download=False): header = None if isinstance(self.metadata["header"], int): header = self.metadata["header"] - self._edgelist = cudf.read_csv( + if create_using is None: + reader = cudf + elif str(type(create_using)) != "": + raise RuntimeError("create_using must be a module.") + elif create_using.__name__ == "cudf" or "pandas": + reader = create_using + else: + raise NotImplementedError() + self._edgelist = reader.read_csv( full_path, delimiter=self.metadata["delim"], names=self.metadata["col_names"], @@ -219,6 +260,10 @@ def get_graph( dataset -if present- will be applied to the Graph. If the dataset does not contain weights, the Graph returned will be unweighted regardless of ignore_weights. + + store_transposed: Boolean (default=False) + If True, stores the transpose of the adjacency matrix. Required + for certain algorithms, such as pagerank. """ if self._edgelist is None: self.get_edgelist(download) @@ -237,20 +282,19 @@ def get_graph( "(or subclass) type or instance, got: " f"{type(create_using)}" ) - if len(self.metadata["col_names"]) > 2 and not (ignore_weights): G.from_cudf_edgelist( self._edgelist, - source="src", - destination="dst", - edge_attr="wgt", + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], + edge_attr=self.metadata["col_names"][2], store_transposed=store_transposed, ) else: G.from_cudf_edgelist( self._edgelist, - source="src", - destination="dst", + source=self.metadata["col_names"][0], + destination=self.metadata["col_names"][1], store_transposed=store_transposed, ) return G @@ -331,7 +375,7 @@ def download_all(force=False): def set_download_dir(path): """ - Set the download location fors datasets + Set the download location for datasets Parameters ---------- diff --git a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml new file mode 100644 index 00000000000..d5c4cf195bd --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml @@ -0,0 +1,22 @@ +name: cit-Patents +file_type: .csv +description: A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations. +author: NBER +refs: + J. Leskovec, J. Kleinberg and C. Faloutsos. Graphs over Time Densification Laws, Shrinking Diameters and Possible Explanations. + ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2005. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 16518948 +number_of_nodes: 3774768 +url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml new file mode 100644 index 00000000000..fe0e42a4b86 --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml @@ -0,0 +1,21 @@ +name: europe_osm +file_type: .csv +description: A graph of OpenStreetMap data for Europe. +author: M. Kobitzsh / Geofabrik GmbH +refs: + Rossi, Ryan. Ahmed, Nesreen. The Network Data Respoistory with Interactive Graph Analytics and Visualization. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 54054660 +number_of_nodes: 50912018 +url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml new file mode 100644 index 00000000000..2f09cf7679b --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml @@ -0,0 +1,26 @@ +name: hollywood +file_type: .csv +description: + A graph of movie actors where vertices are actors, and two actors are + joined by an edge whenever they appeared in a movie together. +author: Laboratory for Web Algorithmics (LAW) +refs: + The WebGraph Framework I Compression Techniques, Paolo Boldi + and Sebastiano Vigna, Proc. of the Thirteenth International + World Wide Web Conference (WWW 2004), 2004, Manhattan, USA, + pp. 595--601, ACM Press. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 57515616 +number_of_nodes: 1139905 +url: https://data.rapids.ai/cugraph/datasets/hollywood.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml new file mode 100644 index 00000000000..fafc68acb9b --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml @@ -0,0 +1,22 @@ +name: soc-LiveJournal1 +file_type: .csv +description: A graph of the LiveJournal social network. +author: L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan +refs: + L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan. Group Formation in + Large Social Networks Membership, Growth, and Evolution. KDD, 2006. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 68993773 +number_of_nodes: 4847571 +url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml new file mode 100644 index 00000000000..df5df5735af --- /dev/null +++ b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml @@ -0,0 +1,22 @@ +name: soc-twitter-2010 +file_type: .csv +description: A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i. +author: H. Kwak, C. Lee, H. Park, S. Moon +refs: + J. Yang, J. Leskovec. Temporal Variation in Online Media. ACM Intl. + Conf. on Web Search and Data Mining (WSDM '11), 2011. +delim: " " +header: None +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: false +number_of_edges: 530051354 +number_of_nodes: 21297772 +url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv \ No newline at end of file diff --git a/python/cugraph/cugraph/testing/__init__.py b/python/cugraph/cugraph/testing/__init__.py index f5f0bcb06eb..2b4a4fd3ebf 100644 --- a/python/cugraph/cugraph/testing/__init__.py +++ b/python/cugraph/cugraph/testing/__init__.py @@ -19,7 +19,7 @@ Resultset, load_resultset, get_resultset, - results_dir_path, + default_resultset_download_dir, ) from cugraph.datasets import ( cyber, @@ -34,6 +34,11 @@ email_Eu_core, toy_graph, toy_graph_undirected, + soc_livejournal, + cit_patents, + europe_osm, + hollywood, + # twitter, ) # @@ -66,3 +71,4 @@ toy_graph_undirected, ] DEFAULT_DATASETS = [dolphins, netscience, karate_disjoint] +BENCHMARKING_DATASETS = [soc_livejournal, cit_patents, europe_osm, hollywood] diff --git a/python/cugraph/cugraph/testing/generate_resultsets.py b/python/cugraph/cugraph/testing/generate_resultsets.py index 9724aca32dc..ec93e445a85 100644 --- a/python/cugraph/cugraph/testing/generate_resultsets.py +++ b/python/cugraph/cugraph/testing/generate_resultsets.py @@ -20,8 +20,14 @@ import cudf import cugraph from cugraph.datasets import dolphins, netscience, karate_disjoint, karate -from cugraph.testing import utils, Resultset, SMALL_DATASETS, results_dir_path +# from cugraph.testing import utils, Resultset, SMALL_DATASETS, results_dir_path +from cugraph.testing import ( + utils, + Resultset, + SMALL_DATASETS, + default_resultset_download_dir, +) _resultsets = {} @@ -224,6 +230,7 @@ def add_resultset(result_data_dictionary, **kwargs): ] ) # Generating ALL results files + results_dir_path = default_resultset_download_dir.get_download_dir() if not results_dir_path.exists(): results_dir_path.mkdir(parents=True, exist_ok=True) diff --git a/python/cugraph/cugraph/testing/resultset.py b/python/cugraph/cugraph/testing/resultset.py index 490e3a7c4ff..a6b3a6d2ca1 100644 --- a/python/cugraph/cugraph/testing/resultset.py +++ b/python/cugraph/cugraph/testing/resultset.py @@ -16,10 +16,12 @@ import urllib.request import cudf -from cugraph.testing import utils +from cugraph.datasets.dataset import ( + DefaultDownloadDir, + default_download_dir, +) - -results_dir_path = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" / "resultsets" +# results_dir_path = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "tests" / "resultsets" class Resultset: @@ -48,14 +50,12 @@ def get_cudf_dataframe(self): _resultsets = {} -def load_resultset(resultset_name, resultset_download_url): - """ - Read a mapping file (.csv) in the _results_dir and save the - mappings between each unique set of args/identifiers to UUIDs to the - _resultsets dictionary. If .csv does not exist in - _results_dir, use resultset_download_url to download a file to - install/unpack/etc. to _results_dir first. - """ +"""def load_resultset(resultset_name, resultset_download_url): + #Read a mapping file (.csv) in the _results_dir and save the + #mappings between each unique set of args/identifiers to UUIDs to the + #_resultsets dictionary. If .csv does not exist in + #_results_dir, use resultset_download_url to download a file to + #install/unpack/etc. to _results_dir first. mapping_file_path = results_dir_path / (resultset_name + "_mappings.csv") if not mapping_file_path.exists(): # Downloads a tar gz from s3 bucket, then unpacks the results files @@ -101,7 +101,7 @@ def load_resultset(resultset_name, resultset_download_url): ] ) - _resultsets[resultset_key] = uuid + _resultsets[resultset_key] = uuid""" def get_resultset(resultset_name, **kwargs): @@ -132,5 +132,83 @@ def get_resultset(resultset_name, **kwargs): if uuid is None: raise KeyError(f"results for {arg_dict} not found") + results_dir_path = default_resultset_download_dir.get_download_dir() results_filename = results_dir_path / (uuid + ".csv") return cudf.read_csv(results_filename) + + +# This seems easily refactorable, this replaces +default_resultset_download_dir = DefaultDownloadDir("tests/resultsets") + + +# Left in case we don't want to move set_download_dir and get_download_dir into +# DefaultDownloadDir. +"""def set_resultset_download_dir(path): + if path is None: + default_resultset_download_dir.clear() + else: + default_resultset_download_dir.path = path + + +def get_resultset_download_dir(): + return default_resultset_download_dir.path.absolute()""" + + +def load_resultset(resultset_name, resultset_download_url): + """ + Read a mapping file (.csv) in the _results_dir and save the + mappings between each unique set of args/identifiers to UUIDs to the + _resultsets dictionary. If .csv does not exist in + _results_dir, use resultset_download_url to download a file to + install/unpack/etc. to _results_dir first. + """ + # curr_resultset_download_dir = get_resultset_download_dir() + curr_resultset_download_dir = default_resultset_download_dir.get_download_dir() + # curr_download_dir = get_download_dir() + curr_download_dir = default_download_dir.get_download_dir() + mapping_file_path = curr_resultset_download_dir / (resultset_name + "_mappings.csv") + if not mapping_file_path.exists(): + # Downloads a tar gz from s3 bucket, then unpacks the results files + compressed_file_dir = curr_download_dir / "tests" + compressed_file_path = compressed_file_dir / "resultsets.tar.gz" + if not curr_resultset_download_dir.exists(): + curr_resultset_download_dir.mkdir(parents=True, exist_ok=True) + if not compressed_file_path.exists(): + urllib.request.urlretrieve(resultset_download_url, compressed_file_path) + tar = tarfile.open(str(compressed_file_path), "r:gz") + tar.extractall(str(curr_resultset_download_dir)) + tar.close() + + # FIXME: This assumes separator is " ", but should this be configurable? + sep = " " + with open(mapping_file_path) as mapping_file: + for line in mapping_file.readlines(): + if line.startswith("#"): + continue + + (uuid, *row_args) = line.split(sep) + if (len(row_args) % 2) != 0: + raise ValueError( + f'bad row in {mapping_file_path}: "{line}", must ' + "contain UUID followed by an even number of items" + ) + row_keys = row_args[::2] + row_vals = row_args[1::2] + row_keys = " ".join(row_keys).split() + row_vals = " ".join(row_vals).split() + arg_dict = dict(zip(row_keys, row_vals)) + arg_dict["resultset_name"] = resultset_name + # Create a unique string key for the _resultsets dict based on + # sorted row_keys. Looking up results based on args will also have + # to sort, but this will ensure results can looked up without + # requiring maintaining a specific order. Example: + # {'a': 1, 'z': 9, 'c': 5, 'b': 2} becomes 'a-1-b-2-c-5-z-9' + resultset_key = "-".join( + [ + str(val) + for arg_dict_pair in sorted(arg_dict.items()) + for val in arg_dict_pair + ] + ) + + _resultsets[resultset_key] = uuid diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py index c2a4f7c6072..26c295c9352 100644 --- a/python/cugraph/cugraph/tests/utils/test_dataset.py +++ b/python/cugraph/cugraph/tests/utils/test_dataset.py @@ -27,6 +27,7 @@ ALL_DATASETS, WEIGHTED_DATASETS, SMALL_DATASETS, + BENCHMARKING_DATASETS, ) from cugraph import datasets @@ -94,7 +95,10 @@ def setup_deprecation_warning_tests(): # Helpers # check if there is a row where src == dst -def has_loop(df): +def has_selfloop(dataset): + if not dataset.metadata["is_directed"]: + return False + df = dataset.get_edgelist(download=True) df.rename(columns={df.columns[0]: "src", df.columns[1]: "dst"}, inplace=True) res = df.where(df["src"] == df["dst"]) @@ -109,7 +113,13 @@ def is_symmetric(dataset): else: df = dataset.get_edgelist(download=True) df_a = df.sort_values("src") - df_b = df_a[["dst", "src", "wgt"]] + + # create df with swapped src/dst columns + df_b = None + if "wgt" in df_a.columns: + df_b = df_a[["dst", "src", "wgt"]] + else: + df_b = df_a[["dst", "src"]] df_b.rename(columns={"dst": "src", "src": "dst"}, inplace=True) # created a df by appending the two res = cudf.concat([df_a, df_b]) @@ -172,7 +182,6 @@ def test_get_graph(dataset): @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_metadata(dataset): M = dataset.metadata - assert M is not None @@ -310,10 +319,8 @@ def test_is_directed(dataset): @pytest.mark.parametrize("dataset", ALL_DATASETS) -def test_has_loop(dataset): - df = dataset.get_edgelist(download=True) - - assert has_loop(df) == dataset.metadata["has_loop"] +def test_has_selfloop(dataset): + assert has_selfloop(dataset) == dataset.metadata["has_loop"] @pytest.mark.parametrize("dataset", ALL_DATASETS) @@ -328,6 +335,25 @@ def test_is_multigraph(dataset): assert G.is_multigraph() == dataset.metadata["is_multigraph"] +# The datasets used for benchmarks are in their own test, since downloading them +# repeatedly would increase testing overhead significantly +@pytest.mark.parametrize("dataset", BENCHMARKING_DATASETS) +def test_benchmarking_datasets(dataset): + dataset_is_directed = dataset.metadata["is_directed"] + G = dataset.get_graph( + download=True, create_using=Graph(directed=dataset_is_directed) + ) + + assert G.is_directed() == dataset.metadata["is_directed"] + assert G.number_of_nodes() == dataset.metadata["number_of_nodes"] + assert G.number_of_edges() == dataset.metadata["number_of_edges"] + assert has_selfloop(dataset) == dataset.metadata["has_loop"] + assert is_symmetric(dataset) == dataset.metadata["is_symmetric"] + assert G.is_multigraph() == dataset.metadata["is_multigraph"] + + dataset.unload() + + @pytest.mark.parametrize("dataset", ALL_DATASETS) def test_object_getters(dataset): assert dataset.is_directed() == dataset.metadata["is_directed"] diff --git a/python/cugraph/cugraph/tests/utils/test_resultset.py b/python/cugraph/cugraph/tests/utils/test_resultset.py new file mode 100644 index 00000000000..3e685c3e905 --- /dev/null +++ b/python/cugraph/cugraph/tests/utils/test_resultset.py @@ -0,0 +1,79 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from pathlib import Path +from tempfile import TemporaryDirectory + +import cudf +from cugraph.datasets.dataset import ( + set_download_dir, + get_download_dir, +) +from cugraph.testing.resultset import load_resultset, default_resultset_download_dir + +############################################################################### + + +def test_load_resultset(): + with TemporaryDirectory() as tmpd: + + set_download_dir(Path(tmpd)) + default_resultset_download_dir.set_download_dir( + Path(tmpd) / "tests" / "resultsets" + ) + default_resultset_download_dir.get_download_dir().mkdir( + parents=True, exist_ok=True + ) + + datasets_download_dir = get_download_dir() + resultsets_download_dir = default_resultset_download_dir.get_download_dir() + assert "tests" in os.listdir(datasets_download_dir) + assert "resultsets.tar.gz" not in os.listdir(datasets_download_dir / "tests") + assert "traversal_mappings.csv" not in os.listdir(resultsets_download_dir) + + load_resultset( + "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz" + ) + + assert "resultsets.tar.gz" in os.listdir(datasets_download_dir / "tests") + assert "traversal_mappings.csv" in os.listdir(resultsets_download_dir) + + +def test_verify_resultset_load(): + # This test is more detailed than test_load_resultset, where for each module, + # we check that every single resultset file is included along with the + # corresponding mapping file. + with TemporaryDirectory() as tmpd: + set_download_dir(Path(tmpd)) + default_resultset_download_dir.set_download_dir( + Path(tmpd) / "tests" / "resultsets" + ) + default_resultset_download_dir.get_download_dir().mkdir( + parents=True, exist_ok=True + ) + + resultsets_download_dir = default_resultset_download_dir.get_download_dir() + + load_resultset( + "traversal", "https://data.rapids.ai/cugraph/results/resultsets.tar.gz" + ) + + resultsets = os.listdir(resultsets_download_dir) + downloaded_results = cudf.read_csv( + resultsets_download_dir / "traversal_mappings.csv", sep=" " + ) + downloaded_uuids = downloaded_results["#UUID"].values + for resultset_uuid in downloaded_uuids: + assert str(resultset_uuid) + ".csv" in resultsets