Resultset and Dataset Refactors (#3957)

This PR replaces and is a continuation of #3857 (by @betochimas) > This PR primarily adds testing for the `Resultset` class, introduced earlier in 23.10. The tests take a similar approach to test_dataset, creating a temporary directory to test downloading all result files. To align `Resultset` and `Dataset`, the setter and getter for each download directory is moved into `DefaultDownloadDir`, so that each class shares an instance of `DefaultDownloadDir` and can be configured independently, although their default locations are still both dependent on the RAPIDS_DATASET_ROOT_DIR_PATH environment variable. The old patterns are present but commented-out, so this change would be breaking. This PR also removes the deprecated `experimental.datasets` package due to it being promoted to stable for >=1 release. Authors: - Ralph Liu (https://github.com/nv-rliu) - Dylan Chima-Sanchez (https://github.com/betochimas) - Rick Ratzel (https://github.com/rlratzel) - Brad Rees (https://github.com/BradReesWork) Approvers: - Rick Ratzel (https://github.com/rlratzel) URL: #3957
rapidsai · Nov 19, 2023 · 0684f9d · 0684f9d
1 parent 5d43f14
commit 0684f9d
Show file tree

Hide file tree

Showing 38 changed files with 379 additions and 882 deletions.
diff --git a/datasets/README.md b/datasets/README.md
@@ -120,9 +120,13 @@ The benchmark datasets are described below:
 | soc-twitter-2010  | 21,297,772 |   265,025,809 | No       | No       |
 
 **cit-Patents** : A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations.
+
 **soc-LiveJournal** : A graph of the LiveJournal social network.
+
 **europe_osm** : A graph of OpenStreetMap data for Europe.
+
 **hollywood** : A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together.
+
 **soc-twitter-2010** : A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i.
 
 _NOTE: the benchmark datasets were converted to a CSV format from their original format described in the reference URL below, and in doing so had edge weights and isolated vertices discarded._

diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
@@ -24,7 +24,7 @@
 import torch
 import numpy as np
 from cugraph.gnn import FeatureStore
-from cugraph.experimental.datasets import karate
+from cugraph.datasets import karate
 
 import tempfile
 

diff --git a/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py b/python/cugraph-service/server/cugraph_service_server/testing/benchmark_server_extension.py
@@ -17,7 +17,7 @@
 
 import cugraph
 from cugraph.experimental import PropertyGraph, MGPropertyGraph
-from cugraph.experimental import datasets
+from cugraph import datasets
 from cugraph.generators import rmat
 
 

diff --git a/python/cugraph/cugraph/dask/community/leiden.py b/python/cugraph/cugraph/dask/community/leiden.py
@@ -125,7 +125,7 @@ def leiden(
 
     Examples
     --------
-    >>> from cugraph.experimental.datasets import karate
+    >>> from cugraph.datasets import karate
     >>> G = karate.get_graph(fetch=True)
     >>> parts, modularity_score = cugraph.leiden(G)
 

diff --git a/python/cugraph/cugraph/dask/community/louvain.py b/python/cugraph/cugraph/dask/community/louvain.py
@@ -129,7 +129,7 @@ def louvain(
 
     Examples
     --------
-    >>> from cugraph.experimental.datasets import karate
+    >>> from cugraph.datasets import karate
     >>> G = karate.get_graph(fetch=True)
     >>> parts = cugraph.louvain(G)
 

diff --git a/python/cugraph/cugraph/datasets/__init__.py b/python/cugraph/cugraph/datasets/__init__.py
@@ -39,3 +39,13 @@
 small_tree = Dataset(meta_path / "small_tree.yaml")
 toy_graph = Dataset(meta_path / "toy_graph.yaml")
 toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml")
+
+# Benchmarking datasets: be mindful of memory usage
+# 250 MB
+soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml")
+# 965 MB
+cit_patents = Dataset(meta_path / "cit-patents.yaml")
+# 1.8 GB
+europe_osm = Dataset(meta_path / "europe_osm.yaml")
+# 1.5 GB
+hollywood = Dataset(meta_path / "hollywood.yaml")
diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py
@@ -14,44 +14,45 @@
 import cudf
 import yaml
 import os
+import pandas as pd
 from pathlib import Path
 from cugraph.structure.graph_classes import Graph
 
 
 class DefaultDownloadDir:
     """
-    Maintains the path to the download directory used by Dataset instances.
+    Maintains a path to be used as a default download directory.
+
+    All DefaultDownloadDir instances are based on RAPIDS_DATASET_ROOT_DIR if
+    set, or _default_base_dir if not set.
+
     Instances of this class are typically shared by several Dataset instances
     in order to allow for the download directory to be defined and updated by
     a single object.
     """
 
-    def __init__(self):
-        self._path = Path(
-            os.environ.get("RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets")
-        )
+    _default_base_dir = Path.home() / ".cugraph/datasets"
 
-    @property
-    def path(self):
+    def __init__(self, *, subdir=""):
         """
-        If `path` is not set, set it to the environment variable
-        RAPIDS_DATASET_ROOT_DIR. If the variable is not set, default to the
-        user's home directory.
+        subdir can be specified to provide a specialized dir under the base dir.
         """
-        if self._path is None:
-            self._path = Path(
-                os.environ.get(
-                    "RAPIDS_DATASET_ROOT_DIR", Path.home() / ".cugraph/datasets"
-                )
-            )
-        return self._path
+        self._subdir = Path(subdir)
+        self.reset()
+
+    @property
+    def path(self):
+        return self._path.absolute()
 
     @path.setter
     def path(self, new):
         self._path = Path(new)
 
-    def clear(self):
-        self._path = None
+    def reset(self):
+        self._basedir = Path(
+            os.environ.get("RAPIDS_DATASET_ROOT_DIR", self._default_base_dir)
+        )
+        self._path = self._basedir / self._subdir
 
 
 default_download_dir = DefaultDownloadDir()
@@ -159,7 +160,7 @@ def unload(self):
         """
         self._edgelist = None
 
-    def get_edgelist(self, download=False):
+    def get_edgelist(self, download=False, reader="cudf"):
         """
         Return an Edgelist
 
@@ -168,6 +169,9 @@ def get_edgelist(self, download=False):
         download : Boolean (default=False)
             Automatically download the dataset from the 'url' location within
             the YAML file.
+
+        reader : 'cudf' or 'pandas' (default='cudf')
+            The library used to read a CSV and return an edgelist DataFrame.
         """
         if self._edgelist is None:
             full_path = self.get_path()
@@ -180,14 +184,29 @@ def get_edgelist(self, download=False):
                         " exist. Try setting download=True"
                         " to download the datafile"
                     )
+
             header = None
             if isinstance(self.metadata["header"], int):
                 header = self.metadata["header"]
-            self._edgelist = cudf.read_csv(
-                full_path,
+
+            if reader == "cudf":
+                self.__reader = cudf.read_csv
+            elif reader == "pandas":
+                self.__reader = pd.read_csv
+            else:
+                raise ValueError(
+                    "reader must be a module with a read_csv function compatible with \
+                     cudf.read_csv"
+                )
+
+            self._edgelist = self.__reader(
+                filepath_or_buffer=full_path,
                 delimiter=self.metadata["delim"],
                 names=self.metadata["col_names"],
-                dtype=self.metadata["col_types"],
+                dtype={
+                    self.metadata["col_names"][i]: self.metadata["col_types"][i]
+                    for i in range(len(self.metadata["col_types"]))
+                },
                 header=header,
             )
 
@@ -219,6 +238,10 @@ def get_graph(
             dataset -if present- will be applied to the Graph. If the
             dataset does not contain weights, the Graph returned will
             be unweighted regardless of ignore_weights.
+
+        store_transposed: Boolean (default=False)
+            If True, stores the transpose of the adjacency matrix.  Required
+            for certain algorithms, such as pagerank.
         """
         if self._edgelist is None:
             self.get_edgelist(download)
@@ -237,20 +260,19 @@ def get_graph(
                 "(or subclass) type or instance, got: "
                 f"{type(create_using)}"
             )
-
         if len(self.metadata["col_names"]) > 2 and not (ignore_weights):
             G.from_cudf_edgelist(
                 self._edgelist,
-                source="src",
-                destination="dst",
-                edge_attr="wgt",
+                source=self.metadata["col_names"][0],
+                destination=self.metadata["col_names"][1],
+                edge_attr=self.metadata["col_names"][2],
                 store_transposed=store_transposed,
             )
         else:
             G.from_cudf_edgelist(
                 self._edgelist,
-                source="src",
-                destination="dst",
+                source=self.metadata["col_names"][0],
+                destination=self.metadata["col_names"][1],
                 store_transposed=store_transposed,
             )
         return G
@@ -331,18 +353,18 @@ def download_all(force=False):
 
 def set_download_dir(path):
     """
-    Set the download location fors datasets
+    Set the download location for datasets
 
     Parameters
     ----------
     path : String
         Location used to store datafiles
     """
     if path is None:
-        default_download_dir.clear()
+        default_download_dir.reset()
     else:
         default_download_dir.path = path
 
 
 def get_download_dir():
-    return default_download_dir.path.absolute()
+    return default_download_dir.path
diff --git a/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml b/python/cugraph/cugraph/datasets/metadata/cit-patents.yaml
@@ -0,0 +1,22 @@
+name: cit-Patents
+file_type: .csv
+description: A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations.
+author: NBER
+refs:
+  J. Leskovec, J. Kleinberg and C. Faloutsos. Graphs over Time Densification Laws, Shrinking Diameters and Possible Explanations. 
+  ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2005.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: true
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 16518948
+number_of_nodes: 3774768
+url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv
diff --git a/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml b/python/cugraph/cugraph/datasets/metadata/europe_osm.yaml
@@ -0,0 +1,21 @@
+name: europe_osm
+file_type: .csv
+description: A graph of OpenStreetMap data for Europe.
+author: M. Kobitzsh / Geofabrik GmbH
+refs:
+  Rossi, Ryan. Ahmed, Nesreen. The Network Data Respoistory with Interactive Graph Analytics and Visualization.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: true
+number_of_edges: 54054660
+number_of_nodes: 50912018
+url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv
diff --git a/python/cugraph/cugraph/datasets/metadata/hollywood.yaml b/python/cugraph/cugraph/datasets/metadata/hollywood.yaml
@@ -0,0 +1,26 @@
+name: hollywood
+file_type: .csv
+description:
+  A graph of movie actors where vertices are actors, and two actors are
+  joined by an edge whenever they appeared in a movie together.
+author: Laboratory for Web Algorithmics (LAW)
+refs:
+  The WebGraph Framework I Compression Techniques, Paolo Boldi
+  and Sebastiano Vigna, Proc. of the Thirteenth International
+  World Wide Web Conference (WWW 2004), 2004, Manhattan, USA,
+  pp. 595--601, ACM Press.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: true
+number_of_edges: 57515616
+number_of_nodes: 1139905
+url: https://data.rapids.ai/cugraph/datasets/hollywood.csv
diff --git a/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml b/python/cugraph/cugraph/datasets/metadata/soc-livejournal1.yaml
@@ -0,0 +1,22 @@
+name: soc-LiveJournal1
+file_type: .csv
+description:  A graph of the LiveJournal social network.
+author: L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan
+refs:
+  L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan. Group Formation in        
+  Large Social Networks Membership, Growth, and Evolution. KDD, 2006.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: true
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 68993773
+number_of_nodes: 4847571
+url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv
diff --git a/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml b/python/cugraph/cugraph/datasets/metadata/soc-twitter-2010.yaml
@@ -0,0 +1,22 @@
+name: soc-twitter-2010
+file_type: .csv
+description: A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i.
+author: H. Kwak, C. Lee, H. Park, S. Moon
+refs:
+  J. Yang, J. Leskovec. Temporal Variation in Online Media. ACM Intl.        
+  Conf. on Web Search and Data Mining (WSDM '11), 2011. 
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 530051354
+number_of_nodes: 21297772
+url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv