rapidsai · nv-rliu · Sep 5, 2023 · Sep 7, 2023 · Sep 7, 2023 · Sep 7, 2023
@@ -120,9 +120,13 @@ The benchmark datasets are described below:
 | soc-twitter-2010  | 21,297,772 |   265,025,809 | No       | No       |
 
 **cit-Patents** : A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations.
+
 **soc-LiveJournal** : A graph of the LiveJournal social network.
+
 **europe_osm** : A graph of OpenStreetMap data for Europe.
+
 **hollywood** : A graph of movie actors where vertices are actors, and two actors are joined by an edge whenever they appeared in a movie together.
+
 **soc-twitter-2010** : A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i.
 
 _NOTE: the benchmark datasets were converted to a CSV format from their original format described in the reference URL below, and in doing so had edge weights and isolated vertices discarded._

@@ -39,3 +39,13 @@
 small_tree = Dataset(meta_path / "small_tree.yaml")
 toy_graph = Dataset(meta_path / "toy_graph.yaml")
 toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml")
+
+# Benchmarking datasets: be mindful of memory usage
+# 250 MB
+soc_livejournal = Dataset(meta_path / "soc-livejournal1.yaml")
+# 965 MB
+cit_patents = Dataset(meta_path / "cit-patents.yaml")
+# 1.8 GB
+europe_osm = Dataset(meta_path / "europe_osm.yaml")
+# 1.5 GB
+hollywood = Dataset(meta_path / "hollywood.yaml")
@@ -159,7 +159,7 @@ def unload(self):
         """
         self._edgelist = None
 
-    def get_edgelist(self, download=False):
+    def get_edgelist(self, download=False, create_using=cudf):
         """
         Return an Edgelist
 
@@ -168,6 +168,10 @@ def get_edgelist(self, download=False):
         download : Boolean (default=False)
             Automatically download the dataset from the 'url' location within
             the YAML file.
+
+        create_using : module (default=cudf)
+            Specify which module to use when reading the dataset. This module
+            must have a read_csv function.
         """
         if self._edgelist is None:
             full_path = self.get_path()
@@ -183,7 +187,15 @@ def get_edgelist(self, download=False):
             header = None
             if isinstance(self.metadata["header"], int):
                 header = self.metadata["header"]
-            self._edgelist = cudf.read_csv(
+            if create_using is None:
+                reader = cudf
+            elif str(type(create_using)) != "<class 'module'>":
+                raise RuntimeError("create_using must be a module.")
+            elif create_using.__name__ == "cudf" or "pandas":
+                reader = create_using
+            else:
+                raise NotImplementedError()
+            self._edgelist = reader.read_csv(
                 full_path,
                 delimiter=self.metadata["delim"],
                 names=self.metadata["col_names"],
@@ -219,6 +231,10 @@ def get_graph(
             dataset -if present- will be applied to the Graph. If the
             dataset does not contain weights, the Graph returned will
             be unweighted regardless of ignore_weights.
+
+        store_transposed: Boolean (default=False)
+            If True, stores the transpose of the adjacency matrix.  Required
+            for certain algorithms, such as pagerank.
         """
         if self._edgelist is None:
             self.get_edgelist(download)
@@ -237,20 +253,19 @@ def get_graph(
                 "(or subclass) type or instance, got: "
                 f"{type(create_using)}"
             )
-
         if len(self.metadata["col_names"]) > 2 and not (ignore_weights):
             G.from_cudf_edgelist(
                 self._edgelist,
-                source="src",
-                destination="dst",
-                edge_attr="wgt",
+                source=self.metadata["col_names"][0],
+                destination=self.metadata["col_names"][1],
+                edge_attr=self.metadata["col_names"][2],
                 store_transposed=store_transposed,
             )
         else:
             G.from_cudf_edgelist(
                 self._edgelist,
-                source="src",
-                destination="dst",
+                source=self.metadata["col_names"][0],
+                destination=self.metadata["col_names"][1],
                 store_transposed=store_transposed,
             )
         return G

@@ -0,0 +1,22 @@
+name: cit-Patents
+file_type: .csv
+description: A citation graph that includes all citations made by patents granted between 1975 and 1999, totaling 16,522,438 citations.
+author: NBER
+refs:
+  J. Leskovec, J. Kleinberg and C. Faloutsos. Graphs over Time Densification Laws, Shrinking Diameters and Possible Explanations. 
+  ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD), 2005.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: true
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 16518948
+number_of_nodes: 3774768
+url: https://data.rapids.ai/cugraph/datasets/cit-Patents.csv
@@ -0,0 +1,21 @@
+name: europe_osm
+file_type: .csv
+description: A graph of OpenStreetMap data for Europe.
+author: M. Kobitzsh / Geofabrik GmbH
+refs:
+  Rossi, Ryan. Ahmed, Nesreen. The Network Data Respoistory with Interactive Graph Analytics and Visualization.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: true
+number_of_edges: 54054660
+number_of_nodes: 50912018
+url: https://data.rapids.ai/cugraph/datasets/europe_osm.csv
@@ -0,0 +1,26 @@
+name: hollywood
+file_type: .csv
+description:
+  A graph of movie actors where vertices are actors, and two actors are
+  joined by an edge whenever they appeared in a movie together.
+author: Laboratory for Web Algorithmics (LAW)
+refs:
+  The WebGraph Framework I Compression Techniques, Paolo Boldi
+  and Sebastiano Vigna, Proc. of the Thirteenth International
+  World Wide Web Conference (WWW 2004), 2004, Manhattan, USA,
+  pp. 595--601, ACM Press.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: true
+number_of_edges: 57515616
+number_of_nodes: 1139905
+url: https://data.rapids.ai/cugraph/datasets/hollywood.csv
@@ -0,0 +1,22 @@
+name: soc-LiveJournal1
+file_type: .csv
+description:  A graph of the LiveJournal social network.
+author: L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan
+refs:
+  L. Backstrom, D. Huttenlocher, J. Kleinberg, X. Lan. Group Formation in        
+  Large Social Networks Membership, Growth, and Evolution. KDD, 2006.
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: true
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 68993773
+number_of_nodes: 4847571
+url: https://data.rapids.ai/cugraph/datasets/soc-LiveJournal1.csv
@@ -0,0 +1,22 @@
+name: soc-twitter-2010
+file_type: .csv
+description: A network of follower relationships from a snapshot of Twitter in 2010, where an edge from i to j indicates that j is a follower of i.
+author: H. Kwak, C. Lee, H. Park, S. Moon
+refs:
+  J. Yang, J. Leskovec. Temporal Variation in Online Media. ACM Intl.        
+  Conf. on Web Search and Data Mining (WSDM '11), 2011. 
+delim: " "
+header: None
+col_names:
+  - src
+  - dst
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: false
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 530051354
+number_of_nodes: 21297772
+url: https://data.rapids.ai/cugraph/datasets/soc-twitter-2010.csv
@@ -34,6 +34,11 @@
     email_Eu_core,
     toy_graph,
     toy_graph_undirected,
+    soc_livejournal,
+    cit_patents,
+    europe_osm,
+    hollywood,
+    # twitter,
 )
 
 #
@@ -66,3 +71,4 @@
     toy_graph_undirected,
 ]
 DEFAULT_DATASETS = [dolphins, netscience, karate_disjoint]
+BENCHMARKING_DATASETS = [soc_livejournal, cit_patents, europe_osm, hollywood]
@@ -27,6 +27,7 @@
     ALL_DATASETS,
     WEIGHTED_DATASETS,
     SMALL_DATASETS,
+    BENCHMARKING_DATASETS,
 )
 from cugraph import datasets
 
@@ -94,7 +95,10 @@ def setup_deprecation_warning_tests():
 # Helpers
 
 # check if there is a row where src == dst
-def has_loop(df):
+def has_selfloop(dataset):
+    if not dataset.metadata["is_directed"]:
+        return False
+    df = dataset.get_edgelist(download=True)
     df.rename(columns={df.columns[0]: "src", df.columns[1]: "dst"}, inplace=True)
     res = df.where(df["src"] == df["dst"])
 
@@ -109,7 +113,13 @@ def is_symmetric(dataset):
     else:
         df = dataset.get_edgelist(download=True)
         df_a = df.sort_values("src")
-        df_b = df_a[["dst", "src", "wgt"]]
+
+        # create df with swapped src/dst columns
+        df_b = None
+        if "wgt" in df_a.columns:
+            df_b = df_a[["dst", "src", "wgt"]]
+        else:
+            df_b = df_a[["dst", "src"]]
         df_b.rename(columns={"dst": "src", "src": "dst"}, inplace=True)
         # created a df by appending the two
         res = cudf.concat([df_a, df_b])
@@ -172,7 +182,6 @@ def test_get_graph(dataset):
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
 def test_metadata(dataset):
     M = dataset.metadata
-
     assert M is not None
 
 
@@ -310,10 +319,8 @@ def test_is_directed(dataset):
 
 
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
-def test_has_loop(dataset):
-    df = dataset.get_edgelist(download=True)
-
-    assert has_loop(df) == dataset.metadata["has_loop"]
+def test_has_selfloop(dataset):
+    assert has_selfloop(dataset) == dataset.metadata["has_loop"]
 
 
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
@@ -328,6 +335,25 @@ def test_is_multigraph(dataset):
     assert G.is_multigraph() == dataset.metadata["is_multigraph"]
 
 
+# The datasets used for benchmarks are in their own test, since downloading them
+# repeatedly would increase testing overhead significantly
+@pytest.mark.parametrize("dataset", BENCHMARKING_DATASETS)
+def test_benchmarking_datasets(dataset):
+    dataset_is_directed = dataset.metadata["is_directed"]
+    G = dataset.get_graph(
+        download=True, create_using=Graph(directed=dataset_is_directed)
+    )
+
+    assert G.is_directed() == dataset.metadata["is_directed"]
+    assert G.number_of_nodes() == dataset.metadata["number_of_nodes"]
+    assert G.number_of_edges() == dataset.metadata["number_of_edges"]
+    assert has_selfloop(dataset) == dataset.metadata["has_loop"]
+    assert is_symmetric(dataset) == dataset.metadata["is_symmetric"]
+    assert G.is_multigraph() == dataset.metadata["is_multigraph"]
+
+    dataset.unload()
+
+
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
 def test_object_getters(dataset):
     assert dataset.is_directed() == dataset.metadata["is_directed"]