Merge branch 'branch-24.02' into fix-gatconv

rapidsai · Jan 9, 2024 · 4072871 · 4072871
2 parents 38d87b7 + c7b720d
commit 4072871
Show file tree

Hide file tree

Showing 16 changed files with 213 additions and 45 deletions.
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
@@ -28,7 +28,7 @@
 )
 
 from cugraph.structure.symmetrize import symmetrize
-from cugraph.experimental.gnn import BulkSampler
+from cugraph.gnn import BulkSampler
 
 import cugraph
 

diff --git a/cpp/src/community/flatten_dendrogram.hpp b/cpp/src/community/flatten_dendrogram.hpp
@@ -75,7 +75,7 @@ void leiden_partition_at_level(raft::handle_t const& handle,
     thrust::make_counting_iterator<size_t>(0),
     thrust::make_counting_iterator<size_t>((level - 1) / 2),
     [&handle, &dendrogram, &local_vertex_ids_v, &d_partition, local_num_verts](size_t l) {
-      cugraph::relabel<vertex_t, false>(
+      cugraph::relabel<vertex_t, multi_gpu>(
         handle,
         std::tuple<vertex_t const*, vertex_t const*>(dendrogram.get_level_ptr_nocheck(2 * l + 1),
                                                      dendrogram.get_level_ptr_nocheck(2 * l + 2)),

diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -17,7 +17,7 @@
 import cupy as cp
 import cudf
 from cugraph.utilities.utils import import_optional
-from cugraph.experimental import BulkSampler
+from cugraph.gnn import BulkSampler
 from dask.distributed import default_client, Event
 from cugraph_dgl.dataloading import (
     HomogenousBulkSamplerDataset,

diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
@@ -11,8 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.utilities.api_tools import experimental_warning_wrapper
-
-from cugraph_pyg.data.cugraph_store import EXPERIMENTAL__CuGraphStore
-
-CuGraphStore = experimental_warning_wrapper(EXPERIMENTAL__CuGraphStore)
+from cugraph_pyg.data.cugraph_store import CuGraphStore
diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
@@ -199,7 +199,7 @@ def cast(cls, *args, **kwargs):
         return cls(*args, **kwargs)
 
 
-class EXPERIMENTAL__CuGraphStore:
+class CuGraphStore:
     """
     Duck-typed version of PyG's GraphStore and FeatureStore.
     """

diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
@@ -11,14 +11,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.utilities.api_tools import experimental_warning_wrapper
+from cugraph_pyg.loader.cugraph_node_loader import CuGraphNeighborLoader
 
-from cugraph_pyg.loader.cugraph_node_loader import EXPERIMENTAL__CuGraphNeighborLoader
-
-CuGraphNeighborLoader = experimental_warning_wrapper(
-    EXPERIMENTAL__CuGraphNeighborLoader
-)
-
-from cugraph_pyg.loader.cugraph_node_loader import EXPERIMENTAL__BulkSampleLoader
-
-BulkSampleLoader = experimental_warning_wrapper(EXPERIMENTAL__BulkSampleLoader)
+from cugraph_pyg.loader.cugraph_node_loader import BulkSampleLoader
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
@@ -20,7 +20,7 @@
 import cupy
 import cudf
 
-from cugraph.experimental.gnn import BulkSampler
+from cugraph.gnn import BulkSampler
 from cugraph.utilities.utils import import_optional, MissingModule
 
 from cugraph_pyg.data import CuGraphStore
@@ -42,7 +42,7 @@
 )
 
 
-class EXPERIMENTAL__BulkSampleLoader:
+class BulkSampleLoader:
 
     __ex_parquet_file = re.compile(r"batch=([0-9]+)\-([0-9]+)\.parquet")
 
@@ -478,7 +478,7 @@ def __iter__(self):
         return self
 
 
-class EXPERIMENTAL__CuGraphNeighborLoader:
+class CuGraphNeighborLoader:
     def __init__(
         self,
         data: Union[CuGraphStore, Tuple[CuGraphStore, CuGraphStore]],
@@ -527,7 +527,7 @@ def batch_size(self) -> int:
         return self.__batch_size
 
     def __iter__(self):
-        self.current_loader = EXPERIMENTAL__BulkSampleLoader(
+        self.current_loader = BulkSampleLoader(
             self.__feature_store,
             self.__graph_store,
             self.__input_nodes,

diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,10 +12,13 @@
 # limitations under the License.
 
 import cudf
+import dask_cudf
 import yaml
 import os
 import pandas as pd
+import cugraph.dask as dcg
 from pathlib import Path
+import urllib.request
 from cugraph.structure.graph_classes import Graph
 
 
@@ -138,9 +141,8 @@ def __download_csv(self, url):
 
         filename = self.metadata["name"] + self.metadata["file_type"]
         if self._dl_path.path.is_dir():
-            df = cudf.read_csv(url)
             self._path = self._dl_path.path / filename
-            df.to_csv(self._path, index=False)
+            urllib.request.urlretrieve(url, str(self._path))
 
         else:
             raise RuntimeError(
@@ -149,7 +151,6 @@ def __download_csv(self, url):
         return self._path
 
     def unload(self):
-
         """
         Remove all saved internal objects, forcing them to be re-created when
         accessed.
@@ -162,7 +163,7 @@ def unload(self):
 
     def get_edgelist(self, download=False, reader="cudf"):
         """
-        Return an Edgelist
+        Return an Edgelist.
 
         Parameters
         ----------
@@ -212,6 +213,47 @@ def get_edgelist(self, download=False, reader="cudf"):
 
         return self._edgelist.copy()
 
+    def get_dask_edgelist(self, download=False):
+        """
+        Return a distributed Edgelist.
+
+        Parameters
+        ----------
+        download : Boolean (default=False)
+            Automatically download the dataset from the 'url' location within
+            the YAML file.
+        """
+        if self._edgelist is None:
+            full_path = self.get_path()
+            if not full_path.is_file():
+                if download:
+                    full_path = self.__download_csv(self.metadata["url"])
+                else:
+                    raise RuntimeError(
+                        f"The datafile {full_path} does not"
+                        " exist. Try setting download=True"
+                        " to download the datafile"
+                    )
+
+            header = None
+            if isinstance(self.metadata["header"], int):
+                header = self.metadata["header"]
+
+            blocksize = dcg.get_chunksize(full_path)
+            self._edgelist = dask_cudf.read_csv(
+                path=full_path,
+                blocksize=blocksize,
+                delimiter=self.metadata["delim"],
+                names=self.metadata["col_names"],
+                dtype={
+                    self.metadata["col_names"][i]: self.metadata["col_types"][i]
+                    for i in range(len(self.metadata["col_types"]))
+                },
+                header=header,
+            )
+
+        return self._edgelist.copy()
+
     def get_graph(
         self,
         download=False,
@@ -249,10 +291,10 @@ def get_graph(
         if create_using is None:
             G = Graph()
         elif isinstance(create_using, Graph):
-            # what about BFS if trnaposed is True
+            # what about BFS if transposed is True
             attrs = {"directed": create_using.is_directed()}
             G = type(create_using)(**attrs)
-        elif type(create_using) is type:
+        elif issubclass(create_using, Graph):
             G = create_using()
         else:
             raise TypeError(
@@ -277,9 +319,74 @@ def get_graph(
             )
         return G
 
+    def get_dask_graph(
+        self,
+        download=False,
+        create_using=Graph,
+        ignore_weights=False,
+        store_transposed=False,
+    ):
+        """
+        Return a distributed Graph object.
+
+        Parameters
+        ----------
+        download : Boolean (default=False)
+            Downloads the dataset from the web.
+
+        create_using: cugraph.Graph (instance or class), optional
+        (default=Graph)
+            Specify the type of Graph to create. Can pass in an instance to
+            create a Graph instance with specified 'directed' attribute.
+
+        ignore_weights : Boolean (default=False)
+            Ignores weights in the dataset if True, resulting in an
+            unweighted Graph. If False (the default), weights from the
+            dataset -if present- will be applied to the Graph. If the
+            dataset does not contain weights, the Graph returned will
+            be unweighted regardless of ignore_weights.
+
+        store_transposed : bool, optional (default=False)
+            If True, stores the transpose of the adjacency matrix.  Required
+            for certain algorithms.
+        """
+        if self._edgelist is None:
+            self.get_dask_edgelist(download)
+
+        if create_using is None:
+            G = Graph()
+        elif isinstance(create_using, Graph):
+            attrs = {"directed": create_using.is_directed()}
+            G = type(create_using)(**attrs)
+        elif issubclass(create_using, Graph):
+            G = create_using()
+        else:
+            raise TypeError(
+                "create_using must be a cugraph.Graph "
+                "(or subclass) type or instance, got: "
+                f"{type(create_using)}"
+            )
+
+        if len(self.metadata["col_names"]) > 2 and not (ignore_weights):
+            G.from_dask_cudf_edgelist(
+                self._edgelist,
+                source=self.metadata["col_names"][0],
+                destination=self.metadata["col_names"][1],
+                edge_attr=self.metadata["col_names"][2],
+                store_transposed=store_transposed,
+            )
+        else:
+            G.from_dask_cudf_edgelist(
+                self._edgelist,
+                source=self.metadata["col_names"][0],
+                destination=self.metadata["col_names"][1],
+                store_transposed=store_transposed,
+            )
+        return G
+
     def get_path(self):
         """
-        Returns the location of the stored dataset file
+        Returns the location of the stored dataset file.
         """
         if self._path is None:
             self._path = self._dl_path.path / (
@@ -347,8 +454,7 @@ def download_all(force=False):
                 filename = meta["name"] + meta["file_type"]
                 save_to = default_download_dir.path / filename
                 if not save_to.is_file() or force:
-                    df = cudf.read_csv(meta["url"])
-                    df.to_csv(save_to, index=False)
+                    urllib.request.urlretrieve(meta["url"], str(save_to))
 
 
 def set_download_dir(path):

diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py
@@ -48,9 +48,9 @@
     experimental_warning_wrapper(EXPERIMENTAL__find_bicliques)
 )
 
-from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler
+from cugraph.gnn.data_loading import BulkSampler
 
-BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler)
+BulkSampler = promoted_experimental_warning_wrapper(BulkSampler)
 
 
 from cugraph.link_prediction.jaccard import jaccard, jaccard_coefficient

diff --git a/python/cugraph/cugraph/experimental/gnn/__init__.py b/python/cugraph/cugraph/experimental/gnn/__init__.py
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler
-from cugraph.utilities.api_tools import experimental_warning_wrapper
+from cugraph.gnn.data_loading import BulkSampler
+from cugraph.utilities.api_tools import promoted_experimental_warning_wrapper
 
-BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler)
+BulkSampler = promoted_experimental_warning_wrapper(BulkSampler)
diff --git a/python/cugraph/cugraph/gnn/__init__.py b/python/cugraph/cugraph/gnn/__init__.py
@@ -12,3 +12,4 @@
 # limitations under the License.
 
 from .feature_storage.feat_storage import FeatureStore
+from .data_loading.bulk_sampler import BulkSampler
diff --git a/python/cugraph/cugraph/gnn/data_loading/__init__.py b/python/cugraph/cugraph/gnn/data_loading/__init__.py
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.gnn.data_loading.bulk_sampler import EXPERIMENTAL__BulkSampler
+from cugraph.gnn.data_loading.bulk_sampler import BulkSampler
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
@@ -31,7 +31,7 @@
 import time
 
 
-class EXPERIMENTAL__BulkSampler:
+class BulkSampler:
     """
     Performs sampling based on input seeds grouped into batches by
     a batch id.  Writes the output minibatches to parquet, with
@@ -158,7 +158,7 @@ def add_batches(
         Examples
         --------
         >>> import cudf
-        >>> from cugraph.experimental.gnn import BulkSampler
+        >>> from cugraph.gnn import BulkSampler
         >>> from cugraph.datasets import karate
         >>> import tempfile
         >>> df = cudf.DataFrame({

diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
@@ -17,7 +17,7 @@
 import cupy
 import cugraph
 from cugraph.datasets import karate, email_Eu_core
-from cugraph.experimental.gnn import BulkSampler
+from cugraph.gnn import BulkSampler
 from cugraph.utilities.utils import create_directory_with_overwrite
 
 import os

diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
@@ -22,7 +22,7 @@
 import cugraph
 import dask_cudf
 from cugraph.datasets import karate, email_Eu_core
-from cugraph.experimental import BulkSampler
+from cugraph.gnn import BulkSampler
 from cugraph.utilities.utils import create_directory_with_overwrite
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,3 +12,4 @@
		# limitations under the License.

		from .feature_storage.feat_storage import FeatureStore
		from .data_loading.bulk_sampler import BulkSampler