nx-cugraph: add CC for undirected graphs to fix k-truss (#3965)

Fixes #3963 and add `connected_components`, `is_connected`, `node_connected_component`, and `number_connected_components`. Also updated `_groupby` to handle groups that are not consecutive integers starting with 0. Also, `plc.weakly_connected_components` does not handle isolated nodes well, and I needed to handle this at the Python layer as was done in #3897 Authors: - Erik Welch (https://github.com/eriknw) Approvers: - Rick Ratzel (https://github.com/rlratzel) URL: #3965
rapidsai · Oct 31, 2023 · eb1e515 · eb1e515
1 parent 24845ca
commit eb1e515
Show file tree

Hide file tree

Showing 12 changed files with 243 additions and 17 deletions.
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -38,6 +38,7 @@
         "complete_bipartite_graph",
         "complete_graph",
         "complete_multipartite_graph",
+        "connected_components",
         "cubical_graph",
         "cycle_graph",
         "davis_southern_women_graph",
@@ -56,6 +57,7 @@
         "house_x_graph",
         "icosahedral_graph",
         "in_degree_centrality",
+        "is_connected",
         "is_isolate",
         "isolates",
         "k_truss",
@@ -66,7 +68,9 @@
         "lollipop_graph",
         "louvain_communities",
         "moebius_kantor_graph",
+        "node_connected_component",
         "null_graph",
+        "number_connected_components",
         "number_of_isolates",
         "number_of_selfloops",
         "octahedral_graph",
@@ -91,6 +95,10 @@
         "betweenness_centrality": "`weight` parameter is not yet supported.",
         "edge_betweenness_centrality": "`weight` parameter is not yet supported.",
         "from_pandas_edgelist": "cudf.DataFrame inputs also supported.",
+        "k_truss": (
+            "Currently raises `NotImplementedError` for graphs with more than one connected\n"
+            "component when k >= 3. We expect to fix this soon."
+        ),
         "louvain_communities": "`seed` parameter is currently ignored.",
         # END: extra_docstrings
     },

diff --git a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
@@ -10,8 +10,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from . import bipartite, centrality, community
+from . import bipartite, centrality, community, components
 from .bipartite import complete_bipartite_graph
 from .centrality import *
+from .components import *
 from .core import *
 from .isolate import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
@@ -62,7 +62,7 @@ def louvain_communities(
         resolution=resolution,
         do_expensive_check=False,
     )
-    groups = _groupby(clusters, vertices)
+    groups = _groupby(clusters, vertices, groups_are_canonical=True)
     rv = [set(G._nodearray_to_list(node_ids)) for node_ids in groups.values()]
     # TODO: PLC doesn't handle isolated vertices yet, so this is a temporary fix
     isolates = _isolates(G)

diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .connected import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+
+import cupy as cp
+import networkx as nx
+import pylibcugraph as plc
+
+from nx_cugraph.convert import _to_undirected_graph
+from nx_cugraph.utils import _groupby, networkx_algorithm, not_implemented_for
+
+from ..isolate import _isolates
+
+__all__ = [
+    "number_connected_components",
+    "connected_components",
+    "is_connected",
+    "node_connected_component",
+]
+
+
+@not_implemented_for("directed")
+@networkx_algorithm
+def number_connected_components(G):
+    return sum(1 for _ in connected_components(G))
+    # PREFERRED IMPLEMENTATION, BUT PLC DOES NOT HANDLE ISOLATED VERTICES WELL
+    # G = _to_undirected_graph(G)
+    # unused_node_ids, labels = plc.weakly_connected_components(
+    #     resource_handle=plc.ResourceHandle(),
+    #     graph=G._get_plc_graph(),
+    #     offsets=None,
+    #     indices=None,
+    #     weights=None,
+    #     labels=None,
+    #     do_expensive_check=False,
+    # )
+    # return cp.unique(labels).size
+
+
+@number_connected_components._can_run
+def _(G):
+    # NetworkX <= 3.2.1 does not check directedness for us
+    try:
+        return not G.is_directed()
+    except Exception:
+        return False
+
+
+@not_implemented_for("directed")
+@networkx_algorithm
+def connected_components(G):
+    G = _to_undirected_graph(G)
+    if G.src_indices.size == 0:
+        # TODO: PLC doesn't handle empty graphs (or isolated nodes) gracefully!
+        return [{key} for key in G._nodeiter_to_iter(range(len(G)))]
+    node_ids, labels = plc.weakly_connected_components(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(),
+        offsets=None,
+        indices=None,
+        weights=None,
+        labels=None,
+        do_expensive_check=False,
+    )
+    groups = _groupby(labels, node_ids)
+    it = (G._nodearray_to_set(connected_ids) for connected_ids in groups.values())
+    # TODO: PLC doesn't handle isolated vertices yet, so this is a temporary fix
+    isolates = _isolates(G)
+    if isolates.size > 0:
+        isolates = isolates[isolates > node_ids.max()]
+        if isolates.size > 0:
+            it = itertools.chain(
+                it, ({node} for node in G._nodearray_to_list(isolates))
+            )
+    return it
+
+
+@not_implemented_for("directed")
+@networkx_algorithm
+def is_connected(G):
+    G = _to_undirected_graph(G)
+    if len(G) == 0:
+        raise nx.NetworkXPointlessConcept(
+            "Connectivity is undefined for the null graph."
+        )
+    for community in connected_components(G):
+        return len(community) == len(G)
+    raise RuntimeError  # pragma: no cover
+    # PREFERRED IMPLEMENTATION, BUT PLC DOES NOT HANDLE ISOLATED VERTICES WELL
+    # unused_node_ids, labels = plc.weakly_connected_components(
+    #     resource_handle=plc.ResourceHandle(),
+    #     graph=G._get_plc_graph(),
+    #     offsets=None,
+    #     indices=None,
+    #     weights=None,
+    #     labels=None,
+    #     do_expensive_check=False,
+    # )
+    # return labels.size == len(G) and cp.unique(labels).size == 1
+
+
+@not_implemented_for("directed")
+@networkx_algorithm
+def node_connected_component(G, n):
+    # We could also do plain BFS from n
+    G = _to_undirected_graph(G)
+    node_id = n if G.key_to_id is None else G.key_to_id[n]
+    node_ids, labels = plc.weakly_connected_components(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(),
+        offsets=None,
+        indices=None,
+        weights=None,
+        labels=None,
+        do_expensive_check=False,
+    )
+    indices = cp.nonzero(node_ids == node_id)[0]
+    if indices.size == 0:
+        return {n}
+    return G._nodearray_to_set(node_ids[labels == labels[indices[0]]])
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/core.py b/python/nx-cugraph/nx_cugraph/algorithms/core.py
@@ -24,13 +24,18 @@
 @not_implemented_for("multigraph")
 @networkx_algorithm
 def k_truss(G, k):
+    """
+    Currently raises `NotImplementedError` for graphs with more than one connected
+    component when k >= 3. We expect to fix this soon.
+    """
     if is_nx := isinstance(G, nx.Graph):
         G = nxcg.from_networkx(G, preserve_all_attrs=True)
     if nxcg.number_of_selfloops(G) > 0:
         raise nx.NetworkXError(
             "Input graph has self loops which is not permitted; "
             "Consider using G.remove_edges_from(nx.selfloop_edges(G))."
         )
+
     # TODO: create renumbering helper function(s)
     if k < 3:
         # k-truss graph is comprised of nodes incident on k-2 triangles, so k<3 is a
@@ -49,6 +54,11 @@ def k_truss(G, k):
         # Renumber step 1: edge values (no changes needed)
         edge_values = {key: val.copy() for key, val in G.edge_values.items()}
         edge_masks = {key: val.copy() for key, val in G.edge_masks.items()}
+    elif (ncc := nxcg.number_connected_components(G)) > 1:
+        raise NotImplementedError(
+            "nx_cugraph.k_truss does not yet work on graphs with more than one "
+            f"connected component (this graph has {ncc}). We expect to fix this soon."
+        )
     else:
         edge_dtype = _get_int_dtype(G.src_indices.size - 1)
         edge_indices = cp.arange(G.src_indices.size, dtype=edge_dtype)

diff --git a/python/nx-cugraph/nx_cugraph/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py
@@ -692,6 +692,11 @@ def _nodearray_to_list(self, node_ids: cp.ndarray[IndexValue]) -> list[NodeKey]:
             return node_ids.tolist()
         return list(self._nodeiter_to_iter(node_ids.tolist()))
 
+    def _nodearray_to_set(self, node_ids: cp.ndarray[IndexValue]) -> set[NodeKey]:
+        if self.key_to_id is None:
+            return set(node_ids.tolist())
+        return set(self._nodeiter_to_iter(node_ids.tolist()))
+
     def _nodearray_to_dict(
         self, values: cp.ndarray[NodeValue]
     ) -> dict[NodeKey, NodeValue]:

diff --git a/python/nx-cugraph/nx_cugraph/interface.py b/python/nx-cugraph/nx_cugraph/interface.py
@@ -223,11 +223,20 @@ def key(testpath):
                         }
                     )
 
+        too_slow = "Too slow to run"
+        skip = {
+            key("test_tree_isomorphism.py:test_positive"): too_slow,
+            key("test_tree_isomorphism.py:test_negative"): too_slow,
+        }
+
         for item in items:
             kset = set(item.keywords)
             for (test_name, keywords), reason in xfail.items():
                 if item.name == test_name and keywords.issubset(kset):
                     item.add_marker(pytest.mark.xfail(reason=reason))
+            for (test_name, keywords), reason in skip.items():
+                if item.name == test_name and keywords.issubset(kset):
+                    item.add_marker(pytest.mark.skip(reason=reason))
 
     @classmethod
     def can_run(cls, name, args, kwargs):

diff --git a/python/nx-cugraph/nx_cugraph/tests/test_ktruss.py b/python/nx-cugraph/nx_cugraph/tests/test_ktruss.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pytest
+
+import nx_cugraph as nxcg
+
+
+@pytest.mark.parametrize(
+    "get_graph", [nx.florentine_families_graph, nx.les_miserables_graph]
+)
+def test_k_truss(get_graph):
+    Gnx = get_graph()
+    Gcg = nxcg.from_networkx(Gnx, preserve_all_attrs=True)
+    for k in range(10):
+        Hnx = nx.k_truss(Gnx, k)
+        Hcg = nxcg.k_truss(Gcg, k)
+        assert nx.utils.graphs_equal(Hnx, nxcg.to_networkx(Hcg))
+        if Hnx.number_of_edges() == 0:
+            break
diff --git a/python/nx-cugraph/nx_cugraph/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py
@@ -13,6 +13,7 @@
 from __future__ import annotations
 
 from functools import partial, update_wrapper
+from textwrap import dedent
 
 from networkx.utils.decorators import nodes_or_number, not_implemented_for
 
@@ -65,7 +66,9 @@ def __new__(
             )
         instance.extra_params = extra_params
         # The docstring on our function is added to the NetworkX docstring.
-        instance.extra_doc = func.__doc__
+        instance.extra_doc = (
+            dedent(func.__doc__.lstrip("\n").rstrip()) if func.__doc__ else None
+        )
         # Copy __doc__ from NetworkX
         if instance.name in _registered_algorithms:
             instance.__doc__ = _registered_algorithms[instance.name].__doc__

diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py
@@ -21,40 +21,56 @@
 import cupy as cp
 import numpy as np
 
+try:
+    from itertools import pairwise  # Python >=3.10
+except ImportError:
+
+    def pairwise(it):
+        it = iter(it)
+        for prev in it:
+            for cur in it:
+                yield (prev, cur)
+                prev = cur
+
+
 __all__ = ["index_dtype", "_groupby", "_seed_to_int", "_get_int_dtype"]
 
 # This may switch to np.uint32 at some point
 index_dtype = np.int32
 
 
-def _groupby(groups: cp.ndarray, values: cp.ndarray) -> dict[int, cp.ndarray]:
+def _groupby(
+    groups: cp.ndarray, values: cp.ndarray, groups_are_canonical: bool = False
+) -> dict[int, cp.ndarray]:
     """Perform a groupby operation given an array of group IDs and array of values.
 
     Parameters
     ----------
     groups : cp.ndarray
         Array that holds the group IDs.
-        Group IDs are assumed to be consecutive integers from 0.
     values : cp.ndarray
         Array of values to be grouped according to groups.
         Must be the same size as groups array.
+    groups_are_canonical : bool, default False
+        Whether the group IDs are consecutive integers beginning with 0.
 
     Returns
     -------
     dict with group IDs as keys and cp.ndarray as values.
     """
-    # It would actually be easy to support groups that aren't consecutive integers,
-    # but let's wait until we need it to implement it.
-    sorted_groups = cp.argsort(groups)
-    sorted_values = values[sorted_groups]
-    rv = {}
-    start = 0
-    for i, end in enumerate(
-        [*(cp.nonzero(cp.diff(groups[sorted_groups]))[0] + 1).tolist(), groups.size]
-    ):
-        rv[i] = sorted_values[start:end]
-        start = end
-    return rv
+    if groups.size == 0:
+        return {}
+    sort_indices = cp.argsort(groups)
+    sorted_groups = groups[sort_indices]
+    sorted_values = values[sort_indices]
+    prepend = 1 if groups_are_canonical else sorted_groups[0] + 1
+    left_bounds = cp.nonzero(cp.diff(sorted_groups, prepend=prepend))[0]
+    boundaries = pairwise(itertools.chain(left_bounds.tolist(), [groups.size]))
+    if groups_are_canonical:
+        it = enumerate(boundaries)
+    else:
+        it = zip(sorted_groups[left_bounds].tolist(), boundaries)
+    return {group: sorted_values[start:end] for group, (start, end) in it}
 
 
 def _seed_to_int(seed: int | Random | None) -> int:

diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
@@ -218,6 +218,7 @@ ignore = [
 # Allow assert, print, RNG, and no docstring
 "nx_cugraph/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"]
 "_nx_cugraph/__init__.py" = ["E501"]
+"nx_cugraph/algorithms/**/*py" = ["D205", "D401"]  # Allow flexible docstrings for algorithms
 
 [tool.ruff.flake8-annotations]
 mypy-init-return = true