From eb1e515553ef133f0ea484a663b1b3ff82b1d5c6 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 31 Oct 2023 17:50:52 -0500 Subject: [PATCH] nx-cugraph: add CC for undirected graphs to fix k-truss (#3965) Fixes #3963 and add `connected_components`, `is_connected`, `node_connected_component`, and `number_connected_components`. Also updated `_groupby` to handle groups that are not consecutive integers starting with 0. Also, `plc.weakly_connected_components` does not handle isolated nodes well, and I needed to handle this at the Python layer as was done in #3897 Authors: - Erik Welch (https://github.com/eriknw) Approvers: - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/3965 --- python/nx-cugraph/_nx_cugraph/__init__.py | 8 ++ .../nx_cugraph/algorithms/__init__.py | 3 +- .../algorithms/community/louvain.py | 2 +- .../algorithms/components/__init__.py | 13 ++ .../algorithms/components/connected.py | 130 ++++++++++++++++++ .../nx-cugraph/nx_cugraph/algorithms/core.py | 10 ++ python/nx-cugraph/nx_cugraph/classes/graph.py | 5 + python/nx-cugraph/nx_cugraph/interface.py | 9 ++ .../nx_cugraph/tests/test_ktruss.py | 30 ++++ .../nx-cugraph/nx_cugraph/utils/decorators.py | 5 +- python/nx-cugraph/nx_cugraph/utils/misc.py | 44 ++++-- python/nx-cugraph/pyproject.toml | 1 + 12 files changed, 243 insertions(+), 17 deletions(-) create mode 100644 python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py create mode 100644 python/nx-cugraph/nx_cugraph/algorithms/components/connected.py create mode 100644 python/nx-cugraph/nx_cugraph/tests/test_ktruss.py diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py index af1df04644c..8ef976aabf1 100644 --- a/python/nx-cugraph/_nx_cugraph/__init__.py +++ b/python/nx-cugraph/_nx_cugraph/__init__.py @@ -38,6 +38,7 @@ "complete_bipartite_graph", "complete_graph", "complete_multipartite_graph", + "connected_components", "cubical_graph", "cycle_graph", "davis_southern_women_graph", @@ -56,6 +57,7 @@ "house_x_graph", "icosahedral_graph", "in_degree_centrality", + "is_connected", "is_isolate", "isolates", "k_truss", @@ -66,7 +68,9 @@ "lollipop_graph", "louvain_communities", "moebius_kantor_graph", + "node_connected_component", "null_graph", + "number_connected_components", "number_of_isolates", "number_of_selfloops", "octahedral_graph", @@ -91,6 +95,10 @@ "betweenness_centrality": "`weight` parameter is not yet supported.", "edge_betweenness_centrality": "`weight` parameter is not yet supported.", "from_pandas_edgelist": "cudf.DataFrame inputs also supported.", + "k_truss": ( + "Currently raises `NotImplementedError` for graphs with more than one connected\n" + "component when k >= 3. We expect to fix this soon." + ), "louvain_communities": "`seed` parameter is currently ignored.", # END: extra_docstrings }, diff --git a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py index 69feb8f6437..87b1967fa93 100644 --- a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py +++ b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py @@ -10,8 +10,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from . import bipartite, centrality, community +from . import bipartite, centrality, community, components from .bipartite import complete_bipartite_graph from .centrality import * +from .components import * from .core import * from .isolate import * diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py index 62261d109a2..45a3429d2ee 100644 --- a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py +++ b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py @@ -62,7 +62,7 @@ def louvain_communities( resolution=resolution, do_expensive_check=False, ) - groups = _groupby(clusters, vertices) + groups = _groupby(clusters, vertices, groups_are_canonical=True) rv = [set(G._nodearray_to_list(node_ids)) for node_ids in groups.values()] # TODO: PLC doesn't handle isolated vertices yet, so this is a temporary fix isolates = _isolates(G) diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py new file mode 100644 index 00000000000..26816ef3692 --- /dev/null +++ b/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .connected import * diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py new file mode 100644 index 00000000000..41f3457d542 --- /dev/null +++ b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py @@ -0,0 +1,130 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools + +import cupy as cp +import networkx as nx +import pylibcugraph as plc + +from nx_cugraph.convert import _to_undirected_graph +from nx_cugraph.utils import _groupby, networkx_algorithm, not_implemented_for + +from ..isolate import _isolates + +__all__ = [ + "number_connected_components", + "connected_components", + "is_connected", + "node_connected_component", +] + + +@not_implemented_for("directed") +@networkx_algorithm +def number_connected_components(G): + return sum(1 for _ in connected_components(G)) + # PREFERRED IMPLEMENTATION, BUT PLC DOES NOT HANDLE ISOLATED VERTICES WELL + # G = _to_undirected_graph(G) + # unused_node_ids, labels = plc.weakly_connected_components( + # resource_handle=plc.ResourceHandle(), + # graph=G._get_plc_graph(), + # offsets=None, + # indices=None, + # weights=None, + # labels=None, + # do_expensive_check=False, + # ) + # return cp.unique(labels).size + + +@number_connected_components._can_run +def _(G): + # NetworkX <= 3.2.1 does not check directedness for us + try: + return not G.is_directed() + except Exception: + return False + + +@not_implemented_for("directed") +@networkx_algorithm +def connected_components(G): + G = _to_undirected_graph(G) + if G.src_indices.size == 0: + # TODO: PLC doesn't handle empty graphs (or isolated nodes) gracefully! + return [{key} for key in G._nodeiter_to_iter(range(len(G)))] + node_ids, labels = plc.weakly_connected_components( + resource_handle=plc.ResourceHandle(), + graph=G._get_plc_graph(), + offsets=None, + indices=None, + weights=None, + labels=None, + do_expensive_check=False, + ) + groups = _groupby(labels, node_ids) + it = (G._nodearray_to_set(connected_ids) for connected_ids in groups.values()) + # TODO: PLC doesn't handle isolated vertices yet, so this is a temporary fix + isolates = _isolates(G) + if isolates.size > 0: + isolates = isolates[isolates > node_ids.max()] + if isolates.size > 0: + it = itertools.chain( + it, ({node} for node in G._nodearray_to_list(isolates)) + ) + return it + + +@not_implemented_for("directed") +@networkx_algorithm +def is_connected(G): + G = _to_undirected_graph(G) + if len(G) == 0: + raise nx.NetworkXPointlessConcept( + "Connectivity is undefined for the null graph." + ) + for community in connected_components(G): + return len(community) == len(G) + raise RuntimeError # pragma: no cover + # PREFERRED IMPLEMENTATION, BUT PLC DOES NOT HANDLE ISOLATED VERTICES WELL + # unused_node_ids, labels = plc.weakly_connected_components( + # resource_handle=plc.ResourceHandle(), + # graph=G._get_plc_graph(), + # offsets=None, + # indices=None, + # weights=None, + # labels=None, + # do_expensive_check=False, + # ) + # return labels.size == len(G) and cp.unique(labels).size == 1 + + +@not_implemented_for("directed") +@networkx_algorithm +def node_connected_component(G, n): + # We could also do plain BFS from n + G = _to_undirected_graph(G) + node_id = n if G.key_to_id is None else G.key_to_id[n] + node_ids, labels = plc.weakly_connected_components( + resource_handle=plc.ResourceHandle(), + graph=G._get_plc_graph(), + offsets=None, + indices=None, + weights=None, + labels=None, + do_expensive_check=False, + ) + indices = cp.nonzero(node_ids == node_id)[0] + if indices.size == 0: + return {n} + return G._nodearray_to_set(node_ids[labels == labels[indices[0]]]) diff --git a/python/nx-cugraph/nx_cugraph/algorithms/core.py b/python/nx-cugraph/nx_cugraph/algorithms/core.py index 33e79793553..2219388bc58 100644 --- a/python/nx-cugraph/nx_cugraph/algorithms/core.py +++ b/python/nx-cugraph/nx_cugraph/algorithms/core.py @@ -24,6 +24,10 @@ @not_implemented_for("multigraph") @networkx_algorithm def k_truss(G, k): + """ + Currently raises `NotImplementedError` for graphs with more than one connected + component when k >= 3. We expect to fix this soon. + """ if is_nx := isinstance(G, nx.Graph): G = nxcg.from_networkx(G, preserve_all_attrs=True) if nxcg.number_of_selfloops(G) > 0: @@ -31,6 +35,7 @@ def k_truss(G, k): "Input graph has self loops which is not permitted; " "Consider using G.remove_edges_from(nx.selfloop_edges(G))." ) + # TODO: create renumbering helper function(s) if k < 3: # k-truss graph is comprised of nodes incident on k-2 triangles, so k<3 is a @@ -49,6 +54,11 @@ def k_truss(G, k): # Renumber step 1: edge values (no changes needed) edge_values = {key: val.copy() for key, val in G.edge_values.items()} edge_masks = {key: val.copy() for key, val in G.edge_masks.items()} + elif (ncc := nxcg.number_connected_components(G)) > 1: + raise NotImplementedError( + "nx_cugraph.k_truss does not yet work on graphs with more than one " + f"connected component (this graph has {ncc}). We expect to fix this soon." + ) else: edge_dtype = _get_int_dtype(G.src_indices.size - 1) edge_indices = cp.arange(G.src_indices.size, dtype=edge_dtype) diff --git a/python/nx-cugraph/nx_cugraph/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py index 2048c4c3d72..23004651fc5 100644 --- a/python/nx-cugraph/nx_cugraph/classes/graph.py +++ b/python/nx-cugraph/nx_cugraph/classes/graph.py @@ -692,6 +692,11 @@ def _nodearray_to_list(self, node_ids: cp.ndarray[IndexValue]) -> list[NodeKey]: return node_ids.tolist() return list(self._nodeiter_to_iter(node_ids.tolist())) + def _nodearray_to_set(self, node_ids: cp.ndarray[IndexValue]) -> set[NodeKey]: + if self.key_to_id is None: + return set(node_ids.tolist()) + return set(self._nodeiter_to_iter(node_ids.tolist())) + def _nodearray_to_dict( self, values: cp.ndarray[NodeValue] ) -> dict[NodeKey, NodeValue]: diff --git a/python/nx-cugraph/nx_cugraph/interface.py b/python/nx-cugraph/nx_cugraph/interface.py index fd0b1483d73..875f8621021 100644 --- a/python/nx-cugraph/nx_cugraph/interface.py +++ b/python/nx-cugraph/nx_cugraph/interface.py @@ -223,11 +223,20 @@ def key(testpath): } ) + too_slow = "Too slow to run" + skip = { + key("test_tree_isomorphism.py:test_positive"): too_slow, + key("test_tree_isomorphism.py:test_negative"): too_slow, + } + for item in items: kset = set(item.keywords) for (test_name, keywords), reason in xfail.items(): if item.name == test_name and keywords.issubset(kset): item.add_marker(pytest.mark.xfail(reason=reason)) + for (test_name, keywords), reason in skip.items(): + if item.name == test_name and keywords.issubset(kset): + item.add_marker(pytest.mark.skip(reason=reason)) @classmethod def can_run(cls, name, args, kwargs): diff --git a/python/nx-cugraph/nx_cugraph/tests/test_ktruss.py b/python/nx-cugraph/nx_cugraph/tests/test_ktruss.py new file mode 100644 index 00000000000..a3e4cee3124 --- /dev/null +++ b/python/nx-cugraph/nx_cugraph/tests/test_ktruss.py @@ -0,0 +1,30 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import networkx as nx +import pytest + +import nx_cugraph as nxcg + + +@pytest.mark.parametrize( + "get_graph", [nx.florentine_families_graph, nx.les_miserables_graph] +) +def test_k_truss(get_graph): + Gnx = get_graph() + Gcg = nxcg.from_networkx(Gnx, preserve_all_attrs=True) + for k in range(10): + Hnx = nx.k_truss(Gnx, k) + Hcg = nxcg.k_truss(Gcg, k) + assert nx.utils.graphs_equal(Hnx, nxcg.to_networkx(Hcg)) + if Hnx.number_of_edges() == 0: + break diff --git a/python/nx-cugraph/nx_cugraph/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py index 0f15d236ecd..0048aee51bb 100644 --- a/python/nx-cugraph/nx_cugraph/utils/decorators.py +++ b/python/nx-cugraph/nx_cugraph/utils/decorators.py @@ -13,6 +13,7 @@ from __future__ import annotations from functools import partial, update_wrapper +from textwrap import dedent from networkx.utils.decorators import nodes_or_number, not_implemented_for @@ -65,7 +66,9 @@ def __new__( ) instance.extra_params = extra_params # The docstring on our function is added to the NetworkX docstring. - instance.extra_doc = func.__doc__ + instance.extra_doc = ( + dedent(func.__doc__.lstrip("\n").rstrip()) if func.__doc__ else None + ) # Copy __doc__ from NetworkX if instance.name in _registered_algorithms: instance.__doc__ = _registered_algorithms[instance.name].__doc__ diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py index 9683df5e7f9..26f023bdcec 100644 --- a/python/nx-cugraph/nx_cugraph/utils/misc.py +++ b/python/nx-cugraph/nx_cugraph/utils/misc.py @@ -21,40 +21,56 @@ import cupy as cp import numpy as np +try: + from itertools import pairwise # Python >=3.10 +except ImportError: + + def pairwise(it): + it = iter(it) + for prev in it: + for cur in it: + yield (prev, cur) + prev = cur + + __all__ = ["index_dtype", "_groupby", "_seed_to_int", "_get_int_dtype"] # This may switch to np.uint32 at some point index_dtype = np.int32 -def _groupby(groups: cp.ndarray, values: cp.ndarray) -> dict[int, cp.ndarray]: +def _groupby( + groups: cp.ndarray, values: cp.ndarray, groups_are_canonical: bool = False +) -> dict[int, cp.ndarray]: """Perform a groupby operation given an array of group IDs and array of values. Parameters ---------- groups : cp.ndarray Array that holds the group IDs. - Group IDs are assumed to be consecutive integers from 0. values : cp.ndarray Array of values to be grouped according to groups. Must be the same size as groups array. + groups_are_canonical : bool, default False + Whether the group IDs are consecutive integers beginning with 0. Returns ------- dict with group IDs as keys and cp.ndarray as values. """ - # It would actually be easy to support groups that aren't consecutive integers, - # but let's wait until we need it to implement it. - sorted_groups = cp.argsort(groups) - sorted_values = values[sorted_groups] - rv = {} - start = 0 - for i, end in enumerate( - [*(cp.nonzero(cp.diff(groups[sorted_groups]))[0] + 1).tolist(), groups.size] - ): - rv[i] = sorted_values[start:end] - start = end - return rv + if groups.size == 0: + return {} + sort_indices = cp.argsort(groups) + sorted_groups = groups[sort_indices] + sorted_values = values[sort_indices] + prepend = 1 if groups_are_canonical else sorted_groups[0] + 1 + left_bounds = cp.nonzero(cp.diff(sorted_groups, prepend=prepend))[0] + boundaries = pairwise(itertools.chain(left_bounds.tolist(), [groups.size])) + if groups_are_canonical: + it = enumerate(boundaries) + else: + it = zip(sorted_groups[left_bounds].tolist(), boundaries) + return {group: sorted_values[start:end] for group, (start, end) in it} def _seed_to_int(seed: int | Random | None) -> int: diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml index 3d3029da232..7e51efd4fe4 100644 --- a/python/nx-cugraph/pyproject.toml +++ b/python/nx-cugraph/pyproject.toml @@ -218,6 +218,7 @@ ignore = [ # Allow assert, print, RNG, and no docstring "nx_cugraph/**/tests/*py" = ["S101", "S311", "T201", "D103", "D100"] "_nx_cugraph/__init__.py" = ["E501"] +"nx_cugraph/algorithms/**/*py" = ["D205", "D401"] # Allow flexible docstrings for algorithms [tool.ruff.flake8-annotations] mypy-init-return = true