From 447cdfd2a74478cd1c72fc9f617a703e7523f9bb Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Mon, 30 Oct 2023 22:13:21 -0500 Subject: [PATCH] nx-cugraph: add CC for undirected graphs to fix k-truss --- python/nx-cugraph/_nx_cugraph/__init__.py | 4 + .../nx_cugraph/algorithms/__init__.py | 3 +- .../algorithms/community/louvain.py | 2 +- .../algorithms/components/__init__.py | 13 ++ .../algorithms/components/connected.py | 130 ++++++++++++++++++ .../nx-cugraph/nx_cugraph/algorithms/core.py | 6 + python/nx-cugraph/nx_cugraph/classes/graph.py | 5 + python/nx-cugraph/nx_cugraph/utils/misc.py | 45 ++++-- 8 files changed, 192 insertions(+), 16 deletions(-) create mode 100644 python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py create mode 100644 python/nx-cugraph/nx_cugraph/algorithms/components/connected.py diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py index 965b5b232ab..73c384c0525 100644 --- a/python/nx-cugraph/_nx_cugraph/__init__.py +++ b/python/nx-cugraph/_nx_cugraph/__init__.py @@ -30,13 +30,17 @@ "functions": { # BEGIN: functions "betweenness_centrality", + "connected_components", "degree_centrality", "edge_betweenness_centrality", "in_degree_centrality", + "is_connected", "is_isolate", "isolates", "k_truss", "louvain_communities", + "node_connected_component", + "number_connected_components", "number_of_isolates", "number_of_selfloops", "out_degree_centrality", diff --git a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py index 22600bfdc2d..dc2f84d7f82 100644 --- a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py +++ b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py @@ -10,7 +10,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from . import centrality, community +from . import centrality, community, components from .centrality import * +from .components import * from .core import * from .isolate import * diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py index 62261d109a2..45a3429d2ee 100644 --- a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py +++ b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py @@ -62,7 +62,7 @@ def louvain_communities( resolution=resolution, do_expensive_check=False, ) - groups = _groupby(clusters, vertices) + groups = _groupby(clusters, vertices, groups_are_canonical=True) rv = [set(G._nodearray_to_list(node_ids)) for node_ids in groups.values()] # TODO: PLC doesn't handle isolated vertices yet, so this is a temporary fix isolates = _isolates(G) diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py new file mode 100644 index 00000000000..26816ef3692 --- /dev/null +++ b/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .connected import * diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py new file mode 100644 index 00000000000..41f3457d542 --- /dev/null +++ b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py @@ -0,0 +1,130 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools + +import cupy as cp +import networkx as nx +import pylibcugraph as plc + +from nx_cugraph.convert import _to_undirected_graph +from nx_cugraph.utils import _groupby, networkx_algorithm, not_implemented_for + +from ..isolate import _isolates + +__all__ = [ + "number_connected_components", + "connected_components", + "is_connected", + "node_connected_component", +] + + +@not_implemented_for("directed") +@networkx_algorithm +def number_connected_components(G): + return sum(1 for _ in connected_components(G)) + # PREFERRED IMPLEMENTATION, BUT PLC DOES NOT HANDLE ISOLATED VERTICES WELL + # G = _to_undirected_graph(G) + # unused_node_ids, labels = plc.weakly_connected_components( + # resource_handle=plc.ResourceHandle(), + # graph=G._get_plc_graph(), + # offsets=None, + # indices=None, + # weights=None, + # labels=None, + # do_expensive_check=False, + # ) + # return cp.unique(labels).size + + +@number_connected_components._can_run +def _(G): + # NetworkX <= 3.2.1 does not check directedness for us + try: + return not G.is_directed() + except Exception: + return False + + +@not_implemented_for("directed") +@networkx_algorithm +def connected_components(G): + G = _to_undirected_graph(G) + if G.src_indices.size == 0: + # TODO: PLC doesn't handle empty graphs (or isolated nodes) gracefully! + return [{key} for key in G._nodeiter_to_iter(range(len(G)))] + node_ids, labels = plc.weakly_connected_components( + resource_handle=plc.ResourceHandle(), + graph=G._get_plc_graph(), + offsets=None, + indices=None, + weights=None, + labels=None, + do_expensive_check=False, + ) + groups = _groupby(labels, node_ids) + it = (G._nodearray_to_set(connected_ids) for connected_ids in groups.values()) + # TODO: PLC doesn't handle isolated vertices yet, so this is a temporary fix + isolates = _isolates(G) + if isolates.size > 0: + isolates = isolates[isolates > node_ids.max()] + if isolates.size > 0: + it = itertools.chain( + it, ({node} for node in G._nodearray_to_list(isolates)) + ) + return it + + +@not_implemented_for("directed") +@networkx_algorithm +def is_connected(G): + G = _to_undirected_graph(G) + if len(G) == 0: + raise nx.NetworkXPointlessConcept( + "Connectivity is undefined for the null graph." + ) + for community in connected_components(G): + return len(community) == len(G) + raise RuntimeError # pragma: no cover + # PREFERRED IMPLEMENTATION, BUT PLC DOES NOT HANDLE ISOLATED VERTICES WELL + # unused_node_ids, labels = plc.weakly_connected_components( + # resource_handle=plc.ResourceHandle(), + # graph=G._get_plc_graph(), + # offsets=None, + # indices=None, + # weights=None, + # labels=None, + # do_expensive_check=False, + # ) + # return labels.size == len(G) and cp.unique(labels).size == 1 + + +@not_implemented_for("directed") +@networkx_algorithm +def node_connected_component(G, n): + # We could also do plain BFS from n + G = _to_undirected_graph(G) + node_id = n if G.key_to_id is None else G.key_to_id[n] + node_ids, labels = plc.weakly_connected_components( + resource_handle=plc.ResourceHandle(), + graph=G._get_plc_graph(), + offsets=None, + indices=None, + weights=None, + labels=None, + do_expensive_check=False, + ) + indices = cp.nonzero(node_ids == node_id)[0] + if indices.size == 0: + return {n} + return G._nodearray_to_set(node_ids[labels == labels[indices[0]]]) diff --git a/python/nx-cugraph/nx_cugraph/algorithms/core.py b/python/nx-cugraph/nx_cugraph/algorithms/core.py index 0a64dd71c69..a5f3e08a5ac 100644 --- a/python/nx-cugraph/nx_cugraph/algorithms/core.py +++ b/python/nx-cugraph/nx_cugraph/algorithms/core.py @@ -32,6 +32,12 @@ def k_truss(G, k): "Input graph has self loops which is not permitted; " "Consider using G.remove_edges_from(nx.selfloop_edges(G))." ) + if (ncc := nxcg.number_connected_components(G)) > 1: + raise NotImplementedError( + "nx_cugraph.k_truss does not yet work on graphs with more than one " + f"connected component (this graph has {ncc}). We expect to fix this soon." + ) + # TODO: create renumbering helper function(s) if k < 3: # k-truss graph is comprised of nodes incident on k-2 triangles, so k<3 is a diff --git a/python/nx-cugraph/nx_cugraph/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py index 166b6b9dc6b..b948593eb23 100644 --- a/python/nx-cugraph/nx_cugraph/classes/graph.py +++ b/python/nx-cugraph/nx_cugraph/classes/graph.py @@ -558,6 +558,11 @@ def _nodearray_to_list(self, node_ids: cp.ndarray[IndexValue]) -> list[NodeKey]: return node_ids.tolist() return list(self._nodeiter_to_iter(node_ids.tolist())) + def _nodearray_to_set(self, node_ids: cp.ndarray[IndexValue]) -> set[NodeKey]: + if self.key_to_id is None: + return set(node_ids.tolist()) + return set(self._nodeiter_to_iter(node_ids.tolist())) + def _nodearray_to_dict( self, values: cp.ndarray[NodeValue] ) -> dict[NodeKey, NodeValue]: diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py index 72e4094b8b7..dad49e265d7 100644 --- a/python/nx-cugraph/nx_cugraph/utils/misc.py +++ b/python/nx-cugraph/nx_cugraph/utils/misc.py @@ -12,43 +12,60 @@ # limitations under the License. from __future__ import annotations +import itertools import operator as op import sys from random import Random import cupy as cp +try: + from itertools import pairwise # Python >=3.10 +except ImportError: + + def pairwise(it): + it = iter(it) + for prev in it: + for cur in it: + yield (prev, cur) + prev = cur + + __all__ = ["_groupby", "_seed_to_int"] -def _groupby(groups: cp.ndarray, values: cp.ndarray) -> dict[int, cp.ndarray]: +def _groupby( + groups: cp.ndarray, values: cp.ndarray, groups_are_canonical: bool = False +) -> dict[int, cp.ndarray]: """Perform a groupby operation given an array of group IDs and array of values. Parameters ---------- groups : cp.ndarray Array that holds the group IDs. - Group IDs are assumed to be consecutive integers from 0. values : cp.ndarray Array of values to be grouped according to groups. Must be the same size as groups array. + groups_are_canonical : bool, default False + Whether the group IDs are consecutive integers beginning with 0. Returns ------- dict with group IDs as keys and cp.ndarray as values. """ - # It would actually be easy to support groups that aren't consecutive integers, - # but let's wait until we need it to implement it. - sorted_groups = cp.argsort(groups) - sorted_values = values[sorted_groups] - rv = {} - start = 0 - for i, end in enumerate( - [*(cp.nonzero(cp.diff(groups[sorted_groups]))[0] + 1).tolist(), groups.size] - ): - rv[i] = sorted_values[start:end] - start = end - return rv + if groups.size == 0: + return {} + sort_indices = cp.argsort(groups) + sorted_groups = groups[sort_indices] + sorted_values = values[sort_indices] + prepend = 1 if groups_are_canonical else sorted_groups[0] + 1 + left_bounds = cp.nonzero(cp.diff(sorted_groups, prepend=prepend))[0] + boundaries = pairwise(itertools.chain(left_bounds.tolist(), [groups.size])) + if groups_are_canonical: + it = enumerate(boundaries) + else: + it = zip(sorted_groups[left_bounds].tolist(), boundaries) + return {group: sorted_values[start:end] for group, (start, end) in it} def _seed_to_int(seed: int | Random | None) -> int: