From 6e5e0667b19106b61c98107b0cc58ff2ef7d4202 Mon Sep 17 00:00:00 2001 From: Naim <110031745+naimnv@users.noreply.github.com> Date: Fri, 29 Sep 2023 03:44:59 +0200 Subject: [PATCH] Adds logic to handle isolated vertices at python layer (#3886) - Adds logic to handle isolated vertices - Clamp downs max level to a sane number closes #3804 Authors: - Naim (https://github.com/naimnv) Approvers: - Joseph Nke (https://github.com/jnke2016) - Brad Rees (https://github.com/BradReesWork) - Chuck Hastings (https://github.com/ChuckHastings) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/3886 --- python/cugraph/cugraph/community/louvain.py | 91 +++++++++++++++---- .../cugraph/tests/community/test_louvain.py | 16 ++++ python/cugraph/cugraph/utilities/utils.py | 4 +- 3 files changed, 91 insertions(+), 20 deletions(-) diff --git a/python/cugraph/cugraph/community/louvain.py b/python/cugraph/cugraph/community/louvain.py index 7f9742c8f09..0bedd427824 100644 --- a/python/cugraph/cugraph/community/louvain.py +++ b/python/cugraph/cugraph/community/louvain.py @@ -11,7 +11,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Union, Tuple +from cugraph.structure import Graph from cugraph.utilities import ( + is_nx_graph_type, ensure_cugraph_obj_for_nx, df_score_to_dictionary, ) @@ -21,9 +24,26 @@ from pylibcugraph import louvain as pylibcugraph_louvain from pylibcugraph import ResourceHandle +from cugraph.utilities.utils import import_optional + +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. +networkx = import_optional("networkx") + +VERTEX_COL_NAME = "vertex" +CLUSTER_ID_COL_NAME = "partition" + # FIXME: max_level should default to 100 once max_iter is removed -def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): +def louvain( + G: Union[Graph, "networkx.Graph"], + max_level: Union[int, None] = None, + max_iter: Union[int, None] = None, + resolution: float = 1.0, + threshold: float = 1e-7, +) -> Tuple[Union[cudf.DataFrame, dict], float]: """ Compute the modularity optimizing partition of the input graph using the Louvain method @@ -48,6 +68,9 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): than the specified number of levels. No error occurs when the algorithm terminates early in this manner. + If max_level > 500, it will be set to 500 and a warning is emitted + in order to prevent excessive runtime. + max_iter : integer, optional (default=None) This parameter is deprecated in favor of max_level. Previously it was used to control the maximum number of levels of the Louvain @@ -68,18 +91,21 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): Returns ------- - parts : cudf.DataFrame - GPU data frame of size V containing two columns the vertex id and the - partition id it is assigned to. + result: cudf.DataFrame or dict + If input graph G is of type cugraph.Graph, a GPU dataframe + with two columns. + + result[VERTEX_COL_NAME] : cudf.Series + Contains the vertex identifiers + result[CLUSTER_ID_COL_NAME] : cudf.Series + Contains the partition assigned to the vertices - df['vertex'] : cudf.Series - Contains the vertex identifiers - df['partition'] : cudf.Series - Contains the partition assigned to the vertices + If input graph G is of type networkx.Graph, a dict + Dictionary of vertices and their partition ids. modularity_score : float - a floating point number containing the global modularity score of the - partitioning. + A floating point number containing the global modularity score + of the partitioning. Examples -------- @@ -89,6 +115,17 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): """ + # FIXME: Onece the graph construction calls support isolated vertices through + # the C API (the C++ interface already supports this) then there will be + # no need to compute isolated vertices here. + + isolated_vertices = list() + if is_nx_graph_type(type(G)): + isolated_vertices = [v for v in range(G.number_of_nodes()) if G.degree[v] == 0] + else: + # FIXME: Gather isolated vertices of G + pass + G, isNx = ensure_cugraph_obj_for_nx(G) if G.is_directed(): @@ -112,7 +149,12 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): if max_level is None: max_level = 100 - vertex, partition, mod_score = pylibcugraph_louvain( + if max_level > 500: + w_msg = "max_level is set too high, clamping it down to 500." + warnings.warn(w_msg) + max_level = 500 + + vertex, partition, modularity_score = pylibcugraph_louvain( resource_handle=ResourceHandle(), graph=G._plc_graph, max_level=max_level, @@ -121,14 +163,27 @@ def louvain(G, max_level=None, max_iter=None, resolution=1.0, threshold=1e-7): do_expensive_check=False, ) - df = cudf.DataFrame() - df["vertex"] = vertex - df["partition"] = partition + result = cudf.DataFrame() + result[VERTEX_COL_NAME] = vertex + result[CLUSTER_ID_COL_NAME] = partition + + if len(isolated_vertices) > 0: + unique_cids = result[CLUSTER_ID_COL_NAME].unique() + max_cluster_id = -1 if len(result) == 0 else unique_cids.max() + + isolated_vtx_and_cids = cudf.DataFrame() + isolated_vtx_and_cids[VERTEX_COL_NAME] = isolated_vertices + isolated_vtx_and_cids[CLUSTER_ID_COL_NAME] = [ + (max_cluster_id + i + 1) for i in range(len(isolated_vertices)) + ] + result = cudf.concat( + [result, isolated_vtx_and_cids], ignore_index=True, sort=False + ) - if G.renumbered: - df = G.unrenumber(df, "vertex") + if G.renumbered and len(G.input_df) > 0: + result = G.unrenumber(result, VERTEX_COL_NAME) if isNx is True: - df = df_score_to_dictionary(df, "partition") + result = df_score_to_dictionary(result, CLUSTER_ID_COL_NAME) - return df, mod_score + return result, modularity_score diff --git a/python/cugraph/cugraph/tests/community/test_louvain.py b/python/cugraph/cugraph/tests/community/test_louvain.py index 183be071a44..5441998fb46 100644 --- a/python/cugraph/cugraph/tests/community/test_louvain.py +++ b/python/cugraph/cugraph/tests/community/test_louvain.py @@ -142,3 +142,19 @@ def test_louvain_csr_graph(is_weighted): assert len(parition_diffs) == 0 assert mod_csr == mod_coo + + +@pytest.mark.sg +def test_louvain_nx_graph_with_isolated_nodes(): + # Cluster IDs are expected to unique if all nodes are isolated + G = nx.Graph() + G.add_nodes_from(range(5)) + result, _ = cugraph.louvain(G) + assert set(result.keys()) == set(G.nodes) + assert len(set(result.values())) == G.number_of_nodes() + + # A graph with 5 nodes, where 3 of the nodes are isolated + G.add_edge(1, 2) + result, _ = cugraph.louvain(G) + assert set(result.keys()) == set(G.nodes) + assert len(set(result.values())) == G.number_of_nodes() - 1 diff --git a/python/cugraph/cugraph/utilities/utils.py b/python/cugraph/cugraph/utilities/utils.py index e68b5dd4880..7a54a0bf2cf 100644 --- a/python/cugraph/cugraph/utilities/utils.py +++ b/python/cugraph/cugraph/utilities/utils.py @@ -364,8 +364,8 @@ def is_matrix_type(m): return is_cp_matrix_type(m) or is_sp_matrix_type(m) -def is_nx_graph_type(g): - return g in __nx_graph_types +def is_nx_graph_type(graph_type): + return graph_type in __nx_graph_types def is_cugraph_graph_type(g):