From 9dd01d82c17b6ada9c6701a9f3238e10053ce8a6 Mon Sep 17 00:00:00 2001 From: Don Acosta <97529984+acostadon@users.noreply.github.com> Date: Fri, 29 Sep 2023 15:17:22 -0400 Subject: [PATCH] fixes force atlas to allow string as vertex names (#3891) fixes force atlas to allow string as vertex names and removes need for mtx formated datasets. resolves #3610 Authors: - Don Acosta (https://github.com/acostadon) - Brad Rees (https://github.com/BradReesWork) Approvers: - Brad Rees (https://github.com/BradReesWork) - ralph (https://github.com/nv-rliu) URL: https://github.com/rapidsai/cugraph/pull/3891 --- .../cugraph/layout/force_atlas2_wrapper.pyx | 8 +- .../cugraph/tests/layout/test_force_atlas2.py | 145 ++++++++---------- 2 files changed, 67 insertions(+), 86 deletions(-) diff --git a/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx b/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx index 4258be3ef71..5a2784e2363 100644 --- a/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx +++ b/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx @@ -56,9 +56,11 @@ def force_atlas2(input_graph, if not input_graph.edgelist: input_graph.view_edge_list() - # FIXME: This implementation assumes that the number of vertices - # is the max vertex ID + 1 which is not always the case. - num_verts = input_graph.nodes().max() + 1 + # this code allows handling of renumbered graphs + if input_graph.is_renumbered(): + num_verts = input_graph.renumber_map.df_internal_to_external['id'].max()+1 + else: + num_verts = input_graph.nodes().max() + 1 num_edges = len(input_graph.edgelist.edgelist_df['src']) cdef GraphCOOView[int,int,float] graph_float diff --git a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py index 495a2d945c0..6b1fd6bcc4e 100644 --- a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py +++ b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py @@ -13,13 +13,46 @@ import time import pytest -import scipy.io -from sklearn.manifold import trustworthiness -import cudf import cugraph +from cugraph.structure import number_map from cugraph.internals import GraphBasedDimRedCallback -from cugraph.datasets import karate, polbooks, dolphins, netscience +from sklearn.manifold import trustworthiness +import scipy.io +from cugraph.datasets import ( + karate, + polbooks, + dolphins, + netscience, + dining_prefs, +) + +# FIXME Removed the multi column positional due to it being non-deterministic +# need to replace this coverage. Issue 3890 in cuGraph repo was created. + +# This method renumbers a dataframe so it can be tested using Trustworthiness. +# it converts a dataframe with string vertex ids to a renumbered int one. + + +def renumbered_edgelist(df): + renumbered_df, num_map = number_map.NumberMap.renumber(df, "src", "dst") + new_df = renumbered_df[["renumbered_src", "renumbered_dst", "wgt"]] + column_names = {"renumbered_src": "src", "renumbered_dst": "dst"} + new_df = new_df.rename(columns=column_names) + return new_df + + +# This method converts a dataframe to a sparce matrix that is required by +# scipy Trustworthiness to verify the layout +def get_coo_array(edgelist): + coo = edgelist + x = max(coo["src"].max(), coo["dst"].max()) + 1 + row = coo["src"].to_numpy() + col = coo["dst"].to_numpy() + data = coo["wgt"].to_numpy() + M = scipy.sparse.coo_array((data, (row, col)), shape=(x, x)) + + return M def cugraph_call( @@ -37,11 +70,15 @@ def cugraph_call( strong_gravity_mode, gravity, callback=None, + renumber=False, ): - G = cugraph.Graph() + if cu_M["src"] is not int or cu_M["dst"] is not int: + renumber = True + else: + renumber = False G.from_cudf_edgelist( - cu_M, source="src", destination="dst", edge_attr="wgt", renumber=False + cu_M, source="src", destination="dst", edge_attr="wgt", renumber=renumber ) t1 = time.time() @@ -66,7 +103,19 @@ def cugraph_call( return pos -DATASETS = [(karate, 0.70), (polbooks, 0.75), (dolphins, 0.66), (netscience, 0.66)] +DATASETS = [ + (karate, 0.70), + (polbooks, 0.75), + (dolphins, 0.66), + (netscience, 0.66), + (dining_prefs, 0.50), +] + +DATASETS2 = [ + (polbooks, 0.75), + (dolphins, 0.66), + (netscience, 0.66), +] MAX_ITERATIONS = [500] @@ -95,8 +144,7 @@ def on_train_end(self, positions): @pytest.mark.parametrize("max_iter", MAX_ITERATIONS) @pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE) def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): - cu_M = graph_file.get_edgelist() - dataset_path = graph_file.get_path() + cu_M = graph_file.get_edgelist(download=True) test_callback = TestCallback() cu_pos = cugraph_call( cu_M, @@ -126,9 +174,11 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): iterations on a given graph. """ - matrix_file = dataset_path.with_suffix(".mtx") - M = scipy.io.mmread(matrix_file) - M = M.toarray() + if "string" in graph_file.metadata["col_types"]: + df = renumbered_edgelist(graph_file.get_edgelist(download=True)) + M = get_coo_array(df) + else: + M = get_coo_array(graph_file.get_edgelist(download=True)) cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas()) print(cu_trust, score) assert cu_trust > score @@ -138,74 +188,3 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): assert test_callback.on_epoch_end_called_count == max_iter # verify `on_train_end` was only called once assert test_callback.on_train_end_called_count == 1 - - -# FIXME: this test occasionally fails - skipping to prevent CI failures but -# need to revisit ASAP -@pytest.mark.sg -@pytest.mark.skip(reason="non-deterministric - needs fixing!") -@pytest.mark.parametrize("graph_file, score", DATASETS[:-1]) -@pytest.mark.parametrize("max_iter", MAX_ITERATIONS) -@pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE) -def test_force_atlas2_multi_column_pos_list( - graph_file, score, max_iter, barnes_hut_optimize -): - cu_M = graph_file.get_edgelist() - dataset_path = graph_file.get_path() - test_callback = TestCallback() - pos = cugraph_call( - cu_M, - max_iter=max_iter, - pos_list=None, - outbound_attraction_distribution=True, - lin_log_mode=False, - prevent_overlapping=False, - edge_weight_influence=1.0, - jitter_tolerance=1.0, - barnes_hut_optimize=False, - barnes_hut_theta=0.5, - scaling_ratio=2.0, - strong_gravity_mode=False, - gravity=1.0, - callback=test_callback, - ) - - cu_M.rename(columns={"0": "src_0", "1": "dst_0"}, inplace=True) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 - - G = cugraph.Graph() - G.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"], edge_attr="2" - ) - - pos_list = cudf.DataFrame() - pos_list["vertex_0"] = pos["vertex"] - pos_list["vertex_1"] = pos_list["vertex_0"] + 1000 - pos_list["x"] = pos["x"] - pos_list["y"] = pos["y"] - - cu_pos = cugraph.force_atlas2( - G, - max_iter=max_iter, - pos_list=pos_list, - outbound_attraction_distribution=True, - lin_log_mode=False, - prevent_overlapping=False, - edge_weight_influence=1.0, - jitter_tolerance=1.0, - barnes_hut_optimize=False, - barnes_hut_theta=0.5, - scaling_ratio=2.0, - strong_gravity_mode=False, - gravity=1.0, - callback=test_callback, - ) - - cu_pos = cu_pos.sort_values("0_vertex") - matrix_file = dataset_path.with_suffix(".mtx") - M = scipy.io.mmread(matrix_file) - M = M.todense() - cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas()) - print(cu_trust, score) - assert cu_trust > score