From f5d008afddb8ee3d8d6f9c9e60f88eeceb079e5e Mon Sep 17 00:00:00 2001 From: acostadon Date: Wed, 27 Sep 2023 14:53:16 -0400 Subject: [PATCH 1/6] fixed force atlas to work with string vertex ids and added tests removed dependence on mtx files in test. --- .../cugraph/layout/force_atlas2_wrapper.pyx | 8 ++- .../cugraph/tests/layout/test_force_atlas2.py | 71 ++++++++++++++++--- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx b/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx index 4258be3ef71..5a2784e2363 100644 --- a/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx +++ b/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx @@ -56,9 +56,11 @@ def force_atlas2(input_graph, if not input_graph.edgelist: input_graph.view_edge_list() - # FIXME: This implementation assumes that the number of vertices - # is the max vertex ID + 1 which is not always the case. - num_verts = input_graph.nodes().max() + 1 + # this code allows handling of renumbered graphs + if input_graph.is_renumbered(): + num_verts = input_graph.renumber_map.df_internal_to_external['id'].max()+1 + else: + num_verts = input_graph.nodes().max() + 1 num_edges = len(input_graph.edgelist.edgelist_df['src']) cdef GraphCOOView[int,int,float] graph_float diff --git a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py index 495a2d945c0..9eb8309fb67 100644 --- a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py +++ b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py @@ -13,13 +13,49 @@ import time import pytest -import scipy.io -from sklearn.manifold import trustworthiness import cudf import cugraph +from cugraph.structure import number_map from cugraph.internals import GraphBasedDimRedCallback -from cugraph.datasets import karate, polbooks, dolphins, netscience +from sklearn.manifold import trustworthiness +import scipy.io +from cugraph.datasets import ( + karate, + polbooks, + dolphins, + netscience, + dining_prefs, +) + +# Temporarily suppress warnings till networkX fixes deprecation warnings +# (Using or importing the ABCs from 'collections' instead of from +# 'collections.abc' is deprecated, and in 3.8 it will stop working) for +# python 3.7. Also, these import fa2 and import networkx need to be +# relocated in the third-party group once this gets fixed. + + +# This method renumbers a dataframe so it can be tested using Trustworthiness. +# it converts a dataframe with string vertex ids to a renumbered int one. +def renumbered_edgelist(df): + renumbered_df, num_map = number_map.NumberMap.renumber(df, "src", "dst") + new_df = renumbered_df[["renumbered_src", "renumbered_dst", "wgt"]] + column_names = {"renumbered_src": "src", "renumbered_dst": "dst"} + new_df = new_df.rename(columns=column_names) + return new_df + + +# This method converts a dataframe to a sparce matrix that is required by +# scipy Trustworthiness to verify the layout +def get_coo_array(edgelist): + coo = edgelist + x = max(coo["src"].max(), coo["dst"].max()) + 1 + row = coo["src"].to_numpy() + col = coo["dst"].to_numpy() + data = coo["wgt"].to_numpy() + M = scipy.sparse.coo_array((data, (row, col)), shape=(x, x)) + + return M def cugraph_call( @@ -37,11 +73,15 @@ def cugraph_call( strong_gravity_mode, gravity, callback=None, + renumber=False, ): - G = cugraph.Graph() + if cu_M["src"] is not int or cu_M["dst"] is not int: + renumber = True + else: + renumber = False G.from_cudf_edgelist( - cu_M, source="src", destination="dst", edge_attr="wgt", renumber=False + cu_M, source="src", destination="dst", edge_attr="wgt", renumber=renumber ) t1 = time.time() @@ -66,7 +106,13 @@ def cugraph_call( return pos -DATASETS = [(karate, 0.70), (polbooks, 0.75), (dolphins, 0.66), (netscience, 0.66)] +DATASETS = [ + (karate, 0.70), + (polbooks, 0.75), + (dolphins, 0.66), + (netscience, 0.66), + (dining_prefs, 0.50), +] MAX_ITERATIONS = [500] @@ -96,7 +142,6 @@ def on_train_end(self, positions): @pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE) def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): cu_M = graph_file.get_edgelist() - dataset_path = graph_file.get_path() test_callback = TestCallback() cu_pos = cugraph_call( cu_M, @@ -126,9 +171,14 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): iterations on a given graph. """ - matrix_file = dataset_path.with_suffix(".mtx") - M = scipy.io.mmread(matrix_file) - M = M.toarray() + # matrix_file = dataset_path.with_suffix(".mtx") + # M = scipy.io.mmread(matrix_file) + # M = M.toarray() + if "string" in graph_file.metadata["col_types"]: + df = renumbered_edgelist(graph_file.get_edgelist()) + M = get_coo_array(df) + else: + M = get_coo_array(graph_file.get_edgelist()) cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas()) print(cu_trust, score) assert cu_trust > score @@ -205,6 +255,7 @@ def test_force_atlas2_multi_column_pos_list( cu_pos = cu_pos.sort_values("0_vertex") matrix_file = dataset_path.with_suffix(".mtx") M = scipy.io.mmread(matrix_file) + M = cugraph.structure.graph_to_csr(G) M = M.todense() cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas()) print(cu_trust, score) From eaa7c6f21b3b1048ee2a6dea32940bd134b1b47c Mon Sep 17 00:00:00 2001 From: acostadon Date: Wed, 27 Sep 2023 15:57:07 -0400 Subject: [PATCH 2/6] removed multi-column test due to it being non-deterministic --- .../cugraph/tests/layout/test_force_atlas2.py | 80 ++----------------- 1 file changed, 6 insertions(+), 74 deletions(-) diff --git a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py index 9eb8309fb67..025b5213f77 100644 --- a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py +++ b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py @@ -114,6 +114,12 @@ def cugraph_call( (dining_prefs, 0.50), ] +DATASETS2 = [ + (polbooks, 0.75), + (dolphins, 0.66), + (netscience, 0.66), +] + MAX_ITERATIONS = [500] BARNES_HUT_OPTIMIZE = [False, True] @@ -171,9 +177,6 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): iterations on a given graph. """ - # matrix_file = dataset_path.with_suffix(".mtx") - # M = scipy.io.mmread(matrix_file) - # M = M.toarray() if "string" in graph_file.metadata["col_types"]: df = renumbered_edgelist(graph_file.get_edgelist()) M = get_coo_array(df) @@ -189,74 +192,3 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): # verify `on_train_end` was only called once assert test_callback.on_train_end_called_count == 1 - -# FIXME: this test occasionally fails - skipping to prevent CI failures but -# need to revisit ASAP -@pytest.mark.sg -@pytest.mark.skip(reason="non-deterministric - needs fixing!") -@pytest.mark.parametrize("graph_file, score", DATASETS[:-1]) -@pytest.mark.parametrize("max_iter", MAX_ITERATIONS) -@pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE) -def test_force_atlas2_multi_column_pos_list( - graph_file, score, max_iter, barnes_hut_optimize -): - cu_M = graph_file.get_edgelist() - dataset_path = graph_file.get_path() - test_callback = TestCallback() - pos = cugraph_call( - cu_M, - max_iter=max_iter, - pos_list=None, - outbound_attraction_distribution=True, - lin_log_mode=False, - prevent_overlapping=False, - edge_weight_influence=1.0, - jitter_tolerance=1.0, - barnes_hut_optimize=False, - barnes_hut_theta=0.5, - scaling_ratio=2.0, - strong_gravity_mode=False, - gravity=1.0, - callback=test_callback, - ) - - cu_M.rename(columns={"0": "src_0", "1": "dst_0"}, inplace=True) - cu_M["src_1"] = cu_M["src_0"] + 1000 - cu_M["dst_1"] = cu_M["dst_0"] + 1000 - - G = cugraph.Graph() - G.from_cudf_edgelist( - cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"], edge_attr="2" - ) - - pos_list = cudf.DataFrame() - pos_list["vertex_0"] = pos["vertex"] - pos_list["vertex_1"] = pos_list["vertex_0"] + 1000 - pos_list["x"] = pos["x"] - pos_list["y"] = pos["y"] - - cu_pos = cugraph.force_atlas2( - G, - max_iter=max_iter, - pos_list=pos_list, - outbound_attraction_distribution=True, - lin_log_mode=False, - prevent_overlapping=False, - edge_weight_influence=1.0, - jitter_tolerance=1.0, - barnes_hut_optimize=False, - barnes_hut_theta=0.5, - scaling_ratio=2.0, - strong_gravity_mode=False, - gravity=1.0, - callback=test_callback, - ) - - cu_pos = cu_pos.sort_values("0_vertex") - matrix_file = dataset_path.with_suffix(".mtx") - M = scipy.io.mmread(matrix_file) - M = cugraph.structure.graph_to_csr(G) - M = M.todense() - cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas()) - print(cu_trust, score) - assert cu_trust > score From 0972bd4145704b59ffe575a564b48361ea3a8847 Mon Sep 17 00:00:00 2001 From: acostadon Date: Wed, 27 Sep 2023 15:59:17 -0400 Subject: [PATCH 3/6] removed dependency pointed out by flak8 --- python/cugraph/cugraph/tests/layout/test_force_atlas2.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py index 025b5213f77..9e5a6a323ea 100644 --- a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py +++ b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py @@ -14,7 +14,6 @@ import time import pytest -import cudf import cugraph from cugraph.structure import number_map from cugraph.internals import GraphBasedDimRedCallback @@ -191,4 +190,3 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): assert test_callback.on_epoch_end_called_count == max_iter # verify `on_train_end` was only called once assert test_callback.on_train_end_called_count == 1 - From 5fcf90450744308bfee0846e98aad29930f871da Mon Sep 17 00:00:00 2001 From: acostadon Date: Thu, 28 Sep 2023 09:03:08 -0400 Subject: [PATCH 4/6] added download equals True --- python/cugraph/cugraph/tests/layout/test_force_atlas2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py index 9e5a6a323ea..87ace22fa1f 100644 --- a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py +++ b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py @@ -146,7 +146,7 @@ def on_train_end(self, positions): @pytest.mark.parametrize("max_iter", MAX_ITERATIONS) @pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE) def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): - cu_M = graph_file.get_edgelist() + cu_M = graph_file.get_edgelist(download=True) test_callback = TestCallback() cu_pos = cugraph_call( cu_M, @@ -177,10 +177,10 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): """ if "string" in graph_file.metadata["col_types"]: - df = renumbered_edgelist(graph_file.get_edgelist()) + df = renumbered_edgelist(graph_file.get_edgelist(download=True)) M = get_coo_array(df) else: - M = get_coo_array(graph_file.get_edgelist()) + M = get_coo_array(graph_file.get_edgelist(download=True)) cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas()) print(cu_trust, score) assert cu_trust > score From b3a3bdf9bc3a1433a05503282c1b6fc50db3a8e8 Mon Sep 17 00:00:00 2001 From: acostadon Date: Fri, 29 Sep 2023 07:52:27 -0400 Subject: [PATCH 5/6] added fixme and issue per review comments --- .../cugraph/cugraph/tests/layout/test_force_atlas2.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py index 87ace22fa1f..6b1fd6bcc4e 100644 --- a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py +++ b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py @@ -27,15 +27,13 @@ dining_prefs, ) -# Temporarily suppress warnings till networkX fixes deprecation warnings -# (Using or importing the ABCs from 'collections' instead of from -# 'collections.abc' is deprecated, and in 3.8 it will stop working) for -# python 3.7. Also, these import fa2 and import networkx need to be -# relocated in the third-party group once this gets fixed. - +# FIXME Removed the multi column positional due to it being non-deterministic +# need to replace this coverage. Issue 3890 in cuGraph repo was created. # This method renumbers a dataframe so it can be tested using Trustworthiness. # it converts a dataframe with string vertex ids to a renumbered int one. + + def renumbered_edgelist(df): renumbered_df, num_map = number_map.NumberMap.renumber(df, "src", "dst") new_df = renumbered_df[["renumbered_src", "renumbered_dst", "wgt"]] From 481e71ee411bb40b87af8bad5ac3cfe00217eb0a Mon Sep 17 00:00:00 2001 From: acostadon Date: Fri, 29 Sep 2023 08:05:04 -0400 Subject: [PATCH 6/6] flak8 reformat --- python/cugraph/cugraph/tests/layout/test_force_atlas2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py index ab95aeec4be..6b1fd6bcc4e 100644 --- a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py +++ b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py @@ -26,6 +26,7 @@ netscience, dining_prefs, ) + # FIXME Removed the multi column positional due to it being non-deterministic # need to replace this coverage. Issue 3890 in cuGraph repo was created.