Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixes force atlas to allow string as vertex names #3891

Merged
merged 9 commits into from
Sep 29, 2023
8 changes: 5 additions & 3 deletions python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ def force_atlas2(input_graph,
if not input_graph.edgelist:
input_graph.view_edge_list()

# FIXME: This implementation assumes that the number of vertices
# is the max vertex ID + 1 which is not always the case.
num_verts = input_graph.nodes().max() + 1
# this code allows handling of renumbered graphs
if input_graph.is_renumbered():
num_verts = input_graph.renumber_map.df_internal_to_external['id'].max()+1
else:
num_verts = input_graph.nodes().max() + 1
num_edges = len(input_graph.edgelist.edgelist_df['src'])

cdef GraphCOOView[int,int,float] graph_float
Expand Down
145 changes: 63 additions & 82 deletions python/cugraph/cugraph/tests/layout/test_force_atlas2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,48 @@

import time
import pytest
import scipy.io
from sklearn.manifold import trustworthiness

import cudf
import cugraph
from cugraph.structure import number_map
from cugraph.internals import GraphBasedDimRedCallback
from cugraph.datasets import karate, polbooks, dolphins, netscience
from sklearn.manifold import trustworthiness
import scipy.io
from cugraph.datasets import (
karate,
polbooks,
dolphins,
netscience,
dining_prefs,
)

# Temporarily suppress warnings till networkX fixes deprecation warnings
# (Using or importing the ABCs from 'collections' instead of from
# 'collections.abc' is deprecated, and in 3.8 it will stop working) for
# python 3.7. Also, these import fa2 and import networkx need to be
# relocated in the third-party group once this gets fixed.
acostadon marked this conversation as resolved.
Show resolved Hide resolved


# This method renumbers a dataframe so it can be tested using Trustworthiness.
# it converts a dataframe with string vertex ids to a renumbered int one.
def renumbered_edgelist(df):
renumbered_df, num_map = number_map.NumberMap.renumber(df, "src", "dst")
new_df = renumbered_df[["renumbered_src", "renumbered_dst", "wgt"]]
column_names = {"renumbered_src": "src", "renumbered_dst": "dst"}
new_df = new_df.rename(columns=column_names)
return new_df


# This method converts a dataframe to a sparce matrix that is required by
# scipy Trustworthiness to verify the layout
def get_coo_array(edgelist):
coo = edgelist
x = max(coo["src"].max(), coo["dst"].max()) + 1
row = coo["src"].to_numpy()
col = coo["dst"].to_numpy()
data = coo["wgt"].to_numpy()
M = scipy.sparse.coo_array((data, (row, col)), shape=(x, x))

return M


def cugraph_call(
Expand All @@ -37,11 +72,15 @@ def cugraph_call(
strong_gravity_mode,
gravity,
callback=None,
renumber=False,
):

G = cugraph.Graph()
if cu_M["src"] is not int or cu_M["dst"] is not int:
renumber = True
else:
renumber = False
G.from_cudf_edgelist(
cu_M, source="src", destination="dst", edge_attr="wgt", renumber=False
cu_M, source="src", destination="dst", edge_attr="wgt", renumber=renumber
)

t1 = time.time()
Expand All @@ -66,7 +105,19 @@ def cugraph_call(
return pos


DATASETS = [(karate, 0.70), (polbooks, 0.75), (dolphins, 0.66), (netscience, 0.66)]
DATASETS = [
(karate, 0.70),
(polbooks, 0.75),
(dolphins, 0.66),
(netscience, 0.66),
(dining_prefs, 0.50),
]

DATASETS2 = [
(polbooks, 0.75),
(dolphins, 0.66),
(netscience, 0.66),
]


MAX_ITERATIONS = [500]
Expand Down Expand Up @@ -96,7 +147,6 @@ def on_train_end(self, positions):
@pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE)
def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
cu_M = graph_file.get_edgelist()
dataset_path = graph_file.get_path()
test_callback = TestCallback()
cu_pos = cugraph_call(
cu_M,
Expand Down Expand Up @@ -126,9 +176,11 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
iterations on a given graph.
"""

matrix_file = dataset_path.with_suffix(".mtx")
M = scipy.io.mmread(matrix_file)
M = M.toarray()
if "string" in graph_file.metadata["col_types"]:
df = renumbered_edgelist(graph_file.get_edgelist())
M = get_coo_array(df)
else:
M = get_coo_array(graph_file.get_edgelist())
cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
print(cu_trust, score)
assert cu_trust > score
Expand All @@ -138,74 +190,3 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
assert test_callback.on_epoch_end_called_count == max_iter
# verify `on_train_end` was only called once
assert test_callback.on_train_end_called_count == 1


# FIXME: this test occasionally fails - skipping to prevent CI failures but
# need to revisit ASAP
acostadon marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.sg
@pytest.mark.skip(reason="non-deterministric - needs fixing!")
@pytest.mark.parametrize("graph_file, score", DATASETS[:-1])
@pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
@pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE)
def test_force_atlas2_multi_column_pos_list(
graph_file, score, max_iter, barnes_hut_optimize
):
cu_M = graph_file.get_edgelist()
dataset_path = graph_file.get_path()
test_callback = TestCallback()
pos = cugraph_call(
cu_M,
max_iter=max_iter,
pos_list=None,
outbound_attraction_distribution=True,
lin_log_mode=False,
prevent_overlapping=False,
edge_weight_influence=1.0,
jitter_tolerance=1.0,
barnes_hut_optimize=False,
barnes_hut_theta=0.5,
scaling_ratio=2.0,
strong_gravity_mode=False,
gravity=1.0,
callback=test_callback,
)

cu_M.rename(columns={"0": "src_0", "1": "dst_0"}, inplace=True)
cu_M["src_1"] = cu_M["src_0"] + 1000
cu_M["dst_1"] = cu_M["dst_0"] + 1000

G = cugraph.Graph()
G.from_cudf_edgelist(
cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"], edge_attr="2"
)

pos_list = cudf.DataFrame()
pos_list["vertex_0"] = pos["vertex"]
pos_list["vertex_1"] = pos_list["vertex_0"] + 1000
pos_list["x"] = pos["x"]
pos_list["y"] = pos["y"]

cu_pos = cugraph.force_atlas2(
G,
max_iter=max_iter,
pos_list=pos_list,
outbound_attraction_distribution=True,
lin_log_mode=False,
prevent_overlapping=False,
edge_weight_influence=1.0,
jitter_tolerance=1.0,
barnes_hut_optimize=False,
barnes_hut_theta=0.5,
scaling_ratio=2.0,
strong_gravity_mode=False,
gravity=1.0,
callback=test_callback,
)

cu_pos = cu_pos.sort_values("0_vertex")
matrix_file = dataset_path.with_suffix(".mtx")
M = scipy.io.mmread(matrix_file)
M = M.todense()
cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
print(cu_trust, score)
assert cu_trust > score
Loading