Skip to content

Commit

Permalink
fixes force atlas to allow string as vertex names (#3891)
Browse files Browse the repository at this point in the history
fixes force atlas to allow string as vertex names and removes need for mtx formated datasets.
resolves #3610

Authors:
  - Don Acosta (https://github.com/acostadon)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - ralph (https://github.com/nv-rliu)

URL: #3891
  • Loading branch information
acostadon authored Sep 29, 2023
1 parent eed1223 commit 9dd01d8
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 86 deletions.
8 changes: 5 additions & 3 deletions python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ def force_atlas2(input_graph,
if not input_graph.edgelist:
input_graph.view_edge_list()

# FIXME: This implementation assumes that the number of vertices
# is the max vertex ID + 1 which is not always the case.
num_verts = input_graph.nodes().max() + 1
# this code allows handling of renumbered graphs
if input_graph.is_renumbered():
num_verts = input_graph.renumber_map.df_internal_to_external['id'].max()+1
else:
num_verts = input_graph.nodes().max() + 1
num_edges = len(input_graph.edgelist.edgelist_df['src'])

cdef GraphCOOView[int,int,float] graph_float
Expand Down
145 changes: 62 additions & 83 deletions python/cugraph/cugraph/tests/layout/test_force_atlas2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,46 @@

import time
import pytest
import scipy.io
from sklearn.manifold import trustworthiness

import cudf
import cugraph
from cugraph.structure import number_map
from cugraph.internals import GraphBasedDimRedCallback
from cugraph.datasets import karate, polbooks, dolphins, netscience
from sklearn.manifold import trustworthiness
import scipy.io
from cugraph.datasets import (
karate,
polbooks,
dolphins,
netscience,
dining_prefs,
)

# FIXME Removed the multi column positional due to it being non-deterministic
# need to replace this coverage. Issue 3890 in cuGraph repo was created.

# This method renumbers a dataframe so it can be tested using Trustworthiness.
# it converts a dataframe with string vertex ids to a renumbered int one.


def renumbered_edgelist(df):
renumbered_df, num_map = number_map.NumberMap.renumber(df, "src", "dst")
new_df = renumbered_df[["renumbered_src", "renumbered_dst", "wgt"]]
column_names = {"renumbered_src": "src", "renumbered_dst": "dst"}
new_df = new_df.rename(columns=column_names)
return new_df


# This method converts a dataframe to a sparce matrix that is required by
# scipy Trustworthiness to verify the layout
def get_coo_array(edgelist):
coo = edgelist
x = max(coo["src"].max(), coo["dst"].max()) + 1
row = coo["src"].to_numpy()
col = coo["dst"].to_numpy()
data = coo["wgt"].to_numpy()
M = scipy.sparse.coo_array((data, (row, col)), shape=(x, x))

return M


def cugraph_call(
Expand All @@ -37,11 +70,15 @@ def cugraph_call(
strong_gravity_mode,
gravity,
callback=None,
renumber=False,
):

G = cugraph.Graph()
if cu_M["src"] is not int or cu_M["dst"] is not int:
renumber = True
else:
renumber = False
G.from_cudf_edgelist(
cu_M, source="src", destination="dst", edge_attr="wgt", renumber=False
cu_M, source="src", destination="dst", edge_attr="wgt", renumber=renumber
)

t1 = time.time()
Expand All @@ -66,7 +103,19 @@ def cugraph_call(
return pos


DATASETS = [(karate, 0.70), (polbooks, 0.75), (dolphins, 0.66), (netscience, 0.66)]
DATASETS = [
(karate, 0.70),
(polbooks, 0.75),
(dolphins, 0.66),
(netscience, 0.66),
(dining_prefs, 0.50),
]

DATASETS2 = [
(polbooks, 0.75),
(dolphins, 0.66),
(netscience, 0.66),
]


MAX_ITERATIONS = [500]
Expand Down Expand Up @@ -95,8 +144,7 @@ def on_train_end(self, positions):
@pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
@pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE)
def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
cu_M = graph_file.get_edgelist()
dataset_path = graph_file.get_path()
cu_M = graph_file.get_edgelist(download=True)
test_callback = TestCallback()
cu_pos = cugraph_call(
cu_M,
Expand Down Expand Up @@ -126,9 +174,11 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
iterations on a given graph.
"""

matrix_file = dataset_path.with_suffix(".mtx")
M = scipy.io.mmread(matrix_file)
M = M.toarray()
if "string" in graph_file.metadata["col_types"]:
df = renumbered_edgelist(graph_file.get_edgelist(download=True))
M = get_coo_array(df)
else:
M = get_coo_array(graph_file.get_edgelist(download=True))
cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
print(cu_trust, score)
assert cu_trust > score
Expand All @@ -138,74 +188,3 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
assert test_callback.on_epoch_end_called_count == max_iter
# verify `on_train_end` was only called once
assert test_callback.on_train_end_called_count == 1


# FIXME: this test occasionally fails - skipping to prevent CI failures but
# need to revisit ASAP
@pytest.mark.sg
@pytest.mark.skip(reason="non-deterministric - needs fixing!")
@pytest.mark.parametrize("graph_file, score", DATASETS[:-1])
@pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
@pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE)
def test_force_atlas2_multi_column_pos_list(
graph_file, score, max_iter, barnes_hut_optimize
):
cu_M = graph_file.get_edgelist()
dataset_path = graph_file.get_path()
test_callback = TestCallback()
pos = cugraph_call(
cu_M,
max_iter=max_iter,
pos_list=None,
outbound_attraction_distribution=True,
lin_log_mode=False,
prevent_overlapping=False,
edge_weight_influence=1.0,
jitter_tolerance=1.0,
barnes_hut_optimize=False,
barnes_hut_theta=0.5,
scaling_ratio=2.0,
strong_gravity_mode=False,
gravity=1.0,
callback=test_callback,
)

cu_M.rename(columns={"0": "src_0", "1": "dst_0"}, inplace=True)
cu_M["src_1"] = cu_M["src_0"] + 1000
cu_M["dst_1"] = cu_M["dst_0"] + 1000

G = cugraph.Graph()
G.from_cudf_edgelist(
cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"], edge_attr="2"
)

pos_list = cudf.DataFrame()
pos_list["vertex_0"] = pos["vertex"]
pos_list["vertex_1"] = pos_list["vertex_0"] + 1000
pos_list["x"] = pos["x"]
pos_list["y"] = pos["y"]

cu_pos = cugraph.force_atlas2(
G,
max_iter=max_iter,
pos_list=pos_list,
outbound_attraction_distribution=True,
lin_log_mode=False,
prevent_overlapping=False,
edge_weight_influence=1.0,
jitter_tolerance=1.0,
barnes_hut_optimize=False,
barnes_hut_theta=0.5,
scaling_ratio=2.0,
strong_gravity_mode=False,
gravity=1.0,
callback=test_callback,
)

cu_pos = cu_pos.sort_values("0_vertex")
matrix_file = dataset_path.with_suffix(".mtx")
M = scipy.io.mmread(matrix_file)
M = M.todense()
cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
print(cu_trust, score)
assert cu_trust > score

0 comments on commit 9dd01d8

Please sign in to comment.