From 9dd01d82c17b6ada9c6701a9f3238e10053ce8a6 Mon Sep 17 00:00:00 2001
From: Don Acosta <97529984+acostadon@users.noreply.github.com>
Date: Fri, 29 Sep 2023 15:17:22 -0400
Subject: [PATCH] fixes force atlas to allow string as vertex names (#3891)

fixes force atlas to allow string as vertex names and removes need for mtx formated datasets.
resolves #3610

Authors:
  - Don Acosta (https://github.com/acostadon)
  - Brad Rees (https://github.com/BradReesWork)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - ralph (https://github.com/nv-rliu)

URL: https://github.com/rapidsai/cugraph/pull/3891
---
 .../cugraph/layout/force_atlas2_wrapper.pyx   |   8 +-
 .../cugraph/tests/layout/test_force_atlas2.py | 145 ++++++++----------
 2 files changed, 67 insertions(+), 86 deletions(-)

diff --git a/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx b/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx
index 4258be3ef71..5a2784e2363 100644
--- a/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx
+++ b/python/cugraph/cugraph/layout/force_atlas2_wrapper.pyx
@@ -56,9 +56,11 @@ def force_atlas2(input_graph,
     if not input_graph.edgelist:
         input_graph.view_edge_list()
 
-    # FIXME: This implementation assumes that the number of vertices
-    # is the max vertex ID + 1 which is not always the case.
-    num_verts = input_graph.nodes().max() + 1
+    # this code allows handling of renumbered graphs
+    if input_graph.is_renumbered():
+        num_verts = input_graph.renumber_map.df_internal_to_external['id'].max()+1
+    else:
+        num_verts = input_graph.nodes().max() + 1
     num_edges = len(input_graph.edgelist.edgelist_df['src'])
 
     cdef GraphCOOView[int,int,float] graph_float
diff --git a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py
index 495a2d945c0..6b1fd6bcc4e 100644
--- a/python/cugraph/cugraph/tests/layout/test_force_atlas2.py
+++ b/python/cugraph/cugraph/tests/layout/test_force_atlas2.py
@@ -13,13 +13,46 @@
 
 import time
 import pytest
-import scipy.io
-from sklearn.manifold import trustworthiness
 
-import cudf
 import cugraph
+from cugraph.structure import number_map
 from cugraph.internals import GraphBasedDimRedCallback
-from cugraph.datasets import karate, polbooks, dolphins, netscience
+from sklearn.manifold import trustworthiness
+import scipy.io
+from cugraph.datasets import (
+    karate,
+    polbooks,
+    dolphins,
+    netscience,
+    dining_prefs,
+)
+
+# FIXME Removed the multi column positional due to it being non-deterministic
+# need to replace this coverage. Issue 3890 in cuGraph repo was created.
+
+# This method renumbers a dataframe so it can be tested using Trustworthiness.
+# it converts a dataframe with string vertex ids to a renumbered int one.
+
+
+def renumbered_edgelist(df):
+    renumbered_df, num_map = number_map.NumberMap.renumber(df, "src", "dst")
+    new_df = renumbered_df[["renumbered_src", "renumbered_dst", "wgt"]]
+    column_names = {"renumbered_src": "src", "renumbered_dst": "dst"}
+    new_df = new_df.rename(columns=column_names)
+    return new_df
+
+
+# This method converts a dataframe to a sparce matrix that is required by
+# scipy Trustworthiness to verify the layout
+def get_coo_array(edgelist):
+    coo = edgelist
+    x = max(coo["src"].max(), coo["dst"].max()) + 1
+    row = coo["src"].to_numpy()
+    col = coo["dst"].to_numpy()
+    data = coo["wgt"].to_numpy()
+    M = scipy.sparse.coo_array((data, (row, col)), shape=(x, x))
+
+    return M
 
 
 def cugraph_call(
@@ -37,11 +70,15 @@ def cugraph_call(
     strong_gravity_mode,
     gravity,
     callback=None,
+    renumber=False,
 ):
-
     G = cugraph.Graph()
+    if cu_M["src"] is not int or cu_M["dst"] is not int:
+        renumber = True
+    else:
+        renumber = False
     G.from_cudf_edgelist(
-        cu_M, source="src", destination="dst", edge_attr="wgt", renumber=False
+        cu_M, source="src", destination="dst", edge_attr="wgt", renumber=renumber
     )
 
     t1 = time.time()
@@ -66,7 +103,19 @@ def cugraph_call(
     return pos
 
 
-DATASETS = [(karate, 0.70), (polbooks, 0.75), (dolphins, 0.66), (netscience, 0.66)]
+DATASETS = [
+    (karate, 0.70),
+    (polbooks, 0.75),
+    (dolphins, 0.66),
+    (netscience, 0.66),
+    (dining_prefs, 0.50),
+]
+
+DATASETS2 = [
+    (polbooks, 0.75),
+    (dolphins, 0.66),
+    (netscience, 0.66),
+]
 
 
 MAX_ITERATIONS = [500]
@@ -95,8 +144,7 @@ def on_train_end(self, positions):
 @pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
 @pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE)
 def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
-    cu_M = graph_file.get_edgelist()
-    dataset_path = graph_file.get_path()
+    cu_M = graph_file.get_edgelist(download=True)
     test_callback = TestCallback()
     cu_pos = cugraph_call(
         cu_M,
@@ -126,9 +174,11 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
         iterations on a given graph.
     """
 
-    matrix_file = dataset_path.with_suffix(".mtx")
-    M = scipy.io.mmread(matrix_file)
-    M = M.toarray()
+    if "string" in graph_file.metadata["col_types"]:
+        df = renumbered_edgelist(graph_file.get_edgelist(download=True))
+        M = get_coo_array(df)
+    else:
+        M = get_coo_array(graph_file.get_edgelist(download=True))
     cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
     print(cu_trust, score)
     assert cu_trust > score
@@ -138,74 +188,3 @@ def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize):
     assert test_callback.on_epoch_end_called_count == max_iter
     # verify `on_train_end` was only called once
     assert test_callback.on_train_end_called_count == 1
-
-
-# FIXME: this test occasionally fails - skipping to prevent CI failures but
-# need to revisit ASAP
-@pytest.mark.sg
-@pytest.mark.skip(reason="non-deterministric - needs fixing!")
-@pytest.mark.parametrize("graph_file, score", DATASETS[:-1])
-@pytest.mark.parametrize("max_iter", MAX_ITERATIONS)
-@pytest.mark.parametrize("barnes_hut_optimize", BARNES_HUT_OPTIMIZE)
-def test_force_atlas2_multi_column_pos_list(
-    graph_file, score, max_iter, barnes_hut_optimize
-):
-    cu_M = graph_file.get_edgelist()
-    dataset_path = graph_file.get_path()
-    test_callback = TestCallback()
-    pos = cugraph_call(
-        cu_M,
-        max_iter=max_iter,
-        pos_list=None,
-        outbound_attraction_distribution=True,
-        lin_log_mode=False,
-        prevent_overlapping=False,
-        edge_weight_influence=1.0,
-        jitter_tolerance=1.0,
-        barnes_hut_optimize=False,
-        barnes_hut_theta=0.5,
-        scaling_ratio=2.0,
-        strong_gravity_mode=False,
-        gravity=1.0,
-        callback=test_callback,
-    )
-
-    cu_M.rename(columns={"0": "src_0", "1": "dst_0"}, inplace=True)
-    cu_M["src_1"] = cu_M["src_0"] + 1000
-    cu_M["dst_1"] = cu_M["dst_0"] + 1000
-
-    G = cugraph.Graph()
-    G.from_cudf_edgelist(
-        cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"], edge_attr="2"
-    )
-
-    pos_list = cudf.DataFrame()
-    pos_list["vertex_0"] = pos["vertex"]
-    pos_list["vertex_1"] = pos_list["vertex_0"] + 1000
-    pos_list["x"] = pos["x"]
-    pos_list["y"] = pos["y"]
-
-    cu_pos = cugraph.force_atlas2(
-        G,
-        max_iter=max_iter,
-        pos_list=pos_list,
-        outbound_attraction_distribution=True,
-        lin_log_mode=False,
-        prevent_overlapping=False,
-        edge_weight_influence=1.0,
-        jitter_tolerance=1.0,
-        barnes_hut_optimize=False,
-        barnes_hut_theta=0.5,
-        scaling_ratio=2.0,
-        strong_gravity_mode=False,
-        gravity=1.0,
-        callback=test_callback,
-    )
-
-    cu_pos = cu_pos.sort_values("0_vertex")
-    matrix_file = dataset_path.with_suffix(".mtx")
-    M = scipy.io.mmread(matrix_file)
-    M = M.todense()
-    cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas())
-    print(cu_trust, score)
-    assert cu_trust > score