From 47e4a5bc88e68c9273976cea4e5d2335ad639baf Mon Sep 17 00:00:00 2001
From: Ralph Liu <ralphl@nvidia.com>
Date: Wed, 13 Mar 2024 08:46:57 -0700
Subject: [PATCH] Adding test_edge_betweenness_centrality_mg

---
 .../test_edge_betweenness_centrality_mg.py    | 227 +++++++-----------
 1 file changed, 88 insertions(+), 139 deletions(-)

diff --git a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py
index 478b7e655d5..0dde5ef476d 100644
--- a/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py
+++ b/python/cugraph/cugraph/tests/centrality/test_edge_betweenness_centrality_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,14 +14,9 @@
 import gc
 import pytest
 
-import dask_cudf
-from pylibcugraph.testing.utils import gen_fixture_params_product
-from cugraph.datasets import karate, dolphins
-
 import cugraph
 import cugraph.dask as dcg
-
-# from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.datasets import karate, dolphins
 
 
 # =============================================================================
@@ -33,79 +28,35 @@ def setup_function():
     gc.collect()
 
 
-IS_DIRECTED = [True, False]
-INCLUDE_WEIGHTS = [False, True]
-INCLUDE_EDGE_IDS = [False, True]
-NORMALIZED_OPTIONS = [False, True]
-SUBSET_SIZE_OPTIONS = [4, None]
+# =============================================================================
+# Parameters
+# =============================================================================
 
 
-# email_Eu_core is too expensive to test
-datasets = [karate, dolphins]
+DATASETS = [karate, dolphins]
+IS_DIRECTED = [True, False]
+IS_WEIGHTED = [True, False]
+INCLUDE_EDGE_IDS = [True, False]
+IS_NORMALIZED = [True, False]
+SUBSET_SIZES = [4, None]
 
 
 # =============================================================================
-# Pytest fixtures
+# Helper functions
 # =============================================================================
 
 
-fixture_params = gen_fixture_params_product(
-    (datasets, "graph_file"),
-    (IS_DIRECTED, "directed"),
-    (INCLUDE_WEIGHTS, "include_weights"),
-    (INCLUDE_EDGE_IDS, "include_edgeids"),
-    (NORMALIZED_OPTIONS, "normalized"),
-    (SUBSET_SIZE_OPTIONS, "subset_size"),
-)
-
-
-@pytest.fixture(scope="module", params=fixture_params)
-def input_combo(request):
-    """
-    Simply return the current combination of params as a dictionary for use in
-    tests or other parameterized fixtures.
-    """
-    parameters = dict(
-        zip(
-            (
-                "graph_file",
-                "directed",
-                "include_weights",
-                "include_edge_ids",
-                "normalized",
-                "subset_size",
-                "subset_seed",
-            ),
-            request.param,
-        )
-    )
-
-    return parameters
-
-
-@pytest.fixture(scope="module")
-def input_expected_output(input_combo):
-    """
-    This fixture returns the inputs and expected results from the edge
-    betweenness centrality algo.
-    (based on cuGraph edge betweenness centrality) which can be used
-    for validation.
-    """
-    directed = input_combo["directed"]
-    normalized = input_combo["normalized"]
-    k = input_combo["subset_size"]
-    subset_seed = 42
-    edge_ids = input_combo["include_edge_ids"]
-    weight = input_combo["include_weights"]
+def get_sg_graph(dataset, directed, edge_ids):
+    dataset.unload()
+    df = dataset.get_edgelist()
 
-    df = input_combo["graph_file"].get_edgelist()
     if edge_ids:
         if not directed:
             # Edge ids not supported for undirected graph
-            return
-        dtype = df.dtypes[0]
+            return None
+        dtype = df.dtypes.iloc[0]
         edge_id = "edge_id"
-        df["edge_id"] = df.index
+        df[edge_id] = df.index
         df = df.astype(dtype)
 
     else:
@@ -115,30 +66,13 @@ def input_expected_output(input_combo):
     G.from_cudf_edgelist(
         df, source="src", destination="dst", weight="wgt", edge_id=edge_id
     )
-    if isinstance(k, int):
-        k = G.select_random_vertices(subset_seed, k)
 
-    input_combo["k"] = k
-    # Save the results back to the input_combo dictionary to prevent redundant
-    # cuGraph runs. Other tests using the input_combo fixture will look for
-    # them, and if not present they will have to re-run the same cuGraph call.
-    sg_cugraph_edge_bc = (
-        cugraph.edge_betweenness_centrality(G, k, normalized)
-        .sort_values(["src", "dst"])
-        .reset_index(drop=True)
-    )
+    return G
 
-    input_data_path = input_combo["graph_file"].get_path()
 
-    input_combo["sg_cugraph_results"] = sg_cugraph_edge_bc
-    chunksize = dcg.get_chunksize(input_data_path)
-    ddf = dask_cudf.read_csv(
-        input_data_path,
-        chunksize=chunksize,
-        delimiter=" ",
-        names=["src", "dst", "value"],
-        dtype=["int32", "int32", "float32"],
-    )
+def get_mg_graph(dataset, directed, edge_ids, weight):
+    dataset.unload()
+    ddf = dataset.get_dask_edgelist()
 
     if weight:
         weight = ddf
@@ -154,20 +88,16 @@ def input_expected_output(input_combo):
         edge_id = None
 
     dg = cugraph.Graph(directed=directed)
-
     dg.from_dask_cudf_edgelist(
         ddf,
         source="src",
         destination="dst",
-        weight="value",
+        weight="wgt",
         edge_id=edge_id,
         renumber=True,
     )
 
-    input_combo["MGGraph"] = dg
-    input_combo["include_weights"] = weight
-
-    return input_combo
+    return dg, weight
 
 
 # =============================================================================
@@ -175,57 +105,76 @@ def input_expected_output(input_combo):
 # =============================================================================
 
 
-# @pytest.mark.skipif(
-#    is_single_gpu(), reason="skipping MG testing on Single GPU system"
-# )
 @pytest.mark.mg
+@pytest.mark.parametrize("dataset", DATASETS)
+@pytest.mark.parametrize("directed", IS_DIRECTED)
+@pytest.mark.parametrize("weighted", IS_WEIGHTED)
+@pytest.mark.parametrize("edge_ids", INCLUDE_EDGE_IDS)
+@pytest.mark.parametrize("normalized", IS_NORMALIZED)
+@pytest.mark.parametrize("subset_size", SUBSET_SIZES)
 def test_dask_mg_edge_betweenness_centrality(
-    dask_client, benchmark, input_expected_output
+    dask_client,
+    dataset,
+    directed,
+    weighted,
+    edge_ids,
+    normalized,
+    subset_size,
+    benchmark,
 ):
-    if input_expected_output is not None:
-        dg = input_expected_output["MGGraph"]
-        k = input_expected_output["k"]
-        normalized = input_expected_output["normalized"]
-        weight = input_expected_output["include_weights"]
-        if weight is not None:
-            with pytest.raises(NotImplementedError):
-                result_edge_bc = benchmark(
-                    dcg.edge_betweenness_centrality, dg, k, normalized, weight=weight
-                )
-
-        else:
+    g = get_sg_graph(dataset, directed, edge_ids)
+
+    if g is None:
+        pytest.skip("Edge_ids not supported for undirected graph")
+
+    dg, weight = get_mg_graph(dataset, directed, edge_ids, weighted)
+    subset_seed = 42
+
+    k = subset_size
+    if isinstance(k, int):
+        k = g.select_random_vertices(subset_seed, k)
+
+    sg_cugraph_edge_bc = (
+        cugraph.edge_betweenness_centrality(g, k, normalized)
+        .sort_values(["src", "dst"])
+        .reset_index(drop=True)
+    )
+
+    if weight is not None:
+        with pytest.raises(NotImplementedError):
             result_edge_bc = benchmark(
                 dcg.edge_betweenness_centrality, dg, k, normalized, weight=weight
             )
-            result_edge_bc = (
-                result_edge_bc.compute()
-                .sort_values(["src", "dst"])
-                .reset_index(drop=True)
-                .rename(columns={"betweenness_centrality": "mg_betweenness_centrality"})
-            )
 
-            if len(result_edge_bc.columns) > 3:
-                result_edge_bc = result_edge_bc.rename(
-                    columns={"edge_id": "mg_edge_id"}
-                )
+    else:
+        result_edge_bc = benchmark(
+            dcg.edge_betweenness_centrality, dg, k, normalized, weight=weight
+        )
+        result_edge_bc = (
+            result_edge_bc.compute()
+            .sort_values(["src", "dst"])
+            .reset_index(drop=True)
+            .rename(columns={"betweenness_centrality": "mg_betweenness_centrality"})
+        )
 
-            expected_output = input_expected_output["sg_cugraph_results"].reset_index(
-                drop=True
-            )
-            result_edge_bc["betweenness_centrality"] = expected_output[
-                "betweenness_centrality"
-            ]
-            if len(expected_output.columns) > 3:
-                result_edge_bc["edge_id"] = expected_output["edge_id"]
-                edge_id_diff = result_edge_bc.query("mg_edge_id != edge_id")
-                assert len(edge_id_diff) == 0
-
-            edge_bc_diffs1 = result_edge_bc.query(
-                "mg_betweenness_centrality - betweenness_centrality > 0.01"
-            )
-            edge_bc_diffs2 = result_edge_bc.query(
-                "betweenness_centrality - mg_betweenness_centrality < -0.01"
-            )
+        if len(result_edge_bc.columns) > 3:
+            result_edge_bc = result_edge_bc.rename(columns={"edge_id": "mg_edge_id"})
+
+        expected_output = sg_cugraph_edge_bc.reset_index(drop=True)
+        result_edge_bc["betweenness_centrality"] = expected_output[
+            "betweenness_centrality"
+        ]
+        if len(expected_output.columns) > 3:
+            result_edge_bc["edge_id"] = expected_output["edge_id"]
+            edge_id_diff = result_edge_bc.query("mg_edge_id != edge_id")
+            assert len(edge_id_diff) == 0
+
+        edge_bc_diffs1 = result_edge_bc.query(
+            "mg_betweenness_centrality - betweenness_centrality > 0.01"
+        )
+        edge_bc_diffs2 = result_edge_bc.query(
+            "betweenness_centrality - mg_betweenness_centrality < -0.01"
+        )
 
-            assert len(edge_bc_diffs1) == 0
-            assert len(edge_bc_diffs2) == 0
+        assert len(edge_bc_diffs1) == 0
+        assert len(edge_bc_diffs2) == 0