Merge pull request #19 from rlratzel/branch-24.10-nxcg_benchmarking

Merge `rlratzel:branch-24.10-nxcg_benchmarking` into Forked Branch
rapidsai · Aug 19, 2024 · f2a0f77 · f2a0f77
2 parents 4c26e7a + b626c3c
commit f2a0f77
Show file tree

Hide file tree

Showing 4 changed files with 274 additions and 2 deletions.
diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
@@ -271,9 +271,8 @@ def bench_from_networkx(benchmark, graph_obj):
 
 
 # normalized_param_values = [True, False]
-# k_param_values = [10, 100]
 normalized_param_values = [True]
-k_param_values = [10]
+k_param_values = [10, 100, 1000]
 
 
 @pytest.mark.parametrize(
@@ -282,6 +281,10 @@ def bench_from_networkx(benchmark, graph_obj):
 @pytest.mark.parametrize("k", k_param_values, ids=lambda k: f"{k=}")
 def bench_betweenness_centrality(benchmark, graph_obj, backend_wrapper, normalized, k):
     G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+
+    if k > G.number_of_nodes():
+        pytest.skip(reason=f"{k=} > {G.number_of_nodes()=}")
+
     result = benchmark.pedantic(
         target=backend_wrapper(nx.betweenness_centrality),
         args=(G,),
@@ -305,6 +308,10 @@ def bench_edge_betweenness_centrality(
     benchmark, graph_obj, backend_wrapper, normalized, k
 ):
     G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+
+    if k > G.number_of_nodes():
+        pytest.skip(reason=f"{k=} > {G.number_of_nodes()=}")
+
     result = benchmark.pedantic(
         target=backend_wrapper(nx.edge_betweenness_centrality),
         args=(G,),
@@ -473,6 +480,26 @@ def bench_pagerank_personalized(benchmark, graph_obj, backend_wrapper):
     assert type(result) is dict
 
 
+def bench_shortest_path(benchmark, graph_obj, backend_wrapper):
+    """
+    This passes in the source node with the highest degree, but no target.
+    """
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.shortest_path),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
 def bench_single_source_shortest_path_length(benchmark, graph_obj, backend_wrapper):
     G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
     node = get_highest_degree_node(graph_obj)

diff --git a/benchmarks/nx-cugraph/pytest-based/ensure_dataset_accessible.py b/benchmarks/nx-cugraph/pytest-based/ensure_dataset_accessible.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import cugraph.datasets as cgds
+
+dataset = sys.argv[1].replace("-", "_")
+dataset_obj = getattr(cgds, dataset)
+
+if not dataset_obj.get_path().exists():
+    dataset_obj.get_edgelist(download=True)
diff --git a/benchmarks/nx-cugraph/pytest-based/gen_table.py b/benchmarks/nx-cugraph/pytest-based/gen_table.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import pathlib
+import json
+
+logs_dir = pathlib.Path("logs")
+
+dataset_patt = re.compile(".*ds=([\w-]+).*")
+backend_patt = re.compile(".*backend=(\w+).*")
+k_patt = re.compile(".*k=(10*).*")
+
+# Organize all benchmark runs by the following hierarchy: algo -> backend -> dataset
+benchmarks = {}
+
+
+def compute_perf_vals(cugraph_runtime, networkx_runtime):
+    speedup_string = f"{networkx_runtime / cugraph_runtime:.3f}X"
+    delta = networkx_runtime - cugraph_runtime
+    if abs(delta) < 1:
+        if abs(delta) < 0.001:
+            units = "us"
+            delta *= 1e6
+        else:
+            units = "ms"
+            delta *= 1e3
+    else:
+        units = "s"
+    delta_string = f"{delta:.3f}{units}"
+
+    return (speedup_string, delta_string)
+
+
+# Populate benchmarks dir from .json files
+for json_file in logs_dir.glob("*.json"):
+    # print(f"READING {json_file}")
+    try:
+        data = json.loads(open(json_file).read())
+    except json.decoder.JSONDecodeError:
+        # print(f"PROBLEM READING {json_file}, skipping.")
+        continue
+
+    for benchmark_run in data["benchmarks"]:
+        # example name: "bench_triangles[ds=netscience-backend=cugraph-preconverted]"
+        name = benchmark_run["name"]
+
+        algo_name = name.split("[")[0]
+        if algo_name.startswith("bench_"):
+            algo_name = algo_name[6:]
+        # special case for betweenness_centrality
+        match = k_patt.match(name)
+        if match is not None:
+            algo_name += f", k={match.group(1)}"
+
+        match = dataset_patt.match(name)
+        if match is None:
+            raise RuntimeError(
+                f"benchmark name {name} in file {json_file} has an unexpected format"
+            )
+        dataset = match.group(1)
+        if dataset.endswith("-backend"):
+            dataset = dataset[:-8]
+
+        match = backend_patt.match(name)
+        if match is None:
+            raise RuntimeError(
+                f"benchmark name {name} in file {json_file} has an unexpected format"
+            )
+        backend = match.group(1)
+        if backend == "None":
+            backend = "networkx"
+
+        runtime = benchmark_run["stats"]["mean"]
+        benchmarks.setdefault(algo_name, {}).setdefault(backend, {})[dataset] = runtime
+
+
+# dump HTML table
+ordered_datasets = [
+    "netscience",
+    "email_Eu_core",
+    "cit-patents",
+    "hollywood",
+    "soc-livejournal1",
+]
+
+print(
+    """
+<html>
+<head>
+   <style>
+      table {
+        table-layout: fixed;
+        width: 100%;
+        border-collapse: collapse;
+      }
+      tbody tr:nth-child(odd) {
+        background-color: #ffffff;
+      }
+
+      tbody tr:nth-child(even) {
+        background-color: #d3d3d3;
+      }
+      tbody td {
+        text-align: center;
+      }
+      th,
+      td {
+        padding: 10px;
+      }
+   </style>
+</head>
+<table>
+   <thead>
+   <tr>
+      <th></th>"""
+)
+for ds in ordered_datasets:
+    print(f"      <th>{ds}</th>")
+print(
+    """   </tr>
+   </thead>
+   <tbody>
+"""
+)
+
+
+for algo_name in benchmarks:
+    algo_runs = benchmarks[algo_name]
+    print("   <tr>")
+    print(f"      <td>{algo_name}</td>")
+
+    # Proceed only if any results are present for both cugraph and NX
+    if "cugraph" in algo_runs and "networkx" in algo_runs:
+        cugraph_algo_runs = algo_runs["cugraph"]
+        networkx_algo_runs = algo_runs["networkx"]
+        datasets_in_both = set(cugraph_algo_runs).intersection(networkx_algo_runs)
+
+        # populate the table with speedup results for each dataset in the order
+        # specified in ordered_datasets. If results for a run using a dataset
+        # are not present for both cugraph and NX, output an empty cell.
+        for dataset in ordered_datasets:
+            if dataset in datasets_in_both:
+                cugraph_runtime = cugraph_algo_runs[dataset]
+                networkx_runtime = networkx_algo_runs[dataset]
+                (speedup, runtime_delta) = compute_perf_vals(
+                    cugraph_runtime=cugraph_runtime, networkx_runtime=networkx_runtime
+                )
+                print(f"      <td>{speedup}<br>{runtime_delta}</td>")
+            else:
+                print(f"      <td></td>")
+
+    # If a comparison between cugraph and NX cannot be made, output empty cells
+    # for each dataset
+    else:
+        for _ in range(len(ordered_datasets)):
+            print("      <td></td>")
+
+    print("   </tr>")
+
+print(
+    """
+   </tbody>
+</table>
+</html>
+"""
+)
diff --git a/benchmarks/nx-cugraph/pytest-based/run-main.sh b/benchmarks/nx-cugraph/pytest-based/run-main.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Runs benchmarks for the 24.02 algos.
+# Pass either a or b or both. This is useful for separating batches of runs on different GPUs:
+# CUDA_VISIBLE_DEVICES=1 run-2402.sh b
+
+export RAPIDS_DATASET_ROOT_DIR=/datasets/cugraph
+mkdir -p logs
+
+algos="
+    pagerank
+    betweenness_centrality
+    louvain
+    shortest_path
+    weakly_connected_components
+    triangles
+    bfs_predecessors
+"
+
+datasets="
+   netscience
+   email_Eu_core
+   cit_patents
+   hollywood
+   soc-livejournal
+"
+
+# None backend is default networkx
+# cugraph-preconvert backend is nx-cugraph
+backends="
+    None
+    cugraph-preconverted
+"
+
+for dataset in $datasets; do
+    python ensure_dataset_accessible.py $dataset
+    for backend in $backends; do
+        for algo in $algos; do
+            name="${backend}__${algo}__${dataset}"
+            echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo and not 1000\" --benchmark-json=\"logs/${name}.json\" bench_algos.py"
+            pytest -sv -k "$backend and $dataset and bench_$algo and not 1000" --benchmark-json="logs/${name}.json" bench_algos.py 2>&1 | tee "logs/${name}.out"
+        done
+    done
+done