From 0c81f15364697e6d922aced223808c6c78de098e Mon Sep 17 00:00:00 2001 From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> Date: Thu, 17 Oct 2024 10:09:52 -0500 Subject: [PATCH] Adds new dataset for benchmarking in the 100k node 1M edge range, adds additional k-values for BC benchmarks (#4726) Adds new dataset for benchmarking in the 100k node 1M edge range. This also updates the benchmark fixture to download the dataset instead of requiring a separate script. This also includes changes from #4721 to add additional k-values for BC benchmarks to run. The addition was done with an inline yaml file and a followup PR will be done to properly add this dataset to the cugraph.datasets API. The inline yaml was done in order to allow benchmarks to run on a system with an existing cugraph installation by simply copying this bench_algos.py, rather than require an updated cugraph install with the new dataset metadata. Authors: - Rick Ratzel (https://github.com/rlratzel) - Ralph Liu (https://github.com/nv-rliu) Approvers: - Ralph Liu (https://github.com/nv-rliu) - Don Acosta (https://github.com/acostadon) URL: https://github.com/rapidsai/cugraph/pull/4726 --- benchmarks/nx-cugraph/pytest-based/README.md | 4 +- .../nx-cugraph/pytest-based/bench_algos.py | 53 +++++++++++++------ .../create_results_summary_page.py | 2 + .../pytest-based/get_graph_bench_dataset.py | 35 ------------ .../pytest-based/run-main-benchmarks.sh | 20 ++++--- 5 files changed, 56 insertions(+), 58 deletions(-) delete mode 100644 benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py diff --git a/benchmarks/nx-cugraph/pytest-based/README.md b/benchmarks/nx-cugraph/pytest-based/README.md index 781550fa560..5d2406bfcd5 100644 --- a/benchmarks/nx-cugraph/pytest-based/README.md +++ b/benchmarks/nx-cugraph/pytest-based/README.md @@ -21,7 +21,9 @@ Our current benchmarks provide the following datasets: #### 1. `run-main-benchmarks.sh` This script allows users to run a small set of commonly-used algorithms across multiple datasets and backends. All results are stored inside a sub-directory (`logs/`) and output files are named based on the combination of parameters for that benchmark. -NOTE: If running with all algorithms and datasets using NetworkX without an accelerated backend, this script may take a few hours to finish running. +NOTE: + - If running with all algorithms and datasets using NetworkX without an accelerated backend, this script may take a few hours to finish running. + - The `betweenness_centrality` benchmark will run with values `[10, 20, 50, 100, 500, 1000]` by default. You can specify only specific k-values to be run by editing `bc_k_values` (line 46) to be passed as a [pytest keyword object](https://docs.pytest.org/en/6.2.x/usage.html#specifying-tests-selecting-tests). **Usage:** - Run with `--cpu-only`: diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py index f88d93c3f17..8852ed2a875 100644 --- a/benchmarks/nx-cugraph/pytest-based/bench_algos.py +++ b/benchmarks/nx-cugraph/pytest-based/bench_algos.py @@ -37,6 +37,40 @@ iterations = 1 warmup_rounds = 1 +# FIXME: Add this to cugraph.datasets. This is done here so these benchmarks +# can be run without requiring an updated cugraph install. This temporarily +# adds a dataset based on an Amazon product co-purchasing network. +amazon0302_metadata = """ +name: amazon0302 +description: + Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003. +author: J. Leskovec, L. Adamic and B. Adamic +refs: J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. ACM Transactions on the Web (ACM TWEB), 1(1), 2007. +delim: "\t" +header: 3 +col_names: + - FromNodeId + - ToNodeId +col_types: + - int32 + - int32 +has_loop: false +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 1234877 +number_of_nodes: 262111 +url: https://snap.stanford.edu/data/amazon0302.txt.gz +""" +amazon0302_metadata_file_name = datasets.default_download_dir.path / "amazon0302.yaml" +if not amazon0302_metadata_file_name.exists(): + amazon0302_metadata_file_name.parent.mkdir(parents=True, exist_ok=True) + with open(amazon0302_metadata_file_name, "w") as f: + f.write(amazon0302_metadata) + +amazon0302_dataset = datasets.Dataset(amazon0302_metadata_file_name) +amazon0302_dataset.metadata["file_type"] = ".gz" + dataset_param_values = [ # name: karate, nodes: 34, edges: 156 pytest.param(datasets.karate, marks=[pytest.mark.small, pytest.mark.undirected]), @@ -46,6 +80,8 @@ pytest.param( datasets.email_Eu_core, marks=[pytest.mark.small, pytest.mark.directed] ), + # name: amazon0302, nodes: 262111, edges: 1234877 + pytest.param(amazon0302_dataset, marks=[pytest.mark.medium, pytest.mark.directed]), # name: cit-Patents, nodes: 3774768, edges: 16518948 pytest.param( datasets.cit_patents, marks=[pytest.mark.medium, pytest.mark.directed] @@ -113,19 +149,7 @@ def nx_graph_from_dataset(dataset_obj): """ create_using = nx.DiGraph if dataset_obj.metadata["is_directed"] else nx.Graph names = dataset_obj.metadata["col_names"] - dtypes = dataset_obj.metadata["col_types"] - if isinstance(dataset_obj.metadata["header"], int): - header = dataset_obj.metadata["header"] - else: - header = None - - pandas_edgelist = pd.read_csv( - dataset_obj.get_path(), - delimiter=dataset_obj.metadata["delim"], - names=names, - dtype=dict(zip(names, dtypes)), - header=header, - ) + pandas_edgelist = dataset_obj.get_edgelist(download=True, reader="pandas") G = nx.from_pandas_edgelist( pandas_edgelist, source=names[0], target=names[1], create_using=create_using ) @@ -272,7 +296,7 @@ def bench_from_networkx(benchmark, graph_obj): # normalized_param_values = [True, False] normalized_param_values = [True] -k_param_values = [10, 100, 1000] +k_param_values = [10, 20, 50, 100, 500, 1000] @pytest.mark.parametrize( @@ -281,7 +305,6 @@ def bench_from_networkx(benchmark, graph_obj): @pytest.mark.parametrize("k", k_param_values, ids=lambda k: f"{k=}") def bench_betweenness_centrality(benchmark, graph_obj, backend_wrapper, normalized, k): G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper) - if k > G.number_of_nodes(): pytest.skip(reason=f"{k=} > {G.number_of_nodes()=}") diff --git a/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py b/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py index f1cc4b06ccc..df4031e0f61 100644 --- a/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py +++ b/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py @@ -166,6 +166,7 @@ def get_system_info(): ordered_datasets = [ "netscience", "email_Eu_core", + "amazon0302", "cit-patents", "hollywood", "soc-livejournal1", @@ -174,6 +175,7 @@ def get_system_info(): dataset_meta = { "netscience": ["1,461", "5,484", "Yes"], "email_Eu_core": ["1,005", "25,571", "Yes"], + "amazon0302": ["262,111", "1,234,877", "Yes"], "cit-patents": ["3,774,768", "16,518,948", "Yes"], "hollywood": ["1,139,905", "57,515,616", "No"], "soc-livejournal1": ["4,847,571", "68,993,773", "Yes"], diff --git a/benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py b/benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py deleted file mode 100644 index 5a0a15da8ee..00000000000 --- a/benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Checks if a particular dataset has been downloaded inside the datasets dir -(RAPIDS_DATAEST_ROOT_DIR). If not, the file will be downloaded using the -datasets API. - -Positional Arguments: - 1) dataset name (e.g. 'email_Eu_core', 'cit-patents') - available datasets can be found here: `python/cugraph/cugraph/datasets/__init__.py` -""" - -import sys - -import cugraph.datasets as cgds - - -if __name__ == "__main__": - # download and store dataset (csv) by using the Datasets API - dataset = sys.argv[1].replace("-", "_") - dataset_obj = getattr(cgds, dataset) - - if not dataset_obj.get_path().exists(): - dataset_obj.get_edgelist(download=True) diff --git a/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh b/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh index 3059e3d4bdf..6c674a5e428 100755 --- a/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh +++ b/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh @@ -14,7 +14,7 @@ # location to store datasets used for benchmarking -export RAPIDS_DATASET_ROOT_DIR=/datasets/cugraph +export RAPIDS_DATASET_ROOT_DIR=${RAPIDS_DATASET_ROOT_DIR:-/datasets/cugraph} mkdir -p logs # list of algos, datasets, and back-ends to use in combinations @@ -30,6 +30,7 @@ algos=" datasets=" netscience email_Eu_core + amazon0302 cit-patents hollywood soc-livejournal @@ -40,6 +41,11 @@ backends=" None cugraph-preconverted " + +# edit this directly to for pytest +# e.g. -k "and not 100 and not 1000" +bc_k_values="" + # check for --cpu-only or --gpu-only args if [[ "$#" -eq 1 ]]; then case $1 in @@ -58,15 +64,15 @@ fi for algo in $algos; do for dataset in $datasets; do - # this script can be used to download benchmarking datasets by name via cugraph.datasets - python get_graph_bench_dataset.py $dataset for backend in $backends; do name="${backend}__${algo}__${dataset}" echo "Running: $backend, $dataset, bench_$algo" - # command to preproduce test - # echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo and not 1000\" --benchmark-json=\"logs/${name}.json\" bench_algos.py" - pytest -sv \ - -k "$backend and $dataset and bench_$algo and not 1000" \ + + # uncomment to get command for reproducing test + # echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo $bc_k_values\" --benchmark-json=\"logs/${name}.json\" bench_algos.py" + + pytest -sv --co \ + -k "$backend and $dataset and bench_$algo $bc_k_values" \ --benchmark-json="logs/${name}.json" \ bench_algos.py 2>&1 | tee "logs/${name}.out" done