Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds new dataset for benchmarking in the 100k node 1M edge range, adds additional k-values for BC benchmarks #4726

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion benchmarks/nx-cugraph/pytest-based/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ Our current benchmarks provide the following datasets:
#### 1. `run-main-benchmarks.sh`
This script allows users to run a small set of commonly-used algorithms across multiple datasets and backends. All results are stored inside a sub-directory (`logs/`) and output files are named based on the combination of parameters for that benchmark.

NOTE: If running with all algorithms and datasets using NetworkX without an accelerated backend, this script may take a few hours to finish running.
NOTE:
- If running with all algorithms and datasets using NetworkX without an accelerated backend, this script may take a few hours to finish running.
- The `betweenness_centrality` benchmark will run with values `[10, 20, 50, 100, 500, 1000]` by default. You can specify only specific k-values to be run by editing `bc_k_values` (line 46) to be passed as a [pytest keyword object](https://docs.pytest.org/en/6.2.x/usage.html#specifying-tests-selecting-tests).

**Usage:**
- Run with `--cpu-only`:
Expand Down
53 changes: 38 additions & 15 deletions benchmarks/nx-cugraph/pytest-based/bench_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,40 @@
iterations = 1
warmup_rounds = 1

# FIXME: Add this to cugraph.datasets. This is done here so these benchmarks
# can be run without requiring an updated cugraph install. This temporarily
# adds a dataset based on an Amazon product co-purchasing network.
amazon0302_metadata = """
name: amazon0302
description:
Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003.
author: J. Leskovec, L. Adamic and B. Adamic
refs: J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. ACM Transactions on the Web (ACM TWEB), 1(1), 2007.
delim: "\t"
header: 3
col_names:
- FromNodeId
- ToNodeId
col_types:
- int32
- int32
has_loop: false
is_directed: true
is_multigraph: false
is_symmetric: false
number_of_edges: 1234877
number_of_nodes: 262111
url: https://snap.stanford.edu/data/amazon0302.txt.gz
"""
amazon0302_metadata_file_name = datasets.default_download_dir.path / "amazon0302.yaml"
if not amazon0302_metadata_file_name.exists():
amazon0302_metadata_file_name.parent.mkdir(parents=True, exist_ok=True)
with open(amazon0302_metadata_file_name, "w") as f:
f.write(amazon0302_metadata)

amazon0302_dataset = datasets.Dataset(amazon0302_metadata_file_name)
amazon0302_dataset.metadata["file_type"] = ".gz"

dataset_param_values = [
# name: karate, nodes: 34, edges: 156
pytest.param(datasets.karate, marks=[pytest.mark.small, pytest.mark.undirected]),
Expand All @@ -46,6 +80,8 @@
pytest.param(
datasets.email_Eu_core, marks=[pytest.mark.small, pytest.mark.directed]
),
# name: amazon0302, nodes: 262111, edges: 1234877
pytest.param(amazon0302_dataset, marks=[pytest.mark.medium, pytest.mark.directed]),
# name: cit-Patents, nodes: 3774768, edges: 16518948
pytest.param(
datasets.cit_patents, marks=[pytest.mark.medium, pytest.mark.directed]
Expand Down Expand Up @@ -113,19 +149,7 @@ def nx_graph_from_dataset(dataset_obj):
"""
create_using = nx.DiGraph if dataset_obj.metadata["is_directed"] else nx.Graph
names = dataset_obj.metadata["col_names"]
dtypes = dataset_obj.metadata["col_types"]
if isinstance(dataset_obj.metadata["header"], int):
header = dataset_obj.metadata["header"]
else:
header = None

pandas_edgelist = pd.read_csv(
dataset_obj.get_path(),
delimiter=dataset_obj.metadata["delim"],
names=names,
dtype=dict(zip(names, dtypes)),
header=header,
)
pandas_edgelist = dataset_obj.get_edgelist(download=True, reader="pandas")
G = nx.from_pandas_edgelist(
pandas_edgelist, source=names[0], target=names[1], create_using=create_using
)
Expand Down Expand Up @@ -272,7 +296,7 @@ def bench_from_networkx(benchmark, graph_obj):

# normalized_param_values = [True, False]
normalized_param_values = [True]
k_param_values = [10, 100, 1000]
k_param_values = [10, 20, 50, 100, 500, 1000]


@pytest.mark.parametrize(
Expand All @@ -281,7 +305,6 @@ def bench_from_networkx(benchmark, graph_obj):
@pytest.mark.parametrize("k", k_param_values, ids=lambda k: f"{k=}")
def bench_betweenness_centrality(benchmark, graph_obj, backend_wrapper, normalized, k):
G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)

if k > G.number_of_nodes():
pytest.skip(reason=f"{k=} > {G.number_of_nodes()=}")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def get_system_info():
ordered_datasets = [
"netscience",
"email_Eu_core",
"amazon0302",
"cit-patents",
"hollywood",
"soc-livejournal1",
Expand All @@ -174,6 +175,7 @@ def get_system_info():
dataset_meta = {
"netscience": ["1,461", "5,484", "Yes"],
"email_Eu_core": ["1,005", "25,571", "Yes"],
"amazon0302": ["262,111", "1,234,877", "Yes"],
"cit-patents": ["3,774,768", "16,518,948", "Yes"],
"hollywood": ["1,139,905", "57,515,616", "No"],
"soc-livejournal1": ["4,847,571", "68,993,773", "Yes"],
Expand Down
35 changes: 0 additions & 35 deletions benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py

This file was deleted.

20 changes: 13 additions & 7 deletions benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


# location to store datasets used for benchmarking
export RAPIDS_DATASET_ROOT_DIR=/datasets/cugraph
export RAPIDS_DATASET_ROOT_DIR=${RAPIDS_DATASET_ROOT_DIR:-/datasets/cugraph}
mkdir -p logs

# list of algos, datasets, and back-ends to use in combinations
Expand All @@ -30,6 +30,7 @@ algos="
datasets="
netscience
email_Eu_core
amazon0302
cit-patents
hollywood
soc-livejournal
Expand All @@ -40,6 +41,11 @@ backends="
None
cugraph-preconverted
"

# edit this directly to for pytest
# e.g. -k "and not 100 and not 1000"
bc_k_values=""

# check for --cpu-only or --gpu-only args
if [[ "$#" -eq 1 ]]; then
case $1 in
Expand All @@ -58,15 +64,15 @@ fi

for algo in $algos; do
for dataset in $datasets; do
# this script can be used to download benchmarking datasets by name via cugraph.datasets
python get_graph_bench_dataset.py $dataset
for backend in $backends; do
name="${backend}__${algo}__${dataset}"
echo "Running: $backend, $dataset, bench_$algo"
# command to preproduce test
# echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo and not 1000\" --benchmark-json=\"logs/${name}.json\" bench_algos.py"
pytest -sv \
-k "$backend and $dataset and bench_$algo and not 1000" \

# uncomment to get command for reproducing test
# echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo $bc_k_values\" --benchmark-json=\"logs/${name}.json\" bench_algos.py"

pytest -sv --co \
-k "$backend and $dataset and bench_$algo $bc_k_values" \
--benchmark-json="logs/${name}.json" \
bench_algos.py 2>&1 | tee "logs/${name}.out"
done
Expand Down
Loading