Merge branch 'plc-eid-lookup' of https://github.com/alexbarghi-nv/cug…

…raph into plc-eid-lookup
rapidsai · Oct 18, 2024 · 38a1755 · 38a1755
2 parents 340a488 + f3e69a5
commit 38a1755
Show file tree

Hide file tree

Showing 83 changed files with 279 additions and 582 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -68,7 +68,7 @@ repos:
         types: [python]
         language: python
         pass_filenames: false
-        additional_dependencies: ["networkx>=3.3"]
+        additional_dependencies: ["networkx>=3.4"]
   - repo: local
     hooks:
       - id: nx-cugraph-readme-update
@@ -78,4 +78,4 @@ repos:
         types_or: [python, markdown]
         language: python
         pass_filenames: false
-        additional_dependencies: ["networkx>=3.3"]
+        additional_dependencies: ["networkx>=3.4"]
diff --git a/benchmarks/nx-cugraph/pytest-based/README.md b/benchmarks/nx-cugraph/pytest-based/README.md
@@ -21,7 +21,9 @@ Our current benchmarks provide the following datasets:
 #### 1. `run-main-benchmarks.sh`
 This script allows users to run a small set of commonly-used algorithms across multiple datasets and backends. All results are stored inside a sub-directory (`logs/`) and output files are named based on the combination of parameters for that benchmark.
 
-NOTE: If running with all algorithms and datasets using NetworkX without an accelerated backend, this script may take a few hours to finish running.
+NOTE:
+ - If running with all algorithms and datasets using NetworkX without an accelerated backend, this script may take a few hours to finish running.
+ - The `betweenness_centrality` benchmark will run with values `[10, 20, 50, 100, 500, 1000]` by default. You can specify only specific k-values to be run by editing `bc_k_values` (line 46) to be passed as a [pytest keyword object](https://docs.pytest.org/en/6.2.x/usage.html#specifying-tests-selecting-tests).
 
 **Usage:**
  - Run with `--cpu-only`:

diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
@@ -37,6 +37,40 @@
 iterations = 1
 warmup_rounds = 1
 
+# FIXME: Add this to cugraph.datasets.  This is done here so these benchmarks
+# can be run without requiring an updated cugraph install.  This temporarily
+# adds a dataset based on an Amazon product co-purchasing network.
+amazon0302_metadata = """
+name: amazon0302
+description:
+  Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003.
+author: J. Leskovec, L. Adamic and B. Adamic
+refs: J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. ACM Transactions on the Web (ACM TWEB), 1(1), 2007.
+delim: "\t"
+header: 3
+col_names:
+  - FromNodeId
+  - ToNodeId
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 1234877
+number_of_nodes: 262111
+url: https://snap.stanford.edu/data/amazon0302.txt.gz
+"""
+amazon0302_metadata_file_name = datasets.default_download_dir.path / "amazon0302.yaml"
+if not amazon0302_metadata_file_name.exists():
+    amazon0302_metadata_file_name.parent.mkdir(parents=True, exist_ok=True)
+    with open(amazon0302_metadata_file_name, "w") as f:
+        f.write(amazon0302_metadata)
+
+amazon0302_dataset = datasets.Dataset(amazon0302_metadata_file_name)
+amazon0302_dataset.metadata["file_type"] = ".gz"
+
 dataset_param_values = [
     # name: karate, nodes: 34, edges: 156
     pytest.param(datasets.karate, marks=[pytest.mark.small, pytest.mark.undirected]),
@@ -46,6 +80,8 @@
     pytest.param(
         datasets.email_Eu_core, marks=[pytest.mark.small, pytest.mark.directed]
     ),
+    # name: amazon0302, nodes: 262111, edges: 1234877
+    pytest.param(amazon0302_dataset, marks=[pytest.mark.medium, pytest.mark.directed]),
     # name: cit-Patents, nodes: 3774768, edges: 16518948
     pytest.param(
         datasets.cit_patents, marks=[pytest.mark.medium, pytest.mark.directed]
@@ -113,19 +149,7 @@ def nx_graph_from_dataset(dataset_obj):
     """
     create_using = nx.DiGraph if dataset_obj.metadata["is_directed"] else nx.Graph
     names = dataset_obj.metadata["col_names"]
-    dtypes = dataset_obj.metadata["col_types"]
-    if isinstance(dataset_obj.metadata["header"], int):
-        header = dataset_obj.metadata["header"]
-    else:
-        header = None
-
-    pandas_edgelist = pd.read_csv(
-        dataset_obj.get_path(),
-        delimiter=dataset_obj.metadata["delim"],
-        names=names,
-        dtype=dict(zip(names, dtypes)),
-        header=header,
-    )
+    pandas_edgelist = dataset_obj.get_edgelist(download=True, reader="pandas")
     G = nx.from_pandas_edgelist(
         pandas_edgelist, source=names[0], target=names[1], create_using=create_using
     )
@@ -272,7 +296,7 @@ def bench_from_networkx(benchmark, graph_obj):
 
 # normalized_param_values = [True, False]
 normalized_param_values = [True]
-k_param_values = [10, 100, 1000]
+k_param_values = [10, 20, 50, 100, 500, 1000]
 
 
 @pytest.mark.parametrize(
@@ -281,7 +305,6 @@ def bench_from_networkx(benchmark, graph_obj):
 @pytest.mark.parametrize("k", k_param_values, ids=lambda k: f"{k=}")
 def bench_betweenness_centrality(benchmark, graph_obj, backend_wrapper, normalized, k):
     G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-
     if k > G.number_of_nodes():
         pytest.skip(reason=f"{k=} > {G.number_of_nodes()=}")
 

diff --git a/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py b/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py
@@ -166,6 +166,7 @@ def get_system_info():
     ordered_datasets = [
         "netscience",
         "email_Eu_core",
+        "amazon0302",
         "cit-patents",
         "hollywood",
         "soc-livejournal1",
@@ -174,6 +175,7 @@ def get_system_info():
     dataset_meta = {
         "netscience": ["1,461", "5,484", "Yes"],
         "email_Eu_core": ["1,005", "25,571", "Yes"],
+        "amazon0302": ["262,111", "1,234,877", "Yes"],
         "cit-patents": ["3,774,768", "16,518,948", "Yes"],
         "hollywood": ["1,139,905", "57,515,616", "No"],
         "soc-livejournal1": ["4,847,571", "68,993,773", "Yes"],

diff --git a/benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py b/benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py
diff --git a/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh b/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh
@@ -14,7 +14,7 @@
 
 
 # location to store datasets used for benchmarking
-export RAPIDS_DATASET_ROOT_DIR=/datasets/cugraph
+export RAPIDS_DATASET_ROOT_DIR=${RAPIDS_DATASET_ROOT_DIR:-/datasets/cugraph}
 mkdir -p logs
 
 # list of algos, datasets, and back-ends to use in combinations
@@ -30,6 +30,7 @@ algos="
 datasets="
    netscience
    email_Eu_core
+   amazon0302
    cit-patents
    hollywood
    soc-livejournal
@@ -40,6 +41,11 @@ backends="
     None
     cugraph-preconverted
 "
+
+# edit this directly to for pytest
+# e.g. -k "and not 100 and not 1000"
+bc_k_values=""
+
 # check for --cpu-only or --gpu-only args
 if [[ "$#" -eq 1 ]]; then
     case $1 in
@@ -58,15 +64,15 @@ fi
 
 for algo in $algos; do
     for dataset in $datasets; do
-	# this script can be used to download benchmarking datasets by name via cugraph.datasets
-    	python get_graph_bench_dataset.py $dataset
         for backend in $backends; do
             name="${backend}__${algo}__${dataset}"
             echo "Running: $backend, $dataset, bench_$algo"
-            # command to preproduce test
-            # echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo and not 1000\" --benchmark-json=\"logs/${name}.json\" bench_algos.py"
+
+            # uncomment to get command for reproducing test
+            # echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo $bc_k_values\" --benchmark-json=\"logs/${name}.json\" bench_algos.py"
+
             pytest -sv \
-                -k "$backend and $dataset and bench_$algo and not 1000" \
+                -k "$backend and $dataset and bench_$algo $bc_k_values" \
                 --benchmark-json="logs/${name}.json" \
                 bench_algos.py 2>&1 | tee "logs/${name}.out"
         done

diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
@@ -4,7 +4,6 @@
 set -eoxu pipefail
 
 package_name=$1
-package_dir=$2
 
 python_package_name=$(echo ${package_name}|sed 's/-/_/g')
 

diff --git a/ci/test_wheel_cugraph-dgl.sh b/ci/test_wheel_cugraph-dgl.sh
@@ -4,24 +4,16 @@
 set -eoxu pipefail
 
 package_name="cugraph-dgl"
-package_dir="python/cugraph-dgl"
-
-python_package_name=$(echo ${package_name}|sed 's/-/_/g')
 
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# Download wheels built during this job.
+# Download the pylibcugraph, cugraph, and cugraph-dgl built in the previous step
 RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-deps
 RAPIDS_PY_WHEEL_NAME="cugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-deps
-python -m pip install ./local-deps/*.whl
-
-# use 'ls' to expand wildcard before adding `[extra]` requires for pip
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
-# pip creates wheels using python package names
-python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
-
 
+# determine pytorch and DGL sources
 PKG_CUDA_VER="$(echo ${CUDA_VERSION} | cut -d '.' -f1,2 | tr -d '.')"
 PKG_CUDA_VER_MAJOR=${PKG_CUDA_VER:0:2}
 if [[ "${PKG_CUDA_VER_MAJOR}" == "12" ]]; then
@@ -32,8 +24,15 @@ fi
 PYTORCH_URL="https://download.pytorch.org/whl/cu${PYTORCH_CUDA_VER}"
 DGL_URL="https://data.dgl.ai/wheels/torch-2.3/cu${PYTORCH_CUDA_VER}/repo.html"
 
-rapids-logger "Installing PyTorch and DGL"
-rapids-retry python -m pip install torch==2.3.0 --index-url ${PYTORCH_URL}
-rapids-retry python -m pip install dgl==2.4.0 --find-links ${DGL_URL}
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install \
+    -v \
+    --extra-index-url "${PYTORCH_URL}" \
+    --find-links "${DGL_URL}" \
+    "$(echo ./local-deps/pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./local-deps/cugraph_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./dist/cugraph_dgl_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    'dgl==2.4.0' \
+    'torch>=2.3.0,<2.4'
 
 python -m pytest python/cugraph-dgl/tests
diff --git a/ci/test_wheel_cugraph-equivariant.sh b/ci/test_wheel_cugraph-equivariant.sh
@@ -4,19 +4,14 @@
 set -eoxu pipefail
 
 package_name="cugraph-equivariant"
-package_dir="python/cugraph-equivariant"
-
-python_package_name=$(echo ${package_name}|sed 's/-/_/g')
 
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# use 'ls' to expand wildcard before adding `[extra]` requires for pip
+# Download the cugraph-equivariant built in the previous step
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
-# pip creates wheels using python package names
-python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
-
 
+# determine pytorch source
 PKG_CUDA_VER="$(echo ${CUDA_VERSION} | cut -d '.' -f1,2 | tr -d '.')"
 PKG_CUDA_VER_MAJOR=${PKG_CUDA_VER:0:2}
 if [[ "${PKG_CUDA_VER_MAJOR}" == "12" ]]; then
@@ -26,8 +21,12 @@ else
 fi
 PYTORCH_URL="https://download.pytorch.org/whl/cu${PYTORCH_CUDA_VER}"
 
-rapids-logger "Installing PyTorch and e3nn"
-rapids-retry python -m pip install torch --index-url ${PYTORCH_URL}
-rapids-retry python -m pip install e3nn
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install \
+    -v \
+    --extra-index-url "${PYTORCH_URL}" \
+    "$(echo ./dist/cugraph_equivariant_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    'e3nn' \
+    'torch>=2.3.0,<2.4'
 
 python -m pytest python/cugraph-equivariant/cugraph_equivariant/tests
diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
@@ -4,45 +4,44 @@
 set -eoxu pipefail
 
 package_name="cugraph-pyg"
-package_dir="python/cugraph-pyg"
-
-python_package_name=$(echo ${package_name}|sed 's/-/_/g')
 
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# Download wheels built during this job.
+# Download the pylibcugraph, cugraph, and cugraph-pyg built in the previous step
 RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-deps
 RAPIDS_PY_WHEEL_NAME="cugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-deps
-python -m pip install ./local-deps/*.whl
-
-# use 'ls' to expand wildcard before adding `[extra]` requires for pip
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
-# pip creates wheels using python package names
-python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
-
-# RAPIDS_DATASET_ROOT_DIR is used by test scripts
-export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
-
-# Used to skip certain examples in CI due to memory limitations
-export CI_RUN=1
 
+# determine pytorch and pyg sources
 if [[ "${CUDA_VERSION}" == "11.8.0" ]]; then
   PYTORCH_URL="https://download.pytorch.org/whl/cu118"
   PYG_URL="https://data.pyg.org/whl/torch-2.3.0+cu118.html"
 else
   PYTORCH_URL="https://download.pytorch.org/whl/cu121"
   PYG_URL="https://data.pyg.org/whl/torch-2.3.0+cu121.html"
 fi
-rapids-logger "Installing PyTorch and PyG dependencies"
-rapids-retry python -m pip install torch==2.3.0 --index-url ${PYTORCH_URL}
-rapids-retry python -m pip install "torch-geometric>=2.5,<2.6"
-rapids-retry python -m pip install \
-  ogb \
-  pyg_lib \
-  torch_scatter \
-  torch_sparse \
-  -f ${PYG_URL}
+
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install \
+    -v \
+    --extra-index-url "${PYTORCH_URL}" \
+    --find-links "${PYG_URL}" \
+    "$(echo ./local-deps/pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./local-deps/cugraph_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./dist/cugraph_pyg_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    'ogb' \
+    'pyg_lib' \
+    'torch>=2.3.0,<2.4' \
+    'torch-geometric>=2.5,<2.6' \
+    'torch_scatter' \
+    'torch_sparse'
+
+# RAPIDS_DATASET_ROOT_DIR is used by test scripts
+export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
+
+# Used to skip certain examples in CI due to memory limitations
+export CI_RUN=1
 
 rapids-logger "pytest cugraph-pyg (single GPU)"
 pushd python/cugraph-pyg/cugraph_pyg

diff --git a/ci/test_wheel_cugraph.sh b/ci/test_wheel_cugraph.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -eoxu pipefail
 
@@ -8,4 +8,4 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcugraph-dep
 python -m pip install --no-deps ./local-pylibcugraph-dep/pylibcugraph*.whl
 
-./ci/test_wheel.sh cugraph python/cugraph
+./ci/test_wheel.sh cugraph
diff --git a/ci/test_wheel_nx-cugraph.sh b/ci/test_wheel_nx-cugraph.sh
@@ -8,4 +8,4 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-deps
 python -m pip install ./local-deps/*.whl
 
-./ci/test_wheel.sh nx-cugraph python/nx-cugraph
+./ci/test_wheel.sh nx-cugraph