Merge branch 'branch-24.02' into branch-24.02-nx_cugraph_readme_update

rapidsai · Feb 2, 2024 · e32918f · e32918f
2 parents 5805292 + 3d52f17
commit e32918f
Show file tree

Hide file tree

Showing 19 changed files with 678 additions and 46 deletions.
diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
diff --git a/benchmarks/nx-cugraph/pytest-based/run-2402.sh b/benchmarks/nx-cugraph/pytest-based/run-2402.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Runs benchmarks for the 24.02 algos.
+# Pass either a or b or both. This is useful for separating batches of runs on different GPUs:
+# CUDA_VISIBLE_DEVICES=1 run-2402.sh b
+
+mkdir -p logs
+
+# benches="$benches ..." pattern is easy to comment out individual runs
+benches=
+
+while [[ $1 != "" ]]; do
+    if [[ $1 == "a" ]]; then
+        benches="$benches bench_ancestors"
+        benches="$benches bench_average_clustering"
+        benches="$benches bench_generic_bfs_edges"
+        benches="$benches bench_bfs_edges"
+        benches="$benches bench_bfs_layers"
+        benches="$benches bench_bfs_predecessors"
+        benches="$benches bench_bfs_successors"
+        benches="$benches bench_bfs_tree"
+        benches="$benches bench_clustering"
+        benches="$benches bench_core_number"
+        benches="$benches bench_descendants"
+    elif [[ $1 == "b" ]]; then
+        benches="$benches bench_descendants_at_distance"
+        benches="$benches bench_is_bipartite"
+        benches="$benches bench_is_strongly_connected"
+        benches="$benches bench_is_weakly_connected"
+        benches="$benches bench_number_strongly_connected_components"
+        benches="$benches bench_number_weakly_connected_components"
+        benches="$benches bench_overall_reciprocity"
+        benches="$benches bench_reciprocity"
+        benches="$benches bench_strongly_connected_components"
+        benches="$benches bench_transitivity"
+        benches="$benches bench_triangles"
+        benches="$benches bench_weakly_connected_components"
+    fi
+    shift
+done
+
+for bench in $benches; do
+    pytest -sv -k "soc-livejournal1" "bench_algos.py::$bench" 2>&1 | tee "logs/${bench}.log"
+done
diff --git a/ci/test_python.sh b/ci/test_python.sh
@@ -63,7 +63,16 @@ pytest \
   tests
 popd
 
-# FIXME: TEMPORARILY disable single-GPU "MG" testing
+# Test runs that include tests that use dask require
+# --import-mode=append. Those tests start a LocalCUDACluster that inherits
+# changes from pytest's modifications to PYTHONPATH (which defaults to
+# prepending source tree paths to PYTHONPATH).  This causes the
+# LocalCUDACluster subprocess to import cugraph from the source tree instead of
+# the install location, and in most cases, the source tree does not have
+# extensions built in-place and will result in ImportErrors.
+#
+# FIXME: TEMPORARILY disable MG PropertyGraph tests (experimental) tests and
+# bulk sampler IO tests (hangs in CI)
 rapids-logger "pytest cugraph"
 pushd python/cugraph/cugraph
 DASK_WORKER_DEVICES="0" \
@@ -72,14 +81,15 @@ DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="1000s" \
 DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT="1000s" \
 pytest \
   -v \
+  --import-mode=append \
   --benchmark-disable \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph.xml" \
   --cov-config=../../.coveragerc \
   --cov=cugraph \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-coverage.xml" \
   --cov-report=term \
-  -k "not test_property_graph_mg" \
+  -k "not test_property_graph_mg and not test_bulk_sampler_io" \
   tests
 popd
 
@@ -110,12 +120,33 @@ popd
 
 rapids-logger "pytest networkx using nx-cugraph backend"
 pushd python/nx-cugraph
+# Use editable install to make coverage work
+pip install -e . --no-deps
 ./run_nx_tests.sh
 # run_nx_tests.sh outputs coverage data, so check that total coverage is >0.0%
 # in case nx-cugraph failed to load but fallback mode allowed the run to pass.
 _coverage=$(coverage report|grep "^TOTAL")
 echo "nx-cugraph coverage from networkx tests: $_coverage"
 echo $_coverage | awk '{ if ($NF == "0.0%") exit 1 }'
+# Ensure all algorithms were called by comparing covered lines to function lines.
+# Run our tests again (they're fast enough) to add their coverage, then create coverage.json
+pytest \
+  --pyargs nx_cugraph \
+  --config-file=./pyproject.toml \
+  --cov-config=./pyproject.toml \
+  --cov=nx_cugraph \
+  --cov-append \
+  --cov-report=
+coverage report \
+  --include="*/nx_cugraph/algorithms/*" \
+  --omit=__init__.py \
+  --show-missing \
+  --rcfile=./pyproject.toml
+coverage json --rcfile=./pyproject.toml
+python -m nx_cugraph.tests.ensure_algos_covered
+# Exercise (and show results of) scripts that show implemented networkx algorithms
+python -m nx_cugraph.scripts.print_tree --dispatch-name --plc --incomplete --different
+python -m nx_cugraph.scripts.print_table
 popd
 
 rapids-logger "pytest cugraph-service (single GPU)"

diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -eoxu pipefail
 
@@ -21,10 +21,21 @@ arch=$(uname -m)
 if [[ "${arch}" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
     python ./ci/wheel_smoke_test_${package_name}.py
 else
-    # FIXME: TEMPORARILY disable single-GPU "MG" testing
+    # Test runs that include tests that use dask require
+    # --import-mode=append. See test_python.sh for details.
+    # FIXME: Adding PY_IGNORE_IMPORTMISMATCH=1 to workaround conftest.py import
+    # mismatch error seen by nx-cugraph after using pytest 8 and
+    # --import-mode=append.
     RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets \
+    PY_IGNORE_IMPORTMISMATCH=1 \
+    DASK_WORKER_DEVICES="0" \
     DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL="1000s" \
     DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="1000s" \
     DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT="1000s" \
-    python -m pytest ./python/${package_name}/${python_package_name}/tests
+    python -m pytest \
+       -v \
+       --import-mode=append \
+       --benchmark-disable \
+       -k "not test_property_graph_mg and not test_bulk_sampler_io" \
+       ./python/${package_name}/${python_package_name}/tests
 fi
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -52,6 +52,7 @@ function(find_and_configure_raft)
                 "RAFT_COMPILE_LIBRARY ${PKG_COMPILE_RAFT_LIB}"
                 "BUILD_TESTS OFF"
                 "BUILD_BENCH OFF"
+                "BUILD_CAGRA_HNSWLIB OFF"
     )
 
     if(raft_ADDED)

diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -39,6 +39,7 @@
 )
 from cugraph.dask.common.mg_utils import run_gc_on_dask_cluster
 import cugraph.dask.comms.comms as Comms
+from cugraph.structure.symmetrize import _memory_efficient_drop_duplicates
 
 
 class simpleDistributedGraphImpl:
@@ -95,6 +96,7 @@ def _make_plc_graph(
         weight_type,
         edge_id_type,
         edge_type_id,
+        drop_multi_edges,
     ):
         weights = None
         edge_ids = None
@@ -149,6 +151,7 @@ def _make_plc_graph(
             num_arrays=num_arrays,
             store_transposed=store_transposed,
             do_expensive_check=False,
+            drop_multi_edges=drop_multi_edges,
         )
         del edata_x
         gc.collect()
@@ -267,7 +270,7 @@ def __from_edgelist(
                 input_ddf,
                 source,
                 destination,
-                multi=self.properties.multi_edge,
+                multi=True,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
             value_col = None
@@ -277,7 +280,7 @@ def __from_edgelist(
                 source,
                 destination,
                 value_col_names,
-                multi=self.properties.multi_edge,
+                multi=True,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
 
@@ -364,6 +367,7 @@ def __from_edgelist(
                 self.weight_type,
                 self.edge_id_type,
                 self.edge_type_id_type,
+                not self.properties.multi_edge,
             )
             for w, edata in persisted_keys_d.items()
         }
@@ -455,6 +459,15 @@ def view_edge_list(self):
                 else:
                     is_multi_column = True
 
+            if not self.properties.multi_edge:
+                # Drop parallel edges for non MultiGraph
+                # FIXME: Drop multi edges with the CAPI instead.
+                _client = default_client()
+                workers = _client.scheduler_info()["workers"]
+                edgelist_df = _memory_efficient_drop_duplicates(
+                    edgelist_df, [srcCol, dstCol], len(workers)
+                )
+
             edgelist_df[srcCol], edgelist_df[dstCol] = edgelist_df[
                 [srcCol, dstCol]
             ].min(axis=1), edgelist_df[[srcCol, dstCol]].max(axis=1)

diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -264,7 +264,7 @@ def __from_edgelist(
                 source,
                 destination,
                 edge_attr,
-                multi=self.properties.multi_edge,
+                multi=self.properties.multi_edge,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
 
@@ -279,7 +279,7 @@ def __from_edgelist(
                 elist,
                 source,
                 destination,
-                multi=self.properties.multi_edge,
+                multi=self.properties.multi_edge,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
 
@@ -298,7 +298,10 @@ def __from_edgelist(
             self._replicate_edgelist()
 
         self._make_plc_graph(
-            value_col=value_col, store_transposed=store_transposed, renumber=renumber
+            value_col=value_col,
+            store_transposed=store_transposed,
+            renumber=renumber,
+            drop_multi_edges=not self.properties.multi_edge,
         )
 
     def to_pandas_edgelist(
@@ -477,13 +480,15 @@ def view_edge_list(self):
                     edgelist_df[simpleGraphImpl.srcCol]
                     <= edgelist_df[simpleGraphImpl.dstCol]
                 ]
+
         elif not use_initial_input_df and self.properties.renumbered:
             # Do not unrenumber the vertices if the initial input df was used
             if not self.properties.directed:
                 edgelist_df = edgelist_df[
                     edgelist_df[simpleGraphImpl.srcCol]
                     <= edgelist_df[simpleGraphImpl.dstCol]
                 ]
+
             edgelist_df = self.renumber_map.unrenumber(
                 edgelist_df, simpleGraphImpl.srcCol
             )
@@ -1084,6 +1089,7 @@ def _make_plc_graph(
         value_col: Dict[str, cudf.DataFrame] = None,
         store_transposed: bool = False,
         renumber: bool = True,
+        drop_multi_edges: bool = False,
     ):
         """
         Parameters
@@ -1100,6 +1106,8 @@ def _make_plc_graph(
             Whether to renumber the vertices of the graph.
             Required if inputted vertex ids are not of
             int32 or int64 type.
+        drop_multi_edges: bool (default=False)
+            Whether to drop multi edges
         """
 
         if value_col is None:
@@ -1163,6 +1171,7 @@ def _make_plc_graph(
             renumber=renumber,
             do_expensive_check=True,
             input_array_format=input_array_format,
+            drop_multi_edges=drop_multi_edges,
         )
 
     def to_directed(self, DiG, store_transposed=False):