Merge pull request #4137 from rapidsai/branch-24.02

Forward-merge branch-24.02 to branch-24.04
rapidsai · Feb 2, 2024 · b4f7a48 · b4f7a48
2 parents 26efc6d + 3d52f17
commit b4f7a48
Show file tree

Hide file tree

Showing 16 changed files with 636 additions and 34 deletions.
diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
diff --git a/benchmarks/nx-cugraph/pytest-based/run-2402.sh b/benchmarks/nx-cugraph/pytest-based/run-2402.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Runs benchmarks for the 24.02 algos.
+# Pass either a or b or both. This is useful for separating batches of runs on different GPUs:
+# CUDA_VISIBLE_DEVICES=1 run-2402.sh b
+
+mkdir -p logs
+
+# benches="$benches ..." pattern is easy to comment out individual runs
+benches=
+
+while [[ $1 != "" ]]; do
+    if [[ $1 == "a" ]]; then
+        benches="$benches bench_ancestors"
+        benches="$benches bench_average_clustering"
+        benches="$benches bench_generic_bfs_edges"
+        benches="$benches bench_bfs_edges"
+        benches="$benches bench_bfs_layers"
+        benches="$benches bench_bfs_predecessors"
+        benches="$benches bench_bfs_successors"
+        benches="$benches bench_bfs_tree"
+        benches="$benches bench_clustering"
+        benches="$benches bench_core_number"
+        benches="$benches bench_descendants"
+    elif [[ $1 == "b" ]]; then
+        benches="$benches bench_descendants_at_distance"
+        benches="$benches bench_is_bipartite"
+        benches="$benches bench_is_strongly_connected"
+        benches="$benches bench_is_weakly_connected"
+        benches="$benches bench_number_strongly_connected_components"
+        benches="$benches bench_number_weakly_connected_components"
+        benches="$benches bench_overall_reciprocity"
+        benches="$benches bench_reciprocity"
+        benches="$benches bench_strongly_connected_components"
+        benches="$benches bench_transitivity"
+        benches="$benches bench_triangles"
+        benches="$benches bench_weakly_connected_components"
+    fi
+    shift
+done
+
+for bench in $benches; do
+    pytest -sv -k "soc-livejournal1" "bench_algos.py::$bench" 2>&1 | tee "logs/${bench}.log"
+done
diff --git a/ci/test_python.sh b/ci/test_python.sh
@@ -120,12 +120,33 @@ popd
 
 rapids-logger "pytest networkx using nx-cugraph backend"
 pushd python/nx-cugraph
+# Use editable install to make coverage work
+pip install -e . --no-deps
 ./run_nx_tests.sh
 # run_nx_tests.sh outputs coverage data, so check that total coverage is >0.0%
 # in case nx-cugraph failed to load but fallback mode allowed the run to pass.
 _coverage=$(coverage report|grep "^TOTAL")
 echo "nx-cugraph coverage from networkx tests: $_coverage"
 echo $_coverage | awk '{ if ($NF == "0.0%") exit 1 }'
+# Ensure all algorithms were called by comparing covered lines to function lines.
+# Run our tests again (they're fast enough) to add their coverage, then create coverage.json
+pytest \
+  --pyargs nx_cugraph \
+  --config-file=./pyproject.toml \
+  --cov-config=./pyproject.toml \
+  --cov=nx_cugraph \
+  --cov-append \
+  --cov-report=
+coverage report \
+  --include="*/nx_cugraph/algorithms/*" \
+  --omit=__init__.py \
+  --show-missing \
+  --rcfile=./pyproject.toml
+coverage json --rcfile=./pyproject.toml
+python -m nx_cugraph.tests.ensure_algos_covered
+# Exercise (and show results of) scripts that show implemented networkx algorithms
+python -m nx_cugraph.scripts.print_tree --dispatch-name --plc --incomplete --different
+python -m nx_cugraph.scripts.print_table
 popd
 
 rapids-logger "pytest cugraph-service (single GPU)"

diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -39,6 +39,7 @@
 )
 from cugraph.dask.common.mg_utils import run_gc_on_dask_cluster
 import cugraph.dask.comms.comms as Comms
+from cugraph.structure.symmetrize import _memory_efficient_drop_duplicates
 
 
 class simpleDistributedGraphImpl:
@@ -95,6 +96,7 @@ def _make_plc_graph(
         weight_type,
         edge_id_type,
         edge_type_id,
+        drop_multi_edges,
     ):
         weights = None
         edge_ids = None
@@ -149,6 +151,7 @@ def _make_plc_graph(
             num_arrays=num_arrays,
             store_transposed=store_transposed,
             do_expensive_check=False,
+            drop_multi_edges=drop_multi_edges,
         )
         del edata_x
         gc.collect()
@@ -267,7 +270,7 @@ def __from_edgelist(
                 input_ddf,
                 source,
                 destination,
-                multi=self.properties.multi_edge,
+                multi=True,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
             value_col = None
@@ -277,7 +280,7 @@ def __from_edgelist(
                 source,
                 destination,
                 value_col_names,
-                multi=self.properties.multi_edge,
+                multi=True,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
 
@@ -364,6 +367,7 @@ def __from_edgelist(
                 self.weight_type,
                 self.edge_id_type,
                 self.edge_type_id_type,
+                not self.properties.multi_edge,
             )
             for w, edata in persisted_keys_d.items()
         }
@@ -455,6 +459,15 @@ def view_edge_list(self):
                 else:
                     is_multi_column = True
 
+            if not self.properties.multi_edge:
+                # Drop parallel edges for non MultiGraph
+                # FIXME: Drop multi edges with the CAPI instead.
+                _client = default_client()
+                workers = _client.scheduler_info()["workers"]
+                edgelist_df = _memory_efficient_drop_duplicates(
+                    edgelist_df, [srcCol, dstCol], len(workers)
+                )
+
             edgelist_df[srcCol], edgelist_df[dstCol] = edgelist_df[
                 [srcCol, dstCol]
             ].min(axis=1), edgelist_df[[srcCol, dstCol]].max(axis=1)

diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -264,7 +264,7 @@ def __from_edgelist(
                 source,
                 destination,
                 edge_attr,
-                multi=self.properties.multi_edge,
+                multi=self.properties.multi_edge,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
 
@@ -279,7 +279,7 @@ def __from_edgelist(
                 elist,
                 source,
                 destination,
-                multi=self.properties.multi_edge,
+                multi=self.properties.multi_edge,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
 
@@ -298,7 +298,10 @@ def __from_edgelist(
             self._replicate_edgelist()
 
         self._make_plc_graph(
-            value_col=value_col, store_transposed=store_transposed, renumber=renumber
+            value_col=value_col,
+            store_transposed=store_transposed,
+            renumber=renumber,
+            drop_multi_edges=not self.properties.multi_edge,
         )
 
     def to_pandas_edgelist(
@@ -477,13 +480,15 @@ def view_edge_list(self):
                     edgelist_df[simpleGraphImpl.srcCol]
                     <= edgelist_df[simpleGraphImpl.dstCol]
                 ]
+
         elif not use_initial_input_df and self.properties.renumbered:
             # Do not unrenumber the vertices if the initial input df was used
             if not self.properties.directed:
                 edgelist_df = edgelist_df[
                     edgelist_df[simpleGraphImpl.srcCol]
                     <= edgelist_df[simpleGraphImpl.dstCol]
                 ]
+
             edgelist_df = self.renumber_map.unrenumber(
                 edgelist_df, simpleGraphImpl.srcCol
             )
@@ -1084,6 +1089,7 @@ def _make_plc_graph(
         value_col: Dict[str, cudf.DataFrame] = None,
         store_transposed: bool = False,
         renumber: bool = True,
+        drop_multi_edges: bool = False,
     ):
         """
         Parameters
@@ -1100,6 +1106,8 @@ def _make_plc_graph(
             Whether to renumber the vertices of the graph.
             Required if inputted vertex ids are not of
             int32 or int64 type.
+        drop_multi_edges: bool (default=False)
+            Whether to drop multi edges
         """
 
         if value_col is None:
@@ -1163,6 +1171,7 @@ def _make_plc_graph(
             renumber=renumber,
             do_expensive_check=True,
             input_array_format=input_array_format,
+            drop_multi_edges=drop_multi_edges,
         )
 
     def to_directed(self, DiG, store_transposed=False):

diff --git a/python/cugraph/cugraph/structure/symmetrize.py b/python/cugraph/cugraph/structure/symmetrize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,6 +15,7 @@
 import cudf
 import dask_cudf
 from dask.distributed import default_client
+import warnings
 
 
 def symmetrize_df(
@@ -54,6 +55,11 @@ def symmetrize_df(
         Name of the column in the data frame containing the weight ids
 
     multi : bool, optional (default=False)
+        [Deprecated,  Multi will be removed in future version,  and the removal
+        of multi edges will no longer be supported from 'symmetrize'.
+        Multi edges will be removed upon  creation of graph instance directly
+        based on if the graph is `curgaph.MultiGraph` or `cugraph.Graph`.]
+
         Set to True if graph is a Multi(Di)Graph. This allows multiple
         edges instead of dropping them.
 
@@ -84,6 +90,12 @@ def symmetrize_df(
     if multi:
         return result
     else:
+        warnings.warn(
+            "Multi is deprecated and the removal of multi edges will no longer be "
+            "supported from 'symmetrize'. Multi edges will be removed upon creation "
+            "of graph instance.",
+            FutureWarning,
+        )
         vertex_col_name = src_name + dst_name
         result = result.groupby(by=[*vertex_col_name], as_index=False).min()
         return result
@@ -128,6 +140,11 @@ def symmetrize_ddf(
         Name of the column in the data frame containing the weight ids
 
     multi : bool, optional (default=False)
+        [Deprecated,  Multi will be removed in future version,  and the removal
+        of multi edges will no longer be supported from 'symmetrize'.
+        Multi edges will be removed upon  creation of graph instance directly
+        based on if the graph is `curgaph.MultiGraph` or `cugraph.Graph`.]
+
         Set to True if graph is a Multi(Di)Graph. This allows multiple
         edges instead of dropping them.
 
@@ -165,8 +182,15 @@ def symmetrize_ddf(
     else:
         result = ddf
     if multi:
+        result = result.reset_index(drop=True).repartition(npartitions=len(workers) * 2)
         return result
     else:
+        warnings.warn(
+            "Multi is deprecated and the removal of multi edges will no longer be "
+            "supported from 'symmetrize'. Multi edges will be removed upon creation "
+            "of graph instance.",
+            FutureWarning,
+        )
         vertex_col_name = src_name + dst_name
         result = _memory_efficient_drop_duplicates(
             result, vertex_col_name, len(workers)
@@ -181,6 +205,7 @@ def symmetrize(
     value_col_name=None,
     multi=False,
     symmetrize=True,
+    do_expensive_check=False,
 ):
     """
     Take a dataframe of source destination pairs along with associated
@@ -208,6 +233,11 @@ def symmetrize(
         weights column name.
 
     multi : bool, optional (default=False)
+        [Deprecated,  Multi will be removed in future version,  and the removal
+        of multi edges will no longer be supported from 'symmetrize'.
+        Multi edges will be removed upon  creation of graph instance directly
+        based on if the graph is `curgaph.MultiGraph` or `cugraph.Graph`.]
+
         Set to True if graph is a Multi(Di)Graph. This allows multiple
         edges instead of dropping them.
 
@@ -234,8 +264,9 @@ def symmetrize(
     if "edge_id" in input_df.columns and symmetrize:
         raise ValueError("Edge IDs are not supported on undirected graphs")
 
-    csg.null_check(input_df[source_col_name])
-    csg.null_check(input_df[dest_col_name])
+    if do_expensive_check:  # FIXME: Optimize this check as it is currently expensive
+        csg.null_check(input_df[source_col_name])
+        csg.null_check(input_df[dest_col_name])
 
     if isinstance(input_df, dask_cudf.DataFrame):
         output_df = symmetrize_ddf(

diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -26,6 +26,7 @@
 from cugraph.testing import UNDIRECTED_DATASETS
 from cugraph.dask import uniform_neighbor_sample
 from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.structure.symmetrize import _memory_efficient_drop_duplicates
 from cugraph.datasets import email_Eu_core, small_tree
 from pylibcugraph.testing.utils import gen_fixture_params_product
 
@@ -135,6 +136,14 @@ def test_mg_uniform_neighbor_sample_simple(dask_client, input_combo):
     dg = input_combo["MGGraph"]
 
     input_df = dg.input_df
+    # Drop parallel edges for non MultiGraph
+    # FIXME: Drop multi edges with the CAPI instead.
+    vertex_col_name = ["src", "dst"]
+    workers = dask_client.scheduler_info()["workers"]
+    input_df = _memory_efficient_drop_duplicates(
+        input_df, vertex_col_name, len(workers)
+    )
+
     result_nbr = uniform_neighbor_sample(
         dg,
         input_combo["start_list"],

diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
@@ -26,7 +26,7 @@ repos:
       - id: mixed-line-ending
       - id: trailing-whitespace
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.15
+    rev: v0.16
     hooks:
       - id: validate-pyproject
         name: Validate pyproject.toml

diff --git a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
@@ -68,7 +68,7 @@ def generic_bfs_edges(G, source, neighbors=None, depth_limit=None, sort_neighbor
         raise NotImplementedError(
             "sort_neighbors argument in generic_bfs_edges is not currently supported"
         )
-    return bfs_edges(source, depth_limit=depth_limit)
+    return bfs_edges(G, source, depth_limit=depth_limit)
 
 
 @generic_bfs_edges._can_run

diff --git a/python/nx-cugraph/nx_cugraph/classes/digraph.py b/python/nx-cugraph/nx_cugraph/classes/digraph.py
@@ -86,9 +86,9 @@ def to_undirected(self, reciprocal=False, as_view=False):
                     key: val[indices].copy() for key, val in self.edge_masks.items()
                 }
             else:
-                src_indices, dst_indices = cp.divmod(
-                    src_dst_indices_new, N, dtype=index_dtype
-                )
+                src_indices, dst_indices = cp.divmod(src_dst_indices_new, N)
+                src_indices = src_indices.astype(index_dtype)
+                dst_indices = dst_indices.astype(index_dtype)
         else:
             src_dst_indices_old_T = self.src_indices + N * self.dst_indices.astype(
                 np.int64
@@ -116,9 +116,9 @@ def to_undirected(self, reciprocal=False, as_view=False):
                 src_dst_indices_new = cp.union1d(
                     src_dst_indices_old, src_dst_indices_old_T
                 )
-                src_indices, dst_indices = cp.divmod(
-                    src_dst_indices_new, N, dtype=index_dtype
-                )
+                src_indices, dst_indices = cp.divmod(src_dst_indices_new, N)
+                src_indices = src_indices.astype(index_dtype)
+                dst_indices = dst_indices.astype(index_dtype)
 
         if self.edge_values:
             recip_indices = cp.lexsort(cp.vstack((src_indices, dst_indices)))