From 7e058e2961b5b8be05877282d26051f36dd73c53 Mon Sep 17 00:00:00 2001 From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> Date: Fri, 20 Sep 2024 09:12:30 -0500 Subject: [PATCH] Drops duplicate edges in non-MultiGraph PLC `SGGraph` instances (#4658) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Graph input with duplicate edges intended for `Graph`/`DiGraph` instances resulted in internal PLC `SGGraph` instances with duplicate edges, which were effectively treated as MultiGraphs and caused incorrect results from algorithms like `pagerank`. This PR sets the `drop_multi_edges` PLC `SGGraph` ctor option to have PLC remove duplicate edges on `SGGraph` creation. The overhead to drop duplicate edges for non-MultiGraphs is negligible, and in the case of a large test graph (wikipedia data, 37.5M nodes, 464.5M edges) resulted in an overall _speedup_ for pagerank going from 12.2 seconds to 10.7 seconds on my workstation, likely due to fewer edges to process a minor slowdown from 10.5s to 10.7s. _edit: after several re-runs, the pagerank runtime before the change settled to 10.5, and the runtime after the change was typically 10.7._ A test was added that uses pagerank to ensure Graphs vs. MultiGraphs are handled correctly and duplicate edges are dropped as needed. The results when run without `drop_multi_edges` set: ``` > assert actual_pr_for_G == approx(expected_pr_for_G) E assert {0: 0.0875795...7955580949783} == approx({0: 0....32 ± 1.8e-07}) E E comparison failed. Mismatched elements: 4 / 4: E Max absolute difference: 0.08785887916592061 E Max relative difference: 0.5007959662968462 E Index | Obtained | Expected E 0 | 0.08757955580949783 | 0.17543839772251532 ± 1.8e-07 E 1 | 0.41242048144340515 | 0.32456160227748454 ± 3.2e-07 E 2 | 0.41242048144340515 | 0.32456160227748454 ± 3.2e-07 E 3 | 0.08757955580949783 | 0.17543839772251532 ± 1.8e-07 ``` The same test passes when run with the changes in this PR to set `drop_multi_edges`. Authors: - Rick Ratzel (https://github.com/rlratzel) Approvers: - Erik Welch (https://github.com/eriknw) URL: https://github.com/rapidsai/cugraph/pull/4658 --- python/nx-cugraph/nx_cugraph/classes/graph.py | 9 +++++ .../nx_cugraph/tests/test_pagerank.py | 36 +++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 python/nx-cugraph/nx_cugraph/tests/test_pagerank.py diff --git a/python/nx-cugraph/nx_cugraph/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py index 7425eacb2b4..7c01365c0ac 100644 --- a/python/nx-cugraph/nx_cugraph/classes/graph.py +++ b/python/nx-cugraph/nx_cugraph/classes/graph.py @@ -689,6 +689,14 @@ def _get_plc_graph( src_indices = src_indices.astype(index_dtype) dst_indices = dst_indices.astype(index_dtype) + # This sets drop_multi_edges=True for non-multigraph input, which means + # the data in self.src_indices and self.dst_indices may not be + # identical to that contained in the returned pcl.SGGraph (the returned + # SGGraph may have fewer edges since duplicates are dropped). Ideally + # self.src_indices and self.dst_indices would be updated to have + # duplicate edges removed for non-multigraph instances, but that + # requires additional code which would be redundant and likely not as + # performant as the code in PLC. return plc.SGGraph( resource_handle=plc.ResourceHandle(), graph_properties=plc.GraphProperties( @@ -702,6 +710,7 @@ def _get_plc_graph( renumber=False, do_expensive_check=False, vertices_array=self._node_ids, + drop_multi_edges=not self.is_multigraph(), ) def _sort_edge_indices(self, primary="src"): diff --git a/python/nx-cugraph/nx_cugraph/tests/test_pagerank.py b/python/nx-cugraph/nx_cugraph/tests/test_pagerank.py new file mode 100644 index 00000000000..0b437df2d2f --- /dev/null +++ b/python/nx-cugraph/nx_cugraph/tests/test_pagerank.py @@ -0,0 +1,36 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import networkx as nx +import pandas as pd +from pytest import approx + + +def test_pagerank_multigraph(): + """ + Ensures correct differences between pagerank results for Graphs + vs. MultiGraphs generated using from_pandas_edgelist() + """ + df = pd.DataFrame({"source": [0, 1, 1, 1, 1, 1, 1, 2], + "target": [1, 2, 2, 2, 2, 2, 2, 3]}) + expected_pr_for_G = nx.pagerank(nx.from_pandas_edgelist(df)) + expected_pr_for_MultiG = nx.pagerank( + nx.from_pandas_edgelist(df, create_using=nx.MultiGraph)) + + G = nx.from_pandas_edgelist(df, backend="cugraph") + actual_pr_for_G = nx.pagerank(G, backend="cugraph") + + MultiG = nx.from_pandas_edgelist(df, create_using=nx.MultiGraph, backend="cugraph") + actual_pr_for_MultiG = nx.pagerank(MultiG, backend="cugraph") + + assert actual_pr_for_G == approx(expected_pr_for_G) + assert actual_pr_for_MultiG == approx(expected_pr_for_MultiG)