From a538745da9c49ab5d463372d6d71c6c5ac2b6816 Mon Sep 17 00:00:00 2001 From: Ralph Liu <137829296+nv-rliu@users.noreply.github.com> Date: Thu, 15 Aug 2024 11:31:27 -0400 Subject: [PATCH] Updates to `cugraph.hypergraph` (Duplicate Col Labels Bug) (#4610) cc: @rlratzel @ChuckHastings This PR addresses failures seen in certain PRs (like [here](https://github.com/rapidsai/cugraph/actions/runs/10372270389/job/28718471674?pr=4606#step:7:5269)) due to a [recent change](https://github.com/rapidsai/cudf/pull/16514) to `cudf` that disallows selecting duplicate column labels. --- In `hypergraph.py`, this PR modifies `_create_hyper_edges` and `_create_direct_edges` to ensure that DataFrames are being indexed by non-duplicate column values. This is done by taking a list that includes duplicates (`fs`), and removing the non-unique values ```python fs = list(set(fs)) ``` _This part requires some attention from the author of the unit test @jnke2016_ In `test_hypergraph.py`, this PR adds the `check_like=True` arg to `assert_frame_equals` function because the ordering of the columns is different for the two DFs. Authors: - Ralph Liu (https://github.com/nv-rliu) Approvers: - Rick Ratzel (https://github.com/rlratzel) - Chuck Hastings (https://github.com/ChuckHastings) - Paul Taylor (https://github.com/trxcllnt) - Joseph Nke (https://github.com/jnke2016) URL: https://github.com/rapidsai/cugraph/pull/4610 --- python/cugraph/cugraph/structure/hypergraph.py | 7 ++++--- .../cugraph/tests/structure/test_hypergraph.py | 11 +++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/python/cugraph/cugraph/structure/hypergraph.py b/python/cugraph/cugraph/structure/hypergraph.py index add68cb6dac..b52fef4dcfc 100644 --- a/python/cugraph/cugraph/structure/hypergraph.py +++ b/python/cugraph/cugraph/structure/hypergraph.py @@ -440,6 +440,7 @@ def _create_hyper_edges( for key, col in events[columns].items(): cat = categories.get(key, key) fs = [EVENTID] + ([key] if drop_edge_attrs else edge_attrs) + fs = list(set(fs)) df = events[fs].dropna(subset=[key]) if dropna else events[fs] if len(df) == 0: continue @@ -464,8 +465,7 @@ def _create_hyper_edges( if not drop_edge_attrs: columns += edge_attrs - edges = cudf.concat(edges)[columns] - edges.reset_index(drop=True, inplace=True) + edges = cudf.concat(edges, ignore_index=True)[list(set(columns))] return edges @@ -546,6 +546,7 @@ def _create_direct_edges( for key2, col2 in events[sorted(edge_shape[key1])].items(): cat2 = categories.get(key2, key2) fs = [EVENTID] + ([key1, key2] if drop_edge_attrs else edge_attrs) + fs = list(set(fs)) df = events[fs].dropna(subset=[key1, key2]) if dropna else events[fs] if len(df) == 0: continue @@ -573,7 +574,7 @@ def _create_direct_edges( if not drop_edge_attrs: columns += edge_attrs - edges = cudf.concat(edges)[columns] + edges = cudf.concat(edges)[list(set(columns))] edges.reset_index(drop=True, inplace=True) return edges diff --git a/python/cugraph/cugraph/tests/structure/test_hypergraph.py b/python/cugraph/cugraph/tests/structure/test_hypergraph.py index 848f31b940f..f1dfc17a509 100644 --- a/python/cugraph/cugraph/tests/structure/test_hypergraph.py +++ b/python/cugraph/cugraph/tests/structure/test_hypergraph.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -171,7 +171,8 @@ def test_hyperedges(categorical_metadata): if categorical_metadata: edges = edges.astype({"edge_type": "category"}) - assert_frame_equal(edges, h["edges"], check_dtype=False) + # check_like ignores the order of columns as long as all correct ones are present + assert_frame_equal(edges, h["edges"], check_dtype=False, check_like=True) for (k, v) in [("entities", 12), ("nodes", 15), ("edges", 12), ("events", 3)]: assert len(h[k]) == v @@ -266,7 +267,8 @@ def test_drop_edge_attrs(categorical_metadata): if categorical_metadata: edges = edges.astype({"edge_type": "category"}) - assert_frame_equal(edges, h["edges"], check_dtype=False) + # check_like ignores the order of columns as long as all correct ones are present + assert_frame_equal(edges, h["edges"], check_dtype=False, check_like=True) for (k, v) in [("entities", 9), ("nodes", 12), ("edges", 9), ("events", 3)]: assert len(h[k]) == v @@ -308,7 +310,8 @@ def test_drop_edge_attrs_direct(categorical_metadata): if categorical_metadata: edges = edges.astype({"edge_type": "category"}) - assert_frame_equal(edges, h["edges"], check_dtype=False) + # check_like ignores the order of columns as long as all correct ones are present + assert_frame_equal(edges, h["edges"], check_dtype=False, check_like=True) for (k, v) in [("entities", 9), ("nodes", 9), ("edges", 6), ("events", 0)]: assert len(h[k]) == v