From cc1f3b5756666c3dfb47ff92690f282084cd8aad Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 27 Sep 2024 09:30:18 +0000 Subject: [PATCH] Fix order-preservation in pandas-compat unsorted groupby When sort=False is requested in groupby aggregations and pandas compatibility mode is enabled, we are on the hook to produce the grouped aggregation result in an order which matches the input key order. We previously nearly did this, but the reordering relied on the (incorrect) assumption that when joining two tables with a left join, the resulting gather map for the left table is the identity. This is not the case. To fix this, we must permute the right (result) table gather map by the ordering that makes the left map the identity (AKA, sort by key with the left map as keys) before permuting the result. While here, replace the (bounds-checking) IndexedFrame.take call with usage of the internal (non-bounds-checking) _gather method. This avoids a redundant reduction over the indices, since by construction they are in bounds. - Closes #16908 --- python/cudf/cudf/core/groupby/groupby.py | 27 +++++++++++++---- .../groupby/test_ordering_pandas_compat.py | 29 +++++++++++++++++++ 2 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index be05075a2cd..81b20488d8d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -27,6 +27,7 @@ from cudf.core.abc import Serializable from cudf.core.column.column import ColumnBase, StructDtype, as_column from cudf.core.column_accessor import ColumnAccessor +from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex @@ -754,17 +755,33 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): left_cols = list(self.grouping.keys.drop_duplicates()._columns) right_cols = list(result_index._columns) join_keys = [ - _match_join_keys(lcol, rcol, "left") + _match_join_keys(lcol, rcol, "inner") for lcol, rcol in zip(left_cols, right_cols) ] # TODO: In future, see if we can centralize # logic else where that has similar patterns. join_keys = map(list, zip(*join_keys)) - _, indices = libcudf.join.join( - *join_keys, - how="left", + # By construction, left and right keys are related by + # a permutation, so we can use an inner join. + left_order, right_order = libcudf.join.join( + *join_keys, how="inner" + ) + # left order is some permutation of the ordering we + # want, and right order is a matching gather map for + # the result table. Get the correct order by sorting + # the right gather map. + (right_order,) = libcudf.sort.sort_by_key( + [right_order], + [left_order], + [True], + ["first"], + stable=False, + ) + result = result._gather( + GatherMap.from_column_unchecked( + right_order, len(result), nullify=False + ) ) - result = result.take(indices) if not self._as_index: result = result.reset_index() diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py new file mode 100644 index 00000000000..a009802bab0 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"]) +def with_nulls(request): + return request.param + + +@pytest.mark.parametrize("nrows", [30, 300, 300_000]) +@pytest.mark.parametrize("nkeys", [1, 2, 4]) +def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): + key_names = [f"key{key}" for key in range(nkeys)] + key_values = [np.random.randint(100, size=nrows) for _ in key_names] + value = np.random.randint(-100, 100, size=nrows) + df = cudf.DataFrame(dict(zip(key_names, key_values), value=value)) + if with_nulls: + for key in key_names: + df.loc[df[key] == 1, key] = None + with cudf.option_context("mode.pandas_compatible", True): + got = df.groupby(key_names, sort=False).agg({"value": "sum"}) + expect = ( + df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"}) + ) + assert_eq(expect, got, check_index_type=not with_nulls)