Fix order-preservation in pandas-compat unsorted groupby (#16942)

When sort=False is requested in groupby aggregations and pandas compatibility mode is enabled, we are on the hook to produce the grouped aggregation result in an order which matches the input key order. We previously nearly did this, but the reordering relied on the (incorrect) assumption that when joining two tables with a left join, the resulting gather map for the left table is the identity. This is not the case. To fix this, we must permute the right (result) table gather map by the ordering that makes the left map the identity (AKA, sort by key with the left map as keys) before permuting the result. While here, replace the (bounds-checking) IndexedFrame.take call with usage of the internal (non-bounds-checking) _gather method. This avoids a redundant reduction over the indices, since by construction they are in bounds. - Closes #16908 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: #16942
rapidsai · Sep 30, 2024 · 9b2f892 · 9b2f892
1 parent 2b6408b
commit 9b2f892
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 5 deletions.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -27,6 +27,7 @@
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
 from cudf.core.column_accessor import ColumnAccessor
+from cudf.core.copy_types import GatherMap
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
@@ -754,17 +755,33 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
                 left_cols = list(self.grouping.keys.drop_duplicates()._columns)
                 right_cols = list(result_index._columns)
                 join_keys = [
-                    _match_join_keys(lcol, rcol, "left")
+                    _match_join_keys(lcol, rcol, "inner")
                     for lcol, rcol in zip(left_cols, right_cols)
                 ]
                 # TODO: In future, see if we can centralize
                 # logic else where that has similar patterns.
                 join_keys = map(list, zip(*join_keys))
-                _, indices = libcudf.join.join(
-                    *join_keys,
-                    how="left",
+                # By construction, left and right keys are related by
+                # a permutation, so we can use an inner join.
+                left_order, right_order = libcudf.join.join(
+                    *join_keys, how="inner"
+                )
+                # left order is some permutation of the ordering we
+                # want, and right order is a matching gather map for
+                # the result table. Get the correct order by sorting
+                # the right gather map.
+                (right_order,) = libcudf.sort.sort_by_key(
+                    [right_order],
+                    [left_order],
+                    [True],
+                    ["first"],
+                    stable=False,
+                )
+                result = result._gather(
+                    GatherMap.from_column_unchecked(
+                        right_order, len(result), nullify=False
+                    )
                 )
-                result = result.take(indices)
 
         if not self._as_index:
             result = result.reset_index()

diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import numpy as np
+import pytest
+
+import cudf
+from cudf.testing import assert_eq
+
+
+@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"])
+def with_nulls(request):
+    return request.param
+
+
+@pytest.mark.parametrize("nrows", [30, 300, 300_000])
+@pytest.mark.parametrize("nkeys", [1, 2, 4])
+def test_groupby_maintain_order_random(nrows, nkeys, with_nulls):
+    key_names = [f"key{key}" for key in range(nkeys)]
+    key_values = [np.random.randint(100, size=nrows) for _ in key_names]
+    value = np.random.randint(-100, 100, size=nrows)
+    df = cudf.DataFrame(dict(zip(key_names, key_values), value=value))
+    if with_nulls:
+        for key in key_names:
+            df.loc[df[key] == 1, key] = None
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = df.groupby(key_names, sort=False).agg({"value": "sum"})
+    expect = (
+        df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"})
+    )
+    assert_eq(expect, got, check_index_type=not with_nulls)