From cc1f3b5756666c3dfb47ff92690f282084cd8aad Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 27 Sep 2024 09:30:18 +0000
Subject: [PATCH] Fix order-preservation in pandas-compat unsorted groupby

When sort=False is requested in groupby aggregations and pandas
compatibility mode is enabled, we are on the hook to produce the
grouped aggregation result in an order which matches the input key
order. We previously nearly did this, but the reordering relied on
the (incorrect) assumption that when joining two tables with a left
join, the resulting gather map for the left table is the identity.
This is not the case.

To fix this, we must permute the right (result) table gather map by
the ordering that makes the left map the identity (AKA, sort by key
with the left map as keys) before permuting the result.

While here, replace the (bounds-checking) IndexedFrame.take call with
usage of the internal (non-bounds-checking) _gather method. This
avoids a redundant reduction over the indices, since by construction
they are in bounds.

- Closes #16908
---
 python/cudf/cudf/core/groupby/groupby.py      | 27 +++++++++++++----
 .../groupby/test_ordering_pandas_compat.py    | 29 +++++++++++++++++++
 2 files changed, 51 insertions(+), 5 deletions(-)
 create mode 100644 python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index be05075a2cd..81b20488d8d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -27,6 +27,7 @@
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
 from cudf.core.column_accessor import ColumnAccessor
+from cudf.core.copy_types import GatherMap
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
@@ -754,17 +755,33 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
                 left_cols = list(self.grouping.keys.drop_duplicates()._columns)
                 right_cols = list(result_index._columns)
                 join_keys = [
-                    _match_join_keys(lcol, rcol, "left")
+                    _match_join_keys(lcol, rcol, "inner")
                     for lcol, rcol in zip(left_cols, right_cols)
                 ]
                 # TODO: In future, see if we can centralize
                 # logic else where that has similar patterns.
                 join_keys = map(list, zip(*join_keys))
-                _, indices = libcudf.join.join(
-                    *join_keys,
-                    how="left",
+                # By construction, left and right keys are related by
+                # a permutation, so we can use an inner join.
+                left_order, right_order = libcudf.join.join(
+                    *join_keys, how="inner"
+                )
+                # left order is some permutation of the ordering we
+                # want, and right order is a matching gather map for
+                # the result table. Get the correct order by sorting
+                # the right gather map.
+                (right_order,) = libcudf.sort.sort_by_key(
+                    [right_order],
+                    [left_order],
+                    [True],
+                    ["first"],
+                    stable=False,
+                )
+                result = result._gather(
+                    GatherMap.from_column_unchecked(
+                        right_order, len(result), nullify=False
+                    )
                 )
-                result = result.take(indices)
 
         if not self._as_index:
             result = result.reset_index()
diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
new file mode 100644
index 00000000000..a009802bab0
--- /dev/null
+++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import numpy as np
+import pytest
+
+import cudf
+from cudf.testing import assert_eq
+
+
+@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"])
+def with_nulls(request):
+    return request.param
+
+
+@pytest.mark.parametrize("nrows", [30, 300, 300_000])
+@pytest.mark.parametrize("nkeys", [1, 2, 4])
+def test_groupby_maintain_order_random(nrows, nkeys, with_nulls):
+    key_names = [f"key{key}" for key in range(nkeys)]
+    key_values = [np.random.randint(100, size=nrows) for _ in key_names]
+    value = np.random.randint(-100, 100, size=nrows)
+    df = cudf.DataFrame(dict(zip(key_names, key_values), value=value))
+    if with_nulls:
+        for key in key_names:
+            df.loc[df[key] == 1, key] = None
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = df.groupby(key_names, sort=False).agg({"value": "sum"})
+    expect = (
+        df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"})
+    )
+    assert_eq(expect, got, check_index_type=not with_nulls)