Skip to content

Commit

Permalink
Fix order-preservation in pandas-compat unsorted groupby (#16942)
Browse files Browse the repository at this point in the history
When sort=False is requested in groupby aggregations and pandas compatibility mode is enabled, we are on the hook to produce the grouped aggregation result in an order which matches the input key order. We previously nearly did this, but the reordering relied on the (incorrect) assumption that when joining two tables with a left join, the resulting gather map for the left table is the identity. This is not the case.

To fix this, we must permute the right (result) table gather map by the ordering that makes the left map the identity (AKA, sort by key with the left map as keys) before permuting the result.

While here, replace the (bounds-checking) IndexedFrame.take call with usage of the internal (non-bounds-checking) _gather method. This avoids a redundant reduction over the indices, since by construction they are in bounds.

- Closes #16908

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #16942
  • Loading branch information
wence- authored Sep 30, 2024
1 parent 2b6408b commit 9b2f892
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 5 deletions.
27 changes: 22 additions & 5 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from cudf.core.abc import Serializable
from cudf.core.column.column import ColumnBase, StructDtype, as_column
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.copy_types import GatherMap
from cudf.core.join._join_helpers import _match_join_keys
from cudf.core.mixins import Reducible, Scannable
from cudf.core.multiindex import MultiIndex
Expand Down Expand Up @@ -754,17 +755,33 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
left_cols = list(self.grouping.keys.drop_duplicates()._columns)
right_cols = list(result_index._columns)
join_keys = [
_match_join_keys(lcol, rcol, "left")
_match_join_keys(lcol, rcol, "inner")
for lcol, rcol in zip(left_cols, right_cols)
]
# TODO: In future, see if we can centralize
# logic else where that has similar patterns.
join_keys = map(list, zip(*join_keys))
_, indices = libcudf.join.join(
*join_keys,
how="left",
# By construction, left and right keys are related by
# a permutation, so we can use an inner join.
left_order, right_order = libcudf.join.join(
*join_keys, how="inner"
)
# left order is some permutation of the ordering we
# want, and right order is a matching gather map for
# the result table. Get the correct order by sorting
# the right gather map.
(right_order,) = libcudf.sort.sort_by_key(
[right_order],
[left_order],
[True],
["first"],
stable=False,
)
result = result._gather(
GatherMap.from_column_unchecked(
right_order, len(result), nullify=False
)
)
result = result.take(indices)

if not self._as_index:
result = result.reset_index()
Expand Down
29 changes: 29 additions & 0 deletions python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
import numpy as np
import pytest

import cudf
from cudf.testing import assert_eq


@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"])
def with_nulls(request):
return request.param


@pytest.mark.parametrize("nrows", [30, 300, 300_000])
@pytest.mark.parametrize("nkeys", [1, 2, 4])
def test_groupby_maintain_order_random(nrows, nkeys, with_nulls):
key_names = [f"key{key}" for key in range(nkeys)]
key_values = [np.random.randint(100, size=nrows) for _ in key_names]
value = np.random.randint(-100, 100, size=nrows)
df = cudf.DataFrame(dict(zip(key_names, key_values), value=value))
if with_nulls:
for key in key_names:
df.loc[df[key] == 1, key] = None
with cudf.option_context("mode.pandas_compatible", True):
got = df.groupby(key_names, sort=False).agg({"value": "sum"})
expect = (
df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"})
)
assert_eq(expect, got, check_index_type=not with_nulls)

0 comments on commit 9b2f892

Please sign in to comment.