diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index d7da42a1708..99b759e2166 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -46,7 +46,7 @@ ListDtype, StructDtype, ) -from cudf.core.groupby import Grouper +from cudf.core.groupby import Grouper, NamedAgg from cudf.core.index import ( BaseIndex, CategoricalIndex, diff --git a/python/cudf/cudf/core/groupby/__init__.py b/python/cudf/cudf/core/groupby/__init__.py index 4375ed3e3da..621edb316cf 100644 --- a/python/cudf/cudf/core/groupby/__init__.py +++ b/python/cudf/cudf/core/groupby/__init__.py @@ -1,8 +1,9 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf.core.groupby.groupby import GroupBy, Grouper +from cudf.core.groupby.groupby import GroupBy, Grouper, NamedAgg __all__ = [ "GroupBy", "Grouper", + "NamedAgg", ] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4f283d41b17..6424c8af877 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -76,6 +76,34 @@ def _is_row_of(chunk, obj): ) +NamedAgg = pd.NamedAgg + + +NamedAgg.__doc__ = """ +Helper for column specific aggregation with control over output column names. + +Subclass of typing.NamedTuple. + +Parameters +---------- +column : Hashable + Column label in the DataFrame to apply aggfunc. +aggfunc : function or str + Function to apply to the provided column. + +Examples +-------- +>>> df = cudf.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) +>>> agg_a = cudf.NamedAgg(column="a", aggfunc="min") +>>> agg_1 = cudf.NamedAgg(column=1, aggfunc=lambda x: x.mean()) +>>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) + result_a result_1 +key +1 -1 10.5 +2 1 12.0 +""" + + groupby_doc_template = textwrap.dedent( """Group using a mapper or by a Series of columns. @@ -1296,9 +1324,21 @@ def _normalize_aggs( columns = values._columns aggs_per_column = (aggs,) * len(columns) elif not aggs and kwargs: - column_names, aggs_per_column = kwargs.keys(), kwargs.values() - columns = tuple(self.obj._data[x[0]] for x in kwargs.values()) - aggs_per_column = tuple(x[1] for x in kwargs.values()) + column_names = kwargs.keys() + + def _raise_invalid_type(x): + raise TypeError( + f"Invalid keyword argument {x} of type {type(x)} was passed to agg" + ) + + columns, aggs_per_column = zip( + *( + (self.obj._data[x[0]], x[1]) + if isinstance(x, tuple) + else _raise_invalid_type(x) + for x in kwargs.values() + ) + ) else: raise TypeError("Must provide at least one aggregation function.") diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py index 99e7523031b..dc20a27177a 100644 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -56,3 +56,19 @@ def test_dataframe_agg(attr, func): ) assert_eq(agg, pd_agg) + + agg = getattr(df.groupby("a"), attr)( + foo=cudf.NamedAgg(column="b", aggfunc=func), + bar=cudf.NamedAgg(column="a", aggfunc=func), + ) + pd_agg = getattr(pdf.groupby(["a"]), attr)( + foo=("b", func), bar=("a", func) + ) + + assert_eq(agg, pd_agg) + + +def test_dataframe_agg_with_invalid_kwarg(): + with pytest.raises(TypeError, match="Invalid keyword argument"): + df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]}) + df.groupby("a").agg(foo=set())