From 08fecf2de8e8a21fbc82301879cdaad9c1be4875 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 4 Sep 2024 11:32:20 -0700 Subject: [PATCH 1/4] Initial commit --- python/cudf/cudf/__init__.py | 2 +- python/cudf/cudf/core/groupby/__init__.py | 5 +++-- python/cudf/cudf/core/groupby/groupby.py | 23 +++++++++++++++++++--- python/cudf/cudf/tests/groupby/test_agg.py | 10 ++++++++++ 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index d7da42a1708..99b759e2166 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -46,7 +46,7 @@ ListDtype, StructDtype, ) -from cudf.core.groupby import Grouper +from cudf.core.groupby import Grouper, NamedAgg from cudf.core.index import ( BaseIndex, CategoricalIndex, diff --git a/python/cudf/cudf/core/groupby/__init__.py b/python/cudf/cudf/core/groupby/__init__.py index 4375ed3e3da..621edb316cf 100644 --- a/python/cudf/cudf/core/groupby/__init__.py +++ b/python/cudf/cudf/core/groupby/__init__.py @@ -1,8 +1,9 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf.core.groupby.groupby import GroupBy, Grouper +from cudf.core.groupby.groupby import GroupBy, Grouper, NamedAgg __all__ = [ "GroupBy", "Grouper", + "NamedAgg", ] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4f283d41b17..ee7617013ec 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -76,6 +76,9 @@ def _is_row_of(chunk, obj): ) +NamedAgg = pd.NamedAgg + + groupby_doc_template = textwrap.dedent( """Group using a mapper or by a Series of columns. @@ -1296,9 +1299,23 @@ def _normalize_aggs( columns = values._columns aggs_per_column = (aggs,) * len(columns) elif not aggs and kwargs: - column_names, aggs_per_column = kwargs.keys(), kwargs.values() - columns = tuple(self.obj._data[x[0]] for x in kwargs.values()) - aggs_per_column = tuple(x[1] for x in kwargs.values()) + column_names = kwargs.keys() + + def _raise_invalid_type(x): + raise TypeError( + f"Invalid keyword argument {x} of type {type(x)} was passed to agg" + ) + + columns, aggs_per_column = zip( + *( + (self.obj._data[x[0]], x[1]) + if isinstance(x, tuple) + else (self.obj._data[x.column], x.pyfunc) + if isinstance(x, NamedAgg) + else (_raise_invalid_type(x)) + for x in kwargs.values() + ) + ) else: raise TypeError("Must provide at least one aggregation function.") diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py index 99e7523031b..d519c4a3790 100644 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -56,3 +56,13 @@ def test_dataframe_agg(attr, func): ) assert_eq(agg, pd_agg) + + agg = getattr(df.groupby("a"), attr)( + foo=cudf.NamedAgg(column="b", aggfunc=func), + bar=cudf.NamedAgg(column="a", aggfunc=func), + ) + pd_agg = getattr(pdf.groupby(["a"]), attr)( + foo=("b", func), bar=("a", func) + ) + + assert_eq(agg, pd_agg) From 788f94c6899902e60751e0011977b246757a43f9 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Wed, 4 Sep 2024 15:19:15 -0700 Subject: [PATCH 2/4] Update docstring --- python/cudf/cudf/core/groupby/groupby.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ee7617013ec..f62f9c0b4dd 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -79,6 +79,31 @@ def _is_row_of(chunk, obj): NamedAgg = pd.NamedAgg +NamedAgg.__doc__ = """ +Helper for column specific aggregation with control over output column names. + +Subclass of typing.NamedTuple. + +Parameters +---------- +column : Hashable + Column label in the DataFrame to apply aggfunc. +aggfunc : function or str + Function to apply to the provided column. + +Examples +-------- +>>> df = cudf.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) +>>> agg_a = cudf.NamedAgg(column="a", aggfunc="min") +>>> agg_1 = cudf.NamedAgg(column=1, aggfunc=lambda x: x.mean()) +>>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) + result_a result_1 +key +1 -1 10.5 +2 1 12.0 +""" + + groupby_doc_template = textwrap.dedent( """Group using a mapper or by a Series of columns. From 7af39816d133d63e29388be958b050e26fde7080 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Mon, 9 Sep 2024 08:10:01 -0700 Subject: [PATCH 3/4] address review --- python/cudf/cudf/core/groupby/groupby.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f62f9c0b4dd..2ef58a54540 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1335,8 +1335,6 @@ def _raise_invalid_type(x): *( (self.obj._data[x[0]], x[1]) if isinstance(x, tuple) - else (self.obj._data[x.column], x.pyfunc) - if isinstance(x, NamedAgg) else (_raise_invalid_type(x)) for x in kwargs.values() ) From 3948464d282732be8d8702c6b3682e39dded0a07 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 10 Sep 2024 08:51:49 -0700 Subject: [PATCH 4/4] address review --- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/tests/groupby/test_agg.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 2ef58a54540..6424c8af877 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1335,7 +1335,7 @@ def _raise_invalid_type(x): *( (self.obj._data[x[0]], x[1]) if isinstance(x, tuple) - else (_raise_invalid_type(x)) + else _raise_invalid_type(x) for x in kwargs.values() ) ) diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py index d519c4a3790..dc20a27177a 100644 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -66,3 +66,9 @@ def test_dataframe_agg(attr, func): ) assert_eq(agg, pd_agg) + + +def test_dataframe_agg_with_invalid_kwarg(): + with pytest.raises(TypeError, match="Invalid keyword argument"): + df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]}) + df.groupby("a").agg(foo=set())