diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 7533ed56647..9d18e023fe8 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -3,7 +3,7 @@ from functools import singledispatch from pandas.errors import DataError -from cudf.api.types import is_string_dtype +from cudf.api.types import _is_categorical_dtype, is_string_dtype from cudf.core.buffer import acquire_spill_lock from cudf.core.dtypes import ( CategoricalDtype, @@ -167,6 +167,46 @@ cdef class GroupBy: included_aggregations_i = [] col_aggregations = [] for agg in aggs: + str_agg = str(agg) + if ( + is_string_dtype(col) + and agg not in _STRING_AGGS + and + ( + str_agg in {"cumsum", "cummin", "cummax"} + or not ( + any(a in str_agg for a in { + "count", + "max", + "min", + "first", + "last", + "nunique", + "unique", + "nth" + }) + or (agg is list) + ) + ) + ): + raise TypeError( + f"function is not supported for this dtype: {agg}" + ) + elif ( + _is_categorical_dtype(col) + and agg not in _CATEGORICAL_AGGS + and ( + str_agg in {"cumsum", "cummin", "cummax"} + or + not ( + any(a in str_agg for a in {"count", "max", "min", "unique"}) + ) + ) + ): + raise TypeError( + f"{col.dtype} type does not support {agg} operations" + ) + agg_obj = make_aggregation(agg) if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations: included_aggregations_i.append((agg, agg_obj.kind)) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index c139b06d20f..674f694a224 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -504,7 +504,6 @@ def test_groupby_apply_jit_unary_reductions( func, dtype, dataset, groupby_jit_datasets ): dataset = groupby_jit_datasets[dataset] - groupby_apply_jit_reductions_test_inner(func, dataset, dtype) @@ -1891,9 +1890,6 @@ def test_groupby_nth(n, by): assert_groupby_results_equal(expect, got, check_dtype=False) -@pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/43209", -) def test_raise_data_error(): pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) gdf = cudf.from_pandas(pdf) @@ -1904,12 +1900,13 @@ def test_raise_data_error(): ) -def test_drop_unsupported_multi_agg(): +def test_multi_agg(): gdf = cudf.DataFrame( {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]} ) + pdf = gdf.to_pandas() assert_groupby_results_equal( - gdf.groupby("a").agg(["count", "mean"]), + pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), ) @@ -3852,3 +3849,39 @@ def test_group_by_reduce_numeric_only(by, data, func): ) result = getattr(df.groupby(by, sort=True), func)(numeric_only=True) assert_eq(expected, result) + + +@pytest.mark.parametrize( + "op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"] +) +def test_group_by_raises_string_error(op): + df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]}) + + with pytest.raises(TypeError): + df.groupby(df.a).agg(op) + + +@pytest.mark.parametrize( + "op", + [ + "cummax", + "cummin", + "cumprod", + "cumsum", + "mean", + "median", + "prod", + "sum", + list, + ], +) +def test_group_by_raises_category_error(op): + df = cudf.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"), + } + ) + + with pytest.raises(TypeError): + df.groupby(df.a).agg(op)