From f69ae1d110ce6389ccef115fe5ca49d36066b8ca Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 29 Jan 2024 09:22:37 -0600 Subject: [PATCH] Add `Groupby.indices` property and deprecate `obj` in `get_group` (#14912) This PR: Introduces Groupby.indices property. Deprecates obj in Groupby.get_group --- python/cudf/cudf/core/groupby/groupby.py | 34 ++++++++++++++++++++++++ python/cudf/cudf/tests/test_groupby.py | 13 +++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b73d5532100..b3577444f6b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -344,6 +344,33 @@ def groups(self): zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1])) ) + @cached_property + def indices(self): + """ + Dict {group name -> group indices}. + + Examples + -------- + >>> import cudf + >>> data = [[10, 20, 30], [10, 30, 40], [40, 50, 30]] + >>> df = cudf.DataFrame(data, columns=["a", "b", "c"]) + >>> df + a b c + 0 10 20 30 + 1 10 30 40 + 2 40 50 30 + >>> df.groupby(by=["a"]).indices + {10: array([0, 1]), 40: array([2])} + """ + group_names, offsets, _, grouped_values = self._grouped() + + return dict( + zip( + group_names.to_pandas(), + np.split(grouped_values.index.values, offsets[1:-1]), + ) + ) + @_cudf_nvtx_annotate def get_group(self, name, obj=None): """ @@ -379,6 +406,13 @@ def get_group(self, name, obj=None): """ if obj is None: obj = self.obj + else: + warnings.warn( + "obj is deprecated and will be removed in a future version. " + "Use ``df.iloc[gb.indices.get(name)]`` " + "instead of ``gb.get_group(name, obj=df)``.", + FutureWarning, + ) return obj.loc[self.groups[name].drop_duplicates()] diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 0c71d74f89f..526aa9f503a 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3130,11 +3130,20 @@ def test_groupby_get_group(pdf, group, name, obj): else: gobj = obj - expected = pdf.groupby(group).get_group(name=name, obj=obj) - actual = gdf.groupby(group).get_group(name=name, obj=gobj) + pgb = pdf.groupby(group) + ggb = gdf.groupby(group) + with expect_warning_if(obj is not None): + expected = pgb.get_group(name=name, obj=obj) + with expect_warning_if(obj is not None): + actual = ggb.get_group(name=name, obj=gobj) assert_groupby_results_equal(expected, actual) + expected = pdf.iloc[pgb.indices.get(name)] + actual = gdf.iloc[ggb.indices.get(name)] + + assert_eq(expected, actual) + @pytest.mark.parametrize( "by",