Skip to content

Commit

Permalink
Add Groupby.indices property and deprecate obj in get_group (ra…
Browse files Browse the repository at this point in the history
…pidsai#14912)

This PR:

 Introduces Groupby.indices property.
 Deprecates obj in Groupby.get_group
  • Loading branch information
galipremsagar authored Jan 29, 2024
1 parent e74fe0a commit f69ae1d
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 2 deletions.
34 changes: 34 additions & 0 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,33 @@ def groups(self):
zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1]))
)

@cached_property
def indices(self):
"""
Dict {group name -> group indices}.
Examples
--------
>>> import cudf
>>> data = [[10, 20, 30], [10, 30, 40], [40, 50, 30]]
>>> df = cudf.DataFrame(data, columns=["a", "b", "c"])
>>> df
a b c
0 10 20 30
1 10 30 40
2 40 50 30
>>> df.groupby(by=["a"]).indices
{10: array([0, 1]), 40: array([2])}
"""
group_names, offsets, _, grouped_values = self._grouped()

return dict(
zip(
group_names.to_pandas(),
np.split(grouped_values.index.values, offsets[1:-1]),
)
)

@_cudf_nvtx_annotate
def get_group(self, name, obj=None):
"""
Expand Down Expand Up @@ -379,6 +406,13 @@ def get_group(self, name, obj=None):
"""
if obj is None:
obj = self.obj
else:
warnings.warn(
"obj is deprecated and will be removed in a future version. "
"Use ``df.iloc[gb.indices.get(name)]`` "
"instead of ``gb.get_group(name, obj=df)``.",
FutureWarning,
)

return obj.loc[self.groups[name].drop_duplicates()]

Expand Down
13 changes: 11 additions & 2 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3130,11 +3130,20 @@ def test_groupby_get_group(pdf, group, name, obj):
else:
gobj = obj

expected = pdf.groupby(group).get_group(name=name, obj=obj)
actual = gdf.groupby(group).get_group(name=name, obj=gobj)
pgb = pdf.groupby(group)
ggb = gdf.groupby(group)
with expect_warning_if(obj is not None):
expected = pgb.get_group(name=name, obj=obj)
with expect_warning_if(obj is not None):
actual = ggb.get_group(name=name, obj=gobj)

assert_groupby_results_equal(expected, actual)

expected = pdf.iloc[pgb.indices.get(name)]
actual = gdf.iloc[ggb.indices.get(name)]

assert_eq(expected, actual)


@pytest.mark.parametrize(
"by",
Expand Down

0 comments on commit f69ae1d

Please sign in to comment.