Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Forward-merge branch-24.10 into branch-24.12 #16846

Merged
merged 1 commit into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 46 additions & 65 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name

if TYPE_CHECKING:
from collections.abc import Generator
from collections.abc import Generator, Hashable

from typing_extensions import Self

Expand Down Expand Up @@ -1041,20 +1041,25 @@ def to_frame(
)

@_performance_tracking
def get_level_values(self, level) -> cudf.Index:
def _level_to_ca_label(self, level) -> tuple[Hashable, int]:
"""
Return the values at the requested level
Convert a level to a ColumAccessor label and an integer position.

Useful if self._column_names != self.names.

Parameters
----------
level : int or label

Returns
-------
An Index containing the values at the requested level.
tuple[Hashable, int]
(ColumnAccessor label corresponding to level, integer position of the level)
"""
colnames = self._data.names
if level not in colnames:
colnames = self._column_names
try:
level_idx = colnames.index(level)
except ValueError:
if isinstance(level, int):
if level < 0:
level = level + len(colnames)
Expand All @@ -1067,8 +1072,22 @@ def get_level_values(self, level) -> cudf.Index:
level = colnames[level_idx]
else:
raise KeyError(f"Level not found: '{level}'")
else:
level_idx = colnames.index(level)
return level, level_idx

@_performance_tracking
def get_level_values(self, level) -> cudf.Index:
"""
Return the values at the requested level

Parameters
----------
level : int or label

Returns
-------
An Index containing the values at the requested level.
"""
level, level_idx = self._level_to_ca_label(level)
level_values = cudf.Index._from_column(
self._data[level], name=self.names[level_idx]
)
Expand Down Expand Up @@ -1420,57 +1439,6 @@ def from_arrays(
codes=codes, levels=levels, sortorder=sortorder, names=names
)

@_performance_tracking
def _poplevels(self, level) -> None | MultiIndex | cudf.Index:
"""
Remove and return the specified levels from self.

Parameters
----------
level : level name or index, list
One or more levels to remove

Returns
-------
Index composed of the removed levels. If only a single level
is removed, a flat index is returned. If no levels are specified
(empty list), None is returned.
"""
if not pd.api.types.is_list_like(level):
level = (level,)

ilevels = sorted(self._level_index_from_level(lev) for lev in level)

if not ilevels:
return None

popped_data = {}
popped_names = []
names = list(self.names)

# build the popped data and names
for i in ilevels:
n = self._data.names[i]
popped_data[n] = self._data[n]
popped_names.append(self.names[i])

# pop the levels out from self
# this must be done iterating backwards
for i in reversed(ilevels):
n = self._data.names[i]
names.pop(i)
popped_data[n] = self._data.pop(n)

# construct the popped result
popped = cudf.core.index._index_from_data(popped_data)
popped.names = popped_names

# update self
self.names = names
self._levels, self._codes = _compute_levels_and_codes(self._data)

return popped

@_performance_tracking
def swaplevel(self, i=-2, j=-1) -> Self:
"""
Expand Down Expand Up @@ -1523,7 +1491,7 @@ def swaplevel(self, i=-2, j=-1) -> Self:
return midx

@_performance_tracking
def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
def droplevel(self, level=-1) -> Self | cudf.Index:
"""
Removes the specified levels from the MultiIndex.

Expand Down Expand Up @@ -1578,11 +1546,24 @@ def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
>>> idx.droplevel(["first", "second"])
Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third')
"""
mi = self.copy(deep=False)
mi._poplevels(level)
if mi.nlevels == 1:
return mi.get_level_values(mi.names[0])
if is_scalar(level):
level = (level,)
elif len(level) == 0:
return self

new_names = list(self.names)
new_data = self._data.copy(deep=False)
for i in sorted(
(self._level_index_from_level(lev) for lev in level), reverse=True
):
new_names.pop(i)
new_data.pop(self._data.names[i])

if len(new_data) == 1:
return cudf.core.index._index_from_data(new_data)
else:
mi = MultiIndex._from_data(new_data)
mi.names = new_names
return mi

@_performance_tracking
Expand Down Expand Up @@ -1886,7 +1867,7 @@ def __array_function__(self, func, types, args, kwargs):
else:
return NotImplemented

def _level_index_from_level(self, level):
def _level_index_from_level(self, level) -> int:
"""
Return level index from given level name or index
"""
Expand Down
26 changes: 19 additions & 7 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from cudf._lib.transform import one_hot_encode
from cudf._lib.types import size_type_dtype
from cudf.api.extensions import no_default
from cudf.api.types import is_scalar
from cudf.core._compat import PANDAS_LT_300
from cudf.core.column import ColumnBase, as_column, column_empty_like
from cudf.core.column_accessor import ColumnAccessor
Expand Down Expand Up @@ -1227,13 +1228,24 @@ def unstack(df, level, fill_value=None, sort: bool = True):
)
return res
else:
df = df.copy(deep=False)
columns = df.index._poplevels(level)
index = df.index
result = _pivot(df, index, columns)
if result.index.nlevels == 1:
result.index = result.index.get_level_values(result.index.names[0])
return result
index = df.index.droplevel(level)
if is_scalar(level):
columns = df.index.get_level_values(level)
else:
new_names = []
ca_data = {}
for lev in level:
ca_level, level_idx = df.index._level_to_ca_label(lev)
new_names.append(df.index.names[level_idx])
ca_data[ca_level] = df.index._data[ca_level]
columns = type(df.index)._from_data(
ColumnAccessor(ca_data, verify=False)
)
columns.names = new_names
result = _pivot(df, index, columns)
if result.index.nlevels == 1:
result.index = result.index.get_level_values(result.index.names[0])
return result


def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase:
Expand Down
Loading