From 5871ebe6f5a0a77e75bbcbb81c156d939269ed50 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 4 Sep 2023 15:08:45 +0100 Subject: [PATCH 1/3] _get_columns_by_label always returns a Frame --- python/cudf/cudf/core/dataframe.py | 12 +++++++----- python/cudf/cudf/core/frame.py | 4 ++-- python/cudf/cudf/core/series.py | 8 ++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e67604069f1..5a3d25a08a7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -36,7 +36,7 @@ from pandas.core.dtypes.common import is_float, is_integer from pandas.io.formats import console from pandas.io.formats.printing import pprint_thing -from typing_extensions import assert_never +from typing_extensions import Self, assert_never import cudf import cudf.core.common @@ -1830,13 +1830,15 @@ def _repr_latex_(self): return self._get_renderable_dataframe().to_pandas()._repr_latex_() @_cudf_nvtx_annotate - def _get_columns_by_label(self, labels, downcast=False): + def _get_columns_by_label( + self, labels, *, downcast=False + ) -> Self | Series: """ Return columns of dataframe by `labels` If downcast is True, try and downcast from a DataFrame to a Series """ - new_data = super()._get_columns_by_label(labels, downcast) + ca = self._data.select_by_label(labels) if downcast: if is_scalar(labels): nlevels = 1 @@ -1844,11 +1846,11 @@ def _get_columns_by_label(self, labels, downcast=False): nlevels = len(labels) if self._data.multiindex is False or nlevels == self._data.nlevels: out = self._constructor_sliced._from_data( - new_data, index=self.index, name=labels + ca, index=self.index, name=labels ) return out out = self.__class__._from_data( - new_data, index=self.index, columns=new_data.to_pandas_index() + ca, index=self.index, columns=ca.to_pandas_index() ) return out diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b9f052e7626..6224793d6f1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -362,12 +362,12 @@ def equals(self, other): ) @_cudf_nvtx_annotate - def _get_columns_by_label(self, labels, downcast=False): + def _get_columns_by_label(self, labels, *, downcast=False) -> Self: """ Returns columns of the Frame specified by `labels` """ - return self._data.select_by_label(labels) + return self.__class__._from_data(self._data.select_by_label(labels)) @property @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2fef741ac09..78be3085754 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -797,17 +797,17 @@ def deserialize(cls, header, frames): return obj - def _get_columns_by_label(self, labels, downcast=False): + def _get_columns_by_label(self, labels, *, downcast=False) -> Self: """Return the column specified by `labels` For cudf.Series, either the column, or an empty series is returned. Parameter `downcast` does not have effects. """ - new_data = super()._get_columns_by_label(labels, downcast) + ca = self._data.select_by_label(labels) return ( - self.__class__._from_data(data=new_data, index=self.index) - if len(new_data) > 0 + self.__class__._from_data(data=ca, index=self.index) + if len(ca) > 0 else self.__class__(dtype=self.dtype, name=self.name) ) From 1c9ea672aec6238e87b5da2b023ed1a397f48725 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 4 Sep 2023 15:09:51 +0100 Subject: [PATCH 2/3] Handle sort_remaining in sort_index - Closes #14011 --- python/cudf/cudf/core/indexed_frame.py | 31 ++++++++++++++------------ 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4c6eb3a50e9..fbdbbd99ce9 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1530,7 +1530,9 @@ def sort_index( na_position : {'first', 'last'}, default 'last' Puts NaNs at the beginning if first; last puts NaNs at the end. sort_remaining : bool, default True - Not yet supported + When sorting a multiindex on a subset of its levels, + should entries be lexsorted by the remaining + (non-specified) levels as well? ignore_index : bool, default False if True, index will be replaced with RangeIndex. key : callable, optional @@ -1596,11 +1598,6 @@ def sort_index( if kind is not None: raise NotImplementedError("kind is not yet supported") - if not sort_remaining: - raise NotImplementedError( - "sort_remaining == False is not yet supported" - ) - if key is not None: raise NotImplementedError("key is not yet supported.") @@ -1613,16 +1610,22 @@ def sort_index( if level is not None: # Pandas doesn't handle na_position in case of MultiIndex. na_position = "first" if ascending is True else "last" - labels = [ - idx._get_level_label(lvl) - for lvl in (level if is_list_like(level) else (level,)) - ] - # Explicitly construct a Frame rather than using type(self) - # to avoid constructing a SingleColumnFrame (e.g. Series). - idx = Frame._from_data(idx._data.select_by_label(labels)) + if not is_list_like(level): + level = [level] + by = list(map(idx._get_level_label, level)) + if sort_remaining: + handled = set(by) + by.extend( + filter( + lambda n: n not in handled, + self.index._data.names, + ) + ) + else: + by = list(idx._data.names) inds = idx._get_sorted_inds( - ascending=ascending, na_position=na_position + by=by, ascending=ascending, na_position=na_position ) out = self._gather( GatherMap.from_column_unchecked( From 7ec38f92b846384e4cbd8022f0c5d99c17b3f42f Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 4 Sep 2023 15:42:11 +0100 Subject: [PATCH 3/3] Add test of partial multiindex sorting --- python/cudf/cudf/tests/test_multiindex.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index eedc9b0c174..56bd7d709b7 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1897,3 +1897,26 @@ def test_multiindex_empty_slice_pandas_compatibility(): with cudf.option_context("mode.pandas_compatible", True): actual = cudf.from_pandas(expected) assert_eq(expected, actual, exact=False) + + +@pytest.mark.parametrize( + "levels", + itertools.chain.from_iterable( + itertools.permutations(range(3), n) for n in range(1, 4) + ), + ids=str, +) +def test_multiindex_sort_index_partial(levels): + df = pd.DataFrame( + { + "a": [3, 3, 3, 1, 1, 1, 2, 2], + "b": [4, 2, 7, -1, 11, -2, 7, 7], + "c": [4, 4, 2, 3, 3, 3, 1, 1], + "val": [1, 2, 3, 4, 5, 6, 7, 8], + } + ).set_index(["a", "b", "c"]) + cdf = cudf.from_pandas(df) + + expect = df.sort_index(level=levels, sort_remaining=True) + got = cdf.sort_index(level=levels, sort_remaining=True) + assert_eq(expect, got)