From f8eb63e499f94d583d715f5c1f5e6f234589be57 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 29 Jul 2024 12:39:19 -1000 Subject: [PATCH] Align Index APIs with pandas 2.x (#16361) Similar to https://github.com/rapidsai/cudf/pull/16310, the follow APIs have been modified to adjust/add parameters * `to_flat_index` * `isin` * `unique` * `transpose` Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16361 --- docs/cudf/source/conf.py | 5 ++++ python/cudf/cudf/core/_base_index.py | 25 ++++++++++++++++++-- python/cudf/cudf/core/index.py | 24 +++++++++++++++---- python/cudf/cudf/core/multiindex.py | 16 +++++++++++-- python/cudf/cudf/core/series.py | 8 ------- python/cudf/cudf/core/single_column_frame.py | 7 ++++++ python/cudf/cudf/tests/test_multiindex.py | 9 +++++++ 7 files changed, 78 insertions(+), 16 deletions(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index f544536fb31..7421d9be298 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -561,6 +561,11 @@ def on_missing_reference(app, env, node, contnode): ("py:class", "ScalarLike"), ("py:class", "ParentType"), ("py:class", "ColumnLike"), + ("py:class", "ColumnLike"), + ("py:obj", "cudf.Index.transpose"), + ("py:obj", "cudf.Index.T"), + ("py:obj", "cudf.Index.to_flat_index"), + ("py:obj", "cudf.MultiIndex.to_flat_index"), # TODO: Remove this when we figure out why typing_extensions doesn't seem # to map types correctly for intersphinx ("py:class", "typing_extensions.Self"), diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 8fad82c5c46..c91514202c5 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -868,6 +868,24 @@ def to_numpy(self): """Convert to a numpy array.""" raise NotImplementedError + def to_flat_index(self) -> Self: + """ + Identity method. + + This is implemented for compatibility with subclass implementations + when chaining. + + Returns + ------- + pd.Index + Caller. + + See Also + -------- + MultiIndex.to_flat_index : Subclass implementation. + """ + return self + def any(self): """ Return whether any elements is True in Index. @@ -945,7 +963,7 @@ def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False): """ raise NotImplementedError - def isin(self, values): + def isin(self, values, level=None): """Return a boolean array where the index values are in values. Compute boolean array of whether each index value is found in @@ -956,6 +974,9 @@ def isin(self, values): ---------- values : set, list-like, Index Sought values. + level : str or int, optional + Name or position of the index level to use (if the index is a + `MultiIndex`). Returns ------- @@ -979,7 +1000,7 @@ def isin(self, values): # ColumnBase.isin). raise NotImplementedError - def unique(self): + def unique(self, level: int | None = None): """ Return unique values in the index. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1c48b8f4f2d..156cb973a9a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -540,8 +540,12 @@ def memory_usage(self, deep: bool = False) -> int: ) return 0 - def unique(self) -> Self: + def unique(self, level: int | None = None) -> Self: # RangeIndex always has unique values + if level is not None and level > 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) return self.copy() @_performance_tracking @@ -964,7 +968,11 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn: i = [] return as_column(i, dtype=size_type_dtype) - def isin(self, values): + def isin(self, values, level=None): + if level is not None and level > 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) if is_scalar(values): raise TypeError( "only list-like objects are allowed to be passed " @@ -1616,12 +1624,20 @@ def append(self, other): return self._concat(to_concat) - def unique(self): + def unique(self, level: int | None = None) -> Self: + if level is not None and level > 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) return cudf.core.index._index_from_data( {self.name: self._values.unique()}, name=self.name ) - def isin(self, values): + def isin(self, values, level=None): + if level is not None and level > 0: + raise IndexError( + f"Too many levels: Index has only 1 level, not {level + 1}" + ) if is_scalar(values): raise TypeError( "only list-like objects are allowed to be passed " diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 0e1fddd7ed5..2788455aebf 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1156,6 +1156,15 @@ def from_tuples(cls, tuples, sortorder: int | None = None, names=None): def to_numpy(self): return self.values_host + def to_flat_index(self): + """ + Convert a MultiIndex to an Index of Tuples containing the level values. + + This is not currently implemented + """ + # TODO: Could implement as Index of ListDtype? + raise NotImplementedError("to_flat_index is not currently supported.") + @property # type: ignore @_performance_tracking def values_host(self): @@ -1734,8 +1743,11 @@ def fillna(self, value): return super().fillna(value=value) @_performance_tracking - def unique(self): - return self.drop_duplicates(keep="first") + def unique(self, level: int | None = None) -> Self | cudf.Index: + if level is None: + return self.drop_duplicates(keep="first") + else: + return self.get_level_values(level).unique() @_performance_tracking def nunique(self, dropna: bool = True) -> int: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8277ccf68fc..10ac1fdfc1e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2775,14 +2775,6 @@ def cov(self, other, min_periods=None, ddof: int | None = None): f"{other.dtype}" ) - @_performance_tracking - def transpose(self): - """Return the transpose, which is by definition self.""" - - return self - - T = property(transpose, doc=transpose.__doc__) - @_performance_tracking def duplicated(self, keep="first"): """ diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index b93528f9693..a5ff1223791 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -389,3 +389,10 @@ def where(self, cond, other=None, inplace=False): result = cudf._lib.copying.copy_if_else(input_col, other, cond) return _make_categorical_like(result, self_column) + + @_performance_tracking + def transpose(self): + """Return the transpose, which is by definition self.""" + return self + + T = property(transpose, doc=transpose.__doc__) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 2c00d48266c..b7314a36e73 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -2170,3 +2170,12 @@ def test_bool_raises(): lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]], rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]], ) + + +def test_unique_level(): + pd_mi = pd.MultiIndex.from_arrays([[1, 1, 2], [3, 3, 2]]) + cudf_mi = cudf.MultiIndex.from_pandas(pd_mi) + + result = pd_mi.unique(level=1) + expected = cudf_mi.unique(level=1) + assert_eq(result, expected)