From 54eff4ef3a6cb2e0e10f1064eb2071653a3c9bc8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 9 Apr 2024 04:19:47 -1000 Subject: [PATCH] Avoid .ordered and .categories from being settable in CategoricalColumn and CategoricalDtype (#15475) A rehash of https://github.com/rapidsai/cudf/pull/14979 The `CategoricalDtype.ordered` behavior matches `pandas.CategoricalDtype.ordered` behavior. Also combines `as_ordered` and `as_unordred` into 1 method, and avoids to `as_index` casts that are already performed elsewhere Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15475 --- python/cudf/cudf/core/column/categorical.py | 53 +++++++-------------- python/cudf/cudf/core/dtypes.py | 4 -- python/cudf/cudf/core/index.py | 6 +-- python/cudf/cudf/tests/test_categorical.py | 5 ++ 4 files changed, 26 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 88bb4521a5b..e4620ee5bc4 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -110,7 +110,7 @@ def categories(self) -> "cudf.core.index.Index": """ The categories of this categorical. """ - return cudf.core.index.as_index(self._column.categories) + return self._column.dtype.categories @property def codes(self) -> "cudf.Series": @@ -165,7 +165,7 @@ def as_ordered(self) -> Optional[SeriesOrIndex]: dtype: category Categories (3, int64): [1 < 2 < 10] """ - return self._return_or_inplace(self._column.as_ordered()) + return self._return_or_inplace(self._column.as_ordered(ordered=True)) def as_unordered(self) -> Optional[SeriesOrIndex]: """ @@ -212,8 +212,7 @@ def as_unordered(self) -> Optional[SeriesOrIndex]: dtype: category Categories (3, int64): [1, 2, 10] """ - - return self._return_or_inplace(self._column.as_unordered()) + return self._return_or_inplace(self._column.as_ordered(ordered=False)) def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: """ @@ -631,10 +630,6 @@ def codes(self) -> NumericalColumn: def ordered(self) -> bool: return self.dtype.ordered - @ordered.setter - def ordered(self, value: bool): - self.dtype.ordered = value - def __setitem__(self, key, value): if cudf.api.types.is_scalar( value @@ -1170,9 +1165,11 @@ def _get_decategorized_column(self) -> ColumnBase: def copy(self, deep: bool = True) -> Self: result_col = super().copy(deep=deep) if deep: - result_col.categories = libcudf.copying.copy_column( - self.dtype._categories + dtype_copy = CategoricalDtype( + categories=self.categories.copy(), + ordered=self.ordered, ) + result_col = cast(Self, result_col._with_type_metadata(dtype_copy)) return result_col @cached_property @@ -1411,31 +1408,17 @@ def reorder_categories( ) return self._set_categories(new_categories, ordered=ordered) - def as_ordered(self): - out_col = self - if not out_col.ordered: - out_col = column.build_categorical_column( - categories=self.categories, - codes=self.codes, - mask=self.base_mask, - size=self.base_size, - offset=self.offset, - ordered=True, - ) - return out_col - - def as_unordered(self): - out_col = self - if out_col.ordered: - out_col = column.build_categorical_column( - categories=self.categories, - codes=self.codes, - mask=self.base_mask, - size=self.base_size, - offset=self.offset, - ordered=False, - ) - return out_col + def as_ordered(self, ordered: bool): + if self.dtype.ordered == ordered: + return self + return column.build_categorical_column( + categories=self.categories, + codes=self.codes, + mask=self.base_mask, + size=self.base_size, + offset=self.offset, + ordered=ordered, + ) def _create_empty_categorical_column( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 3bd342e24c2..73617763221 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -205,10 +205,6 @@ def ordered(self) -> bool: """ return self._ordered - @ordered.setter - def ordered(self, value) -> None: - self._ordered = value - @classmethod def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": """ diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index bd9dc1ae3da..0a7435bd241 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2624,9 +2624,9 @@ def __init__( elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): data = data.set_categories(dtype.categories, ordered=ordered) elif ordered is True and data.ordered is False: - data = data.as_ordered() + data = data.as_ordered(ordered=True) elif ordered is False and data.ordered is True: - data = data.as_unordered() + data = data.as_ordered(ordered=False) super().__init__(data, **kwargs) @property # type: ignore @@ -2643,7 +2643,7 @@ def categories(self): """ The categories of this categorical. """ - return as_index(self._values.categories) + return self.dtype.categories def _is_boolean(self): return False diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index cc3e20b5bac..e21fd53bee4 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -848,6 +848,11 @@ def test_empty_series_category_cast(ordered): assert_eq(expected.dtype.ordered, actual.dtype.ordered) +def test_categorical_dtype_ordered_not_settable(): + with pytest.raises(AttributeError): + cudf.CategoricalDtype().ordered = False + + @pytest.mark.parametrize("scalar", [1, "a", None, 10.2]) def test_cat_from_scalar(scalar): ps = pd.Series(scalar, dtype="category")