From fd44adc9e02dec4cdde9626f46ba231bda4a7ea6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 16 Aug 2024 13:02:49 -1000 Subject: [PATCH] Make CategoricalColumn.__init__ strict (#16456) This PR transfers some of the validation logic in `build_column` directly into `CategoricalColumn` just in case `CategoricalColumn` is called independently of `build_column`. Additionally adds stricter validation of `data`, `dtype` and `children` so the column doesn't represent an invalid state xref https://github.com/rapidsai/cudf/issues/16469 Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16456 --- python/cudf/cudf/_lib/column.pyx | 6 +-- python/cudf/cudf/core/column/categorical.py | 56 +++++++++++++-------- python/cudf/cudf/core/column/column.py | 9 +--- 3 files changed, 40 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 2e400f775d3..e27c595edda 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -86,7 +86,7 @@ cdef class Column: object mask=None, int offset=0, object null_count=None, - object children=() + tuple children=() ): if size < 0: raise ValueError("size must be >=0") @@ -297,11 +297,11 @@ cdef class Column: dtypes = [ base_child.dtype for base_child in self.base_children ] - self._children = [ + self._children = tuple( child._with_type_metadata(dtype) for child, dtype in zip( children, dtypes ) - ] + ) return self._children def set_base_children(self, value): diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 66aed38bffd..1fdaf9f8c07 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -465,6 +465,18 @@ def reorder_categories( ) +def validate_categorical_children(children) -> None: + if not ( + len(children) == 1 + and isinstance(children[0], cudf.core.column.numerical.NumericalColumn) + and children[0].dtype.kind in "iu" + ): + # TODO: Enforce unsigned integer? + raise ValueError( + "Must specify exactly one child NumericalColumn of integers for representing the codes." + ) + + class CategoricalColumn(column.ColumnBase): """ Implements operations for Columns of Categorical type @@ -481,8 +493,7 @@ class CategoricalColumn(column.ColumnBase): respectively """ - dtype: cudf.core.dtypes.CategoricalDtype - _codes: NumericalColumn | None + dtype: CategoricalDtype _children: tuple[NumericalColumn] _VALID_REDUCTIONS = { "max", @@ -499,25 +510,29 @@ class CategoricalColumn(column.ColumnBase): def __init__( self, + data: None, + size: int | None, dtype: CategoricalDtype, mask: Buffer | None = None, - size: int | None = None, offset: int = 0, null_count: int | None = None, - children: tuple["column.ColumnBase", ...] = (), + children: tuple[NumericalColumn] = (), # type: ignore[assignment] ): + if data is not None: + raise ValueError(f"{data=} must be None") + validate_categorical_children(children) if size is None: - for child in children: - assert child.offset == 0 - assert child.base_mask is None - size = children[0].size + child = children[0] + assert child.offset == 0 + assert child.base_mask is None + size = child.size size = size - offset - if isinstance(dtype, pd.api.types.CategoricalDtype): - dtype = CategoricalDtype.from_pandas(dtype) if not isinstance(dtype, CategoricalDtype): - raise ValueError("dtype must be instance of CategoricalDtype") + raise ValueError( + f"{dtype=} must be cudf.CategoricalDtype instance." + ) super().__init__( - data=None, + data=data, size=size, dtype=dtype, mask=mask, @@ -525,7 +540,7 @@ def __init__( null_count=null_count, children=children, ) - self._codes = None + self._codes = self.children[0].set_mask(self.mask) @property def base_size(self) -> int: @@ -558,13 +573,14 @@ def _process_values_for_isin( rhs = cudf.core.column.as_column(values, dtype=self.dtype) return lhs, rhs - def set_base_mask(self, value: Buffer | None): + def set_base_mask(self, value: Buffer | None) -> None: super().set_base_mask(value) - self._codes = None + self._codes = self.children[0].set_mask(self.mask) - def set_base_children(self, value: tuple[ColumnBase, ...]): + def set_base_children(self, value: tuple[NumericalColumn]) -> None: # type: ignore[override] super().set_base_children(value) - self._codes = None + validate_categorical_children(value) + self._codes = value[0].set_mask(self.mask) @property def children(self) -> tuple[NumericalColumn]: @@ -586,9 +602,7 @@ def categories(self) -> ColumnBase: @property def codes(self) -> NumericalColumn: - if self._codes is None: - self._codes = self.children[0].set_mask(self.mask) - return cast(cudf.core.column.NumericalColumn, self._codes) + return self._codes @property def ordered(self) -> bool: @@ -1131,7 +1145,7 @@ def _mimic_inplace( ) -> Self | None: out = super()._mimic_inplace(other_col, inplace=inplace) if inplace and isinstance(other_col, CategoricalColumn): - self._codes = other_col._codes + self._codes = other_col.codes return out def view(self, dtype: Dtype) -> ColumnBase: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 090c02da990..19d6bf84d3f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1578,19 +1578,14 @@ def build_column( return col if isinstance(dtype, CategoricalDtype): - if not len(children) == 1: - raise ValueError( - "Must specify exactly one child column for CategoricalColumn" - ) - if not isinstance(children[0], ColumnBase): - raise TypeError("children must be a tuple of Columns") return cudf.core.column.CategoricalColumn( + data=data, # type: ignore[arg-type] dtype=dtype, mask=mask, size=size, offset=offset, null_count=null_count, - children=children, + children=children, # type: ignore[arg-type] ) elif dtype.type is np.datetime64: return cudf.core.column.DatetimeColumn(