Skip to content

Commit

Permalink
Make CategoricalColumn.__init__ strict (#16456)
Browse files Browse the repository at this point in the history
This PR transfers some of the validation logic in `build_column` directly into `CategoricalColumn` just in case `CategoricalColumn` is called independently of `build_column`. Additionally adds stricter validation of `data`, `dtype` and `children` so the column doesn't represent an invalid state

xref #16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16456
  • Loading branch information
mroeschke authored Aug 16, 2024
1 parent cb843db commit fd44adc
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 31 deletions.
6 changes: 3 additions & 3 deletions python/cudf/cudf/_lib/column.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ cdef class Column:
object mask=None,
int offset=0,
object null_count=None,
object children=()
tuple children=()
):
if size < 0:
raise ValueError("size must be >=0")
Expand Down Expand Up @@ -297,11 +297,11 @@ cdef class Column:
dtypes = [
base_child.dtype for base_child in self.base_children
]
self._children = [
self._children = tuple(
child._with_type_metadata(dtype) for child, dtype in zip(
children, dtypes
)
]
)
return self._children

def set_base_children(self, value):
Expand Down
56 changes: 35 additions & 21 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,18 @@ def reorder_categories(
)


def validate_categorical_children(children) -> None:
if not (
len(children) == 1
and isinstance(children[0], cudf.core.column.numerical.NumericalColumn)
and children[0].dtype.kind in "iu"
):
# TODO: Enforce unsigned integer?
raise ValueError(
"Must specify exactly one child NumericalColumn of integers for representing the codes."
)


class CategoricalColumn(column.ColumnBase):
"""
Implements operations for Columns of Categorical type
Expand All @@ -481,8 +493,7 @@ class CategoricalColumn(column.ColumnBase):
respectively
"""

dtype: cudf.core.dtypes.CategoricalDtype
_codes: NumericalColumn | None
dtype: CategoricalDtype
_children: tuple[NumericalColumn]
_VALID_REDUCTIONS = {
"max",
Expand All @@ -499,33 +510,37 @@ class CategoricalColumn(column.ColumnBase):

def __init__(
self,
data: None,
size: int | None,
dtype: CategoricalDtype,
mask: Buffer | None = None,
size: int | None = None,
offset: int = 0,
null_count: int | None = None,
children: tuple["column.ColumnBase", ...] = (),
children: tuple[NumericalColumn] = (), # type: ignore[assignment]
):
if data is not None:
raise ValueError(f"{data=} must be None")
validate_categorical_children(children)
if size is None:
for child in children:
assert child.offset == 0
assert child.base_mask is None
size = children[0].size
child = children[0]
assert child.offset == 0
assert child.base_mask is None
size = child.size
size = size - offset
if isinstance(dtype, pd.api.types.CategoricalDtype):
dtype = CategoricalDtype.from_pandas(dtype)
if not isinstance(dtype, CategoricalDtype):
raise ValueError("dtype must be instance of CategoricalDtype")
raise ValueError(
f"{dtype=} must be cudf.CategoricalDtype instance."
)
super().__init__(
data=None,
data=data,
size=size,
dtype=dtype,
mask=mask,
offset=offset,
null_count=null_count,
children=children,
)
self._codes = None
self._codes = self.children[0].set_mask(self.mask)

@property
def base_size(self) -> int:
Expand Down Expand Up @@ -558,13 +573,14 @@ def _process_values_for_isin(
rhs = cudf.core.column.as_column(values, dtype=self.dtype)
return lhs, rhs

def set_base_mask(self, value: Buffer | None):
def set_base_mask(self, value: Buffer | None) -> None:
super().set_base_mask(value)
self._codes = None
self._codes = self.children[0].set_mask(self.mask)

def set_base_children(self, value: tuple[ColumnBase, ...]):
def set_base_children(self, value: tuple[NumericalColumn]) -> None: # type: ignore[override]
super().set_base_children(value)
self._codes = None
validate_categorical_children(value)
self._codes = value[0].set_mask(self.mask)

@property
def children(self) -> tuple[NumericalColumn]:
Expand All @@ -586,9 +602,7 @@ def categories(self) -> ColumnBase:

@property
def codes(self) -> NumericalColumn:
if self._codes is None:
self._codes = self.children[0].set_mask(self.mask)
return cast(cudf.core.column.NumericalColumn, self._codes)
return self._codes

@property
def ordered(self) -> bool:
Expand Down Expand Up @@ -1131,7 +1145,7 @@ def _mimic_inplace(
) -> Self | None:
out = super()._mimic_inplace(other_col, inplace=inplace)
if inplace and isinstance(other_col, CategoricalColumn):
self._codes = other_col._codes
self._codes = other_col.codes
return out

def view(self, dtype: Dtype) -> ColumnBase:
Expand Down
9 changes: 2 additions & 7 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1578,19 +1578,14 @@ def build_column(
return col

if isinstance(dtype, CategoricalDtype):
if not len(children) == 1:
raise ValueError(
"Must specify exactly one child column for CategoricalColumn"
)
if not isinstance(children[0], ColumnBase):
raise TypeError("children must be a tuple of Columns")
return cudf.core.column.CategoricalColumn(
data=data, # type: ignore[arg-type]
dtype=dtype,
mask=mask,
size=size,
offset=offset,
null_count=null_count,
children=children,
children=children, # type: ignore[arg-type]
)
elif dtype.type is np.datetime64:
return cudf.core.column.DatetimeColumn(
Expand Down

0 comments on commit fd44adc

Please sign in to comment.