Skip to content

Commit

Permalink
Make ListColumn.__init__ strict (#16465)
Browse files Browse the repository at this point in the history
This PR makes `ListColumn.__init__` strict putting restrictions on data, dtype, size and children so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref #16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #16465
  • Loading branch information
mroeschke authored Aug 19, 2024
1 parent 0491778 commit c516fc4
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 25 deletions.
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1625,12 +1625,13 @@ def build_column(
)
elif isinstance(dtype, ListDtype):
return cudf.core.column.ListColumn(
size=size,
data=None,
size=size, # type: ignore[arg-type]
dtype=dtype,
mask=mask,
offset=offset,
null_count=null_count,
children=children,
children=children, # type: ignore[arg-type]
)
elif isinstance(dtype, IntervalDtype):
return cudf.core.column.IntervalColumn(
Expand Down
64 changes: 41 additions & 23 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

from functools import cached_property
from typing import TYPE_CHECKING, Sequence
from typing import TYPE_CHECKING, Sequence, cast

import numpy as np
import pandas as pd
Expand All @@ -29,30 +29,46 @@
from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
from cudf.core.column import ColumnBase, as_column, column
from cudf.core.column.methods import ColumnMethods, ParentType
from cudf.core.column.numerical import NumericalColumn
from cudf.core.dtypes import ListDtype
from cudf.core.missing import NA

if TYPE_CHECKING:
from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
from cudf.core.buffer import Buffer


class ListColumn(ColumnBase):
dtype: ListDtype
_VALID_BINARY_OPERATIONS = {"__add__", "__radd__"}

def __init__(
self,
size,
dtype,
mask=None,
offset=0,
null_count=None,
children=(),
data: None,
size: int,
dtype: ListDtype,
mask: Buffer | None = None,
offset: int = 0,
null_count: int | None = None,
children: tuple[NumericalColumn, ColumnBase] = (), # type: ignore[assignment]
):
if data is not None:
raise ValueError("data must be None")
if not isinstance(dtype, ListDtype):
raise ValueError("dtype must be a cudf.ListDtype")
if not (
len(children) == 2
and isinstance(children[0], NumericalColumn)
# TODO: Enforce int32_t (size_type) used in libcudf?
and children[0].dtype.kind == "i"
and isinstance(children[1], ColumnBase)
):
raise ValueError(
"children must a tuple of 2 columns of (signed integer offsets, list values)"
)
super().__init__(
None,
size,
dtype,
data=data,
size=size,
dtype=dtype,
mask=mask,
offset=offset,
null_count=null_count,
Expand Down Expand Up @@ -131,19 +147,19 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
raise TypeError("can only concatenate list to list")

@property
def elements(self):
def elements(self) -> ColumnBase:
"""
Column containing the elements of each list (may itself be a
ListColumn)
"""
return self.children[1]

@property
def offsets(self):
def offsets(self) -> NumericalColumn:
"""
Integer offsets to elements specifying each row of the ListColumn
"""
return self.children[0]
return cast(NumericalColumn, self.children[0])

def to_arrow(self):
offsets = self.offsets.to_arrow()
Expand Down Expand Up @@ -172,10 +188,9 @@ def set_base_data(self, value):
else:
super().set_base_data(value)

def set_base_children(self, value: tuple[ColumnBase, ...]):
def set_base_children(self, value: tuple[NumericalColumn, ColumnBase]): # type: ignore[override]
super().set_base_children(value)
_, values = value
self._dtype = cudf.ListDtype(element_type=values.dtype)
self._dtype = cudf.ListDtype(element_type=value[1].dtype)

@property
def __cuda_array_interface__(self):
Expand All @@ -196,12 +211,13 @@ def _with_type_metadata(
dtype.element_type
)
return ListColumn(
data=None,
dtype=dtype,
mask=self.base_mask,
size=self.size,
offset=self.offset,
null_count=self.null_count,
children=(self.base_children[0], elements),
children=(self.base_children[0], elements), # type: ignore[arg-type]
)

return self
Expand All @@ -226,24 +242,25 @@ def from_sequences(
"""
data_col = column.column_empty(0)
mask_col = []
offset_col = [0]
offset_vals = [0]
offset = 0

# Build Data, Mask & Offsets
for data in arbitrary:
if cudf._lib.scalar._is_null_host_scalar(data):
mask_col.append(False)
offset_col.append(offset)
offset_vals.append(offset)
else:
mask_col.append(True)
data_col = data_col.append(as_column(data))
offset += len(data)
offset_col.append(offset)
offset_vals.append(offset)

offset_col = column.as_column(offset_col, dtype=size_type_dtype)
offset_col = column.as_column(offset_vals, dtype=size_type_dtype)

# Build ListColumn
res = cls(
data=None,
size=len(arbitrary),
dtype=cudf.ListDtype(data_col.dtype),
mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
Expand Down Expand Up @@ -283,12 +300,13 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self:
for c in cc:
o = c.children[0]
lc = cudf.core.column.ListColumn( # type: ignore
data=None,
size=c.size,
dtype=cudf.ListDtype(lc.dtype),
mask=c.mask,
offset=c.offset,
null_count=c.null_count,
children=(o, lc),
children=(o, lc), # type: ignore[arg-type]
)
return lc

Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ def _split_by_character(self):
offset_col = col.children[0]

return cudf.core.column.ListColumn(
data=None,
size=len(col),
dtype=cudf.ListDtype(col.dtype),
mask=col.mask,
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,7 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage():
col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64")))
nested_col = col.children[1]
empty_inner = type(nested_col)(
data=None,
size=nested_col.size,
dtype=nested_col.dtype,
mask=nested_col.mask,
Expand All @@ -939,6 +940,7 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage():
),
)
col_empty_offset = type(col)(
data=None,
size=col.size,
dtype=col.dtype,
mask=col.mask,
Expand Down

0 comments on commit c516fc4

Please sign in to comment.