diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 19d6bf84d3f..0857727d23f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1625,12 +1625,13 @@ def build_column( ) elif isinstance(dtype, ListDtype): return cudf.core.column.ListColumn( - size=size, + data=None, + size=size, # type: ignore[arg-type] dtype=dtype, mask=mask, offset=offset, null_count=null_count, - children=children, + children=children, # type: ignore[arg-type] ) elif isinstance(dtype, IntervalDtype): return cudf.core.column.IntervalColumn( diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 1b7cd95b3d0..302f04a0e71 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -3,7 +3,7 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING, Sequence +from typing import TYPE_CHECKING, Sequence, cast import numpy as np import pandas as pd @@ -29,30 +29,46 @@ from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethods, ParentType +from cudf.core.column.numerical import NumericalColumn from cudf.core.dtypes import ListDtype from cudf.core.missing import NA if TYPE_CHECKING: from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike + from cudf.core.buffer import Buffer class ListColumn(ColumnBase): - dtype: ListDtype _VALID_BINARY_OPERATIONS = {"__add__", "__radd__"} def __init__( self, - size, - dtype, - mask=None, - offset=0, - null_count=None, - children=(), + data: None, + size: int, + dtype: ListDtype, + mask: Buffer | None = None, + offset: int = 0, + null_count: int | None = None, + children: tuple[NumericalColumn, ColumnBase] = (), # type: ignore[assignment] ): + if data is not None: + raise ValueError("data must be None") + if not isinstance(dtype, ListDtype): + raise ValueError("dtype must be a cudf.ListDtype") + if not ( + len(children) == 2 + and isinstance(children[0], NumericalColumn) + # TODO: Enforce int32_t (size_type) used in libcudf? + and children[0].dtype.kind == "i" + and isinstance(children[1], ColumnBase) + ): + raise ValueError( + "children must a tuple of 2 columns of (signed integer offsets, list values)" + ) super().__init__( - None, - size, - dtype, + data=data, + size=size, + dtype=dtype, mask=mask, offset=offset, null_count=null_count, @@ -131,7 +147,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: raise TypeError("can only concatenate list to list") @property - def elements(self): + def elements(self) -> ColumnBase: """ Column containing the elements of each list (may itself be a ListColumn) @@ -139,11 +155,11 @@ def elements(self): return self.children[1] @property - def offsets(self): + def offsets(self) -> NumericalColumn: """ Integer offsets to elements specifying each row of the ListColumn """ - return self.children[0] + return cast(NumericalColumn, self.children[0]) def to_arrow(self): offsets = self.offsets.to_arrow() @@ -172,10 +188,9 @@ def set_base_data(self, value): else: super().set_base_data(value) - def set_base_children(self, value: tuple[ColumnBase, ...]): + def set_base_children(self, value: tuple[NumericalColumn, ColumnBase]): # type: ignore[override] super().set_base_children(value) - _, values = value - self._dtype = cudf.ListDtype(element_type=values.dtype) + self._dtype = cudf.ListDtype(element_type=value[1].dtype) @property def __cuda_array_interface__(self): @@ -196,12 +211,13 @@ def _with_type_metadata( dtype.element_type ) return ListColumn( + data=None, dtype=dtype, mask=self.base_mask, size=self.size, offset=self.offset, null_count=self.null_count, - children=(self.base_children[0], elements), + children=(self.base_children[0], elements), # type: ignore[arg-type] ) return self @@ -226,24 +242,25 @@ def from_sequences( """ data_col = column.column_empty(0) mask_col = [] - offset_col = [0] + offset_vals = [0] offset = 0 # Build Data, Mask & Offsets for data in arbitrary: if cudf._lib.scalar._is_null_host_scalar(data): mask_col.append(False) - offset_col.append(offset) + offset_vals.append(offset) else: mask_col.append(True) data_col = data_col.append(as_column(data)) offset += len(data) - offset_col.append(offset) + offset_vals.append(offset) - offset_col = column.as_column(offset_col, dtype=size_type_dtype) + offset_col = column.as_column(offset_vals, dtype=size_type_dtype) # Build ListColumn res = cls( + data=None, size=len(arbitrary), dtype=cudf.ListDtype(data_col.dtype), mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)), @@ -283,12 +300,13 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self: for c in cc: o = c.children[0] lc = cudf.core.column.ListColumn( # type: ignore + data=None, size=c.size, dtype=cudf.ListDtype(lc.dtype), mask=c.mask, offset=c.offset, null_count=c.null_count, - children=(o, lc), + children=(o, lc), # type: ignore[arg-type] ) return lc diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a710a9f46c2..6f7508822d4 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -549,6 +549,7 @@ def _split_by_character(self): offset_col = col.children[0] return cudf.core.column.ListColumn( + data=None, size=len(col), dtype=cudf.ListDtype(col.dtype), mask=col.mask, diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index c4c883ca9f9..7d87fc73621 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -928,6 +928,7 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage(): col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64"))) nested_col = col.children[1] empty_inner = type(nested_col)( + data=None, size=nested_col.size, dtype=nested_col.dtype, mask=nested_col.mask, @@ -939,6 +940,7 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage(): ), ) col_empty_offset = type(col)( + data=None, size=col.size, dtype=col.dtype, mask=col.mask,