diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 2ed4298e993..2a4ad60b051 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -37,6 +37,8 @@ class DataFrame: def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None: self.columns = list(columns) self._column_map = {c.name: c for c in self.columns} + if len(self._column_map) != len(self.columns): + raise ValueError("Some columns have overlapping names") self.scalars = list(scalars) if len(scalars) == 0: self.table = plc.Table([c.obj for c in columns]) @@ -89,7 +91,25 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self: @classmethod def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: - """Create from a pylibcudf table.""" + """ + Create from a pylibcudf table. + + Parameters + ---------- + table + Pylibcudf table to obtain columns from + names + Names for the columns + + Returns + ------- + New dataframe sharing data with the input table. + + Raises + ------ + ValueError if the number of provided names does not match the + number of columns in the table. + """ # TODO: strict=True when we drop py39 if table.num_columns() != len(names): raise ValueError("Mismatching name and table length.") @@ -98,7 +118,24 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: def sorted_like( self, like: DataFrame, /, *, subset: Set[str] | None = None ) -> Self: - """Copy sortedness from a dataframe onto self.""" + """ + Copy sortedness from a dataframe onto self. + + Parameters + ---------- + like + The dataframe to copy from + subset + Optional subset of columns from which to copy data. + + Returns + ------- + Self with metadata set. + + Raises + ------ + ValueError if there is a name mismatch between self and like. + """ if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") subset = self.column_names_set if subset is None else subset @@ -112,7 +149,19 @@ def with_columns(self, columns: Sequence[Column]) -> Self: """ Return a new dataframe with extra columns. - Data is shared. + Parameters + ---------- + columns + Columns to add + + Returns + ------- + New dataframe + + Raises + ------ + ValueError if the new columns have overlapping names with the + existing ones. """ return type(self)([*self.columns, *columns], self.scalars) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index d96a6464404..249cc3775f7 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -183,12 +183,21 @@ def do_evaluate( Notes ----- Do not call this function directly, but rather - :func:`evaluate` which handles the mapping lookups. + :meth:`evaluate` which handles the mapping lookups. + + The typed return value of :class:`Column` is not true when + evaluating :class:`Literal` nodes (which instead produce + :class:`Scalar` objects). However, these duck-type to having a + pylibcudf container object inside them, and usually they end + up appearing in binary expressions which pylibcudf handles + appropriately since there are overloads for (column, scalar) + pairs. We don't have to handle (scalar, scalar) in binops + since the polars optimizer has a constant-folding pass. Returns ------- Column representing the evaluation of the expression (or maybe - a scalar, annoying!). + a scalar). Raises ------