Skip to content

Commit

Permalink
Expunge NamedColumn (#16962)
Browse files Browse the repository at this point in the history
Everything in the expression evaluation now operates on columns without names. DataFrame construction takes either a mapping from string-valued names to columns, or a sequence of pairs of names and columns.

This removes some duplicate code in the NamedColumn class (by removing it) where we had to fight the inheritance hierarchy.

- Closes #16272

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: #16962
  • Loading branch information
wence- authored Oct 8, 2024
1 parent 09ed210 commit 219ec0e
Show file tree
Hide file tree
Showing 10 changed files with 209 additions and 240 deletions.
4 changes: 2 additions & 2 deletions python/cudf_polars/cudf_polars/containers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from __future__ import annotations

__all__: list[str] = ["DataFrame", "Column", "NamedColumn"]
__all__: list[str] = ["DataFrame", "Column"]

from cudf_polars.containers.column import Column, NamedColumn
from cudf_polars.containers.column import Column
from cudf_polars.containers.dataframe import DataFrame
110 changes: 40 additions & 70 deletions python/cudf_polars/cudf_polars/containers/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import polars as pl

__all__: list[str] = ["Column", "NamedColumn"]
__all__: list[str] = ["Column"]


class Column:
Expand All @@ -26,6 +26,9 @@ class Column:
order: plc.types.Order
null_order: plc.types.NullOrder
is_scalar: bool
# Optional name, only ever set by evaluation of NamedExpr nodes
# The internal evaluation should not care about the name.
name: str | None

def __init__(
self,
Expand All @@ -34,14 +37,12 @@ def __init__(
is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
order: plc.types.Order = plc.types.Order.ASCENDING,
null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
name: str | None = None,
):
self.obj = column
self.is_scalar = self.obj.size() == 1
if self.obj.size() <= 1:
is_sorted = plc.types.Sorted.YES
self.is_sorted = is_sorted
self.order = order
self.null_order = null_order
self.name = name
self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order)

@functools.cached_property
def obj_scalar(self) -> plc.Scalar:
Expand All @@ -63,9 +64,26 @@ def obj_scalar(self) -> plc.Scalar:
)
return plc.copying.get_element(self.obj, 0)

def rename(self, name: str | None, /) -> Self:
"""
Return a shallow copy with a new name.
Parameters
----------
name
New name
Returns
-------
Shallow copy of self with new name set.
"""
new = self.copy()
new.name = name
return new

def sorted_like(self, like: Column, /) -> Self:
"""
Copy sortedness properties from a column onto self.
Return a shallow copy with sortedness from like.
Parameters
----------
Expand All @@ -74,20 +92,23 @@ def sorted_like(self, like: Column, /) -> Self:
Returns
-------
Self with metadata set.
Shallow copy of self with metadata set.
See Also
--------
set_sorted, copy_metadata
"""
return self.set_sorted(
is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
return type(self)(
self.obj,
name=self.name,
is_sorted=like.is_sorted,
order=like.order,
null_order=like.null_order,
)

# TODO: Return Column once #16272 is fixed.
def astype(self, dtype: plc.DataType) -> plc.Column:
def astype(self, dtype: plc.DataType) -> Column:
"""
Return the backing column as the requested dtype.
Cast the column to as the requested dtype.
Parameters
----------
Expand All @@ -109,8 +130,10 @@ def astype(self, dtype: plc.DataType) -> plc.Column:
the current one.
"""
if self.obj.type() != dtype:
return plc.unary.cast(self.obj, dtype)
return self.obj
return Column(plc.unary.cast(self.obj, dtype), name=self.name).sorted_like(
self
)
return self

def copy_metadata(self, from_: pl.Series, /) -> Self:
"""
Expand All @@ -129,6 +152,7 @@ def copy_metadata(self, from_: pl.Series, /) -> Self:
--------
set_sorted, sorted_like
"""
self.name = from_.name
if len(from_) <= 1:
return self
ascending = from_.flags["SORTED_ASC"]
Expand Down Expand Up @@ -192,6 +216,7 @@ def copy(self) -> Self:
is_sorted=self.is_sorted,
order=self.order,
null_order=self.null_order,
name=self.name,
)

def mask_nans(self) -> Self:
Expand All @@ -217,58 +242,3 @@ def nan_count(self) -> int:
)
).as_py()
return 0


class NamedColumn(Column):
"""A column with a name."""

name: str

def __init__(
self,
column: plc.Column,
name: str,
*,
is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
order: plc.types.Order = plc.types.Order.ASCENDING,
null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
) -> None:
super().__init__(
column, is_sorted=is_sorted, order=order, null_order=null_order
)
self.name = name

def copy(self, *, new_name: str | None = None) -> Self:
"""
A shallow copy of the column.
Parameters
----------
new_name
Optional new name for the copied column.
Returns
-------
New column sharing data with self.
"""
return type(self)(
self.obj,
self.name if new_name is None else new_name,
is_sorted=self.is_sorted,
order=self.order,
null_order=self.null_order,
)

def mask_nans(self) -> Self:
"""Return a shallow copy of self with nans masked out."""
# Annoying, the inheritance is not right (can't call the
# super-type mask_nans), but will sort that by refactoring
# later.
if plc.traits.is_floating_point(self.obj.type()):
old_count = self.obj.null_count()
mask, new_count = plc.transform.nans_to_nulls(self.obj)
result = type(self)(self.obj.with_mask(mask, new_count), self.name)
if old_count == new_count:
return result.sorted_like(self)
return result
return self.copy()
Loading

0 comments on commit 219ec0e

Please sign in to comment.