rapidsai · rapids-bot · May 30, 2024 · May 16, 2024 · May 8, 2024 · May 8, 2024
@@ -51,6 +51,9 @@ cdef class DataType:
             self.c_obj == (<DataType>other).c_obj
         )
 
+    def __hash__(self):
+        return hash((self.c_obj.id(), self.c_obj.scale()))
+
     @staticmethod
     cdef DataType from_libcudf(data_type dt):
         """Create a DataType from a libcudf data_type.

@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Callback for the polars collect function to execute on device."""
+
+from __future__ import annotations
+
+from functools import partial
+from typing import TYPE_CHECKING
+
+import nvtx
+
+from cudf_polars.dsl.translate import translate_ir
+
+if TYPE_CHECKING:
+    import polars as pl
+
+    from cudf_polars.dsl.ir import IR
+
+__all__: list[str] = ["execute_with_cudf"]
+
+
+def _callback(
+    ir: IR,
+    with_columns: list[str] | None,
+    pyarrow_predicate: str | None,
+    n_rows: int | None,
+) -> pl.DataFrame:
+    assert with_columns is None
+    assert pyarrow_predicate is None
+    assert n_rows is None
+    with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
+        return ir.evaluate(cache={}).to_polars()
+
+
+def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None:
+    """
+    A post optimization callback that attempts to execute the plan with cudf.
+
+    Parameters
+    ----------
+    nt
+        NodeTraverser
+
+    raise_on_fail
+        Should conversion raise an exception rather than continuing
+        without setting a callback.
+
+    The NodeTraverser is mutated if the libcudf executor can handle the plan.
+    """
+    try:
+        with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
+            nt.set_udf(partial(_callback, translate_ir(nt)))
+    except NotImplementedError:
+        if raise_on_fail:
+            raise
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Containers of concrete data."""
+
+from __future__ import annotations
+
+__all__: list[str] = ["DataFrame", "Column", "Scalar"]
+
+from cudf_polars.containers.column import Column
+from cudf_polars.containers.dataframe import DataFrame
+from cudf_polars.containers.scalar import Scalar
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""A column, with some properties."""
+
+from __future__ import annotations
+
+import functools
+from typing import TYPE_CHECKING
+
+import cudf._lib.pylibcudf as plc
+
+if TYPE_CHECKING:
+    from typing_extensions import Self
+
+__all__: list[str] = ["Column"]
+
+
+class Column:
+    """A column, a name, and sortedness."""
+
+    obj: plc.Column
+    name: str
+    is_sorted: plc.types.Sorted
+    order: plc.types.Order
+    null_order: plc.types.NullOrder
+
+    def __init__(self, column: plc.Column, name: str):
+        self.obj = column
+        self.name = name
+        self.is_sorted = plc.types.Sorted.NO
+        self.order = plc.types.Order.ASCENDING
+        self.null_order = plc.types.NullOrder.BEFORE
+
+    def rename(self, name: str) -> Column:
+        """Return a new column sharing data with a new name."""
+        return type(self)(self.obj, name).with_sorted(like=self)
+
+    def with_sorted(self, *, like: Column) -> Self:
+        """Copy sortedness properties from a column onto self."""
+        return self.set_sorted(
+            is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
+        )
+
+    def set_sorted(
+        self,
+        *,
+        is_sorted: plc.types.Sorted,
+        order: plc.types.Order,
+        null_order: plc.types.NullOrder,
+    ) -> Self:
+        """
+        Modify sortedness metadata in place.
+
+        Parameters
+        ----------
+        is_sorted
+            Is the column sorted
+        order
+            The order if sorted
+        null_order
+            Where nulls sort, if sorted
+
+        Returns
+        -------
+        Self with metadata set.
+        """
+        self.is_sorted = is_sorted
+        self.order = order
+        self.null_order = null_order
+        return self
+
+    def copy(self) -> Self:
+        """Return a shallow copy of the column."""
+        return type(self)(self.obj, self.name).with_sorted(like=self)
+
+    def mask_nans(self) -> Self:
+        """Return a copy of self with nans masked out."""
+        if self.nan_count > 0:
+            raise NotImplementedError
+        return self.copy()
+
+    @functools.cached_property
+    def nan_count(self) -> int:
+        """Return the number of NaN values in the column."""
+        if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+            return 0
+        return plc.interop.to_arrow(
+            plc.reduce.reduce(
+                plc.unary.is_nan(self.obj),
+                plc.aggregation.sum(),
+                # TODO: pylibcudf needs to have a SizeType DataType singleton
+                plc.DataType(plc.TypeId.INT32),
+            )
+        ).as_py()
@@ -0,0 +1,171 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""A dataframe, with some properties."""
+
+from __future__ import annotations
+
+from functools import cached_property
+from typing import TYPE_CHECKING
+
+import polars as pl
+
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars.containers.column import Column
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping, Sequence, Set
+
+    from typing_extensions import Self
+
+    import cudf
+
+    from cudf_polars.containers.scalar import Scalar
+
+
+__all__: list[str] = ["DataFrame"]
+
+
+class DataFrame:
+    """A representation of a dataframe."""
+
+    columns: list[Column]
+    scalars: list[Scalar]
+    table: plc.Table | None
+
+    def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None:
+        self.columns = list(columns)
+        self._column_map = {c.name: c for c in self.columns}
+        self.scalars = list(scalars)
+        if len(scalars) == 0:
+            self.table = plc.Table([c.obj for c in columns])
+        else:
+            self.table = None
+
+    def to_polars(self) -> pl.DataFrame:
+        """Convert to a polars DataFrame."""
+        assert len(self.scalars) == 0
+        return pl.from_arrow(
+            plc.interop.to_arrow(
+                self.table,
+                [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
+            )
+        )
+
+    @cached_property
+    def column_names_set(self) -> frozenset[str]:
+        """Return the column names as a set."""
+        return frozenset(c.name for c in self.columns)
+
+    @cached_property
+    def column_names(self) -> list[str]:
+        """Return a list of the column names."""
+        return [c.name for c in self.columns]
+
+    @cached_property
+    def num_columns(self) -> int:
+        """Number of columns."""
+        return len(self.columns)
+
+    @cached_property
+    def num_rows(self) -> int:
+        """Number of rows."""
+        if self.table is None:
+            raise ValueError("Number of rows of frame with scalars makes no sense")
+        return self.table.num_rows()
+
+    @classmethod
+    def from_cudf(cls, df: cudf.DataFrame) -> Self:
+        """Create from a cudf dataframe."""
+        return cls(
+            [Column(c.to_pylibcudf(mode="read"), name) for name, c in df._data.items()],
+            [],
+        )
+
+    @classmethod
+    def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
+        """Create from a pylibcudf table."""
+        # TODO: strict=True when we drop py39
+        if table.num_columns() != len(names):
+            raise ValueError("Mismatching name and table length.")
+        return cls([Column(c, name) for c, name in zip(table.columns(), names)], [])
+
+    def with_sorted(self, *, like: DataFrame, subset: Set[str] | None = None) -> Self:
+        """Copy sortedness from a dataframe onto self."""
+        if like.column_names != self.column_names:
+            raise ValueError("Can only copy from identically named frame")
+        subset = self.column_names_set if subset is None else subset
+        self.columns = [
+            c.with_sorted(like=other) if c.name in subset else c
+            for c, other in zip(self.columns, like.columns)
+        ]
+        return self
+
+    def with_columns(self, columns: Sequence[Column]) -> Self:
+        """
+        Return a new dataframe with extra columns.
+
+        Data is shared.
+        """
+        return type(self)([*self.columns, *columns], self.scalars)
+
+    def discard_columns(self, names: Set[str]) -> Self:
+        """Drop columns by name."""
+        return type(self)(
+            [c for c in self.columns if c.name not in names], self.scalars
+        )
+
+    def select(self, names: Sequence[str]) -> Self:
+        """Select columns by name returning DataFrame."""
+        want = set(names)
+        if not want.issubset(self.column_names_set):
+            raise ValueError("Can't select missing names")
+        return type(self)([self._column_map[name] for name in names], self.scalars)
+
+    def replace_columns(self, *columns: Column) -> Self:
+        """Return a new dataframe with columns replaced by name."""
+        new = {c.name: c for c in columns}
+        if not set(new).issubset(self.column_names_set):
+            raise ValueError("Cannot replace with non-existing names")
+        return type(self)([new.get(c.name, c) for c in self.columns], self.scalars)
+
+    def rename_columns(self, mapping: Mapping[str, str]) -> Self:
+        """Rename some columns."""
+        return type(self)(
+            [c.rename(mapping.get(c.name, c.name)) for c in self.columns], self.scalars
+        )
+
+    def select_columns(self, names: Set[str]) -> list[Column]:
+        """Select columns by name."""
+        return [c for c in self.columns if c.name in names]
+
+    def filter(self, mask: Column) -> Self:
+        """Return a filtered table given a mask."""
+        table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
+        return type(self).from_table(table, self.column_names).with_sorted(like=self)
+
+    def slice(self, zlice: tuple[int, int] | None) -> Self:
+        """
+        Slice a dataframe.
+
+        Parameters
+        ----------
+        zlice
+            optional, tuple of start and length, negative values of start
+            treated as for python indexing. If not provided, returns self.
+
+        Returns
+        -------
+        New dataframe (if zlice is not None) other self (if it is)
+        """
+        if zlice is None:
+            return self
+        start, length = zlice
+        if start < 0:
+            start += self.num_rows
+        # Polars slice takes an arbitrary positive integer and slice
+        # to the end of the frame if it is larger.
+        end = min(start + length, self.num_rows)
+        (table,) = plc.copying.slice(self.table, [start, end])
+        return type(self).from_table(table, self.column_names).with_sorted(like=self)
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""A scalar, with some properties."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import cudf._lib.pylibcudf as plc
+
+__all__: list[str] = ["Scalar"]
+
+
+class Scalar:
+    """A scalar, and a name."""
+
+    __slots__ = ("obj", "name")
+    obj: plc.Scalar
+
+    def __init__(self, scalar: plc.Scalar):
+        self.obj = scalar
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""The domain-specific language (DSL) for the polars executor."""
+
+from __future__ import annotations
+
+__all__: list[str] = []