From b3d0e062203939f4e2d44240f95ee3f9957945aa Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 16 May 2024 17:41:03 +0000 Subject: [PATCH 01/56] Give pylibcudf DataTypes a __hash__ --- python/cudf/cudf/_lib/pylibcudf/types.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx index de10196e289..a5248ad0a1f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx @@ -51,6 +51,9 @@ cdef class DataType: self.c_obj == (other).c_obj ) + def __hash__(self): + return hash((self.c_obj.id(), self.c_obj.scale())) + @staticmethod cdef DataType from_libcudf(data_type dt): """Create a DataType from a libcudf data_type. From 22f6a4f05b86748fc9b09c0d81092d7c17948400 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 May 2024 14:17:25 +0000 Subject: [PATCH 02/56] WIP: Translate polars IR to ours --- .../cudf_polars/cudf_polars/dsl/__init__.py | 8 + python/cudf_polars/cudf_polars/dsl/expr.py | 121 ++++++++ python/cudf_polars/cudf_polars/dsl/ir.py | 160 +++++++++++ .../cudf_polars/cudf_polars/dsl/translate.py | 261 ++++++++++++++++++ 4 files changed, 550 insertions(+) create mode 100644 python/cudf_polars/cudf_polars/dsl/__init__.py create mode 100644 python/cudf_polars/cudf_polars/dsl/expr.py create mode 100644 python/cudf_polars/cudf_polars/dsl/ir.py create mode 100644 python/cudf_polars/cudf_polars/dsl/translate.py diff --git a/python/cudf_polars/cudf_polars/dsl/__init__.py b/python/cudf_polars/cudf_polars/dsl/__init__.py new file mode 100644 index 00000000000..cdc37f9e437 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""The DSL for the polars executor.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py new file mode 100644 index 00000000000..affc17d3de0 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +""" +DSL nodes for the polars expression language. + +An expression node is a function, `DataFrame -> Column` or `DataFrame -> Scalar`. + +The evaluation context is provided by a LogicalPlan node, and can +affect the evaluation rule as well as providing the dataframe input. +In particular, the interpretation of the expression language in a +`GroupBy` node is groupwise, rather than whole frame. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +__all__ = [ + "Expr", + "NamedExpr", + "Literal", + "Column", + "BooleanFunction", + "Sort", + "SortBy", + "Gather", + "Filter", + "Window", + "Cast", + "Agg", + "BinOp", +] + + +@dataclass(slots=True) +class Expr: + pass + + +@dataclass(slots=True) +class NamedExpr(Expr): + name: str + value: Expr + + +@dataclass(slots=True) +class Literal(Expr): + dtype: Any + value: Any + + +@dataclass(slots=True) +class Column(Expr): + name: str + + +@dataclass(slots=True) +class Len(Expr): + pass + + +@dataclass(slots=True) +class BooleanFunction(Expr): + name: str + options: Any + arguments: list[Expr] + + +@dataclass(slots=True) +class Sort(Expr): + column: Expr + options: Any + + +@dataclass(slots=True) +class SortBy(Expr): + column: Expr + by: list[Expr] + descending: list[bool] + + +@dataclass(slots=True) +class Gather(Expr): + values: Expr + indices: Expr + + +@dataclass(slots=True) +class Filter(Expr): + values: Expr + mask: Expr + + +@dataclass(slots=True) +class Window(Expr): + agg: Expr + by: None | list[Expr] + options: Any + + +@dataclass(slots=True) +class Cast(Expr): + dtype: Any + column: Expr + + +@dataclass(slots=True) +class Agg(Expr): + column: Expr + name: str + options: Any + + +@dataclass(slots=True) +class BinOp(Expr): + left: Expr + right: Expr + op: Any diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py new file mode 100644 index 00000000000..4009e5ffb04 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -0,0 +1,160 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +""" +DSL nodes for the LogicalPlan of polars. + +An IR node is either a source, normal, or a sink. Respectively they +can be considered as functions: + +- source: `IO () -> DataFrame` +- normal: `DataFrame -> DataFrame` +- sink: `DataFrame -> IO ()` +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from cudf_polars.dsl.expr import Expr + + +__all__ = [ + "IR", + "PythonScan", + "Scan", + "Cache", + "DataFrameScan", + "Select", + "GroupBy", + "Join", + "HStack", + "Distinct", + "Sort", + "Slice", + "Filter", + "Projection", + "MapFunction", + "Union", + "HConcat", + "ExtContext", +] + + +@dataclass(slots=True) +class IR: + schema: dict + + +@dataclass(slots=True) +class PythonScan(IR): + options: Any + predicate: Expr | None + + +@dataclass(slots=True) +class Scan(IR): + typ: Any + paths: list[str] + file_options: Any + predicate: Expr | None + + +@dataclass(slots=True) +class Cache(IR): + key: int + value: IR + + +@dataclass(slots=True) +class DataFrameScan(IR): + df: Any + projection: list[str] + predicate: Expr | None + + +@dataclass(slots=True) +class Select(IR): + df: IR + cse: list[Expr] + expr: list[Expr] + + +@dataclass(slots=True) +class GroupBy(IR): + df: IR + agg_requests: list[Expr] + keys: list[Expr] + options: Any + + +@dataclass(slots=True) +class Join(IR): + left: IR + right: IR + left_on: list[Expr] + right_on: list[Expr] + options: Any + + +@dataclass(slots=True) +class HStack(IR): + df: IR + columns: list[Expr] + + +@dataclass(slots=True) +class Distinct(IR): + df: IR + options: Any + + +@dataclass(slots=True) +class Sort(IR): + df: IR + by: list[Expr] + options: Any + + +@dataclass(slots=True) +class Slice(IR): + df: IR + offset: int + length: int + + +@dataclass(slots=True) +class Filter(IR): + df: IR + mask: Expr + + +@dataclass(slots=True) +class Projection(IR): + df: IR + + +@dataclass(slots=True) +class MapFunction(IR): + df: IR + name: str + options: Any + + +@dataclass(slots=True) +class Union(IR): + dfs: list[IR] + + +@dataclass(slots=True) +class HConcat(IR): + dfs: list[IR] + + +@dataclass(slots=True) +class ExtContext(IR): + df: IR + extra: list[IR] diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py new file mode 100644 index 00000000000..b456e76e99f --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -0,0 +1,261 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Translate polars IR representation to ours.""" + +from __future__ import annotations + +from contextlib import AbstractContextManager, nullcontext +from typing import Any + +from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir + +from cudf_polars.dsl import expr, ir + +__all__ = ["translate_ir", "translate_expr"] + + +class set_node(AbstractContextManager): + __slots__ = ("n", "visitor") + + def __init__(self, visitor, n): + self.visitor = visitor + self.n = n + + def __enter__(self): + n = self.visitor.get_node() + self.visitor.set_node(self.n) + self.n = n + + def __exit__(self, *args): + self.visitor.set_node(self.n) + + +noop_context: nullcontext = nullcontext() + + +def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: + """ + Translate a polars-internal IR node to our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + n + Optional node to start traversing from, if not provided uses + current polars-internal node. + + Returns + ------- + Translated IR object + + Raises + ------ + NotImplementedError if we can't translate the nodes due to + unsupported functionality. + """ + ctx: AbstractContextManager = ( + set_node(visitor, n) if n is not None else noop_context + ) + with ctx: + node = visitor.view_current_node() + schema = visitor.get_schema() + if isinstance(node, pl_ir.PythonScan): + return ir.PythonScan( + schema, + node.options, + translate_expr(visitor, n=node.predicate) + if node.predicate is not None + else None, + ) + elif isinstance(node, pl_ir.Scan): + return ir.Scan( + schema, + node.scan_type, + node.paths, + node.file_options, + translate_expr(visitor, n=node.predicate) + if node.predicate is not None + else None, + ) + elif isinstance(node, pl_ir.Cache): + return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input)) + elif isinstance(node, pl_ir.DataFrameScan): + return ir.DataFrameScan( + schema, + node.df, + node.projection, + translate_expr(visitor, n=node.selection) + if node.selection is not None + else None, + ) + elif isinstance(node, pl_ir.Select): + return ir.Select( + schema, + translate_ir(visitor, n=node.input), + [translate_expr(visitor, n=e) for e in node.cse_expr], + [translate_expr(visitor, n=e) for e in node.expr], + ) + elif isinstance(node, pl_ir.GroupBy): + return ir.GroupBy( + schema, + translate_ir(visitor, n=node.input), + [translate_expr(visitor, n=e) for e in node.aggs], + [translate_expr(visitor, n=e) for e in node.keys], + node.options, + ) + elif isinstance(node, pl_ir.Join): + return ir.Join( + schema, + translate_ir(visitor, n=node.input_left), + translate_ir(visitor, n=node.input_right), + [translate_expr(visitor, n=e) for e in node.left_on], + [translate_expr(visitor, n=e) for e in node.right_on], + node.options, + ) + elif isinstance(node, pl_ir.HStack): + return ir.HStack( + schema, + translate_ir(visitor, n=node.input), + [translate_expr(visitor, n=e) for e in node.exprs], + ) + elif isinstance(node, pl_ir.Distinct): + return ir.Distinct( + schema, + translate_ir(visitor, n=node.input), + node.options, + ) + elif isinstance(node, pl_ir.Sort): + return ir.Sort( + schema, + translate_ir(visitor, n=node.input), + [translate_expr(visitor, n=e) for e in node.by_column], + node.sort_options, + ) + elif isinstance(node, pl_ir.Slice): + return ir.Slice( + schema, translate_ir(visitor, n=node.input), node.offset, node.len + ) + elif isinstance(node, pl_ir.Filter): + return ir.Filter( + schema, + translate_ir(visitor, n=node.input), + translate_expr(visitor, n=node.predicate), + ) + elif isinstance(node, pl_ir.SimpleProjection): + return ir.Projection(schema, translate_ir(visitor, n=node.input)) + elif isinstance(node, pl_ir.MapFunction): + name, *options = node.function + return ir.MapFunction( + schema, + # TODO: merge_sorted breaks this pattern + translate_ir(visitor, n=node.input), + name, + options, + ) + elif isinstance(node, pl_ir.Union): + return ir.Union(schema, [translate_ir(visitor, n=n) for n in node.inputs]) + elif isinstance(node, pl_ir.HConcat): + return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs]) + elif isinstance(node, pl_ir.ExtContext): + return ir.ExtContext( + schema, + translate_ir(visitor, n=node.input), + [translate_ir(visitor, n=n) for n in node.contexts], + ) + else: + raise NotImplementedError( + f"No handler for LogicalPlan node with {type(node)=}" + ) + + +BOOLEAN_FUNCTIONS: frozenset[str] = frozenset() + + +def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: + """ + Translate a polars-internal expression IR into our representation. + + Parameters + ---------- + visitor + Polars NodeTraverser object + n + Node to translate, either an integer referencing a polars + internal node, or a named expression node. + + Returns + ------- + Translated IR object. + + Raises + ------ + NotImplementedError if any translation fails due to unsupported functionality. + """ + if isinstance(n, pl_expr.PyExprIR): + # TODO: type narrowing didn't work because PyExprIR is Unknown + assert not isinstance(n, int) + return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node)) + node = visitor.view_expression(n) + if isinstance(node, pl_expr.Function): + name, *options = node.function_data + if name in BOOLEAN_FUNCTIONS: + return expr.BooleanFunction( + name, + options, + [translate_expr(visitor, n=n) for n in node.input], + ) + else: + raise NotImplementedError(f"No handler for Expr function node with {name=}") + elif isinstance(node, pl_expr.Window): + # TODO: raise in groupby? + return expr.Window( + translate_expr(visitor, n=node.function), + [translate_expr(visitor, n=n) for n in node.partition_by] + if node.partition_by is not None + else None, + node.options, + ) + elif isinstance(node, pl_expr.Literal): + return expr.Literal(node.dtype, node.value) + elif isinstance(node, pl_expr.Sort): + # TODO: raise in groupby + return expr.Sort(translate_expr(visitor, n=node.expr), node.options) + elif isinstance(node, pl_expr.SortBy): + # TODO: raise in groupby + return expr.SortBy( + translate_expr(visitor, n=node.expr), + [translate_expr(visitor, n=n) for n in node.by], + node.descending, + ) + elif isinstance(node, pl_expr.Gather): + return expr.Gather( + translate_expr(visitor, n=node.expr), + translate_expr(visitor, n=node.idx), + ) + elif isinstance(node, pl_expr.Filter): + return expr.Filter( + translate_expr(visitor, n=node.input), + translate_expr(visitor, n=node.by), + ) + elif isinstance(node, pl_expr.Cast): + return expr.Cast(node.dtype, translate_expr(visitor, n=node.expr)) + elif isinstance(node, pl_expr.Column): + return expr.Column(node.name) + elif isinstance(node, pl_expr.Agg): + return expr.Agg( + translate_expr(visitor, n=node.arguments), + node.name, + node.options, + ) + elif isinstance(node, pl_expr.BinaryExpr): + return expr.BinOp( + translate_expr(visitor, n=node.left), + translate_expr(visitor, n=node.right), + node.op, + ) + elif isinstance(node, pl_expr.Len): + return expr.Len() + else: + raise NotImplementedError(f"No handler for expression node with {type(node)=}") From 8ac43478f47e1010df33a6eefa5fab7842653e24 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 May 2024 16:50:06 +0000 Subject: [PATCH 03/56] Add some container objects --- .../cudf_polars/containers/__init__.py | 8 +++ .../cudf_polars/containers/column.py | 54 +++++++++++++++++++ .../cudf_polars/containers/dataframe.py | 50 +++++++++++++++++ .../cudf_polars/containers/scalar.py | 25 +++++++++ 4 files changed, 137 insertions(+) create mode 100644 python/cudf_polars/cudf_polars/containers/__init__.py create mode 100644 python/cudf_polars/cudf_polars/containers/column.py create mode 100644 python/cudf_polars/cudf_polars/containers/dataframe.py create mode 100644 python/cudf_polars/cudf_polars/containers/scalar.py diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py new file mode 100644 index 00000000000..c8b444389bd --- /dev/null +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Containers of concrete data.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py new file mode 100644 index 00000000000..1d7f00435cf --- /dev/null +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""A column, with some properties.""" + +from __future__ import annotations + +import cudf._lib.pylibcudf as plc + +__all__: list[str] = ["Column"] + + +class Column: + """A column, a name, and sortedness.""" + + __slots__ = ("obj", "name", "is_sorted", "order", "null_order") + obj: plc.Column + name: str + is_sorted: plc.types.Sorted + order: plc.types.Order + null_order: plc.types.NullOrder + + def __init__(self, column: plc.Column, name: str): + self.obj = column + self.name = name + self.is_sorted = plc.types.Sorted.NO + + def set_sorted( + self, + is_sorted: plc.types.Sorted, + order: plc.types.Order, + null_order: plc.types.NullOrder, + ) -> Column: + """ + Return a new column sharing data with sortedness set. + + Parameters + ---------- + is_sorted + Is the column sorted + order + The order if sorted + null_order + Where nulls sort, if sorted + + Returns + ------- + New column sharing data. + """ + obj = Column(self.obj, self.name) + obj.is_sorted = is_sorted + obj.order = order + obj.null_order = null_order + return obj diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py new file mode 100644 index 00000000000..9f368c76626 --- /dev/null +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""A dataframe, with some properties.""" + +from __future__ import annotations + +import itertools +from typing import TYPE_CHECKING + +import cudf._lib.pylibcudf as plc + +if TYPE_CHECKING: + from cudf_polars.containers.column import Column + from cudf_polars.containers.scalar import Scalar + +__all__: list[str] = ["DataFrame"] + + +class DataFrame: + """A representation of a dataframe.""" + + __slots__ = ("columns", "scalars", "names", "scalar_names", "table") + columns: list[Column] + scalars: list[Scalar] + names: dict[str, int] + scalar_names: frozenset[str] + table: plc.Table | None + + def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None: + self.names = dict(zip((c.name for c in columns), itertools.count(0))) | dict( + zip((s.name for s in columns), itertools.count(0)) + ) + self.scalar_names = frozenset(s.name for s in scalars) + self.columns = columns + self.scalars = scalars + if len(scalars) == 0: + self.table = plc.Table(columns) + else: + self.table = None + + __iter__ = None + + def __getitem__(self, name: str) -> Column | Scalar: + """Return column with given name.""" + i = self.names[name] + if name in self.scalar_names: + return self.scalars[i] + else: + return self.columns[i] diff --git a/python/cudf_polars/cudf_polars/containers/scalar.py b/python/cudf_polars/cudf_polars/containers/scalar.py new file mode 100644 index 00000000000..a9b59a3218c --- /dev/null +++ b/python/cudf_polars/cudf_polars/containers/scalar.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""A scalar, with some properties.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import cudf._lib.pylibcudf as plc + +__all__: list[str] = ["Scalar"] + + +class Scalar: + """A scalar, and a name.""" + + __slots__ = ("obj", "name") + obj: plc.Scalar + name: str + + def __init__(self, scalar: plc.Column, name: str): + self.obj = scalar + self.name = name From 4ab983e017970a8eaa2e60badfa1b831c4f7a716 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 May 2024 16:54:19 +0000 Subject: [PATCH 04/56] WIP: really, fleshing out some evaluation --- python/cudf_polars/cudf_polars/dsl/ir.py | 81 ++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 4009e5ffb04..f796e57b0ab 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -17,6 +17,10 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any +from typing_extensions import assert_never + +import cudf._lib.pylibcudf as plc +import cudf_polars.dsl.expr as expr if TYPE_CHECKING: from cudf_polars.dsl.expr import Expr @@ -54,6 +58,9 @@ class PythonScan(IR): options: Any predicate: Expr | None + def evaluate(self): + raise NotImplementedError + @dataclass(slots=True) class Scan(IR): @@ -62,6 +69,36 @@ class Scan(IR): file_options: Any predicate: Expr | None + def __post_init__(self): + if self.file_options.n_rows is not None: + raise NotImplementedError("row limit in scan") + if self.typ not in ("csv", "parquet"): + raise NotImplementedError(f"Unhandled scan type: {self.typ}") + def evaluate(self): + options = self.file_options + n_rows = options.n_rows + with_columns = options.with_columns + row_index = options.row_index + assert n_rows is None + if self.typ == "csv": + df = cudf.concat( + [cudf.read_csv(p, usecols=with_columns) for p in self.paths] + ) + elif self.typ == "parquet": + df = cudf.read_parquet(self.paths, columns=with_columns) + else: + assert_never(self.typ) + if row_index is not None: + name, offset = row_index + dtype = self.schema[name] + index = as_column( + ..., dtype=dtype + ) + + + + + @dataclass(slots=True) class Cache(IR): @@ -90,6 +127,42 @@ class GroupBy(IR): keys: list[Expr] options: Any + @staticmethod + def check_agg(agg: Expr) -> int: + """ + Determine if we can handle an aggregation expression. + + Parameters + ---------- + agg + Expression to check + + Returns + ------- + depth of nesting + + Raises + ------ + NotImplementedError for unsupported expression nodes. + """ + if isinstance(agg, expr.Agg): + if agg.name == "implode": + raise NotImplementedError("implode in groupby") + return 1 + GroupBy.check_agg(agg.column) + elif isinstance(agg, (expr.Len, expr.Column, expr.Literal)): + return 0 + elif isinstance(agg, expr.BinOp): + return max(GroupBy.check_agg(agg.left), GroupBy.check_agg(agg.right)) + elif isinstance(agg, expr.Cast): + return GroupBy.check_agg(agg.column) + else: + raise NotImplementedError(f"No handler for {agg=}") + + def __post_init__(self): + """Check whether all the aggregations are implemented.""" + if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests): + raise NotImplementedError("Nested aggregations in groupby") + @dataclass(slots=True) class Join(IR): @@ -99,6 +172,14 @@ class Join(IR): right_on: list[Expr] options: Any + def __post_init__(self): + """Raise for unsupported options.""" + how, coalesce = self.options[0], self.options[-1] + if how == "cross": + raise NotImplementedError("cross join not implemented") + if how == "outer" and not coalesce: + raise NotImplementedError("non-coalescing outer join") + @dataclass(slots=True) class HStack(IR): From 1981a3dce23d4babab20373a7d3bac61097818c9 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 9 May 2024 11:27:09 +0000 Subject: [PATCH 05/56] Flesh out more container stuff --- .../cudf_polars/containers/__init__.py | 6 +- .../cudf_polars/containers/column.py | 28 ++++-- .../cudf_polars/containers/dataframe.py | 73 ++++++++++++++- .../cudf_polars/cudf_polars/utils/__init__.py | 8 ++ .../cudf_polars/cudf_polars/utils/dtypes.py | 89 +++++++++++++++++++ 5 files changed, 192 insertions(+), 12 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/utils/__init__.py create mode 100644 python/cudf_polars/cudf_polars/utils/dtypes.py diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index c8b444389bd..ef9d9ca61b6 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,4 +5,8 @@ from __future__ import annotations -__all__: list[str] = [] +__all__: list[str] = ["DataFrame", "Column", "Scalar"] + +from cudf_polars.containers.column import Column +from cudf_polars.containers.dataframe import DataFrame +from cudf_polars.containers.scalar import Scalar diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 1d7f00435cf..efcd2e0da20 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -5,8 +5,13 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import cudf._lib.pylibcudf as plc +if TYPE_CHECKING: + from typing_extensions import Self + __all__: list[str] = ["Column"] @@ -25,14 +30,22 @@ def __init__(self, column: plc.Column, name: str): self.name = name self.is_sorted = plc.types.Sorted.NO + def with_metadata(self, *, like: Column) -> Self: + """Copy metadata from a column onto self.""" + self.is_sorted = like.is_sorted + self.order = like.order + self.null_order = like.null_order + return self + def set_sorted( self, + *, is_sorted: plc.types.Sorted, order: plc.types.Order, null_order: plc.types.NullOrder, - ) -> Column: + ) -> Self: """ - Return a new column sharing data with sortedness set. + Modify sortedness metadata in place. Parameters ---------- @@ -45,10 +58,9 @@ def set_sorted( Returns ------- - New column sharing data. + Self with metadata set. """ - obj = Column(self.obj, self.name) - obj.is_sorted = is_sorted - obj.order = order - obj.null_order = null_order - return obj + self.is_sorted = is_sorted + self.order = order + self.null_order = null_order + return self diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 9f368c76626..502817d652b 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -6,13 +6,19 @@ from __future__ import annotations import itertools +from functools import cached_property from typing import TYPE_CHECKING import cudf._lib.pylibcudf as plc +from cudf_polars.containers.column import Column +from cudf_polars.containers.scalar import Scalar + if TYPE_CHECKING: - from cudf_polars.containers.column import Column - from cudf_polars.containers.scalar import Scalar + from typing_extensions import Self + + import cudf + __all__: list[str] = ["DataFrame"] @@ -35,7 +41,7 @@ def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None: self.columns = columns self.scalars = scalars if len(scalars) == 0: - self.table = plc.Table(columns) + self.table = plc.Table([c.obj for c in columns]) else: self.table = None @@ -48,3 +54,64 @@ def __getitem__(self, name: str) -> Column | Scalar: return self.scalars[i] else: return self.columns[i] + + @cached_property + def num_rows(self): + """Number of rows.""" + if self.table is None: + raise ValueError("Number of rows of frame with scalars makes no sense") + return self.table.num_rows() + + @classmethod + def from_cudf(cls, df: cudf.DataFrame) -> Self: + """Create from a cudf dataframe.""" + return cls( + [Column(c.to_pylibcudf(mode="read"), name) for name, c in df._data.items()], + [], + ) + + def with_columns(self, *columns: Column | Scalar) -> Self: + """ + Return a new dataframe with extra columns. + + Data is shared. + """ + cols = [c for c in columns if isinstance(c, Column)] + scalars = [c for c in columns if isinstance(c, Scalar)] + return type(self)([*self.columns, *cols], [*self.scalars, *scalars]) + + def discard_columns(self, names: set[str]) -> Self: + """Drop columns by name.""" + return type(self)([c for c in self.columns if c not in names], self.scalars) + + def replace_columns(self, *columns: Column) -> Self: + """Return a new dataframe with columns replaced by name, maintaining order.""" + new = {c.name: c for c in columns} + if set(new).intersection(self.scalar_names): + raise ValueError("Cannot replace scalars") + if not set(new).issubset(self.names): + raise ValueError("Cannot replace with non-existing names") + return type(self)([new.get(c.name, c) for c in self.columns], self.scalars) + + def rename_columns(self, mapping: dict[str, str]) -> Self: + """Rename some columns.""" + new_columns = [ + Column(c, mapping.get(c.name, c.name)).with_metadata(like=c) + for c in self.columns + ] + return type(self)(new_columns, self.scalars) + + def select_columns(self, names: set[str]) -> list[Column]: + """Select columns by name.""" + return [c for c in self.columns if c.name in names] + + def filter(self, mask: Column) -> Self: + """Return a filtered table given a mask.""" + table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj) + return type(self)( + [ + Column(new, old.name).with_metadata(like=old) + for old, new in zip(self.columns, table.columns()) + ], + [], + ) diff --git a/python/cudf_polars/cudf_polars/utils/__init__.py b/python/cudf_polars/cudf_polars/utils/__init__.py new file mode 100644 index 00000000000..6018209e1e8 --- /dev/null +++ b/python/cudf_polars/cudf_polars/utils/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Utilities.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py new file mode 100644 index 00000000000..1ac8719b839 --- /dev/null +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Datatype utilities.""" + +from __future__ import annotations + +from functools import cache + +from typing_extensions import assert_never + +import polars as pl + +import cudf._lib.pylibcudf as plc + + +@cache +def from_polars(dtype: pl.DataType) -> plc.DataType: + """ + Convert a polars datatype to a pylibcudf one. + + Parameters + ---------- + dtype + Polars dtype to convert + + Returns + ------- + Matching pylibcudf DataType object. + + Raises + ------ + NotImplementedError for unsupported conversions. + """ + if isinstance(dtype, pl.Int8): + return plc.DataType(plc.TypeId.BOOL8) + elif isinstance(dtype, pl.Int8): + return plc.DataType(plc.TypeId.INT8) + elif isinstance(dtype, pl.Int16): + return plc.DataType(plc.TypeId.INT16) + elif isinstance(dtype, pl.Int32): + return plc.DataType(plc.TypeId.INT32) + elif isinstance(dtype, pl.Int64): + return plc.DataType(plc.TypeId.INT64) + if isinstance(dtype, pl.UInt8): + return plc.DataType(plc.TypeId.UINT8) + elif isinstance(dtype, pl.UInt16): + return plc.DataType(plc.TypeId.UINT16) + elif isinstance(dtype, pl.UInt32): + return plc.DataType(plc.TypeId.UINT32) + elif isinstance(dtype, pl.UInt64): + return plc.DataType(plc.TypeId.UINT64) + elif isinstance(dtype, pl.Float32): + return plc.DataType(plc.TypeId.FLOAT32) + elif isinstance(dtype, pl.Float64): + return plc.DataType(plc.TypeId.FLOAT64) + elif isinstance(dtype, pl.Date): + return plc.DataType(plc.TypeId.TIMESTAMP_DAYS) + elif isinstance(dtype, pl.Time): + raise NotImplementedError("Time of day dtype not implemented") + elif isinstance(dtype, pl.Datetime): + if dtype.time_zone is not None: + raise NotImplementedError("Time zone support") + if dtype.time_unit == "ms": + return plc.DataType(plc.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype.time_unit == "us": + return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) + elif dtype.time_unit == "ns": + return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS) + else: + assert dtype.time_unit is not None + assert_never(dtype.time_unit) + elif isinstance(dtype, pl.Duration): + if dtype.time_unit == "ms": + return plc.DataType(plc.TypeId.DURATION_MILLISECONDS) + elif dtype.time_unit == "us": + return plc.DataType(plc.TypeId.DURATION_MICROSECONDS) + elif dtype.time_unit == "ns": + return plc.DataType(plc.TypeId.DURATION_NANOSECONDS) + else: + assert dtype.time_unit is not None + assert_never(dtype.time_unit) + elif isinstance(dtype, pl.String): + return plc.DataType(plc.TypeId.STRING) + elif isinstance(dtype, pl.Null): + # TODO: Hopefully + return plc.DataType(plc.TypeId.EMPTY) + else: + raise NotImplementedError(f"{dtype=} conversion not supported") From 700f0757a4ab3495f039b767c3b228443ffeef05 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 9 May 2024 11:29:50 +0000 Subject: [PATCH 06/56] WIP: More fleshing out evaluation --- python/cudf_polars/cudf_polars/dsl/expr.py | 14 +- python/cudf_polars/cudf_polars/dsl/ir.py | 198 ++++++++++++++++-- .../cudf_polars/cudf_polars/dsl/translate.py | 4 +- 3 files changed, 192 insertions(+), 24 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index affc17d3de0..3ec0223b7a2 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -16,13 +16,16 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Any +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from cudf_polars.containers import Column, DataFrame __all__ = [ "Expr", "NamedExpr", "Literal", - "Column", + "Col", "BooleanFunction", "Sort", "SortBy", @@ -37,7 +40,10 @@ @dataclass(slots=True) class Expr: - pass + # TODO: return type is a lie for Literal + def evaluate(self, context: DataFrame) -> Column: + """Evaluate this expression given a dataframe for context.""" + raise NotImplementedError @dataclass(slots=True) @@ -53,7 +59,7 @@ class Literal(Expr): @dataclass(slots=True) -class Column(Expr): +class Col(Expr): name: str diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index f796e57b0ab..f7d5b56e637 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -16,13 +16,24 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Any +from functools import cache +from typing import TYPE_CHECKING, Any, Callable + +import pyarrow as pa from typing_extensions import assert_never +import polars as pl + +import cudf import cudf._lib.pylibcudf as plc + import cudf_polars.dsl.expr as expr +from cudf_polars.containers import Column, DataFrame +from cudf_polars.utils import dtypes if TYPE_CHECKING: + from typing import Literal + from cudf_polars.dsl.expr import Expr @@ -52,15 +63,16 @@ class IR: schema: dict + def evaluate(self) -> DataFrame: + """Evaluate and return a dataframe.""" + raise NotImplementedError + @dataclass(slots=True) class PythonScan(IR): options: Any predicate: Expr | None - def evaluate(self): - raise NotImplementedError - @dataclass(slots=True) class Scan(IR): @@ -70,34 +82,49 @@ class Scan(IR): predicate: Expr | None def __post_init__(self): + """Validate preconditions.""" if self.file_options.n_rows is not None: raise NotImplementedError("row limit in scan") if self.typ not in ("csv", "parquet"): raise NotImplementedError(f"Unhandled scan type: {self.typ}") - def evaluate(self): + + def evaluate(self) -> DataFrame: + """Evaluate and return a dataframe.""" options = self.file_options n_rows = options.n_rows with_columns = options.with_columns row_index = options.row_index assert n_rows is None if self.typ == "csv": - df = cudf.concat( - [cudf.read_csv(p, usecols=with_columns) for p in self.paths] + df = DataFrame.from_cudf( + cudf.concat( + [cudf.read_csv(p, usecols=with_columns) for p in self.paths] + ) ) elif self.typ == "parquet": - df = cudf.read_parquet(self.paths, columns=with_columns) + df = DataFrame.from_cudf( + cudf.read_parquet(self.paths, columns=with_columns) + ) else: assert_never(self.typ) if row_index is not None: name, offset = row_index - dtype = self.schema[name] - index = as_column( - ..., dtype=dtype + dtype = dtypes.from_polars(self.schema[name]) + step = plc.interop.from_arrow(pa.scalar(1), data_type=dtype) + init = plc.interop.from_arrow(pa.scalar(offset), data_type=dtype) + index = Column( + plc.filling.sequence(df.num_rows(), init, step), name + ).set_sorted( + is_sorted=plc.types.Sorted.YES, + order=plc.types.Order.ASCENDING, + null_order=plc.types.null_order.AFTER, ) - - - - + df = df.with_columns(index) + if self.predicate is None: + return df + else: + mask = self.predicate.evaluate(df) + return df.filter(mask) @dataclass(slots=True) @@ -112,6 +139,34 @@ class DataFrameScan(IR): projection: list[str] predicate: Expr | None + def evaluate(self) -> DataFrame: + """Evaluate and return a dataframe.""" + pdf = pl.DataFrame._from_pydf(self.df) + if self.projection is not None: + pdf = pdf.select(self.projection) + # TODO: goes away when libcudf supports large strings + table = pdf.to_arrow() + schema = table.schema + for i, field in enumerate(schema): + if field.type == pa.large_string(): + # TODO: Nested types + schema = schema.set(i, pa.field(field.name, pa.string())) + table = table.cast(schema) + df = DataFrame( + [ + Column(col, name) + for name, col in zip( + self.schema.keys(), plc.interop.from_arrow(table).columns() + ) + ], + [], + ) + if self.predicate is not None: + mask = self.predicate.evaluate(df) + return df.filter(mask) + else: + return df + @dataclass(slots=True) class Select(IR): @@ -119,6 +174,13 @@ class Select(IR): cse: list[Expr] expr: list[Expr] + def evaluate(self): + """Evaluate and return a dataframe.""" + df = self.df.evaluate() + for e in self.cse: + df = df.with_columns(e.evaluate(df)) + return DataFrame([e.evaluate(df) for e in self.expr], []) + @dataclass(slots=True) class GroupBy(IR): @@ -174,11 +236,109 @@ class Join(IR): def __post_init__(self): """Raise for unsupported options.""" - how, coalesce = self.options[0], self.options[-1] - if how == "cross": + if self.options[0] == "cross": raise NotImplementedError("cross join not implemented") - if how == "outer" and not coalesce: - raise NotImplementedError("non-coalescing outer join") + + @cache + @staticmethod + def _joiners( + how: Literal["inner", "left", "outer", "leftsemi", "leftanti"], + ) -> tuple[ + Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None + ]: + if how == "inner": + return ( + plc.join.inner_join, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + elif how == "left": + return ( + plc.join.left_join, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + plc.copying.OutOfBoundsPolicy.NULLIFY, + ) + elif how == "outer": + return ( + plc.join.full_join, + plc.copying.OutOfBoundsPolicy.NULLIFY, + plc.copying.OutOfBoundsPolicy.NULLIFY, + ) + elif how == "leftsemi": + return ( + plc.join.left_semi_join, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + None, + ) + elif how == "leftanti": + return ( + plc.join.left_anti_join, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + None, + ) + else: + assert_never(how) + + def evaluate(self) -> DataFrame: + """Evaluate and return a dataframe.""" + left = self.left.evaluate() + right = self.right.evaluate() + left_on = DataFrame([e.evaluate(left) for e in self.left_on], []) + right_on = DataFrame([e.evaluate(right) for e in self.right_on], []) + how, join_nulls, zlice, suffix, coalesce = self.options + null_equality = ( + plc.types.NullEquality.EQUAL + if join_nulls + else plc.types.NullEquality.UNEQUAL + ) + suffix = "_right" if suffix is None else suffix + join_fn, left_policy, right_policy = Join._joiners(how) + if right_policy is None: + # Semi join + lg = join_fn(left_on.table, right_on.table, null_equality) + left = left.replace_columns(*left_on.columns) + table = plc.copying.gather(left.table, lg, left_policy) + result = DataFrame( + [ + Column(c, col.name) + for col, c in zip(left_on.columns, table.columns()) + ], + [], + ) + else: + lg, rg = join_fn(left_on, right_on, null_equality) + left = left.replace_columns(*left_on.columns) + right = right.replace_columns(*right_on.columns) + if coalesce and how != "outer": + right = right.discard_columns(set(right_on.names)) + left = DataFrame( + plc.copying.gather(left.table, lg, left_policy).columns(), [] + ) + right = DataFrame( + plc.copying.gather(right.table, rg, right_policy).columns(), [] + ) + if coalesce and how == "outer": + left.replace_columns( + *( + Column( + plc.replace.replace_nulls(left_col.obj, right_col.obj), + left_col.name, + ) + for left_col, right_col in zip( + left.select_columns(set(left_on.names)), + right.select_columns(set(right_on.names)), + ) + ) + ) + right.discard_columns(set(right_on.names)) + right = right.rename_columns( + {name: f"{name}{suffix}" for name in right.names if name in left.names} + ) + result = left.with_columns(*right.columns) + if zlice is not None: + raise NotImplementedError("slicing") + else: + return result @dataclass(slots=True) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index b456e76e99f..9d6020ee6f4 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -16,6 +16,8 @@ class set_node(AbstractContextManager): + """Run a block with current node set in the visitor.""" + __slots__ = ("n", "visitor") def __init__(self, visitor, n): @@ -242,7 +244,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: elif isinstance(node, pl_expr.Cast): return expr.Cast(node.dtype, translate_expr(visitor, n=node.expr)) elif isinstance(node, pl_expr.Column): - return expr.Column(node.name) + return expr.Col(node.name) elif isinstance(node, pl_expr.Agg): return expr.Agg( translate_expr(visitor, n=node.arguments), From 9c303bc32246e0b8bfc3644e4fa566f29459fa67 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 9 May 2024 17:44:00 +0000 Subject: [PATCH 07/56] WIP: More fleshing out Still need to port the expression eval --- .../cudf_polars/containers/column.py | 15 +- .../cudf_polars/containers/dataframe.py | 69 ++++++-- python/cudf_polars/cudf_polars/dsl/ir.py | 163 ++++++++++++++---- .../cudf_polars/cudf_polars/utils/sorting.py | 44 +++++ 4 files changed, 236 insertions(+), 55 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/utils/sorting.py diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index efcd2e0da20..e34a1a7726e 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -30,12 +30,15 @@ def __init__(self, column: plc.Column, name: str): self.name = name self.is_sorted = plc.types.Sorted.NO - def with_metadata(self, *, like: Column) -> Self: - """Copy metadata from a column onto self.""" - self.is_sorted = like.is_sorted - self.order = like.order - self.null_order = like.null_order - return self + def rename(self, name: str) -> Column: + """Return a new column sharing data with a new name.""" + return type(self)(self.obj, name).with_sorted(like=self) + + def with_sorted(self, *, like: Column) -> Self: + """Copy sortedness properties from a column onto self.""" + return self.set_sorted( + is_sorted=like.is_sorted, order=like.order, null_order=like.null_order + ) def set_sorted( self, diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 502817d652b..c29494debd5 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -55,6 +55,16 @@ def __getitem__(self, name: str) -> Column | Scalar: else: return self.columns[i] + @cached_property + def column_names(self) -> list[str]: + """Return a list of the column names.""" + return [c.name for c in self.columns] + + @cached_property + def num_columns(self): + """Number of columns.""" + return len(self.columns) + @cached_property def num_rows(self): """Number of rows.""" @@ -70,6 +80,22 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self: [], ) + @classmethod + def from_table(cls, table: plc.Table, names: list[str]) -> Self: + """Create from a pylibcudf table.""" + if table.num_columns != len(names): + raise ValueError("Mismatching name and table length.") + return cls([Column(c, name) for c, name in zip(table.columns(), names)], []) + + def with_sorted(self, *, like: DataFrame) -> Self: + """Copy sortedness from a dataframe onto self.""" + if like.column_names != self.column_names: + raise ValueError("Can only copy from identically named frame") + self.columns = [ + c.with_sorted(like=other) for c, other in zip(self.columns, like.columns) + ] + return self + def with_columns(self, *columns: Column | Scalar) -> Self: """ Return a new dataframe with extra columns. @@ -85,7 +111,7 @@ def discard_columns(self, names: set[str]) -> Self: return type(self)([c for c in self.columns if c not in names], self.scalars) def replace_columns(self, *columns: Column) -> Self: - """Return a new dataframe with columns replaced by name, maintaining order.""" + """Return a new dataframe with columns replaced by name.""" new = {c.name: c for c in columns} if set(new).intersection(self.scalar_names): raise ValueError("Cannot replace scalars") @@ -95,11 +121,9 @@ def replace_columns(self, *columns: Column) -> Self: def rename_columns(self, mapping: dict[str, str]) -> Self: """Rename some columns.""" - new_columns = [ - Column(c, mapping.get(c.name, c.name)).with_metadata(like=c) - for c in self.columns - ] - return type(self)(new_columns, self.scalars) + return type(self)( + [c.rename(mapping.get(c.name, c.name)) for c in self.columns], self.scalars + ) def select_columns(self, names: set[str]) -> list[Column]: """Select columns by name.""" @@ -108,10 +132,29 @@ def select_columns(self, names: set[str]) -> list[Column]: def filter(self, mask: Column) -> Self: """Return a filtered table given a mask.""" table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj) - return type(self)( - [ - Column(new, old.name).with_metadata(like=old) - for old, new in zip(self.columns, table.columns()) - ], - [], - ) + return type(self).from_table(table, self.column_names).with_sorted(like=self) + + def slice(self, zlice: tuple[int, int] | None) -> Self: + """ + Slice a dataframe. + + Parameters + ---------- + zlice + optional, tuple of start and length, negative values of start + treated as for python indexing. If not provided, returns self. + + Returns + ------- + New dataframe (if zlice is not None) other self (if it is) + """ + if zlice is None: + return self + start, length = zlice + if start < 0: + start += self.num_rows + # Polars slice takes an arbitrary positive integer and slice + # to the end of the frame if it is larger. + end = min(start + length, self.num_rows) + (table,) = plc.copying.slice(self.table, [start, end]) + return type(self).from_table(table, self.column_names).with_sorted(like=self) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index f7d5b56e637..ce069e1ce5f 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -17,7 +17,7 @@ from dataclasses import dataclass from functools import cache -from typing import TYPE_CHECKING, Any, Callable +from typing import TYPE_CHECKING, Any, Callable, ClassVar import pyarrow as pa from typing_extensions import assert_never @@ -29,7 +29,7 @@ import cudf_polars.dsl.expr as expr from cudf_polars.containers import Column, DataFrame -from cudf_polars.utils import dtypes +from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: from typing import Literal @@ -63,7 +63,7 @@ class IR: schema: dict - def evaluate(self) -> DataFrame: + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" raise NotImplementedError @@ -88,7 +88,7 @@ def __post_init__(self): if self.typ not in ("csv", "parquet"): raise NotImplementedError(f"Unhandled scan type: {self.typ}") - def evaluate(self) -> DataFrame: + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" options = self.file_options n_rows = options.n_rows @@ -132,6 +132,13 @@ class Cache(IR): key: int value: IR + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + try: + return cache[self.key] + except KeyError: + return cache.setdefault(self.key, self.value.evaluate(cache=cache)) + @dataclass(slots=True) class DataFrameScan(IR): @@ -139,7 +146,7 @@ class DataFrameScan(IR): projection: list[str] predicate: Expr | None - def evaluate(self) -> DataFrame: + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" pdf = pl.DataFrame._from_pydf(self.df) if self.projection is not None: @@ -152,14 +159,8 @@ def evaluate(self) -> DataFrame: # TODO: Nested types schema = schema.set(i, pa.field(field.name, pa.string())) table = table.cast(schema) - df = DataFrame( - [ - Column(col, name) - for name, col in zip( - self.schema.keys(), plc.interop.from_arrow(table).columns() - ) - ], - [], + df = DataFrame.from_table( + plc.interop.from_arrow(table), list(self.schema.keys()) ) if self.predicate is not None: mask = self.predicate.evaluate(df) @@ -174,9 +175,9 @@ class Select(IR): cse: list[Expr] expr: list[Expr] - def evaluate(self): + def evaluate(self, *, cache: dict[int, DataFrame]): """Evaluate and return a dataframe.""" - df = self.df.evaluate() + df = self.df.evaluate(cache=cache) for e in self.cse: df = df.with_columns(e.evaluate(df)) return DataFrame([e.evaluate(df) for e in self.expr], []) @@ -235,7 +236,7 @@ class Join(IR): options: Any def __post_init__(self): - """Raise for unsupported options.""" + """Validate preconditions.""" if self.options[0] == "cross": raise NotImplementedError("cross join not implemented") @@ -279,10 +280,10 @@ def _joiners( else: assert_never(how) - def evaluate(self) -> DataFrame: + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - left = self.left.evaluate() - right = self.right.evaluate() + left = self.left.evaluate(cache=cache) + right = self.right.evaluate(cache=cache) left_on = DataFrame([e.evaluate(left) for e in self.left_on], []) right_on = DataFrame([e.evaluate(right) for e in self.right_on], []) how, join_nulls, zlice, suffix, coalesce = self.options @@ -298,24 +299,18 @@ def evaluate(self) -> DataFrame: lg = join_fn(left_on.table, right_on.table, null_equality) left = left.replace_columns(*left_on.columns) table = plc.copying.gather(left.table, lg, left_policy) - result = DataFrame( - [ - Column(c, col.name) - for col, c in zip(left_on.columns, table.columns()) - ], - [], - ) + result = DataFrame.from_table(table, left.column_names) else: lg, rg = join_fn(left_on, right_on, null_equality) left = left.replace_columns(*left_on.columns) right = right.replace_columns(*right_on.columns) if coalesce and how != "outer": right = right.discard_columns(set(right_on.names)) - left = DataFrame( - plc.copying.gather(left.table, lg, left_policy).columns(), [] + left = DataFrame.from_table( + plc.copying.gather(left.table, lg, left_policy), left.column_names ) - right = DataFrame( - plc.copying.gather(right.table, rg, right_policy).columns(), [] + right = DataFrame.from_table( + plc.copying.gather(right.table, rg, right_policy), right.column_names ) if coalesce and how == "outer": left.replace_columns( @@ -335,10 +330,7 @@ def evaluate(self) -> DataFrame: {name: f"{name}{suffix}" for name in right.names if name in left.names} ) result = left.with_columns(*right.columns) - if zlice is not None: - raise NotImplementedError("slicing") - else: - return result + return result.slice(zlice) @dataclass(slots=True) @@ -346,18 +338,117 @@ class HStack(IR): df: IR columns: list[Expr] + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + return df.with_columns(*(c.evaluate(df) for c in self.columns)) + @dataclass(slots=True) class Distinct(IR): df: IR - options: Any + keep: plc.stream_compaction.DuplicateKeepOption + subset: set[str] | None + zlice: tuple[int, int] | None + stable: bool + + _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = { + "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + "last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, + "none": plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY, + } + + def __init__(self, schema: dict, df: IR, options: Any): + self.schema = schema + self.df = df + (keep, subset, maintain_order, zlice) = options + self.keep = Distinct._KEEP_MAP[keep] + self.subset = set(subset) if subset is not None else None + self.stable = maintain_order + self.zlice = zlice + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + if self.subset is None: + indices = list(range(df.num_columns)) + else: + indices = [i for i, k in enumerate(df.names) if k in self.subset] + keys_sorted = all(c.is_sorted for c in df.columns) + if keys_sorted: + table = plc.stream_compaction.unique( + df.table, + indices, + self.keep, + plc.types.NullEquality.EQUAL, + ) + else: + distinct = ( + plc.stream_compaction.stable_distinct + if self.stable + else plc.stream_compaction.distinct + ) + table = distinct( + df.table, + indices, + self.keep, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + result = DataFrame( + [Column(c, old.name) for c, old in zip(table.columns(), df.columns)], [] + ) + if keys_sorted or self.stable: + result = result.with_sorted(like=df) + return result.slice(self.zlice) @dataclass(slots=True) class Sort(IR): df: IR by: list[Expr] - options: Any + do_sort: Callable[..., plc.Table] + zlice: tuple[int, int] | None + order: list[plc.types.Order] + null_order: list[plc.types.NullOrder] + + def __init__(self, schema: dict, df: IR, by: list[Expr], options: Any): + self.schema = schema + self.df = df + self.by = by + stable, nulls_last, descending = options + self.order, self.null_order = sorting.sort_order( + descending, nulls_last=nulls_last, num_keys=len(by) + ) + self.do_sort = ( + plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + ) + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + sort_keys = [k.evaluate(df) for k in self.by] + keys_in_result = [ + i + for k in sort_keys + if (i := df.names.get(k.name)) is not None and k is df.columns[i] + ] + table = self.do_sort( + df.table, + plc.Table([k.obj for k in sort_keys]), + self.order, + self.null_order, + ) + columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)] + # If a sort key is in the result table, set the sortedness property + for idx in keys_in_result: + columns[idx] = columns[idx].set_sorted( + is_sorted=plc.types.Sorted.YES, + order=self.order[idx], + null_order=self.null_order[idx], + ) + return DataFrame(columns, []) @dataclass(slots=True) diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py new file mode 100644 index 00000000000..fed1cd35416 --- /dev/null +++ b/python/cudf_polars/cudf_polars/utils/sorting.py @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Sorting utilities.""" + +from __future__ import annotations + +import cudf._lib.pylibcudf as plc + + +def sort_order( + descending: list[bool], *, nulls_last: bool, num_keys: int +) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]: + """ + Produce sort order arguments. + + Parameters + ---------- + descending + List indicating order for each column + nulls_last + Should nulls sort last or first? + num_keys + Number of sort keys + + Returns + ------- + tuple of column_order and null_precendence + suitable for passing to sort routines + """ + # Mimicking polars broadcast handling of descending + if num_keys > (n := len(descending)) and n == 1: + descending = [descending[0]] * num_keys + column_order = [ + plc.types.Order.DESCENDING if d else plc.types.Order.ASCENDING + for d in descending + ] + null_precedence = [] + for asc in column_order: + if (asc == plc.types.Order.ASCENDING) ^ (not nulls_last): + null_precedence.append(plc.types.NullOrder.AFTER) + elif (asc == plc.types.Order.ASCENDING) ^ nulls_last: + null_precedence.append(plc.types.NullOrder.BEFORE) + return column_order, null_precedence From 688d8ef9a8b79a66f710e7a2528cef92b44479b2 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 10 May 2024 13:24:37 +0000 Subject: [PATCH 08/56] WIP: more implementation --- .../cudf_polars/containers/dataframe.py | 28 +++-- python/cudf_polars/cudf_polars/dsl/ir.py | 101 +++++++++++++++++- 2 files changed, 117 insertions(+), 12 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index c29494debd5..fda4eb3617d 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -12,13 +12,14 @@ import cudf._lib.pylibcudf as plc from cudf_polars.containers.column import Column -from cudf_polars.containers.scalar import Scalar if TYPE_CHECKING: from typing_extensions import Self import cudf + from cudf_polars.containers.scalar import Scalar + __all__: list[str] = ["DataFrame"] @@ -55,6 +56,11 @@ def __getitem__(self, name: str) -> Column | Scalar: else: return self.columns[i] + @cached_property + def column_names_set(self) -> set[str]: + """Return the column names as a set.""" + return {c.name for c in self.columns} + @cached_property def column_names(self) -> list[str]: """Return a list of the column names.""" @@ -87,28 +93,34 @@ def from_table(cls, table: plc.Table, names: list[str]) -> Self: raise ValueError("Mismatching name and table length.") return cls([Column(c, name) for c, name in zip(table.columns(), names)], []) - def with_sorted(self, *, like: DataFrame) -> Self: + def with_sorted(self, *, like: DataFrame, subset: set[str] | None = None) -> Self: """Copy sortedness from a dataframe onto self.""" if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") + subset = self.column_names_set if subset is None else subset self.columns = [ - c.with_sorted(like=other) for c, other in zip(self.columns, like.columns) + c.with_sorted(like=other) if c.name in subset else c + for c, other in zip(self.columns, like.columns) ] return self - def with_columns(self, *columns: Column | Scalar) -> Self: + def with_columns(self, columns: list[Column]) -> Self: """ Return a new dataframe with extra columns. Data is shared. """ - cols = [c for c in columns if isinstance(c, Column)] - scalars = [c for c in columns if isinstance(c, Scalar)] - return type(self)([*self.columns, *cols], [*self.scalars, *scalars]) + return type(self)([*self.columns, *columns], self.scalars) def discard_columns(self, names: set[str]) -> Self: """Drop columns by name.""" - return type(self)([c for c in self.columns if c not in names], self.scalars) + return type(self)( + [c for c in self.columns if c.name not in names], self.scalars + ) + + def select(self, names: set[str]) -> Self: + """Select columns by name returning DataFrame.""" + return type(self)([c for c in self.columns if c.name in names], self.scalars) def replace_columns(self, *columns: Column) -> Self: """Return a new dataframe with columns replaced by name.""" diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index ce069e1ce5f..e0d794e0615 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -119,7 +119,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: order=plc.types.Order.ASCENDING, null_order=plc.types.null_order.AFTER, ) - df = df.with_columns(index) + df = df.with_columns([index]) if self.predicate is None: return df else: @@ -179,7 +179,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]): """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) for e in self.cse: - df = df.with_columns(e.evaluate(df)) + df = df.with_columns([e.evaluate(df)]) return DataFrame([e.evaluate(df) for e in self.expr], []) @@ -329,7 +329,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: right = right.rename_columns( {name: f"{name}{suffix}" for name in right.names if name in left.names} ) - result = left.with_columns(*right.columns) + result = left.with_columns(right.columns) return result.slice(zlice) @@ -341,7 +341,7 @@ class HStack(IR): def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - return df.with_columns(*(c.evaluate(df) for c in self.columns)) + return df.with_columns([c.evaluate(df) for c in self.columns]) @dataclass(slots=True) @@ -457,17 +457,32 @@ class Slice(IR): offset: int length: int + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + return df.slice((self.offset, self.length)) + @dataclass(slots=True) class Filter(IR): df: IR mask: Expr + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + return df.filter(self.mask.evaluate(df)) + @dataclass(slots=True) class Projection(IR): df: IR + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + return df.select(set(self.schema.keys())) + @dataclass(slots=True) class MapFunction(IR): @@ -475,6 +490,84 @@ class MapFunction(IR): name: str options: Any + _NAMES: ClassVar[frozenset[str]] = frozenset( + [ + "drop_nulls", + "rechunk", + "merge_sorted", + "rename", + "explode", + ] + ) + + def __post_init__(self): + """Validate preconditions.""" + if self.name not in MapFunction._NAMES: + raise NotImplementedError(f"Unhandled map function {self.name}") + if self.name == "explode": + (to_explode,) = self.options + if len(to_explode) > 1: + # TODO: straightforward, but need to error check + # polars requires that all to-explode columns have the + # same sub-shapes + raise NotImplementedError("Explode with more than one column") + elif self.name == "merge_sorted": + assert isinstance(self.df, Union) + (key_column,) = self.options + if key_column not in self.df.dfs[0].schema: + raise ValueError(f"Key column {key_column} not found") + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + if self.name == "merge_sorted": + # merge_sorted operates on Union inputs + # but if we evaluate the Union then we can't unpick the + # pieces, so we dive inside and evaluate the pieces by hand + assert isinstance(self.df, Union) + first, *rest = (c.evaluate(cache=cache) for c in self.df.dfs) + (key_column,) = self.options + if not all(first.column_names == r.column_names for r in rest): + raise ValueError("DataFrame shapes/column names don't match") + # Already validated that key_column is in column names + index = first.column_names.index(key_column) + return DataFrame.from_table( + plc.merge.merge_sorted( + [first.table, *(df.table for df in rest)], + [index], + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.BEFORE], + ), + first.column_names, + ).with_sorted(like=first, subset={key_column}) + elif self.name == "rechunk": + # No-op in our data model + return self.df.evaluate(cache=cache) + elif self.name == "drop_nulls": + df = self.df.evaluate(cache=cache) + (subset,) = self.options + subset = set(subset) + indices = [i for i, name in enumerate(df.column_names) if name in subset] + return DataFrame.from_table( + plc.stream_compaction.drop_nulls(df.table, indices, len(indices)), + df.column_names, + ).with_sorted(like=df) + elif self.name == "rename": + df = self.df.evaluate(cache=cache) + # final tag is "swapping" which is useful for the + # optimiser (it blocks some pushdown operations) + old, new, _ = self.options + return df.rename_columns(dict(zip(old, new))) + elif self.name == "explode": + df = self.df.evaluate(cache=cache) + ((to_explode,),) = self.options + index = df.column_names.index(to_explode) + subset = df.column_names_set - {to_explode} + return DataFrame.from_table( + plc.lists.explode_outer(df.table, index), df.column_names + ).with_sorted(like=df, subset=subset) + else: + raise AssertionError("Should never be reached") + @dataclass(slots=True) class Union(IR): From f56525aff239ac3f097999436e92540272858a9d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 10 May 2024 13:42:01 +0000 Subject: [PATCH 09/56] WIP: simplify --- .../cudf_polars/containers/dataframe.py | 24 +++++--------- python/cudf_polars/cudf_polars/dsl/ir.py | 31 +++++++++++-------- 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index fda4eb3617d..8cd2943853e 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -5,7 +5,6 @@ from __future__ import annotations -import itertools from functools import cached_property from typing import TYPE_CHECKING @@ -14,6 +13,8 @@ from cudf_polars.containers.column import Column if TYPE_CHECKING: + from collections.abc import Sequence + from typing_extensions import Self import cudf @@ -30,14 +31,10 @@ class DataFrame: __slots__ = ("columns", "scalars", "names", "scalar_names", "table") columns: list[Column] scalars: list[Scalar] - names: dict[str, int] scalar_names: frozenset[str] table: plc.Table | None def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None: - self.names = dict(zip((c.name for c in columns), itertools.count(0))) | dict( - zip((s.name for s in columns), itertools.count(0)) - ) self.scalar_names = frozenset(s.name for s in scalars) self.columns = columns self.scalars = scalars @@ -48,14 +45,6 @@ def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None: __iter__ = None - def __getitem__(self, name: str) -> Column | Scalar: - """Return column with given name.""" - i = self.names[name] - if name in self.scalar_names: - return self.scalars[i] - else: - return self.columns[i] - @cached_property def column_names_set(self) -> set[str]: """Return the column names as a set.""" @@ -104,7 +93,7 @@ def with_sorted(self, *, like: DataFrame, subset: set[str] | None = None) -> Sel ] return self - def with_columns(self, columns: list[Column]) -> Self: + def with_columns(self, columns: Sequence[Column]) -> Self: """ Return a new dataframe with extra columns. @@ -118,16 +107,17 @@ def discard_columns(self, names: set[str]) -> Self: [c for c in self.columns if c.name not in names], self.scalars ) - def select(self, names: set[str]) -> Self: + def select(self, names: Sequence[str]) -> Self: """Select columns by name returning DataFrame.""" - return type(self)([c for c in self.columns if c.name in names], self.scalars) + want = set(names) + return type(self)([c for c in self.columns if c.name in want], self.scalars) def replace_columns(self, *columns: Column) -> Self: """Return a new dataframe with columns replaced by name.""" new = {c.name: c for c in columns} if set(new).intersection(self.scalar_names): raise ValueError("Cannot replace scalars") - if not set(new).issubset(self.names): + if not set(new).issubset(self.column_names_set): raise ValueError("Cannot replace with non-existing names") return type(self)([new.get(c.name, c) for c in self.columns], self.scalars) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index e0d794e0615..a8147549b28 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -61,7 +61,7 @@ @dataclass(slots=True) class IR: - schema: dict + schema: dict[str, Any] def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -305,7 +305,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: left = left.replace_columns(*left_on.columns) right = right.replace_columns(*right_on.columns) if coalesce and how != "outer": - right = right.discard_columns(set(right_on.names)) + right = right.discard_columns(right_on.column_names_set) left = DataFrame.from_table( plc.copying.gather(left.table, lg, left_policy), left.column_names ) @@ -320,14 +320,18 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: left_col.name, ) for left_col, right_col in zip( - left.select_columns(set(left_on.names)), - right.select_columns(set(right_on.names)), + left.select_columns(left_on.column_names_set), + right.select_columns(right_on.column_names_set), ) ) ) - right.discard_columns(set(right_on.names)) + right.discard_columns(right_on.column_names_set) right = right.rename_columns( - {name: f"{name}{suffix}" for name in right.names if name in left.names} + { + name: f"{name}{suffix}" + for name in right.column_names + if name in left.column_names_set + } ) result = left.with_columns(right.columns) return result.slice(zlice) @@ -374,7 +378,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: if self.subset is None: indices = list(range(df.num_columns)) else: - indices = [i for i, k in enumerate(df.names) if k in self.subset] + indices = [i for i, k in enumerate(df.column_names) if k in self.subset] keys_sorted = all(c.is_sorted for c in df.columns) if keys_sorted: table = plc.stream_compaction.unique( @@ -429,10 +433,11 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) sort_keys = [k.evaluate(df) for k in self.by] + names = {c.name: i for i, c in enumerate(df.columns)} keys_in_result = [ i for k in sort_keys - if (i := df.names.get(k.name)) is not None and k is df.columns[i] + if (i := names.get(k.name)) is not None and k is df.columns[i] ] table = self.do_sort( df.table, @@ -442,11 +447,11 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ) columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)] # If a sort key is in the result table, set the sortedness property - for idx in keys_in_result: - columns[idx] = columns[idx].set_sorted( + for i in keys_in_result: + columns[i] = columns[i].set_sorted( is_sorted=plc.types.Sorted.YES, - order=self.order[idx], - null_order=self.null_order[idx], + order=self.order[i], + null_order=self.null_order[i], ) return DataFrame(columns, []) @@ -481,7 +486,7 @@ class Projection(IR): def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - return df.select(set(self.schema.keys())) + return df.select(list(self.schema.keys())) @dataclass(slots=True) From 2cb6f5031d4724075ea5cdcfe2d20807794f048a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 10 May 2024 14:53:06 +0000 Subject: [PATCH 10/56] WIP: Maybe done with eval of plan nodes --- .../cudf_polars/containers/dataframe.py | 23 ++++++------ python/cudf_polars/cudf_polars/dsl/ir.py | 36 +++++++++++++++++++ .../cudf_polars/cudf_polars/dsl/translate.py | 4 ++- 3 files changed, 51 insertions(+), 12 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 8cd2943853e..c30f8c10ca2 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -13,7 +13,7 @@ from cudf_polars.containers.column import Column if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import Mapping, Sequence, Set from typing_extensions import Self @@ -34,10 +34,10 @@ class DataFrame: scalar_names: frozenset[str] table: plc.Table | None - def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None: + def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None: self.scalar_names = frozenset(s.name for s in scalars) - self.columns = columns - self.scalars = scalars + self.columns = list(columns) + self.scalars = list(scalars) if len(scalars) == 0: self.table = plc.Table([c.obj for c in columns]) else: @@ -46,9 +46,9 @@ def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None: __iter__ = None @cached_property - def column_names_set(self) -> set[str]: + def column_names_set(self) -> frozenset[str]: """Return the column names as a set.""" - return {c.name for c in self.columns} + return frozenset(c.name for c in self.columns) @cached_property def column_names(self) -> list[str]: @@ -76,13 +76,14 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self: ) @classmethod - def from_table(cls, table: plc.Table, names: list[str]) -> Self: + def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: """Create from a pylibcudf table.""" + # TODO: strict=True when we drop py39 if table.num_columns != len(names): raise ValueError("Mismatching name and table length.") return cls([Column(c, name) for c, name in zip(table.columns(), names)], []) - def with_sorted(self, *, like: DataFrame, subset: set[str] | None = None) -> Self: + def with_sorted(self, *, like: DataFrame, subset: Set[str] | None = None) -> Self: """Copy sortedness from a dataframe onto self.""" if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") @@ -101,7 +102,7 @@ def with_columns(self, columns: Sequence[Column]) -> Self: """ return type(self)([*self.columns, *columns], self.scalars) - def discard_columns(self, names: set[str]) -> Self: + def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" return type(self)( [c for c in self.columns if c.name not in names], self.scalars @@ -121,13 +122,13 @@ def replace_columns(self, *columns: Column) -> Self: raise ValueError("Cannot replace with non-existing names") return type(self)([new.get(c.name, c) for c in self.columns], self.scalars) - def rename_columns(self, mapping: dict[str, str]) -> Self: + def rename_columns(self, mapping: Mapping[str, str]) -> Self: """Rename some columns.""" return type(self)( [c.rename(mapping.get(c.name, c.name)) for c in self.columns], self.scalars ) - def select_columns(self, names: set[str]) -> list[Column]: + def select_columns(self, names: Set[str]) -> list[Column]: """Select columns by name.""" return [c for c in self.columns if c.name in names] diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index a8147549b28..078ad3e884c 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -486,6 +486,7 @@ class Projection(IR): def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) + # This can reorder things. return df.select(list(self.schema.keys())) @@ -577,14 +578,49 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class Union(IR): dfs: list[IR] + zlice: tuple[int, int] | None + + def __post_init__(self): + """Validated preconditions.""" + schema = self.dfs[0].schema + if not all(s == schema for s in self.dfs[1:]): + raise ValueError("Schema mismatch") + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + dfs = [df.evaluate(cache=cache) for df in self.dfs] + return DataFrame.from_table( + plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names + ).slice(self.zlice) @dataclass(slots=True) class HConcat(IR): dfs: list[IR] + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + dfs = [df.evaluate(cache=cache) for df in self.dfs] + columns, scalars = zip(*((df.columns, df.scalars) for df in dfs)) + return DataFrame(columns, scalars) + @dataclass(slots=True) class ExtContext(IR): df: IR extra: list[IR] + + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + # TODO: polars optimizer doesn't do projection pushdown + # through extcontext AFAICT. + df = self.df.evaluate(cache=cache) + # extra contexts are added in order, if they have any + # overlapping column names, those are ignored. + names = df.column_names_set.copy() + # TODO: scalars + for ir in self.extra: + extra = ir.evaluate(cache=cache).discard_columns(names) + names |= extra.column_names_set + df = df.with_columns(extra.columns) + return df diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 9d6020ee6f4..ff634948663 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -157,7 +157,9 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: options, ) elif isinstance(node, pl_ir.Union): - return ir.Union(schema, [translate_ir(visitor, n=n) for n in node.inputs]) + return ir.Union( + schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options + ) elif isinstance(node, pl_ir.HConcat): return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs]) elif isinstance(node, pl_ir.ExtContext): From c3e0a9207de94df026655234bbe0b3f7373b4ed6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 13 May 2024 14:52:06 +0000 Subject: [PATCH 11/56] WIP: expression evaluation --- .../cudf_polars/containers/dataframe.py | 6 +- .../cudf_polars/containers/scalar.py | 4 +- python/cudf_polars/cudf_polars/dsl/expr.py | 169 +++++++++++++++++- python/cudf_polars/cudf_polars/dsl/ir.py | 14 +- .../cudf_polars/cudf_polars/dsl/translate.py | 17 +- 5 files changed, 187 insertions(+), 23 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index c30f8c10ca2..2c05cee9dea 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -28,15 +28,13 @@ class DataFrame: """A representation of a dataframe.""" - __slots__ = ("columns", "scalars", "names", "scalar_names", "table") columns: list[Column] scalars: list[Scalar] - scalar_names: frozenset[str] table: plc.Table | None def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None: - self.scalar_names = frozenset(s.name for s in scalars) self.columns = list(columns) + self._column_map = {c.name: c for c in self.columns} self.scalars = list(scalars) if len(scalars) == 0: self.table = plc.Table([c.obj for c in columns]) @@ -116,8 +114,6 @@ def select(self, names: Sequence[str]) -> Self: def replace_columns(self, *columns: Column) -> Self: """Return a new dataframe with columns replaced by name.""" new = {c.name: c for c in columns} - if set(new).intersection(self.scalar_names): - raise ValueError("Cannot replace scalars") if not set(new).issubset(self.column_names_set): raise ValueError("Cannot replace with non-existing names") return type(self)([new.get(c.name, c) for c in self.columns], self.scalars) diff --git a/python/cudf_polars/cudf_polars/containers/scalar.py b/python/cudf_polars/cudf_polars/containers/scalar.py index a9b59a3218c..fc97d0fd9c2 100644 --- a/python/cudf_polars/cudf_polars/containers/scalar.py +++ b/python/cudf_polars/cudf_polars/containers/scalar.py @@ -18,8 +18,6 @@ class Scalar: __slots__ = ("obj", "name") obj: plc.Scalar - name: str - def __init__(self, scalar: plc.Column, name: str): + def __init__(self, scalar: plc.Scalar): self.obj = scalar - self.name = name diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 3ec0223b7a2..376651f4124 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -15,11 +15,22 @@ from __future__ import annotations +import enum from dataclasses import dataclass -from typing import TYPE_CHECKING, Any +from enum import IntEnum +from typing import TYPE_CHECKING, Any, ClassVar + +import pyarrow as pa + +from polars.polars import _expr_nodes as pl_expr + +import cudf._lib.pylibcudf as plc + +from cudf_polars.containers import Column, Scalar +from cudf_polars.utils import sorting if TYPE_CHECKING: - from cudf_polars.containers import Column, DataFrame + from cudf_polars.containers import DataFrame __all__ = [ "Expr", @@ -38,10 +49,18 @@ ] +class ExecutionContext(IntEnum): + FRAME = enum.auto() + GROUPBY = enum.auto() + ROLLING = enum.auto() + + @dataclass(slots=True) class Expr: # TODO: return type is a lie for Literal - def evaluate(self, context: DataFrame) -> Column: + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: """Evaluate this expression given a dataframe for context.""" raise NotImplementedError @@ -51,21 +70,45 @@ class NamedExpr(Expr): name: str value: Expr + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + return Column(self.value.evaluate(df, context=context), self.name) + @dataclass(slots=True) class Literal(Expr): - dtype: Any + dtype: plc.Datatype value: Any + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + obj = plc.interop.from_arrow(pa.scalar(self.value), data_type=self.dtype) + return Scalar(obj) # type: ignore + @dataclass(slots=True) class Col(Expr): name: str + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + return df._column_map[self.name] + @dataclass(slots=True) class Len(Expr): - pass + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + # TODO: type is wrong + return df.num_rows @dataclass(slots=True) @@ -80,12 +123,43 @@ class Sort(Expr): column: Expr options: Any + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + column = self.column.evaluate(df, context=context) + (stable, nulls_last, descending) = self.options + order, null_order = sorting.sort_order( + [descending], nulls_last=nulls_last, num_keys=1 + ) + do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort + table = do_sort(plc.Table([column], order, null_order)) + return Column(table.columns()[0], column.name).set_sorted( + is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0] + ) + @dataclass(slots=True) class SortBy(Expr): column: Expr by: list[Expr] - descending: list[bool] + options: Any + + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + column = self.column.evaluate(df, context=context) + by = [b.evaluate(df, context=context) for b in self.by] + (stable, nulls_last, descending) = self.options + order, null_order = sorting.sort_order( + descending, nulls_last=nulls_last, num_keys=len(self.by) + ) + do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + table = do_sort( + plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order + ) + return Column(table.columns()[0], column.name) @dataclass(slots=True) @@ -93,12 +167,47 @@ class Gather(Expr): values: Expr indices: Expr + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + values = self.values.evaluate(df, context=context) + indices = self.indices.evaluate(df, context=context) + lo, hi = plc.reduce.minmax(indices.obj) + lo = plc.interop.to_arrow(lo).as_py() + hi = plc.interop.to_arrow(hi).as_py() + n = df.num_rows + if hi >= n or lo < -n: + raise ValueError("gather indices are out of bounds") + if indices.obj.null_count(): + bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY + obj = plc.replace.replace_nulls( + indices.obj, + plc.interop.from_arrow(pa.scalar(n), data_type=indices.obj.data_type()), + ) + else: + bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK + obj = indices.obj + table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy) + return Column(table.columns()[0], values.name) + @dataclass(slots=True) class Filter(Expr): values: Expr mask: Expr + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + values = self.values.evaluate(df, context=context) + mask = self.mask.evaluate(df, context=context) + table = plc.stream_compaction.apply_boolean_mask( + plc.Table([values.obj]), mask.obj + ) + return Column(table.columns()[0], values.name).with_sorted(like=values) + @dataclass(slots=True) class Window(Expr): @@ -109,9 +218,18 @@ class Window(Expr): @dataclass(slots=True) class Cast(Expr): - dtype: Any + dtype: plc.DataType column: Expr + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + column = self.column.evaluate(df, context=context) + return Column(plc.unary.cast(column, self.dtype), column.name).with_sorted( + like=column + ) + @dataclass(slots=True) class Agg(Expr): @@ -124,4 +242,39 @@ class Agg(Expr): class BinOp(Expr): left: Expr right: Expr - op: Any + op: plc.binaryop.BinaryOperator + dtype: plc.DataType + + _MAPPING: ClassVar[dict[pl_expr.PyOperator, plc.binaryop.BinaryOperator]] = { + pl_expr.PyOperator.Eq: plc.binaryop.BinaryOperator.EQUAL, + pl_expr.PyOperator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, + pl_expr.PyOperator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL, + pl_expr.PyOperator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS, + pl_expr.PyOperator.Lt: plc.binaryop.BinaryOperator.LESS, + pl_expr.PyOperator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL, + pl_expr.PyOperator.Gt: plc.binaryop.BinaryOperator.GREATER, + pl_expr.PyOperator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL, + pl_expr.PyOperator.Plus: plc.binaryop.BinaryOperator.ADD, + pl_expr.PyOperator.Minus: plc.binaryop.BinaryOperator.SUB, + pl_expr.PyOperator.Multiply: plc.binaryop.BinaryOperator.MUL, + pl_expr.PyOperator.Divide: plc.binaryop.BinaryOperator.DIV, + pl_expr.PyOperator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV, + pl_expr.PyOperator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV, + pl_expr.PyOperator.Modulus: plc.binaryop.BinaryOperator.PYMOD, + pl_expr.PyOperator.And: plc.binaryop.BinaryOperator.BITWISE_AND, + pl_expr.PyOperator.Or: plc.binaryop.BinaryOperator.BITWISE_OR, + pl_expr.PyOperator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR, + pl_expr.PyOperator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND, + pl_expr.PyOperator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR, + } + + def evaluate( + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + left = self.left.evaluate(df, context=context) + right = self.right.evaluate(df, context=context) + return Column( + plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype), + left.name, + ) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 078ad3e884c..70e7d20bd22 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -29,7 +29,7 @@ import cudf_polars.dsl.expr as expr from cudf_polars.containers import Column, DataFrame -from cudf_polars.utils import dtypes, sorting +from cudf_polars.utils import sorting if TYPE_CHECKING: from typing import Literal @@ -61,7 +61,7 @@ @dataclass(slots=True) class IR: - schema: dict[str, Any] + schema: dict[str, plc.DataType] def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -109,7 +109,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: assert_never(self.typ) if row_index is not None: name, offset = row_index - dtype = dtypes.from_polars(self.schema[name]) + dtype = self.schema[name] step = plc.interop.from_arrow(pa.scalar(1), data_type=dtype) init = plc.interop.from_arrow(pa.scalar(offset), data_type=dtype) index = Column( @@ -120,6 +120,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: null_order=plc.types.null_order.AFTER, ) df = df.with_columns([index]) + assert all( + c.obj.data_type() == dtype + for c, dtype in zip(df.columns, self.schema.values()) + ) if self.predicate is None: return df else: @@ -162,6 +166,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: df = DataFrame.from_table( plc.interop.from_arrow(table), list(self.schema.keys()) ) + assert all( + c.obj.data_type() == dtype + for c, dtype in zip(df.columns, self.schema.values()) + ) if self.predicate is not None: mask = self.predicate.evaluate(df) return df.filter(mask) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index ff634948663..95f705199d4 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -11,6 +11,7 @@ from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir from cudf_polars.dsl import expr, ir +from cudf_polars.utils import dtypes __all__ = ["translate_ir", "translate_expr"] @@ -62,7 +63,7 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: ) with ctx: node = visitor.view_current_node() - schema = visitor.get_schema() + schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()} if isinstance(node, pl_ir.PythonScan): return ir.PythonScan( schema, @@ -222,7 +223,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: node.options, ) elif isinstance(node, pl_expr.Literal): - return expr.Literal(node.dtype, node.value) + return expr.Literal(dtypes.from_polars(node.dtype), node.value) elif isinstance(node, pl_expr.Sort): # TODO: raise in groupby return expr.Sort(translate_expr(visitor, n=node.expr), node.options) @@ -244,7 +245,13 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: translate_expr(visitor, n=node.by), ) elif isinstance(node, pl_expr.Cast): - return expr.Cast(node.dtype, translate_expr(visitor, n=node.expr)) + inner = translate_expr(visitor, n=node.expr) + # Push casts into literals so we can handle Cast(Literal(Null)) + dtype = dtypes.from_polars(node.dtype) + if isinstance(inner, expr.Literal): + return expr.Literal(dtype, inner.value) + else: + return expr.Cast(dtype, inner) elif isinstance(node, pl_expr.Column): return expr.Col(node.name) elif isinstance(node, pl_expr.Agg): @@ -257,7 +264,9 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: return expr.BinOp( translate_expr(visitor, n=node.left), translate_expr(visitor, n=node.right), - node.op, + expr.BinOp._MAPPING[node.op], + # TODO: Should lay dtype onto every node, but visitor.get_dtype is O(n) not O(1) + dtypes.from_polars(visitor.get_dtype(n)), ) elif isinstance(node, pl_expr.Len): return expr.Len() From ec4562c26f3812cbc46a5d28c56243d5d06bd69f Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 13 May 2024 17:02:17 +0000 Subject: [PATCH 12/56] WIP: some more --- python/cudf_polars/cudf_polars/dsl/expr.py | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 376651f4124..1f9488f4884 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -30,6 +30,8 @@ from cudf_polars.utils import sorting if TYPE_CHECKING: + from typing import Callable + from cudf_polars.containers import DataFrame __all__ = [ @@ -237,6 +239,31 @@ class Agg(Expr): name: str options: Any + _MAPPING: ClassVar[dict[str, Callable[..., plc.aggregation.Aggregation]]] = { + "min": plc.aggregation.min, + "max": plc.aggregation.max, + "median": plc.aggregation.median, + "nunique": plc.aggregation.nunique, + "first": lambda: plc.aggregation.nth_element(0), + "last": lambda: plc.aggregation.nth_element(-1), # TODO: check + "mean": plc.aggregation.mean, + "sum": plc.aggregation.sum, + "count": lambda include_null: plc.aggregation.count( + plc.types.NullPolicy.INCLUDE + if include_null + else plc.types.NullPolicy.EXCLUDE + ), + "std": plc.aggregation.std, + "var": plc.aggregation.var, + "agg_groups": lambda: None, + } + + def evaluate( + self, df, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + raise NotImplementedError("Agg") + @dataclass(slots=True) class BinOp(Expr): From f21cd5707e3ee1c6bc4a7747d307d6b72c0d5a41 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 14 May 2024 13:06:04 +0000 Subject: [PATCH 13/56] WIP: some agg expr stuff --- .../cudf_polars/containers/column.py | 25 +++++++++ python/cudf_polars/cudf_polars/dsl/expr.py | 52 ++++++++++++++++++- 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index e34a1a7726e..9c3b2114602 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -5,6 +5,7 @@ from __future__ import annotations +import functools from typing import TYPE_CHECKING import cudf._lib.pylibcudf as plc @@ -67,3 +68,27 @@ def set_sorted( self.order = order self.null_order = null_order return self + + def copy(self) -> Self: + """Return a shallow copy of the column.""" + return type(self)(self.obj, self.name).with_sorted(like=self) + + def mask_nans(self) -> Self: + """Return a copy of self with nans masked out.""" + if self.nan_count > 0: + raise NotImplementedError + else: + return self.copy() + + @functools.cached_property + def nan_count(self) -> int: + """Return the number of NaN values in the column.""" + if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): + return 0 + else: + return plc.reduce.reduce( + plc.unary.is_nan(self.obj), + plc.aggregation.sum(), + # TODO: pylibcudf needs to have a SizeType DataType singleton + plc.DataType(plc.TypeId.INT32), + ) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 1f9488f4884..a3b14f79368 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -254,10 +254,60 @@ class Agg(Expr): else plc.types.NullPolicy.EXCLUDE ), "std": plc.aggregation.std, - "var": plc.aggregation.var, + "var": plc.aggregation.variance, "agg_groups": lambda: None, } + def _min(self, column: Column, *, propagate_nans: bool) -> plc.Column: + if propagate_nans and column.nan_count > 0: + return plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(float("nan")), data_type=column.obj.type() + ), + 1, + ) + if column.nan_count > 0: + column = column.mask_nans() + return plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.min(), column.obj.type()), 1 + ) + + def _max(self, column: Column, *, propagate_nans: bool) -> plc.Column: + if propagate_nans and column.nan_count > 0: + return plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(float("nan")), data_type=column.obj.type() + ), + 1, + ) + if column.nan_count > 0: + column = column.mask_nans() + return plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.max(), column.obj.type()), 1 + ) + + def _median(self, column: Column) -> plc.Column: + return plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.median(), column.obj.type()), + 1, + ) + + def _first(self, column: Column) -> plc.Column: + return plc.copying.slice(column.obj, [0, 1])[0] + + def _last(self, column: Column) -> plc.Column: + n = column.obj.size() + return plc.copying.slice(column.obj, [n - 1, n])[0] + + def _mean(self, column: Column) -> plc.Column: + return plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.mean(), column.obj.type()), + 1, + ) + + def _nunique(self, column: Column) -> Column: + return plc.Col + def evaluate( self, df, *, context: ExecutionContext = ExecutionContext.FRAME ) -> Column: From 1f5a49031a83bf4d5a3fd96a7860e7dfc2cd3844 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 14 May 2024 14:04:23 +0000 Subject: [PATCH 14/56] Bla --- python/cudf_polars/cudf_polars/dsl/expr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index a3b14f79368..7d9beb202aa 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -306,7 +306,10 @@ def _mean(self, column: Column) -> plc.Column: ) def _nunique(self, column: Column) -> Column: - return plc.Col + return plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE), ), + 1, + ) def evaluate( self, df, *, context: ExecutionContext = ExecutionContext.FRAME From 31a3d5eefe0192536fd005b24060b0a9201ed3f6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 15 May 2024 17:00:52 +0000 Subject: [PATCH 15/56] More fixes --- python/cudf_polars/cudf_polars/callback.py | 50 +++++ .../cudf_polars/containers/column.py | 2 + .../cudf_polars/containers/dataframe.py | 14 +- python/cudf_polars/cudf_polars/dsl/expr.py | 194 +++++++++++++----- python/cudf_polars/cudf_polars/dsl/ir.py | 14 +- .../cudf_polars/cudf_polars/dsl/translate.py | 86 ++++---- 6 files changed, 258 insertions(+), 102 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/callback.py diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py new file mode 100644 index 00000000000..4d7b63cd705 --- /dev/null +++ b/python/cudf_polars/cudf_polars/callback.py @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Callback for the polars collect function to execute on device.""" + +from __future__ import annotations + +from functools import partial +from typing import TYPE_CHECKING + +from cudf_polars.dsl.translate import translate_ir + +if TYPE_CHECKING: + import polars as pl + + from cudf_polars.dsl.ir import IR + +__all__: list[str] = ["execute_with_cudf"] + + +def _callback( + ir: IR, + with_columns: list[str] | None, + pyarrow_predicate: str | None, + n_rows: int | None, +) -> pl.DataFrame: + assert with_columns is None + assert pyarrow_predicate is None + assert n_rows is None + return ir.evaluate(cache={}).to_polars() + + +def execute_with_cudf(nt) -> None: + """ + A post optimization callback that attempts to execute the plan with cudf. + + Parameters + ---------- + nt + NodeTraverser + + The NodeTraverser is mutated if the libcudf executor can handle the plan. + """ + try: + callback = partial(_callback, translate_ir(nt)) + except NotImplementedError: + return + + nt.set_udf(callback) + return diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 9c3b2114602..a853680b18b 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -30,6 +30,8 @@ def __init__(self, column: plc.Column, name: str): self.obj = column self.name = name self.is_sorted = plc.types.Sorted.NO + self.order = plc.types.Order.ASCENDING + self.null_order = plc.types.NullOrder.BEFORE def rename(self, name: str) -> Column: """Return a new column sharing data with a new name.""" diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 2c05cee9dea..e5dd757690a 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -8,6 +8,8 @@ from functools import cached_property from typing import TYPE_CHECKING +import polars as pl + import cudf._lib.pylibcudf as plc from cudf_polars.containers.column import Column @@ -43,6 +45,16 @@ def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None __iter__ = None + def to_polars(self) -> pl.DataFrame: + """Convert to a polars DataFrame.""" + assert len(self.scalars) == 0 + return pl.from_arrow( + plc.interop.to_arrow( + self.table, + [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], + ) + ) + @cached_property def column_names_set(self) -> frozenset[str]: """Return the column names as a set.""" @@ -77,7 +89,7 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self: def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: """Create from a pylibcudf table.""" # TODO: strict=True when we drop py39 - if table.num_columns != len(names): + if table.num_columns() != len(names): raise ValueError("Mismatching name and table length.") return cls([Column(c, name) for c, name in zip(table.columns(), names)], []) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 7d9beb202aa..86510ee4894 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -18,6 +18,7 @@ import enum from dataclasses import dataclass from enum import IntEnum +from functools import partial from typing import TYPE_CHECKING, Any, ClassVar import pyarrow as pa @@ -59,6 +60,8 @@ class ExecutionContext(IntEnum): @dataclass(slots=True) class Expr: + dtype: plc.DataType + # TODO: return type is a lie for Literal def evaluate( self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME @@ -76,12 +79,11 @@ def evaluate( self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME ) -> Column: """Evaluate this expression given a dataframe for context.""" - return Column(self.value.evaluate(df, context=context), self.name) + return Column(self.value.evaluate(df, context=context).obj, self.name) @dataclass(slots=True) class Literal(Expr): - dtype: plc.Datatype value: Any def evaluate( @@ -135,7 +137,7 @@ def evaluate( [descending], nulls_last=nulls_last, num_keys=1 ) do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort - table = do_sort(plc.Table([column], order, null_order)) + table = do_sort(plc.Table([column.obj]), order, null_order) return Column(table.columns()[0], column.name).set_sorted( is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0] ) @@ -228,7 +230,7 @@ def evaluate( ) -> Column: """Evaluate this expression given a dataframe for context.""" column = self.column.evaluate(df, context=context) - return Column(plc.unary.cast(column, self.dtype), column.name).with_sorted( + return Column(plc.unary.cast(column.obj, self.dtype), column.name).with_sorted( like=column ) @@ -236,86 +238,171 @@ def evaluate( @dataclass(slots=True) class Agg(Expr): column: Expr + op: Callable[..., plc.Column] name: str - options: Any - _MAPPING: ClassVar[dict[str, Callable[..., plc.aggregation.Aggregation]]] = { - "min": plc.aggregation.min, - "max": plc.aggregation.max, - "median": plc.aggregation.median, - "nunique": plc.aggregation.nunique, - "first": lambda: plc.aggregation.nth_element(0), - "last": lambda: plc.aggregation.nth_element(-1), # TODO: check - "mean": plc.aggregation.mean, - "sum": plc.aggregation.sum, - "count": lambda include_null: plc.aggregation.count( - plc.types.NullPolicy.INCLUDE - if include_null - else plc.types.NullPolicy.EXCLUDE - ), - "std": plc.aggregation.std, - "var": plc.aggregation.variance, - "agg_groups": lambda: None, - } + _SUPPORTED: ClassVar[frozenset[str]] = frozenset( + [ + "min", + "max", + "median", + "nunique", + "first", + "last", + "mean", + "sum", + "count", + "std", + "var", + "agg_groups", + ] + ) + + def __init__( + self, dtype: plc.DataType, column: Expr, name: str, options: Any + ) -> None: + if name not in Agg._SUPPORTED: + raise NotImplementedError(f"Unsupported aggregation {name}") + self.dtype = dtype + self.column = column + self.name = name + op = getattr(self, f"_{name}") + if name in {"min", "max"}: + op = partial(op, propagate_nans=options) + elif name in {"std", "var"}: + op = partial(op, ddof=options) + self.op = op + + def _std(self, column: Column, *, ddof: int) -> Column: + # TODO: handle nans + return Column( + plc.Column.from_scalar( + plc.reduce.reduce( + column.obj, plc.aggregation.std(ddof=ddof), self.dtype + ), + 1, + ), + column.name, + ) - def _min(self, column: Column, *, propagate_nans: bool) -> plc.Column: - if propagate_nans and column.nan_count > 0: - return plc.Column.from_scalar( - plc.interop.from_arrow( - pa.scalar(float("nan")), data_type=column.obj.type() + def _var(self, column: Column, *, ddof: int) -> Column: + # TODO: handle nans + return Column( + plc.Column.from_scalar( + plc.reduce.reduce( + column.obj, plc.aggregation.variance(ddof=ddof), self.dtype ), 1, + ), + column.name, + ) + + def _sum(self, column: Column) -> Column: + return Column( + plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.sum(), self.dtype), 1 + ), + column.name, + ) + + def _count(self, column: Column) -> Column: + return Column( + plc.Column.from_scalar( + plc.reduce.reduce( + column.obj, + plc.aggregation.count(plc.types.NullPolicy.EXCLUDE), + self.dtype, + ), + 1, + ), + column.name, + ) + + def _min(self, column: Column, *, propagate_nans: bool) -> Column: + if propagate_nans and column.nan_count > 0: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(float("nan")), data_type=self.dtype + ), + 1, + ), + column.name, ) if column.nan_count > 0: column = column.mask_nans() - return plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.min(), column.obj.type()), 1 + return Column( + plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.min(), self.dtype), 1 + ), + column.name, ) - def _max(self, column: Column, *, propagate_nans: bool) -> plc.Column: + def _max(self, column: Column, *, propagate_nans: bool) -> Column: if propagate_nans and column.nan_count > 0: - return plc.Column.from_scalar( - plc.interop.from_arrow( - pa.scalar(float("nan")), data_type=column.obj.type() + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(float("nan")), data_type=self.dtype + ), + 1, ), - 1, + column.name, ) if column.nan_count > 0: column = column.mask_nans() - return plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.max(), column.obj.type()), 1 + return Column( + plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.max(), self.dtype), 1 + ), + column.name, ) - def _median(self, column: Column) -> plc.Column: - return plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.median(), column.obj.type()), - 1, + def _median(self, column: Column) -> Column: + return Column( + plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.median(), self.dtype), + 1, + ), + column.name, ) - def _first(self, column: Column) -> plc.Column: - return plc.copying.slice(column.obj, [0, 1])[0] + def _first(self, column: Column) -> Column: + return Column(plc.copying.slice(column.obj, [0, 1])[0], column.name) - def _last(self, column: Column) -> plc.Column: + def _last(self, column: Column) -> Column: n = column.obj.size() - return plc.copying.slice(column.obj, [n - 1, n])[0] + return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name) - def _mean(self, column: Column) -> plc.Column: - return plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.mean(), column.obj.type()), - 1, + def _mean(self, column: Column) -> Column: + return Column( + plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.mean(), self.dtype), + 1, + ), + column.name, ) def _nunique(self, column: Column) -> Column: - return plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE), ), - 1, + return Column( + plc.Column.from_scalar( + plc.reduce.reduce( + column.obj, + plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE), + self.dtype, + ), + 1, + ), + column.name, ) def evaluate( self, df, *, context: ExecutionContext = ExecutionContext.FRAME ) -> Column: """Evaluate this expression given a dataframe for context.""" - raise NotImplementedError("Agg") + if context is not ExecutionContext.FRAME: + raise NotImplementedError(f"Agg in context {context}") + return self.op(self.column.evaluate(df, context=context)) @dataclass(slots=True) @@ -323,7 +410,6 @@ class BinOp(Expr): left: Expr right: Expr op: plc.binaryop.BinaryOperator - dtype: plc.DataType _MAPPING: ClassVar[dict[pl_expr.PyOperator, plc.binaryop.BinaryOperator]] = { pl_expr.PyOperator.Eq: plc.binaryop.BinaryOperator.EQUAL, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 70e7d20bd22..ab49fecff25 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -120,10 +120,13 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: null_order=plc.types.null_order.AFTER, ) df = df.with_columns([index]) - assert all( - c.obj.data_type() == dtype - for c, dtype in zip(df.columns, self.schema.values()) - ) + # TODO: should be true, but not the case until we get + # cudf-classic out of the loop for IO since it converts date32 + # to datetime. + # assert all( + # c.obj.type() == dtype + # for c, dtype in zip(df.columns, self.schema.values()) + # ) if self.predicate is None: return df else: @@ -167,8 +170,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: plc.interop.from_arrow(table), list(self.schema.keys()) ) assert all( - c.obj.data_type() == dtype - for c, dtype in zip(df.columns, self.schema.values()) + c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values()) ) if self.predicate is not None: mask = self.predicate.evaluate(df) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 95f705199d4..fe7902fdcc0 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -94,35 +94,36 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: else None, ) elif isinstance(node, pl_ir.Select): - return ir.Select( - schema, - translate_ir(visitor, n=node.input), - [translate_expr(visitor, n=e) for e in node.cse_expr], - [translate_expr(visitor, n=e) for e in node.expr], - ) + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr] + exprs = [translate_expr(visitor, n=e) for e in node.expr] + return ir.Select(schema, inp, cse_exprs, exprs) elif isinstance(node, pl_ir.GroupBy): + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + aggs = [translate_expr(visitor, n=e) for e in node.aggs] + keys = [translate_expr(visitor, n=e) for e in node.keys] return ir.GroupBy( schema, - translate_ir(visitor, n=node.input), - [translate_expr(visitor, n=e) for e in node.aggs], - [translate_expr(visitor, n=e) for e in node.keys], + inp, + aggs, + keys, node.options, ) elif isinstance(node, pl_ir.Join): - return ir.Join( - schema, - translate_ir(visitor, n=node.input_left), - translate_ir(visitor, n=node.input_right), - [translate_expr(visitor, n=e) for e in node.left_on], - [translate_expr(visitor, n=e) for e in node.right_on], - node.options, - ) + with set_node(visitor, node.input_left): + inp_left = translate_ir(visitor, n=None) + left_on = [translate_expr(visitor, n=e) for e in node.left_on] + with set_node(visitor, node.input_right): + inp_right = translate_ir(visitor, n=None) + right_on = [translate_expr(visitor, n=e) for e in node.right_on] + return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options) elif isinstance(node, pl_ir.HStack): - return ir.HStack( - schema, - translate_ir(visitor, n=node.input), - [translate_expr(visitor, n=e) for e in node.exprs], - ) + with set_node(visitor, n=None): + inp = translate_ir(visitor, n=node.input) + exprs = [translate_expr(visitor, n=e) for e in node.exprs] + return ir.HStack(schema, inp, exprs) elif isinstance(node, pl_ir.Distinct): return ir.Distinct( schema, @@ -130,22 +131,19 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: node.options, ) elif isinstance(node, pl_ir.Sort): - return ir.Sort( - schema, - translate_ir(visitor, n=node.input), - [translate_expr(visitor, n=e) for e in node.by_column], - node.sort_options, - ) + with set_node(visitor, n=None): + inp = translate_ir(visitor, n=node.input) + by = [translate_expr(visitor, n=e) for e in node.by_column] + return ir.Sort(schema, inp, by, node.sort_options) elif isinstance(node, pl_ir.Slice): return ir.Slice( schema, translate_ir(visitor, n=node.input), node.offset, node.len ) elif isinstance(node, pl_ir.Filter): - return ir.Filter( - schema, - translate_ir(visitor, n=node.input), - translate_expr(visitor, n=node.predicate), - ) + with set_node(visitor, n=None): + inp = translate_ir(visitor, n=node.input) + mask = translate_expr(visitor, n=node.predicate) + return ir.Filter(schema, inp, mask) elif isinstance(node, pl_ir.SimpleProjection): return ir.Projection(schema, translate_ir(visitor, n=node.input)) elif isinstance(node, pl_ir.MapFunction): @@ -201,12 +199,15 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: if isinstance(n, pl_expr.PyExprIR): # TODO: type narrowing didn't work because PyExprIR is Unknown assert not isinstance(n, int) - return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node)) + e = translate_expr(visitor, n=n.node) + return expr.NamedExpr(e.dtype, n.output_name, e) node = visitor.view_expression(n) + dtype = dtypes.from_polars(visitor.get_dtype(n)) if isinstance(node, pl_expr.Function): name, *options = node.function_data if name in BOOLEAN_FUNCTIONS: return expr.BooleanFunction( + dtype, name, options, [translate_expr(visitor, n=n) for n in node.input], @@ -216,6 +217,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: elif isinstance(node, pl_expr.Window): # TODO: raise in groupby? return expr.Window( + dtype, translate_expr(visitor, n=node.function), [translate_expr(visitor, n=n) for n in node.partition_by] if node.partition_by is not None @@ -223,52 +225,54 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: node.options, ) elif isinstance(node, pl_expr.Literal): - return expr.Literal(dtypes.from_polars(node.dtype), node.value) + return expr.Literal(dtype, node.value) elif isinstance(node, pl_expr.Sort): # TODO: raise in groupby - return expr.Sort(translate_expr(visitor, n=node.expr), node.options) + return expr.Sort(dtype, translate_expr(visitor, n=node.expr), node.options) elif isinstance(node, pl_expr.SortBy): # TODO: raise in groupby return expr.SortBy( + dtype, translate_expr(visitor, n=node.expr), [translate_expr(visitor, n=n) for n in node.by], node.descending, ) elif isinstance(node, pl_expr.Gather): return expr.Gather( + dtype, translate_expr(visitor, n=node.expr), translate_expr(visitor, n=node.idx), ) elif isinstance(node, pl_expr.Filter): return expr.Filter( + dtype, translate_expr(visitor, n=node.input), translate_expr(visitor, n=node.by), ) elif isinstance(node, pl_expr.Cast): inner = translate_expr(visitor, n=node.expr) # Push casts into literals so we can handle Cast(Literal(Null)) - dtype = dtypes.from_polars(node.dtype) if isinstance(inner, expr.Literal): return expr.Literal(dtype, inner.value) else: return expr.Cast(dtype, inner) elif isinstance(node, pl_expr.Column): - return expr.Col(node.name) + return expr.Col(dtype, node.name) elif isinstance(node, pl_expr.Agg): return expr.Agg( + dtype, translate_expr(visitor, n=node.arguments), node.name, node.options, ) elif isinstance(node, pl_expr.BinaryExpr): return expr.BinOp( + dtype, translate_expr(visitor, n=node.left), translate_expr(visitor, n=node.right), expr.BinOp._MAPPING[node.op], - # TODO: Should lay dtype onto every node, but visitor.get_dtype is O(n) not O(1) - dtypes.from_polars(visitor.get_dtype(n)), ) elif isinstance(node, pl_expr.Len): - return expr.Len() + return expr.Len(dtype) else: raise NotImplementedError(f"No handler for expression node with {type(node)=}") From cda34e05d0623017e4fac885f6e38fca9bcb1d71 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 16 May 2024 17:40:47 +0000 Subject: [PATCH 16/56] WIP: More working --- python/cudf_polars/cudf_polars/callback.py | 9 +- .../cudf_polars/containers/column.py | 1 - python/cudf_polars/cudf_polars/dsl/expr.py | 381 ++++++++++++------ python/cudf_polars/cudf_polars/dsl/ir.py | 112 ++++- .../cudf_polars/cudf_polars/dsl/translate.py | 28 +- .../cudf_polars/cudf_polars/utils/dtypes.py | 3 +- .../cudf_polars/cudf_polars/utils/sorting.py | 7 +- 7 files changed, 391 insertions(+), 150 deletions(-) diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 4d7b63cd705..b598e1442ce 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -27,7 +27,11 @@ def _callback( assert with_columns is None assert pyarrow_predicate is None assert n_rows is None - return ir.evaluate(cache={}).to_polars() + try: + return ir.evaluate(cache={}).to_polars() + except Exception as e: + print("Unable to evaluate", e) + raise def execute_with_cudf(nt) -> None: @@ -43,7 +47,8 @@ def execute_with_cudf(nt) -> None: """ try: callback = partial(_callback, translate_ir(nt)) - except NotImplementedError: + except NotImplementedError as e: + print("Unable to translate", e) return nt.set_udf(callback) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index a853680b18b..73db1c34b48 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -19,7 +19,6 @@ class Column: """A column, a name, and sortedness.""" - __slots__ = ("obj", "name", "is_sorted", "order", "null_order") obj: plc.Column name: str is_sorted: plc.types.Sorted diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 86510ee4894..41df85dcb73 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -19,7 +19,7 @@ from dataclasses import dataclass from enum import IntEnum from functools import partial -from typing import TYPE_CHECKING, Any, ClassVar +from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple import pyarrow as pa @@ -58,80 +58,161 @@ class ExecutionContext(IntEnum): ROLLING = enum.auto() -@dataclass(slots=True) +class AggInfo(NamedTuple): + requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]] + + +@dataclass(slots=True, unsafe_hash=True) class Expr: dtype: plc.DataType # TODO: return type is a lie for Literal def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" raise NotImplementedError + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + raise NotImplementedError -@dataclass(slots=True) + +def with_mapping(fn): + """Decorate a callback that takes an expression mapping to use it.""" + + def look( + self, + df: DataFrame, + *, + context=ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ): + """Look up the self in the mapping before evaluating it.""" + if mapping is None: + return fn(self, df, context=context, mapping=mapping) + else: + try: + return mapping[self] + except KeyError: + return fn(self, df, context=context, mapping=mapping) + + return look + + +@dataclass(slots=True, unsafe_hash=True) class NamedExpr(Expr): name: str value: Expr + @with_mapping def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - return Column(self.value.evaluate(df, context=context).obj, self.name) + return Column( + self.value.evaluate(df, context=context, mapping=mapping).obj, self.name + ) + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return self.value.collect_agg(depth=depth) -@dataclass(slots=True) + +@dataclass(slots=True, unsafe_hash=True) # TODO: won't work for list literals class Literal(Expr): value: Any + @with_mapping def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - obj = plc.interop.from_arrow(pa.scalar(self.value), data_type=self.dtype) + # TODO: obey dtype + obj = plc.interop.from_arrow(pa.scalar(self.value)) return Scalar(obj) # type: ignore + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + raise NotImplementedError("Literal in groupby") -@dataclass(slots=True) + +@dataclass(slots=True, unsafe_hash=True) class Col(Expr): name: str + @with_mapping def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" return df._column_map[self.name] + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return AggInfo([(self, plc.aggregation.collect_list(), self)]) -@dataclass(slots=True) + +@dataclass(slots=True, unsafe_hash=True) class Len(Expr): + @with_mapping def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - # TODO: type is wrong + # TODO: type is wrong, and dtype return df.num_rows + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: polars returns a uint, not an int for count + return AggInfo( + [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)] + ) -@dataclass(slots=True) + +@dataclass(slots=True, unsafe_hash=True) class BooleanFunction(Expr): name: str options: Any - arguments: list[Expr] + arguments: tuple[Expr, ...] -@dataclass(slots=True) +@dataclass(slots=True, unsafe_hash=True) class Sort(Expr): column: Expr - options: Any + options: tuple[bool, bool, bool] + @with_mapping def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - column = self.column.evaluate(df, context=context) + column = self.column.evaluate(df, context=context, mapping=mapping) (stable, nulls_last, descending) = self.options order, null_order = sorting.sort_order( [descending], nulls_last=nulls_last, num_keys=1 @@ -142,19 +223,29 @@ def evaluate( is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0] ) + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: Could do with sort-based groupby and segmented sort post-hoc + raise NotImplementedError("Sort in groupby") -@dataclass(slots=True) + +@dataclass(slots=True, unsafe_hash=True) class SortBy(Expr): column: Expr - by: list[Expr] - options: Any + by: tuple[Expr, ...] + options: tuple[bool, bool, tuple[bool]] + @with_mapping def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - column = self.column.evaluate(df, context=context) - by = [b.evaluate(df, context=context) for b in self.by] + column = self.column.evaluate(df, context=context, mapping=mapping) + by = [b.evaluate(df, context=context, mapping=mapping) for b in self.by] (stable, nulls_last, descending) = self.options order, null_order = sorting.sort_order( descending, nulls_last=nulls_last, num_keys=len(self.by) @@ -165,18 +256,28 @@ def evaluate( ) return Column(table.columns()[0], column.name) + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: Could do with sort-based groupby and segmented sort post-hoc + raise NotImplementedError("SortBy in groupby") + @dataclass(slots=True) class Gather(Expr): values: Expr indices: Expr + @with_mapping def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - values = self.values.evaluate(df, context=context) - indices = self.indices.evaluate(df, context=context) + values = self.values.evaluate(df, context=context, mapping=mapping) + indices = self.indices.evaluate(df, context=context, mapping=mapping) lo, hi = plc.reduce.minmax(indices.obj) lo = plc.interop.to_arrow(lo).as_py() hi = plc.interop.to_arrow(hi).as_py() @@ -195,28 +296,43 @@ def evaluate( table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy) return Column(table.columns()[0], values.name) + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: Could do with sort-based groupby and segmented gather. + raise NotImplementedError("Gather in groupby") + @dataclass(slots=True) class Filter(Expr): values: Expr mask: Expr + @with_mapping def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - values = self.values.evaluate(df, context=context) - mask = self.mask.evaluate(df, context=context) + values = self.values.evaluate(df, context=context, mapping=mapping) + mask = self.mask.evaluate(df, context=context, mapping=mapping) table = plc.stream_compaction.apply_boolean_mask( plc.Table([values.obj]), mask.obj ) return Column(table.columns()[0], values.name).with_sorted(like=values) + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: Could do with sort-based groupby and segmented filter + raise NotImplementedError("Filter in groupby") + @dataclass(slots=True) class Window(Expr): agg: Expr - by: None | list[Expr] + by: None | tuple[Expr, ...] options: Any @@ -225,21 +341,32 @@ class Cast(Expr): dtype: plc.DataType column: Expr + @with_mapping def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - column = self.column.evaluate(df, context=context) + column = self.column.evaluate(df, context=context, mapping=mapping) return Column(plc.unary.cast(column.obj, self.dtype), column.name).with_sorted( like=column ) + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: Could do with sort-based groupby and segmented filter + return self.column.collect_agg(depth=depth) + @dataclass(slots=True) class Agg(Expr): column: Expr op: Callable[..., plc.Column] name: str + request: plc.aggregation.Aggregation _SUPPORTED: ClassVar[frozenset[str]] = frozenset( [ @@ -254,10 +381,29 @@ class Agg(Expr): "count", "std", "var", - "agg_groups", ] ) + def __eq__(self, other): + """Return whether this Agg is equal to another.""" + return type(self) == type(other) and (self.column, self.name) == ( + other.column, + other.name, + ) + + def __hash__(self): + """Return a hash.""" + return hash((self.column, self.name)) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth >= 1: + raise NotImplementedError("Nested aggregations in groupby") + ((expr, _, _),) = self.column.collect_agg(depth=depth + 1).requests + if self.request is None: + raise NotImplementedError(f"Aggregation {self.name} in groupby") + return AggInfo([(expr, self.request, self)]) + def __init__( self, dtype: plc.DataType, column: Expr, name: str, options: Any ) -> None: @@ -266,53 +412,47 @@ def __init__( self.dtype = dtype self.column = column self.name = name - op = getattr(self, f"_{name}") - if name in {"min", "max"}: + # TODO: nan handling in groupby case + if name == "min": + req = plc.aggregation.min() + elif name == "max": + req = plc.aggregation.max() + elif name == "median": + req = plc.aggregation.median() + elif name == "nunique": + req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE) + elif name == "first" or name == "last": + req = None + elif name == "mean": + req = plc.aggregation.mean() + elif name == "sum": + req = plc.aggregation.sum() + elif name == "std": + # TODO: handle nans + req = plc.aggregation.std(ddof=options) + elif name == "var": + # TODO: handle nans + req = plc.aggregation.variance(ddof=options) + elif name == "count": + req = plc.aggregation.count(null_policy=plc.types.NullPolicy.EXCLUDE) + else: + raise NotImplementedError + self.request = req + op = getattr(self, f"_{name}", None) + if op is None: + op = partial(self._reduce, request=req) + elif name in {"min", "max"}: op = partial(op, propagate_nans=options) - elif name in {"std", "var"}: - op = partial(op, ddof=options) + else: + raise AssertionError self.op = op - def _std(self, column: Column, *, ddof: int) -> Column: - # TODO: handle nans - return Column( - plc.Column.from_scalar( - plc.reduce.reduce( - column.obj, plc.aggregation.std(ddof=ddof), self.dtype - ), - 1, - ), - column.name, - ) - - def _var(self, column: Column, *, ddof: int) -> Column: - # TODO: handle nans - return Column( - plc.Column.from_scalar( - plc.reduce.reduce( - column.obj, plc.aggregation.variance(ddof=ddof), self.dtype - ), - 1, - ), - column.name, - ) - - def _sum(self, column: Column) -> Column: - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.sum(), self.dtype), 1 - ), - column.name, - ) - - def _count(self, column: Column) -> Column: + def _reduce( + self, column: Column, *, request: plc.aggregation.Aggregation + ) -> Column: return Column( plc.Column.from_scalar( - plc.reduce.reduce( - column.obj, - plc.aggregation.count(plc.types.NullPolicy.EXCLUDE), - self.dtype, - ), + plc.reduce.reduce(column.obj, request, self.dtype), 1, ), column.name, @@ -331,12 +471,7 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column: ) if column.nan_count > 0: column = column.mask_nans() - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.min(), self.dtype), 1 - ), - column.name, - ) + return self._reduce(column, request=plc.aggregation.min()) def _max(self, column: Column, *, propagate_nans: bool) -> Column: if propagate_nans and column.nan_count > 0: @@ -351,21 +486,7 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column: ) if column.nan_count > 0: column = column.mask_nans() - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.max(), self.dtype), 1 - ), - column.name, - ) - - def _median(self, column: Column) -> Column: - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.median(), self.dtype), - 1, - ), - column.name, - ) + return self._reduce(column, request=plc.aggregation.max()) def _first(self, column: Column) -> Column: return Column(plc.copying.slice(column.obj, [0, 1])[0], column.name) @@ -374,38 +495,21 @@ def _last(self, column: Column) -> Column: n = column.obj.size() return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name) - def _mean(self, column: Column) -> Column: - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.mean(), self.dtype), - 1, - ), - column.name, - ) - - def _nunique(self, column: Column) -> Column: - return Column( - plc.Column.from_scalar( - plc.reduce.reduce( - column.obj, - plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE), - self.dtype, - ), - 1, - ), - column.name, - ) - + @with_mapping def evaluate( - self, df, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" if context is not ExecutionContext.FRAME: raise NotImplementedError(f"Agg in context {context}") - return self.op(self.column.evaluate(df, context=context)) + return self.op(self.column.evaluate(df, context=context, mapping=mapping)) -@dataclass(slots=True) +@dataclass(slots=True, unsafe_hash=True) class BinOp(Expr): left: Expr right: Expr @@ -434,13 +538,34 @@ class BinOp(Expr): pl_expr.PyOperator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR, } + @with_mapping def evaluate( - self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - left = self.left.evaluate(df, context=context) - right = self.right.evaluate(df, context=context) + left = self.left.evaluate(df, context=context, mapping=mapping) + right = self.right.evaluate(df, context=context, mapping=mapping) return Column( plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype), - left.name, + "what", ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth == 1: + # inside aggregation, need to pre-evaluate, + # This recurses to check if we have nested aggs + # groupby construction has checked that we don't have + # nested aggs, so stop the recursion and return ourselves + # for pre-eval + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + else: + left_info = self.left.collect_agg(depth=depth) + right_info = self.right.collect_agg(depth=depth) + return AggInfo( + [*left_info.requests, *right_info.requests], + ) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index ab49fecff25..f9f6369426d 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -15,10 +15,12 @@ from __future__ import annotations +import types from dataclasses import dataclass from functools import cache from typing import TYPE_CHECKING, Any, Callable, ClassVar +import nvtx import pyarrow as pa from typing_extensions import assert_never @@ -88,6 +90,7 @@ def __post_init__(self): if self.typ not in ("csv", "parquet"): raise NotImplementedError(f"Unhandled scan type: {self.typ}") + @nvtx.annotate(message="Scan", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" options = self.file_options @@ -153,6 +156,7 @@ class DataFrameScan(IR): projection: list[str] predicate: Expr | None + @nvtx.annotate(message="from_dataframe", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" pdf = pl.DataFrame._from_pydf(self.df) @@ -185,6 +189,7 @@ class Select(IR): cse: list[Expr] expr: list[Expr] + @nvtx.annotate(message="Select", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]): """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -193,11 +198,42 @@ def evaluate(self, *, cache: dict[int, DataFrame]): return DataFrame([e.evaluate(df) for e in self.expr], []) -@dataclass(slots=True) +def placeholder_column(n: int): + """ + Produce a placeholder pylibcudf column with NO BACKING DATA. + + Parameters + ---------- + n + Number of rows the column will advertise + + Returns + ------- + pylibcudf Column that is almost unusable. DO NOT ACCESS THE DATA BUFFER. + + Notes + ----- + This is used to avoid allocating data for count aggregations. + """ + return plc.Column( + plc.DataType(plc.TypeId.INT8), + n, + plc.gpumemoryview( + types.SimpleNamespace(__cuda_array_interface__={"data": (1, True)}) + ), + None, + 0, + 0, + [], + ) + + +@dataclass(slots=False) class GroupBy(IR): df: IR agg_requests: list[Expr] keys: list[Expr] + maintain_order: bool options: Any @staticmethod @@ -218,11 +254,13 @@ def check_agg(agg: Expr) -> int: ------ NotImplementedError for unsupported expression nodes. """ - if isinstance(agg, expr.Agg): + if isinstance(agg, expr.NamedExpr): + return GroupBy.check_agg(agg.value) + elif isinstance(agg, expr.Agg): if agg.name == "implode": raise NotImplementedError("implode in groupby") return 1 + GroupBy.check_agg(agg.column) - elif isinstance(agg, (expr.Len, expr.Column, expr.Literal)): + elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)): return 0 elif isinstance(agg, expr.BinOp): return max(GroupBy.check_agg(agg.left), GroupBy.check_agg(agg.right)) @@ -233,8 +271,51 @@ def check_agg(agg: Expr) -> int: def __post_init__(self): """Check whether all the aggregations are implemented.""" + if self.maintain_order: + raise NotImplementedError("Maintaining order in groupby") if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") + self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] + + @nvtx.annotate(message="GroupBy", domain="cudf_polars") + def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) + keys = [k.evaluate(df) for k in self.keys] + # TODO: use sorted information, need to expose column_order + # and null_precedence in pylibcudf groupby constructor + # sorted = ( + # plc.types.Sorted.YES + # if all(k.is_sorted for k in keys) + # else plc.types.Sorted.NO + # ) + grouper = plc.groupby.GroupBy( + plc.Table([k.obj for k in keys]), + null_handling=plc.types.NullPolicy.INCLUDE, + ) + # TODO: uniquify + requests = [] + replacements = [] + for info in self.agg_infos: + for pre_eval, req, rep in info.requests: + if pre_eval is None: + col = placeholder_column(df.num_rows) + else: + col = pre_eval.evaluate(df).obj + requests.append(plc.groupby.GroupByRequest(col, [req])) + replacements.append(rep) + group_keys, raw_tables = grouper.aggregate(requests) + raw_columns = [] + for i, table in enumerate(raw_tables): + (column,) = table.columns() + raw_columns.append(Column(column, f"column{i}")) + mapping = dict(zip(replacements, raw_columns)) + result_keys = [Column(gk, k.name) for gk, k in zip(group_keys.columns(), keys)] + result_subs = DataFrame(raw_columns, []) + results = [ + req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests + ] + return DataFrame([*result_keys, *results], []) @dataclass(slots=True) @@ -290,6 +371,7 @@ def _joiners( else: assert_never(how) + @nvtx.annotate(message="Join", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" left = self.left.evaluate(cache=cache) @@ -311,7 +393,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: table = plc.copying.gather(left.table, lg, left_policy) result = DataFrame.from_table(table, left.column_names) else: - lg, rg = join_fn(left_on, right_on, null_equality) + lg, rg = join_fn(left_on.table, right_on.table, null_equality) left = left.replace_columns(*left_on.columns) right = right.replace_columns(*right_on.columns) if coalesce and how != "outer": @@ -352,6 +434,7 @@ class HStack(IR): df: IR columns: list[Expr] + @nvtx.annotate(message="HStack", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -382,6 +465,7 @@ def __init__(self, schema: dict, df: IR, options: Any): self.stable = maintain_order self.zlice = zlice + @nvtx.annotate(message="Distinct", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -427,10 +511,18 @@ class Sort(IR): order: list[plc.types.Order] null_order: list[plc.types.NullOrder] - def __init__(self, schema: dict, df: IR, by: list[Expr], options: Any): + def __init__( + self, + schema: dict, + df: IR, + by: list[Expr], + options: Any, + zlice: tuple[int, int] | None, + ): self.schema = schema self.df = df self.by = by + self.zlice = zlice stable, nulls_last, descending = options self.order, self.null_order = sorting.sort_order( descending, nulls_last=nulls_last, num_keys=len(by) @@ -439,6 +531,7 @@ def __init__(self, schema: dict, df: IR, by: list[Expr], options: Any): plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key ) + @nvtx.annotate(message="Sort", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -463,7 +556,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: order=self.order[i], null_order=self.null_order[i], ) - return DataFrame(columns, []) + return DataFrame(columns, []).slice(self.zlice) @dataclass(slots=True) @@ -472,6 +565,7 @@ class Slice(IR): offset: int length: int + @nvtx.annotate(message="Slice", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -483,6 +577,7 @@ class Filter(IR): df: IR mask: Expr + @nvtx.annotate(message="Filter", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -493,6 +588,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: class Projection(IR): df: IR + @nvtx.annotate(message="Projection", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -533,6 +629,7 @@ def __post_init__(self): if key_column not in self.df.dfs[0].schema: raise ValueError(f"Key column {key_column} not found") + @nvtx.annotate(message="MapFunction", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" if self.name == "merge_sorted": @@ -596,6 +693,7 @@ def __post_init__(self): if not all(s == schema for s in self.dfs[1:]): raise ValueError("Schema mismatch") + @nvtx.annotate(message="Union", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" dfs = [df.evaluate(cache=cache) for df in self.dfs] @@ -608,6 +706,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: class HConcat(IR): dfs: list[IR] + @nvtx.annotate(message="HConcat", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" dfs = [df.evaluate(cache=cache) for df in self.dfs] @@ -620,6 +719,7 @@ class ExtContext(IR): df: IR extra: list[IR] + @nvtx.annotate(message="ExtContext", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" # TODO: polars optimizer doesn't do projection pushdown diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index fe7902fdcc0..62ccc09b2ff 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -8,6 +8,8 @@ from contextlib import AbstractContextManager, nullcontext from typing import Any +import nvtx + from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir from cudf_polars.dsl import expr, ir @@ -37,6 +39,7 @@ def __exit__(self, *args): noop_context: nullcontext = nullcontext() +@nvtx.annotate(domain="cudf_polars") def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: """ Translate a polars-internal IR node to our representation. @@ -109,6 +112,7 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: inp, aggs, keys, + node.maintain_order, node.options, ) elif isinstance(node, pl_ir.Join): @@ -120,8 +124,8 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: right_on = [translate_expr(visitor, n=e) for e in node.right_on] return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options) elif isinstance(node, pl_ir.HStack): - with set_node(visitor, n=None): - inp = translate_ir(visitor, n=node.input) + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) exprs = [translate_expr(visitor, n=e) for e in node.exprs] return ir.HStack(schema, inp, exprs) elif isinstance(node, pl_ir.Distinct): @@ -131,17 +135,17 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: node.options, ) elif isinstance(node, pl_ir.Sort): - with set_node(visitor, n=None): - inp = translate_ir(visitor, n=node.input) + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) by = [translate_expr(visitor, n=e) for e in node.by_column] - return ir.Sort(schema, inp, by, node.sort_options) + return ir.Sort(schema, inp, by, node.sort_options, node.slice) elif isinstance(node, pl_ir.Slice): return ir.Slice( schema, translate_ir(visitor, n=node.input), node.offset, node.len ) elif isinstance(node, pl_ir.Filter): - with set_node(visitor, n=None): - inp = translate_ir(visitor, n=node.input) + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) mask = translate_expr(visitor, n=node.predicate) return ir.Filter(schema, inp, mask) elif isinstance(node, pl_ir.SimpleProjection): @@ -176,6 +180,7 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: BOOLEAN_FUNCTIONS: frozenset[str] = frozenset() +@nvtx.annotate(domain="cudf_polars") def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: """ Translate a polars-internal expression IR into our representation. @@ -210,7 +215,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: dtype, name, options, - [translate_expr(visitor, n=n) for n in node.input], + tuple(translate_expr(visitor, n=n) for n in node.input), ) else: raise NotImplementedError(f"No handler for Expr function node with {name=}") @@ -219,7 +224,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: return expr.Window( dtype, translate_expr(visitor, n=node.function), - [translate_expr(visitor, n=n) for n in node.partition_by] + tuple(translate_expr(visitor, n=n) for n in node.partition_by) if node.partition_by is not None else None, node.options, @@ -231,11 +236,12 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: return expr.Sort(dtype, translate_expr(visitor, n=node.expr), node.options) elif isinstance(node, pl_expr.SortBy): # TODO: raise in groupby + stable, nulls_last, descending = node.sort_options return expr.SortBy( dtype, translate_expr(visitor, n=node.expr), - [translate_expr(visitor, n=n) for n in node.by], - node.descending, + tuple(translate_expr(visitor, n=n) for n in node.by), + (stable, nulls_last, tuple(descending)), ) elif isinstance(node, pl_expr.Gather): return expr.Gather( diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 1ac8719b839..45adbdc842c 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -32,7 +32,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: ------ NotImplementedError for unsupported conversions. """ - if isinstance(dtype, pl.Int8): + if isinstance(dtype, pl.Boolean): return plc.DataType(plc.TypeId.BOOL8) elif isinstance(dtype, pl.Int8): return plc.DataType(plc.TypeId.INT8) @@ -86,4 +86,5 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: # TODO: Hopefully return plc.DataType(plc.TypeId.EMPTY) else: + breakpoint() raise NotImplementedError(f"{dtype=} conversion not supported") diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py index fed1cd35416..b3ecfdd3dd4 100644 --- a/python/cudf_polars/cudf_polars/utils/sorting.py +++ b/python/cudf_polars/cudf_polars/utils/sorting.py @@ -5,11 +5,16 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import cudf._lib.pylibcudf as plc +if TYPE_CHECKING: + from collections.abc import Sequence + def sort_order( - descending: list[bool], *, nulls_last: bool, num_keys: int + descending: Sequence[bool], *, nulls_last: bool, num_keys: int ) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]: """ Produce sort order arguments. From 235575d8ce9c9d65a68e42f3493361c82c087e9e Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 17 May 2024 08:40:12 +0000 Subject: [PATCH 17/56] Expr objects are no longer dataclasses This is easier for handling implementing hash, etc... --- python/cudf_polars/cudf_polars/dsl/expr.py | 351 ++++++++++++------ python/cudf_polars/cudf_polars/dsl/ir.py | 10 +- .../cudf_polars/cudf_polars/dsl/translate.py | 32 +- .../cudf_polars/cudf_polars/utils/dtypes.py | 2 + 4 files changed, 269 insertions(+), 126 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 41df85dcb73..fbc5404d129 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -16,7 +16,6 @@ from __future__ import annotations import enum -from dataclasses import dataclass from enum import IntEnum from functools import partial from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple @@ -31,7 +30,7 @@ from cudf_polars.utils import sorting if TYPE_CHECKING: - from typing import Callable + from collections.abc import Sequence from cudf_polars.containers import DataFrame @@ -45,7 +44,8 @@ "SortBy", "Gather", "Filter", - "Window", + "RollingWindow", + "GroupedRollingWindow", "Cast", "Agg", "BinOp", @@ -62,9 +62,86 @@ class AggInfo(NamedTuple): requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]] -@dataclass(slots=True, unsafe_hash=True) class Expr: + __slots__ = ("dtype", "hash_value", "repr_value") + #: Data type of the expression dtype: plc.DataType + #: caching slot for the hash of the expression + hash_value: int + #: caching slot for repr of the expression + repr_value: str + #: Children of the expression + children: tuple[Expr, ...] = () + #: Names of non-child data (not Exprs) for reconstruction + _non_child: ClassVar[tuple[str, ...]] = ("dtype",) + + # Constructor must take arguments in order (*_non_child, *children) + def __init__(self, dtype: plc.DataType) -> None: + self.dtype = dtype + + def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence: + return (*(getattr(self, attr) for attr in self._non_child), *children) + + def get_hash(self) -> int: + """ + Return the hash of this expr. + + Override this in subclasses, rather than __hash__. + + Returns + ------- + The integer hash value. + """ + return hash((type(self), self._ctor_arguments(self.children))) + + def __hash__(self): + """Hash of an expression with caching.""" + try: + return self.hash_value + except AttributeError: + self.hash_value = self.get_hash() + return self.hash_value + + def is_equal(self, other: Any) -> bool: + """ + Equality of two expressions. + + Override this in subclasses, rather than __eq__. + + Parameter + --------- + other + object to compare to + + Returns + ------- + True if the two expressions are equal, false otherwise. + """ + if type(self) is not type(other): + return False + return self._ctor_arguments(self.children) == other._ctor_arguments( + other.children + ) + + def __eq__(self, other): + """Equality of expressions.""" + if type(self) != type(other) or hash(self) != hash(other): + return False + else: + return self.is_equal(other) + + def __ne__(self, other): + """Inequality of expressions.""" + return not self.__eq__(other) + + def __repr__(self): + """String representation of an expression with caching.""" + try: + return self.repr_value + except AttributeError: + args = ", ".join(f"{arg}" for arg in self._ctor_arguments(self.children)) + self.repr_value = f"{type(self)}({args})" + return self.repr_value # TODO: return type is a lie for Literal def evaluate( @@ -104,10 +181,14 @@ def look( return look -@dataclass(slots=True, unsafe_hash=True) class NamedExpr(Expr): - name: str - value: Expr + __slots__ = ("name", "children") + _non_child = ("dtype", "name") + + def __init__(self, dtype: plc.DataType, name: str, value: Expr) -> None: + super().__init__(dtype) + self.name = name + self.children = (value,) @with_mapping def evaluate( @@ -118,18 +199,25 @@ def evaluate( mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" + (child,) = self.children return Column( - self.value.evaluate(df, context=context, mapping=mapping).obj, self.name + child.evaluate(df, context=context, mapping=mapping).obj, self.name ) def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" - return self.value.collect_agg(depth=depth) + (value,) = self.children + return value.collect_agg(depth=depth) -@dataclass(slots=True, unsafe_hash=True) # TODO: won't work for list literals class Literal(Expr): - value: Any + __slots__ = ("value",) + _non_child = ("dtype", "value") + value: pa.Scalar + + def __init__(self, dtype: plc.DataType, value: Any) -> None: + super().__init__(dtype) + self.value = pa.scalar(value) @with_mapping def evaluate( @@ -141,7 +229,7 @@ def evaluate( ) -> Column: """Evaluate this expression given a dataframe for context.""" # TODO: obey dtype - obj = plc.interop.from_arrow(pa.scalar(self.value)) + obj = plc.interop.from_arrow(self.value) return Scalar(obj) # type: ignore def collect_agg(self, *, depth: int) -> AggInfo: @@ -149,10 +237,15 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError("Literal in groupby") -@dataclass(slots=True, unsafe_hash=True) class Col(Expr): + __slots__ = ("name",) + _non_child = ("dtype", "name") name: str + def __init__(self, dtype: plc.DataType, name: str) -> None: + self.dtype = dtype + self.name = name + @with_mapping def evaluate( self, @@ -169,7 +262,6 @@ def collect_agg(self, *, depth: int) -> AggInfo: return AggInfo([(self, plc.aggregation.collect_list(), self)]) -@dataclass(slots=True, unsafe_hash=True) class Len(Expr): @with_mapping def evaluate( @@ -191,17 +283,27 @@ def collect_agg(self, *, depth: int) -> AggInfo: ) -@dataclass(slots=True, unsafe_hash=True) class BooleanFunction(Expr): - name: str - options: Any - arguments: tuple[Expr, ...] + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + + def __init__(self, dtype: plc.DataType, name: str, options: Any, *children: Expr): + super().__init__(dtype) + self.options = options + self.name = name + self.children = tuple(children) -@dataclass(slots=True, unsafe_hash=True) class Sort(Expr): - column: Expr - options: tuple[bool, bool, bool] + __slots__ = ("options", "children") + _non_child = ("dtype", "options") + + def __init__( + self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr + ): + super().__init__(dtype) + self.options = options + self.children = (column,) @with_mapping def evaluate( @@ -212,7 +314,8 @@ def evaluate( mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - column = self.column.evaluate(df, context=context, mapping=mapping) + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) (stable, nulls_last, descending) = self.options order, null_order = sorting.sort_order( [descending], nulls_last=nulls_last, num_keys=1 @@ -229,11 +332,20 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError("Sort in groupby") -@dataclass(slots=True, unsafe_hash=True) class SortBy(Expr): - column: Expr - by: tuple[Expr, ...] - options: tuple[bool, bool, tuple[bool]] + __slots__ = ("options", "children") + _non_child = ("dtype", "options") + + def __init__( + self, + dtype: plc.DataType, + options: tuple[bool, bool, tuple[bool]], + column: Expr, + *by: Expr, + ): + super().__init__(dtype) + self.options = options + self.children = (column, *by) @with_mapping def evaluate( @@ -244,11 +356,13 @@ def evaluate( mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - column = self.column.evaluate(df, context=context, mapping=mapping) - by = [b.evaluate(df, context=context, mapping=mapping) for b in self.by] + column, *by = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) (stable, nulls_last, descending) = self.options order, null_order = sorting.sort_order( - descending, nulls_last=nulls_last, num_keys=len(self.by) + descending, nulls_last=nulls_last, num_keys=len(by) ) do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key table = do_sort( @@ -262,10 +376,13 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError("SortBy in groupby") -@dataclass(slots=True) class Gather(Expr): - values: Expr - indices: Expr + __slots__ = ("children",) + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): + super().__init__(dtype) + self.children = (values, indices) @with_mapping def evaluate( @@ -276,8 +393,10 @@ def evaluate( mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - values = self.values.evaluate(df, context=context, mapping=mapping) - indices = self.indices.evaluate(df, context=context, mapping=mapping) + values, indices = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) lo, hi = plc.reduce.minmax(indices.obj) lo = plc.interop.to_arrow(lo).as_py() hi = plc.interop.to_arrow(hi).as_py() @@ -302,10 +421,13 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError("Gather in groupby") -@dataclass(slots=True) class Filter(Expr): - values: Expr - mask: Expr + __slots__ = ("children",) + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): + super().__init__(dtype) + self.children = (values, indices) @with_mapping def evaluate( @@ -316,8 +438,10 @@ def evaluate( mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - values = self.values.evaluate(df, context=context, mapping=mapping) - mask = self.mask.evaluate(df, context=context, mapping=mapping) + values, mask = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) table = plc.stream_compaction.apply_boolean_mask( plc.Table([values.obj]), mask.obj ) @@ -329,17 +453,33 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError("Filter in groupby") -@dataclass(slots=True) -class Window(Expr): - agg: Expr - by: None | tuple[Expr, ...] - options: Any +class RollingWindow(Expr): + __slots__ = ("options", "children") + _non_child = ("dtype", "options") + + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr): + super().__init__(dtype) + self.options = options + self.children = (agg,) + + +class GroupedRollingWindow(Expr): + __slots__ = ("options", "children") + _non_child = ("dtype", "options") + + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr): + super().__init__(dtype) + self.options = options + self.children = (agg, *by) -@dataclass(slots=True) class Cast(Expr): - dtype: plc.DataType - column: Expr + __slots__ = ("children",) + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, value: Expr): + super().__init__(dtype) + self.children = (value,) @with_mapping def evaluate( @@ -350,7 +490,8 @@ def evaluate( mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - column = self.column.evaluate(df, context=context, mapping=mapping) + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) return Column(plc.unary.cast(column.obj, self.dtype), column.name).with_sorted( like=column ) @@ -358,60 +499,23 @@ def evaluate( def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" # TODO: Could do with sort-based groupby and segmented filter - return self.column.collect_agg(depth=depth) + (child,) = self.children + return child.collect_agg(depth=depth) -@dataclass(slots=True) class Agg(Expr): - column: Expr - op: Callable[..., plc.Column] - name: str - request: plc.aggregation.Aggregation - - _SUPPORTED: ClassVar[frozenset[str]] = frozenset( - [ - "min", - "max", - "median", - "nunique", - "first", - "last", - "mean", - "sum", - "count", - "std", - "var", - ] - ) - - def __eq__(self, other): - """Return whether this Agg is equal to another.""" - return type(self) == type(other) and (self.column, self.name) == ( - other.column, - other.name, - ) - - def __hash__(self): - """Return a hash.""" - return hash((self.column, self.name)) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - if depth >= 1: - raise NotImplementedError("Nested aggregations in groupby") - ((expr, _, _),) = self.column.collect_agg(depth=depth + 1).requests - if self.request is None: - raise NotImplementedError(f"Aggregation {self.name} in groupby") - return AggInfo([(expr, self.request, self)]) + __slots__ = ("name", "options", "op", "request", "children") + _non_child = ("dtype", "name", "options") def __init__( - self, dtype: plc.DataType, column: Expr, name: str, options: Any + self, dtype: plc.DataType, name: str, options: Any, value: Expr ) -> None: - if name not in Agg._SUPPORTED: - raise NotImplementedError(f"Unsupported aggregation {name}") - self.dtype = dtype - self.column = column + super().__init__(dtype) self.name = name + self.options = options + self.children = (value,) + if name not in Agg._SUPPORTED: + raise NotImplementedError(f"Unsupported aggregation {name=}") # TODO: nan handling in groupby case if name == "min": req = plc.aggregation.min() @@ -447,6 +551,32 @@ def __init__( raise AssertionError self.op = op + _SUPPORTED: ClassVar[frozenset[str]] = frozenset( + [ + "min", + "max", + "median", + "nunique", + "first", + "last", + "mean", + "sum", + "count", + "std", + "var", + ] + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth >= 1: + raise NotImplementedError("Nested aggregations in groupby") + (child,) = self.children + ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests + if self.request is None: + raise NotImplementedError(f"Aggregation {self.name} in groupby") + return AggInfo([(expr, self.request, self)]) + def _reduce( self, column: Column, *, request: plc.aggregation.Aggregation ) -> Column: @@ -506,14 +636,24 @@ def evaluate( """Evaluate this expression given a dataframe for context.""" if context is not ExecutionContext.FRAME: raise NotImplementedError(f"Agg in context {context}") - return self.op(self.column.evaluate(df, context=context, mapping=mapping)) + (child,) = self.children + return self.op(child.evaluate(df, context=context, mapping=mapping)) -@dataclass(slots=True, unsafe_hash=True) class BinOp(Expr): - left: Expr - right: Expr - op: plc.binaryop.BinaryOperator + __slots__ = ("op", "children") + _non_child = ("dtype", "op") + + def __init__( + self, + dtype: plc.DataType, + op: plc.binaryop.BinaryOperator, + left: Expr, + right: Expr, + ) -> None: + super().__init__(dtype) + self.op = op + self.children = (left, right) _MAPPING: ClassVar[dict[pl_expr.PyOperator, plc.binaryop.BinaryOperator]] = { pl_expr.PyOperator.Eq: plc.binaryop.BinaryOperator.EQUAL, @@ -547,8 +687,10 @@ def evaluate( mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - left = self.left.evaluate(df, context=context, mapping=mapping) - right = self.right.evaluate(df, context=context, mapping=mapping) + left, right = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) return Column( plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype), "what", @@ -564,8 +706,9 @@ def collect_agg(self, *, depth: int) -> AggInfo: # for pre-eval return AggInfo([(self, plc.aggregation.collect_list(), self)]) else: - left_info = self.left.collect_agg(depth=depth) - right_info = self.right.collect_agg(depth=depth) + left_info, right_info = ( + child.collect_agg(depth=depth) for child in self.children + ) return AggInfo( [*left_info.requests, *right_info.requests], ) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index f9f6369426d..29336e7cdba 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -254,18 +254,14 @@ def check_agg(agg: Expr) -> int: ------ NotImplementedError for unsupported expression nodes. """ - if isinstance(agg, expr.NamedExpr): - return GroupBy.check_agg(agg.value) + if isinstance(agg, (expr.NamedExpr, expr.BinOp, expr.Cast)): + return max(GroupBy.check_agg(child) for child in agg.children) elif isinstance(agg, expr.Agg): if agg.name == "implode": raise NotImplementedError("implode in groupby") - return 1 + GroupBy.check_agg(agg.column) + return 1 + max(GroupBy.check_agg(child) for child in agg.children) elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)): return 0 - elif isinstance(agg, expr.BinOp): - return max(GroupBy.check_agg(agg.left), GroupBy.check_agg(agg.right)) - elif isinstance(agg, expr.Cast): - return GroupBy.check_agg(agg.column) else: raise NotImplementedError(f"No handler for {agg=}") diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 62ccc09b2ff..c51f548b111 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -215,33 +215,35 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: dtype, name, options, - tuple(translate_expr(visitor, n=n) for n in node.input), + *(translate_expr(visitor, n=n) for n in node.input), ) else: raise NotImplementedError(f"No handler for Expr function node with {name=}") elif isinstance(node, pl_expr.Window): # TODO: raise in groupby? - return expr.Window( - dtype, - translate_expr(visitor, n=node.function), - tuple(translate_expr(visitor, n=n) for n in node.partition_by) - if node.partition_by is not None - else None, - node.options, - ) + if node.partition_by is None: + return expr.RollingWindow( + dtype, node.options, translate_expr(visitor, n=node.function) + ) + else: + return expr.GroupedRollingWindow( + dtype, + node.options, + translate_expr(visitor, n=node.function), + *(translate_expr(visitor, n=n) for n in node.partition_by), + ) elif isinstance(node, pl_expr.Literal): return expr.Literal(dtype, node.value) elif isinstance(node, pl_expr.Sort): # TODO: raise in groupby - return expr.Sort(dtype, translate_expr(visitor, n=node.expr), node.options) + return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr)) elif isinstance(node, pl_expr.SortBy): # TODO: raise in groupby - stable, nulls_last, descending = node.sort_options return expr.SortBy( dtype, + node.sort_options, translate_expr(visitor, n=node.expr), - tuple(translate_expr(visitor, n=n) for n in node.by), - (stable, nulls_last, tuple(descending)), + *(translate_expr(visitor, n=n) for n in node.by), ) elif isinstance(node, pl_expr.Gather): return expr.Gather( @@ -267,16 +269,16 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: elif isinstance(node, pl_expr.Agg): return expr.Agg( dtype, - translate_expr(visitor, n=node.arguments), node.name, node.options, + translate_expr(visitor, n=node.arguments), ) elif isinstance(node, pl_expr.BinaryExpr): return expr.BinOp( dtype, + expr.BinOp._MAPPING[node.op], translate_expr(visitor, n=node.left), translate_expr(visitor, n=node.right), - expr.BinOp._MAPPING[node.op], ) elif isinstance(node, pl_expr.Len): return expr.Len(dtype) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 45adbdc842c..f3303fbbce2 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -85,6 +85,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: elif isinstance(dtype, pl.Null): # TODO: Hopefully return plc.DataType(plc.TypeId.EMPTY) + elif isinstance(dtype, pl.List): + return plc.DataType(plc.TypeId.LIST) else: breakpoint() raise NotImplementedError(f"{dtype=} conversion not supported") From e158de6c8ff4a7b95a9ba83c127398d64c7be416 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 17 May 2024 08:50:12 +0000 Subject: [PATCH 18/56] No recursive nvtx annotations --- python/cudf_polars/cudf_polars/callback.py | 8 ++++++-- python/cudf_polars/cudf_polars/dsl/expr.py | 8 ++++---- python/cudf_polars/cudf_polars/dsl/ir.py | 16 ---------------- python/cudf_polars/cudf_polars/dsl/translate.py | 4 ---- 4 files changed, 10 insertions(+), 26 deletions(-) diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index b598e1442ce..38d80bb417e 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -8,6 +8,8 @@ from functools import partial from typing import TYPE_CHECKING +import nvtx + from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: @@ -28,7 +30,8 @@ def _callback( assert pyarrow_predicate is None assert n_rows is None try: - return ir.evaluate(cache={}).to_polars() + with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"): + return ir.evaluate(cache={}).to_polars() except Exception as e: print("Unable to evaluate", e) raise @@ -46,7 +49,8 @@ def execute_with_cudf(nt) -> None: The NodeTraverser is mutated if the libcudf executor can handle the plan. """ try: - callback = partial(_callback, translate_ir(nt)) + with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): + callback = partial(_callback, translate_ir(nt)) except NotImplementedError as e: print("Unable to translate", e) return diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index fbc5404d129..33266c6634a 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -139,8 +139,8 @@ def __repr__(self): try: return self.repr_value except AttributeError: - args = ", ".join(f"{arg}" for arg in self._ctor_arguments(self.children)) - self.repr_value = f"{type(self)}({args})" + args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children)) + self.repr_value = f"{type(self).__name__}({args})" return self.repr_value # TODO: return type is a lie for Literal @@ -168,8 +168,8 @@ def look( *, context=ExecutionContext.FRAME, mapping: dict[Expr, Column] | None = None, - ): - """Look up the self in the mapping before evaluating it.""" + ) -> Column: + """Look up self in the mapping before evaluating it.""" if mapping is None: return fn(self, df, context=context, mapping=mapping) else: diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 29336e7cdba..e2bc3b7bf44 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -20,7 +20,6 @@ from functools import cache from typing import TYPE_CHECKING, Any, Callable, ClassVar -import nvtx import pyarrow as pa from typing_extensions import assert_never @@ -90,7 +89,6 @@ def __post_init__(self): if self.typ not in ("csv", "parquet"): raise NotImplementedError(f"Unhandled scan type: {self.typ}") - @nvtx.annotate(message="Scan", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" options = self.file_options @@ -156,7 +154,6 @@ class DataFrameScan(IR): projection: list[str] predicate: Expr | None - @nvtx.annotate(message="from_dataframe", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" pdf = pl.DataFrame._from_pydf(self.df) @@ -189,7 +186,6 @@ class Select(IR): cse: list[Expr] expr: list[Expr] - @nvtx.annotate(message="Select", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]): """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -273,7 +269,6 @@ def __post_init__(self): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] - @nvtx.annotate(message="GroupBy", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -367,7 +362,6 @@ def _joiners( else: assert_never(how) - @nvtx.annotate(message="Join", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" left = self.left.evaluate(cache=cache) @@ -430,7 +424,6 @@ class HStack(IR): df: IR columns: list[Expr] - @nvtx.annotate(message="HStack", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -461,7 +454,6 @@ def __init__(self, schema: dict, df: IR, options: Any): self.stable = maintain_order self.zlice = zlice - @nvtx.annotate(message="Distinct", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -527,7 +519,6 @@ def __init__( plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key ) - @nvtx.annotate(message="Sort", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -561,7 +552,6 @@ class Slice(IR): offset: int length: int - @nvtx.annotate(message="Slice", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -573,7 +563,6 @@ class Filter(IR): df: IR mask: Expr - @nvtx.annotate(message="Filter", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -584,7 +573,6 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: class Projection(IR): df: IR - @nvtx.annotate(message="Projection", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) @@ -625,7 +613,6 @@ def __post_init__(self): if key_column not in self.df.dfs[0].schema: raise ValueError(f"Key column {key_column} not found") - @nvtx.annotate(message="MapFunction", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" if self.name == "merge_sorted": @@ -689,7 +676,6 @@ def __post_init__(self): if not all(s == schema for s in self.dfs[1:]): raise ValueError("Schema mismatch") - @nvtx.annotate(message="Union", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" dfs = [df.evaluate(cache=cache) for df in self.dfs] @@ -702,7 +688,6 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: class HConcat(IR): dfs: list[IR] - @nvtx.annotate(message="HConcat", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" dfs = [df.evaluate(cache=cache) for df in self.dfs] @@ -715,7 +700,6 @@ class ExtContext(IR): df: IR extra: list[IR] - @nvtx.annotate(message="ExtContext", domain="cudf_polars") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" # TODO: polars optimizer doesn't do projection pushdown diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index c51f548b111..37fb599c35d 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -8,8 +8,6 @@ from contextlib import AbstractContextManager, nullcontext from typing import Any -import nvtx - from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir from cudf_polars.dsl import expr, ir @@ -39,7 +37,6 @@ def __exit__(self, *args): noop_context: nullcontext = nullcontext() -@nvtx.annotate(domain="cudf_polars") def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: """ Translate a polars-internal IR node to our representation. @@ -180,7 +177,6 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: BOOLEAN_FUNCTIONS: frozenset[str] = frozenset() -@nvtx.annotate(domain="cudf_polars") def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: """ Translate a polars-internal expression IR into our representation. From b4003910ca5c19ab8fdb51a48dabd31a5b452cf5 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 17 May 2024 09:08:17 +0000 Subject: [PATCH 19/56] Testing infrastructure --- python/cudf_polars/cudf_polars/callback.py | 11 ++- .../cudf_polars/testing/__init__.py | 8 ++ .../cudf_polars/testing/asserts.py | 76 +++++++++++++++++++ 3 files changed, 92 insertions(+), 3 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/testing/__init__.py create mode 100644 python/cudf_polars/cudf_polars/testing/asserts.py diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 38d80bb417e..ed473e0ad0e 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -37,7 +37,7 @@ def _callback( raise -def execute_with_cudf(nt) -> None: +def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None: """ A post optimization callback that attempts to execute the plan with cudf. @@ -46,13 +46,18 @@ def execute_with_cudf(nt) -> None: nt NodeTraverser + raise_on_fail + Should conversion raise an exception rather than continuing + without setting a callback. + The NodeTraverser is mutated if the libcudf executor can handle the plan. """ try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): callback = partial(_callback, translate_ir(nt)) - except NotImplementedError as e: - print("Unable to translate", e) + except NotImplementedError: + if raise_on_fail: + raise return nt.set_udf(callback) diff --git a/python/cudf_polars/cudf_polars/testing/__init__.py b/python/cudf_polars/cudf_polars/testing/__init__.py new file mode 100644 index 00000000000..d0147e713f9 --- /dev/null +++ b/python/cudf_polars/cudf_polars/testing/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Testing utilities for cudf_polars.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py new file mode 100644 index 00000000000..a6e26a6425c --- /dev/null +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Device-aware assertions.""" + +from __future__ import annotations + +from functools import partial +from typing import TYPE_CHECKING + +from polars.testing.asserts import assert_frame_equal + +from cudf_polars.callback import execute_with_cudf + +if TYPE_CHECKING: + import polars as pl + +__all__: list[str] = ["assert_gpu_result_equal"] + + +def assert_gpu_result_equal( + lazydf: pl.LazyFrame, + *, + check_row_order: bool = True, + check_column_order: bool = True, + check_dtype: bool = True, + check_exact: bool = True, + rtol: float = 1e-05, + atol: float = 1e-08, + categorical_as_str: bool = False, +): + """ + Assert that collection of a lazyframe on GPU produces correct results. + + Parameters + ---------- + lazydf + frame to collect. + check_row_order + Expect rows to be in same order + check_column_order + Expect columns to be in same order + check_dtype + Expect dtypes to match + check_exact + Require exact equality for floats, if `False` compare using + rtol and atol. + rtol + Relative tolerance for float comparisons + atol + Absolute tolerance for float comparisons + categorical_as_str + Decat categoricals to strings before comparing + + Raises + ------ + AssertionError + If the GPU and CPU collection do not match. + NotImplementedError + If GPU collection failed in some way. + """ + expect = lazydf.collect() + got = lazydf.collect( + post_opt_callback=partial(execute_with_cudf, raise_on_fail=True) + ) + assert_frame_equal( + expect, + got, + check_row_order=check_row_order, + check_column_order=check_column_order, + check_dtype=check_dtype, + check_exact=check_exact, + rtol=rtol, + atol=atol, + categorical_as_str=categorical_as_str, + ) From 7f04985483621fcbaf849dadf4518751f8628068 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 17 May 2024 09:11:52 +0000 Subject: [PATCH 20/56] Add basic tests --- python/cudf_polars/pyproject.toml | 3 + python/cudf_polars/tests/test_basic.py | 239 ++++++++++++++++++++ python/cudf_polars/tests/test_distinct.py | 25 ++ python/cudf_polars/tests/test_extcontext.py | 19 ++ python/cudf_polars/tests/test_filter.py | 20 ++ python/cudf_polars/tests/test_hconcat.py | 19 ++ python/cudf_polars/tests/test_hstack.py | 19 ++ python/cudf_polars/tests/test_join.py | 64 ++++++ python/cudf_polars/tests/test_slice.py | 34 +++ python/cudf_polars/tests/test_sort.py | 42 ++++ python/cudf_polars/tests/test_union.py | 24 ++ 11 files changed, 508 insertions(+) create mode 100644 python/cudf_polars/tests/test_basic.py create mode 100644 python/cudf_polars/tests/test_distinct.py create mode 100644 python/cudf_polars/tests/test_extcontext.py create mode 100644 python/cudf_polars/tests/test_filter.py create mode 100644 python/cudf_polars/tests/test_hconcat.py create mode 100644 python/cudf_polars/tests/test_hstack.py create mode 100644 python/cudf_polars/tests/test_join.py create mode 100644 python/cudf_polars/tests/test_slice.py create mode 100644 python/cudf_polars/tests/test_sort.py create mode 100644 python/cudf_polars/tests/test_union.py diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 00fde6c0e05..f5d29202961 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -129,6 +129,9 @@ ignore = [ ] fixable = ["ALL"] +[tool.ruff.lint.per-file-ignores] +"**/tests/test_*.py" = ["D", "INP"] + [tool.ruff.lint.flake8-pytest-style] # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style fixture-parentheses = false diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py new file mode 100644 index 00000000000..094f1bc3490 --- /dev/null +++ b/python/cudf_polars/tests/test_basic.py @@ -0,0 +1,239 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import operator +from datetime import datetime + +import numpy as np +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture +def ldf_datetime(): + dates = [ + "2020-01-01 13:45:48", + "2020-01-01 16:42:13", + "2020-01-01 16:45:09", + "2020-01-02 18:12:48", + "2020-01-03 19:45:32", + "2020-01-08 23:16:43", + ] + return ( + pl.DataFrame({"dt": dates, "a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 1, 2, 2, 2]}) + .with_columns(pl.col("dt").str.strptime(pl.Datetime).set_sorted()) + .lazy() + ) + + +@pytest.fixture +def df(): + return pl.DataFrame( + { + "int_key1": np.repeat(np.arange(10), 10), + "int_key2": np.tile(np.arange(10), 10), + "str_key1": np.repeat(list("ABCDEFGHIJ"), 10), + "int_val": np.random.randint(100, size=100), + "float_val": np.random.rand(100), + } + ) + + +@pytest.fixture +def ldf(df): + return df.lazy() + + +@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) +@pytest.mark.parametrize( + "op", [operator.add, operator.sub, operator.mul, operator.truediv] +) +def test_binaryops(op, dtype): + df = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": [1, 2, 3, 4, 5], + } + ).lazy() + + dtype = pl.datatypes.numpy_char_code_to_dtype(dtype) + df = df.with_columns(pl.col("a").cast(dtype)).with_columns(pl.col("b").cast(dtype)) + result = df.with_columns(op(pl.col("a"), pl.col("b"))) + assert_gpu_result_equal(result) + + +def test_scan_parquet(tmp_path): + df = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + df.write_parquet(tmp_path / "example.parquet") + ldf = pl.scan_parquet(tmp_path / "example.parquet") + assert_gpu_result_equal(ldf) + + +def test_rolling(ldf_datetime): + out = ldf_datetime.rolling(index_column="dt", period="2d").agg( + [ + pl.sum("a").alias("sum_a"), + pl.min("a").alias("min_a"), + pl.max("a").alias("max_a"), + ] + ) + assert_gpu_result_equal(out) + + +def test_groupby_rolling(ldf_datetime): + out = ldf_datetime.rolling(index_column="dt", period="2d", group_by="b").agg( + [ + pl.sum("a").alias("sum_a"), + pl.min("a").alias("min_a"), + pl.max("a").alias("max_a"), + ] + ) + assert_gpu_result_equal(out) + + +def test_rolling_expression(ldf_datetime): + out = ldf_datetime.with_columns( + sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), + min_a=pl.min("a").rolling(index_column="dt", period="2d"), + max_a=pl.max("a").rolling(index_column="dt", period="2d"), + ) + assert_gpu_result_equal(out) + + +def test_datetime_comparison(ldf_datetime): + out = ldf_datetime.filter( + pl.col("dt") > datetime.fromisoformat("2020-01-01 16:45:09") + ) + assert_gpu_result_equal(out) + + +@pytest.fixture +def null_data(): + return pl.DataFrame( + { + "a": [1, 2, None, 4, None], + } + ).lazy() + + +def test_drop_nulls(null_data): + result = null_data.drop_nulls() + assert_gpu_result_equal(result) + + +@pytest.mark.parametrize("how", ["inner", "left", "semi", "outer_coalesce"]) +def test_join(df: pl.DataFrame, how): + pl.set_random_seed(42) + # Sample eagerly since we haven't implemented it yet. + ldf1 = df.sample(n=50).lazy() + ldf2 = df.sample(n=50).lazy() + + out = ldf1.join(ldf2, on=["int_key1", "int_key2"], how=how) + assert_gpu_result_equal(out, check_row_order=False) + + +def test_sort(ldf): + for col in ldf.columns: + out = ldf.sort(by=col) + assert_gpu_result_equal(out) + + +def test_filter(ldf): + out = ldf.filter(pl.col("int_key1") > pl.col("int_key2")) + assert_gpu_result_equal(out) + + +@pytest.mark.parametrize( + "agg", + [ + "sum", + "min", + "max", + "mean", + # TODO: first/last get turned into slice of the Scan + "first", + "last", + "count", + "median", + ], +) +def test_agg(df, agg): + ldf = ( + df.cast( + {key: pl.Float64 for key in df.columns if ("int" in key or "float" in key)} + ) + .select(list(filter(lambda c: "str" not in c, df.columns))) + .lazy() + ) + out = getattr(ldf, agg)() + assert_gpu_result_equal(out, check_dtype=agg != "count", check_exact=False) + + +@pytest.mark.parametrize("keep", ["first", "last", "none"]) +@pytest.mark.parametrize("subset", [None, "keys"]) +@pytest.mark.parametrize("sort", [False, True]) +@pytest.mark.parametrize("maintain_order", [False, True]) +def test_unique(ldf: pl.LazyFrame, keep, subset, sort, maintain_order): + if subset is not None: + subset = list(filter(lambda c: "key" in c, ldf.columns)) + sort_by = subset + else: + sort_by = ldf.columns + if sort: + ldf = ldf.sort(*sort_by) + out = ldf.unique( + subset, + keep=keep, + maintain_order=maintain_order, + ) + assert_gpu_result_equal(out, check_row_order=maintain_order) + + +def test_selection(ldf: pl.LazyFrame): + k = pl.col("int_key1") + v = pl.col("int_val") + # groupby stops predicate pushdown + out = ldf.group_by(k).agg(v.sum()).filter(k * 2 > v) + assert_gpu_result_equal(out) + + +def test_concat_vertical(ldf): + out = pl.concat([ldf, ldf]) + assert_gpu_result_equal(out) + + +def test_concat_horizontal(ldf): + # Have to split the columns in two to avoid the same column names + left_columns = ldf.columns[: len(ldf.columns) // 2] + right_columns = ldf.columns[len(ldf.columns) // 2 :] + out = pl.concat( + [ldf.select(left_columns), ldf.select(right_columns)], how="horizontal" + ) + assert_gpu_result_equal(out) + + +def test_groupby(ldf): + out = ldf.group_by("int_key1").agg(pl.col("float_val").sum()) + assert_gpu_result_equal(out, check_row_order=False, check_exact=False) + + +def test_expr_function(ldf): + out = ldf.select(pl.arg_where(pl.col("int_key1") == 5)).set_sorted( + pl.col("int_key1") + ) + # TODO: Fix the underlying dtype + assert_gpu_result_equal(out, check_dtype=False) + + +def test_filter_expr(ldf): + out = ldf.select(pl.col("int_key1").filter(pl.col("int_key2") > 4)) + assert_gpu_result_equal(out) + + +def test_gather_expr(ldf): + out = ldf.select(pl.col("int_key1").gather(pl.col("int_key2"))) + assert_gpu_result_equal(out) diff --git a/python/cudf_polars/tests/test_distinct.py b/python/cudf_polars/tests/test_distinct.py new file mode 100644 index 00000000000..e0fa089cee2 --- /dev/null +++ b/python/cudf_polars/tests/test_distinct.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("subset", [None, ["a"], ["a", "b"], ["b", "c"]]) +@pytest.mark.parametrize("keep", ["any", "none", "first", "last"]) +@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"]) +def test_distinct(subset, keep, maintain_order): + ldf = pl.DataFrame( + { + "a": [1, 2, 1, 3, 5, None, None], + "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3], + "c": [True, True, True, True, False, False, True], + } + ).lazy() + + query = ldf.unique(subset=subset, keep=keep, maintain_order=maintain_order) + assert_gpu_result_equal(query, check_row_order=maintain_order) diff --git a/python/cudf_polars/tests/test_extcontext.py b/python/cudf_polars/tests/test_extcontext.py new file mode 100644 index 00000000000..c5481d0ccbd --- /dev/null +++ b/python/cudf_polars/tests/test_extcontext.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_extcontext(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select((pl.col("b") + pl.col("a")).alias("c")) + query = ldf.with_context(ldf2).select(pl.col("b"), pl.col("c")) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_filter.py b/python/cudf_polars/tests/test_filter.py new file mode 100644 index 00000000000..783403d764c --- /dev/null +++ b/python/cudf_polars/tests/test_filter.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_filter(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + # group-by is just to avoid the filter being pushed into the scan. + query = ldf.group_by(pl.col("a")).agg(pl.col("b").sum()).filter(pl.col("b") < 1) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_hconcat.py b/python/cudf_polars/tests/test_hconcat.py new file mode 100644 index 00000000000..46cbb21b25a --- /dev/null +++ b/python/cudf_polars/tests/test_hconcat.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_hconcat(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c")) + query = pl.concat([ldf, ldf2], how="horizontal") + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_hstack.py b/python/cudf_polars/tests/test_hstack.py new file mode 100644 index 00000000000..731c036bc88 --- /dev/null +++ b/python/cudf_polars/tests/test_hstack.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_hstack(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + query = ldf.with_columns(pl.col("a") + pl.col("b")) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py new file mode 100644 index 00000000000..9432824a34c --- /dev/null +++ b/python/cudf_polars/tests/test_join.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "how", + [ + "inner", + "left", + pytest.param( + "outer", + marks=pytest.mark.xfail(reason="non-coalescing join not implemented"), + ), + "semi", + "anti", + pytest.param( + "cross", + marks=pytest.mark.xfail(reason="cross join not implemented"), + ), + "outer_coalesce", + ], +) +@pytest.mark.parametrize( + "join_nulls", [False, True], ids=["nulls_not_equal", "nulls_equal"] +) +@pytest.mark.parametrize( + "join_expr", + [ + pl.col("a"), + pytest.param( + pl.col("a") * 2, + marks=pytest.mark.xfail(reason="Taking key columns from wrong table"), + ), + pytest.param( + [pl.col("a"), pl.col("a") + 1], + marks=pytest.mark.xfail(reason="Taking key columns from wrong table"), + ), + ["c", "a"], + ], +) +def test_join(how, join_nulls, join_expr): + left = pl.DataFrame( + { + "a": [1, 2, 3, 1, None], + "b": [1, 2, 3, 4, 5], + "c": [2, 3, 4, 5, 6], + } + ).lazy() + right = pl.DataFrame( + { + "a": [1, 4, 3, 7, None, None], + "c": [2, 3, 4, 5, 6, 7], + } + ).lazy() + + query = left.join(right, on=join_expr, how=how, join_nulls=join_nulls) + assert_gpu_result_equal(query, check_row_order=False) diff --git a/python/cudf_polars/tests/test_slice.py b/python/cudf_polars/tests/test_slice.py new file mode 100644 index 00000000000..6c918a89e33 --- /dev/null +++ b/python/cudf_polars/tests/test_slice.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "offset", + [0, 1, 2], +) +@pytest.mark.parametrize( + "len", + [0, 2, 12], +) +def test_slice(offset, len): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + query = ( + ldf.group_by(pl.col("a")) + .agg(pl.col("b").sum()) + .sort(by=pl.col("a")) + .slice(offset, len) + ) + assert_gpu_result_equal(query, check_row_order=False) diff --git a/python/cudf_polars/tests/test_sort.py b/python/cudf_polars/tests/test_sort.py new file mode 100644 index 00000000000..ecc02efd967 --- /dev/null +++ b/python/cudf_polars/tests/test_sort.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize( + "sort_keys", + [ + (pl.col("a"),), + pytest.param( + (pl.col("d").abs(),), + marks=pytest.mark.xfail(reason="abs not yet implemented"), + ), + (pl.col("a"), pl.col("d")), + (pl.col("b"),), + ], +) +@pytest.mark.parametrize("nulls_last", [False, True]) +@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"]) +def test_sort(sort_keys, nulls_last, maintain_order): + ldf = pl.DataFrame( + { + "a": [1, 2, 1, 3, 5, None, None], + "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3], + "c": [True, True, True, True, False, False, True], + "d": [1, 2, -1, 10, 6, -1, -7], + } + ).lazy() + + query = ldf.sort( + *sort_keys, + descending=True, + nulls_last=nulls_last, + maintain_order=maintain_order, + ) + assert_gpu_result_equal(query, check_row_order=maintain_order) diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py new file mode 100644 index 00000000000..8a6e015e4db --- /dev/null +++ b/python/cudf_polars/tests/test_union.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.xfail(reason="Need handling of null scalars that are cast") +def test_union(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"), pl.col("a")) + query = pl.concat([ldf, ldf2], how="diagonal") + # Plan for this produces a `None`.astype(Int64) which we don't + # handle correctly right now + assert_gpu_result_equal(query) From 233c1be771bdee9688fca30349b3debebd9cce5f Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 17 May 2024 10:02:54 +0000 Subject: [PATCH 21/56] All tests passing (or at least xfailing appropriately) --- .../cudf_polars/containers/column.py | 14 ++++++----- python/cudf_polars/cudf_polars/dsl/expr.py | 16 +++++++++++- python/cudf_polars/cudf_polars/dsl/ir.py | 16 ++++++++---- python/cudf_polars/tests/test_basic.py | 8 ++++-- python/cudf_polars/tests/test_join.py | 25 +++++++++---------- 5 files changed, 52 insertions(+), 27 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 73db1c34b48..7784febf2e8 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -87,9 +87,11 @@ def nan_count(self) -> int: if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): return 0 else: - return plc.reduce.reduce( - plc.unary.is_nan(self.obj), - plc.aggregation.sum(), - # TODO: pylibcudf needs to have a SizeType DataType singleton - plc.DataType(plc.TypeId.INT32), - ) + return plc.interop.to_arrow( + plc.reduce.reduce( + plc.unary.is_nan(self.obj), + plc.aggregation.sum(), + # TODO: pylibcudf needs to have a SizeType DataType singleton + plc.DataType(plc.TypeId.INT32), + ) + ).as_py() diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 33266c6634a..cefe9922f64 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -538,7 +538,7 @@ def __init__( # TODO: handle nans req = plc.aggregation.variance(ddof=options) elif name == "count": - req = plc.aggregation.count(null_policy=plc.types.NullPolicy.EXCLUDE) + req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) else: raise NotImplementedError self.request = req @@ -547,6 +547,8 @@ def __init__( op = partial(self._reduce, request=req) elif name in {"min", "max"}: op = partial(op, propagate_nans=options) + elif name == "count": + pass else: raise AssertionError self.op = op @@ -588,6 +590,18 @@ def _reduce( column.name, ) + def _count(self, column: Column) -> Column: + # TODO: dtype handling + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(column.obj.size() - column.obj.null_count()), + ), + 1, + ), + column.name, + ) + def _min(self, column: Column, *, propagate_nans: bool) -> Column: if propagate_nans and column.nan_count > 0: return Column( diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index e2bc3b7bf44..9ac6cd6d51b 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -15,6 +15,7 @@ from __future__ import annotations +import itertools import types from dataclasses import dataclass from functools import cache @@ -263,8 +264,10 @@ def check_agg(agg: Expr) -> int: def __post_init__(self): """Check whether all the aggregations are implemented.""" - if self.maintain_order: + if self.options.rolling is None and self.maintain_order: raise NotImplementedError("Maintaining order in groupby") + if self.options.rolling: + raise NotImplementedError("rolling window/groupby") if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] @@ -395,7 +398,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: plc.copying.gather(right.table, rg, right_policy), right.column_names ) if coalesce and how == "outer": - left.replace_columns( + left = left.replace_columns( *( Column( plc.replace.replace_nulls(left_col.obj, right_col.obj), @@ -407,7 +410,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ) ) ) - right.discard_columns(right_on.column_names_set) + right = right.discard_columns(right_on.column_names_set) right = right.rename_columns( { name: f"{name}{suffix}" @@ -673,7 +676,7 @@ class Union(IR): def __post_init__(self): """Validated preconditions.""" schema = self.dfs[0].schema - if not all(s == schema for s in self.dfs[1:]): + if not all(s.schema == schema for s in self.dfs[1:]): raise ValueError("Schema mismatch") def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @@ -692,7 +695,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" dfs = [df.evaluate(cache=cache) for df in self.dfs] columns, scalars = zip(*((df.columns, df.scalars) for df in dfs)) - return DataFrame(columns, scalars) + return DataFrame( + list(itertools.chain.from_iterable(columns)), + list(itertools.chain.from_iterable(scalars)), + ) @dataclass(slots=True) diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py index 094f1bc3490..c7123513cd2 100644 --- a/python/cudf_polars/tests/test_basic.py +++ b/python/cudf_polars/tests/test_basic.py @@ -73,6 +73,7 @@ def test_scan_parquet(tmp_path): assert_gpu_result_equal(ldf) +@pytest.mark.xfail(reason="Rolling window not yet implemented") def test_rolling(ldf_datetime): out = ldf_datetime.rolling(index_column="dt", period="2d").agg( [ @@ -84,6 +85,7 @@ def test_rolling(ldf_datetime): assert_gpu_result_equal(out) +@pytest.mark.xfail(reason="Grouped rolling window not yet implemented") def test_groupby_rolling(ldf_datetime): out = ldf_datetime.rolling(index_column="dt", period="2d", group_by="b").agg( [ @@ -95,6 +97,7 @@ def test_groupby_rolling(ldf_datetime): assert_gpu_result_equal(out) +@pytest.mark.xfail(reason="Rolling expression not yet implemented") def test_rolling_expression(ldf_datetime): out = ldf_datetime.with_columns( sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), @@ -120,6 +123,7 @@ def null_data(): ).lazy() +@pytest.mark.xfail(reason="Boolean function not yet implemented") def test_drop_nulls(null_data): result = null_data.drop_nulls() assert_gpu_result_equal(result) @@ -221,12 +225,12 @@ def test_groupby(ldf): assert_gpu_result_equal(out, check_row_order=False, check_exact=False) +@pytest.mark.xfail(reason="arg_where not yet implemented") def test_expr_function(ldf): out = ldf.select(pl.arg_where(pl.col("int_key1") == 5)).set_sorted( pl.col("int_key1") ) - # TODO: Fix the underlying dtype - assert_gpu_result_equal(out, check_dtype=False) + assert_gpu_result_equal(out) def test_filter_expr(ldf): diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 9432824a34c..9ba513023da 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -14,10 +14,7 @@ [ "inner", "left", - pytest.param( - "outer", - marks=pytest.mark.xfail(reason="non-coalescing join not implemented"), - ), + "outer", "semi", "anti", pytest.param( @@ -34,18 +31,20 @@ "join_expr", [ pl.col("a"), - pytest.param( - pl.col("a") * 2, - marks=pytest.mark.xfail(reason="Taking key columns from wrong table"), - ), - pytest.param( - [pl.col("a"), pl.col("a") + 1], - marks=pytest.mark.xfail(reason="Taking key columns from wrong table"), - ), + pl.col("a") * 2, + [pl.col("a"), pl.col("a") + 1], ["c", "a"], ], ) -def test_join(how, join_nulls, join_expr): +def test_join(request, how, join_nulls, join_expr): + request.applymarker( + pytest.mark.xfail( + how == "outer_coalesce" + and isinstance(join_expr, list) + and not isinstance(join_expr[0], str), + reason="https://github.com/pola-rs/polars/issues/16289", + ) + ) left = pl.DataFrame( { "a": [1, 2, 3, 1, None], From 3a3ad2db76c0d8f21f84e6f462b54e0a86d9d630 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 21 May 2024 09:54:00 +0000 Subject: [PATCH 22/56] Handle string functions and boolean functions and add some docs --- python/cudf_polars/cudf_polars/dsl/expr.py | 172 +++++++++++++++--- python/cudf_polars/cudf_polars/dsl/ir.py | 2 +- .../cudf_polars/cudf_polars/dsl/translate.py | 12 +- 3 files changed, 151 insertions(+), 35 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index cefe9922f64..3e7fc4bffc8 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -40,6 +40,7 @@ "Literal", "Col", "BooleanFunction", + "StringFunction", "Sort", "SortBy", "Gather", @@ -63,17 +64,30 @@ class AggInfo(NamedTuple): class Expr: + """ + An abstract expression object. + + This contains a (potentially empty) tuple of child expressions, + along with non-child data. For uniform reconstruction and + implementation of hashing and equality schemes, child classes need + to provide a certain amount of metadata when they are defined. + Specifically, the ``_non_child`` attribute must list, in-order, + the names of the slots that are passed to the constructor. The + constructor must take arguments in the order ``(*_non_child, + *children).`` + """ + __slots__ = ("dtype", "hash_value", "repr_value") - #: Data type of the expression dtype: plc.DataType - #: caching slot for the hash of the expression + """Data type of the expression.""" hash_value: int - #: caching slot for repr of the expression + """Caching slot for the hash of the expression.""" repr_value: str - #: Children of the expression + """Caching slot for repr of the expression.""" children: tuple[Expr, ...] = () - #: Names of non-child data (not Exprs) for reconstruction + """Children of the expression.""" _non_child: ClassVar[tuple[str, ...]] = ("dtype",) + """Names of non-child data (not Exprs) for reconstruction.""" # Constructor must take arguments in order (*_non_child, *children) def __init__(self, dtype: plc.DataType) -> None: @@ -151,18 +165,61 @@ def evaluate( context: ExecutionContext = ExecutionContext.FRAME, mapping: dict[Expr, Column] | None = None, ) -> Column: - """Evaluate this expression given a dataframe for context.""" + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame that will provide columns. + context + What context are we performing this evaluation in? + mapping + Substitution mapping from expressions to Columns, used to + override the evaluation of a given expression if we're + performing a simple rewritten evaluation. + + Returns + ------- + Column representing the evaluation of the expression (or maybe + a scalar, annoying!). + + Raises + ------ + NotImplementedError if we couldn't evaluate the expression. + Ideally all these are returned during translation to the IR, + but for now we are not perfect. + """ raise NotImplementedError def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" + """ + Collect information about aggregations in groupbys. + + Parameters + ---------- + depth + The depth of aggregating (reduction or sampling) + expressions we are currently at. + + Returns + ------- + Aggregation info describing the expression to aggregate in the + groupby. + + Raises + ------ + NotImplementedError if we can't currently perform the + aggregation request (for example nested aggregations like + ``a.max().min()``). + """ raise NotImplementedError def with_mapping(fn): """Decorate a callback that takes an expression mapping to use it.""" - def look( + def _( self, df: DataFrame, *, @@ -178,7 +235,7 @@ def look( except KeyError: return fn(self, df, context=context, mapping=mapping) - return look + return _ class NamedExpr(Expr): @@ -293,6 +350,61 @@ def __init__(self, dtype: plc.DataType, name: str, options: Any, *children: Expr self.name = name self.children = tuple(children) + @with_mapping + def evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) + if self.name == pl_expr.BooleanFunction.IsNull: + return Column(plc.unary.is_null(column.obj), column.name) + elif self.name == pl_expr.BooleanFunction.IsNotNull: + return Column(plc.unary.is_valid(column.obj), column.name) + else: + raise NotImplementedError(f"BooleanFunction {self.name}") + + +class StringFunction(Expr): + __slots__ = ("name", "options", "children") + _non_child = ("dtype", "name", "options") + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.StringFunction, + options: Any, + *children: Expr, + ): + super().__init__(dtype) + self.options = options + self.name = name + self.children = children + + @with_mapping + def evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) + if self.name == pl_expr.StringFunction.Lowercase: + return Column(plc.strings.case.to_lower(column.obj), column.name) + elif self.name == pl_expr.StringFunction.Uppercase: + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) + return Column(plc.strings.case.to_upper(column.obj), column.name) + else: + raise NotImplementedError(f"StringFunction {self.name}") + class Sort(Expr): __slots__ = ("options", "children") @@ -669,27 +781,27 @@ def __init__( self.op = op self.children = (left, right) - _MAPPING: ClassVar[dict[pl_expr.PyOperator, plc.binaryop.BinaryOperator]] = { - pl_expr.PyOperator.Eq: plc.binaryop.BinaryOperator.EQUAL, - pl_expr.PyOperator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, - pl_expr.PyOperator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL, - pl_expr.PyOperator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS, - pl_expr.PyOperator.Lt: plc.binaryop.BinaryOperator.LESS, - pl_expr.PyOperator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL, - pl_expr.PyOperator.Gt: plc.binaryop.BinaryOperator.GREATER, - pl_expr.PyOperator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL, - pl_expr.PyOperator.Plus: plc.binaryop.BinaryOperator.ADD, - pl_expr.PyOperator.Minus: plc.binaryop.BinaryOperator.SUB, - pl_expr.PyOperator.Multiply: plc.binaryop.BinaryOperator.MUL, - pl_expr.PyOperator.Divide: plc.binaryop.BinaryOperator.DIV, - pl_expr.PyOperator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV, - pl_expr.PyOperator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV, - pl_expr.PyOperator.Modulus: plc.binaryop.BinaryOperator.PYMOD, - pl_expr.PyOperator.And: plc.binaryop.BinaryOperator.BITWISE_AND, - pl_expr.PyOperator.Or: plc.binaryop.BinaryOperator.BITWISE_OR, - pl_expr.PyOperator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR, - pl_expr.PyOperator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND, - pl_expr.PyOperator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR, + _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { + pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, + pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, + pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL, + pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS, + pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS, + pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL, + pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER, + pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL, + pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD, + pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB, + pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL, + pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV, + pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV, + pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV, + pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD, + pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND, + pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR, + pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR, + pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND, + pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR, } @with_mapping diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 9ac6cd6d51b..37eddb9b408 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -309,7 +309,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: results = [ req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests ] - return DataFrame([*result_keys, *results], []) + return DataFrame([*result_keys, *results], []).slice(self.options.slice) @dataclass(slots=True) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 37fb599c35d..430534bf6bd 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -174,9 +174,6 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: ) -BOOLEAN_FUNCTIONS: frozenset[str] = frozenset() - - def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: """ Translate a polars-internal expression IR into our representation. @@ -206,7 +203,14 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: dtype = dtypes.from_polars(visitor.get_dtype(n)) if isinstance(node, pl_expr.Function): name, *options = node.function_data - if name in BOOLEAN_FUNCTIONS: + if isinstance(name, pl_expr.StringFunction): + return expr.StringFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + elif isinstance(name, pl_expr.BooleanFunction): return expr.BooleanFunction( dtype, name, From dd6efaafe9d42efdb04125b434ea0ee636bbfe88 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 21 May 2024 12:05:22 +0000 Subject: [PATCH 23/56] Flesh out more boolean functions --- python/cudf_polars/cudf_polars/dsl/expr.py | 131 ++++++++++++++++++++- 1 file changed, 128 insertions(+), 3 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 3e7fc4bffc8..f82ca78fb4f 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -17,7 +17,7 @@ import enum from enum import IntEnum -from functools import partial +from functools import partial, reduce from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple import pyarrow as pa @@ -350,6 +350,47 @@ def __init__(self, dtype: plc.DataType, name: str, options: Any, *children: Expr self.name = name self.children = tuple(children) + def __post_init__(self): + """Validate preconditions.""" + if ( + self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All) + and not self.options[0] + ): + # With ignore_nulls == False, polars uses Kleene logic + raise NotImplementedError(f"Kleene logic for {self.name}") + if self.name in ( + pl_expr.BooleanFunction.IsFinite, + pl_expr.BooleanFunction.IsInfinite, + pl_expr.BooleanFunction.IsBetween, + pl_expr.BooleanFunction.IsIn, + ): + raise NotImplementedError(f"{self.name}") + + @staticmethod + def _distinct( + column: Column, + *, + keep: plc.stream_compaction.DuplicateKeepOption, + source_value: plc.Scalar, + target_value: plc.Scalar, + ) -> Column: + table = plc.Table([column.obj]) + indices = plc.stream_compaction.distinct_indices( + table, + keep, + # TODO: polars doesn't expose options for these + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + return Column( + plc.copying.scatter( + [source_value], + indices, + plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]), + ).columns()[0], + column.name, + ) + @with_mapping def evaluate( self, @@ -359,12 +400,96 @@ def evaluate( mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - (child,) = self.children - column = child.evaluate(df, context=context, mapping=mapping) + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + if self.name == pl_expr.BooleanFunction.Any: + (column,) = columns + return plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1 + ) + elif self.name == pl_expr.BooleanFunction.All: + (column,) = columns + return plc.Column.from_scalar( + plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1 + ) if self.name == pl_expr.BooleanFunction.IsNull: + (column,) = columns return Column(plc.unary.is_null(column.obj), column.name) elif self.name == pl_expr.BooleanFunction.IsNotNull: + (column,) = columns return Column(plc.unary.is_valid(column.obj), column.name) + elif self.name == pl_expr.BooleanFunction.IsNan: + # TODO: copy over null mask since is_nan(null) => null in polars + (column,) = columns + return Column(plc.unary.is_nan(column.obj), column.name) + elif self.name == pl_expr.BooleanFunction.IsNotNan: + # TODO: copy over null mask since is_not_nan(null) => null in polars + (column,) = columns + return Column(plc.unary.is_not_nan(column.obj), column.name) + elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + ) + elif self.name == pl_expr.BooleanFunction.IsLastDistinct: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, + source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + ) + elif self.name == pl_expr.BooleanFunction.IsUnique: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + source_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + target_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + ) + elif self.name == pl_expr.BooleanFunction.IsDuplicated: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + source_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 + target_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 + ) + elif self.name == pl_expr.AllHorizontal: + name = columns[0].name + if any(c.obj.null_count() > 0 for c in columns): + raise NotImplementedError("Kleene logic for all_horizontal") + return Column( + reduce( + partial( + plc.binaryop.binary_operation, + op=plc.binaryop.BinaryOperator.BITWISE_AND, + output_type=self.dtype, + ), + (c.obj for c in columns), + ), + name, + ) + elif self.name == pl_expr.AnyHorizontal: + name = columns[0].name + if any(c.obj.null_count() > 0 for c in columns): + raise NotImplementedError("Kleene logic for any_horizontal") + return Column( + reduce( + partial( + plc.binaryop.binary_operation, + op=plc.binaryop.BinaryOperator.BITWISE_OR, + output_type=self.dtype, + ), + (c.obj for c in columns), + ), + name, + ) else: raise NotImplementedError(f"BooleanFunction {self.name}") From e279a2f2cb5ce64b1eeb7d81217d14443f70eb89 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 21 May 2024 12:39:01 +0000 Subject: [PATCH 24/56] More fixes --- python/cudf_polars/cudf_polars/dsl/expr.py | 53 +++++++++++++++++----- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index f82ca78fb4f..3773bba8632 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -348,10 +348,7 @@ def __init__(self, dtype: plc.DataType, name: str, options: Any, *children: Expr super().__init__(dtype) self.options = options self.name = name - self.children = tuple(children) - - def __post_init__(self): - """Validate preconditions.""" + self.children = children if ( self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All) and not self.options[0] @@ -361,7 +358,6 @@ def __post_init__(self): if self.name in ( pl_expr.BooleanFunction.IsFinite, pl_expr.BooleanFunction.IsInfinite, - pl_expr.BooleanFunction.IsBetween, pl_expr.BooleanFunction.IsIn, ): raise NotImplementedError(f"{self.name}") @@ -460,7 +456,7 @@ def evaluate( source_value=plc.interop.from_arrow(pa.scalar(False)), # noqa: FBT003 target_value=plc.interop.from_arrow(pa.scalar(True)), # noqa: FBT003 ) - elif self.name == pl_expr.AllHorizontal: + elif self.name == pl_expr.BooleanFunction.AllHorizontal: name = columns[0].name if any(c.obj.null_count() > 0 for c in columns): raise NotImplementedError("Kleene logic for all_horizontal") @@ -475,7 +471,7 @@ def evaluate( ), name, ) - elif self.name == pl_expr.AnyHorizontal: + elif self.name == pl_expr.BooleanFunction.AnyHorizontal: name = columns[0].name if any(c.obj.null_count() > 0 for c in columns): raise NotImplementedError("Kleene logic for any_horizontal") @@ -490,6 +486,34 @@ def evaluate( ), name, ) + elif self.name == pl_expr.BooleanFunction.IsBetween: + column, lo, hi = columns + closed = self.options + if closed == pl_expr.ClosedInterval.None_: + left = plc.binaryop.BinaryOperator.GREATER + right = plc.binaryop.BinaryOperator.LESS + elif closed == pl_expr.ClosedInterval.Left: + left = plc.binaryop.BinaryOperator.GREATER_EQUAL + right = plc.binaryop.BinaryOperator.LESS + elif closed == pl_expr.ClosedInterval.Right: + left = plc.binaryop.BinaryOperator.GREATER + right = plc.binaryop.BinaryOperator.LESS_EQUAL + else: + left = plc.binaryop.BinaryOperator.GREATER_EQUAL + right = plc.binaryop.BinaryOperator.LESS_EQUAL + return Column( + plc.binaryop.binary_operation( + plc.binaryop.binary_operation( + column.obj, lo.obj, left, output_type=self.dtype + ), + plc.binaryop.binary_operation( + column.obj, hi.obj, right, output_type=self.dtype + ), + plc.binaryop.BinaryOperator.LOGICAL_AND, + self.dtype, + ), + column.name, + ) else: raise NotImplementedError(f"BooleanFunction {self.name}") @@ -519,14 +543,21 @@ def evaluate( mapping: dict[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" - (child,) = self.children - column = child.evaluate(df, context=context, mapping=mapping) + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] if self.name == pl_expr.StringFunction.Lowercase: + (column,) = columns return Column(plc.strings.case.to_lower(column.obj), column.name) elif self.name == pl_expr.StringFunction.Uppercase: - (child,) = self.children - column = child.evaluate(df, context=context, mapping=mapping) + (column,) = columns return Column(plc.strings.case.to_upper(column.obj), column.name) + elif self.name == pl_expr.StringFunction.EndsWith: + column, suffix = columns + return Column( + plc.strings.find.ends_with(column.obj, suffix.obj), column.name + ) else: raise NotImplementedError(f"StringFunction {self.name}") From bdd6ee38727b86b0a9fd821954f985037b9f40b4 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 21 May 2024 12:44:40 +0000 Subject: [PATCH 25/56] Simplify --- python/cudf_polars/cudf_polars/dsl/expr.py | 42 ++++++++++++++-------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 3773bba8632..819582b98f9 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -387,6 +387,30 @@ def _distinct( column.name, ) + _BETWEEN_OPS: ClassVar[ + dict[ + pl_expr.ClosedInterval, + tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator], + ] + ] = { + pl_expr.ClosedInterval.None_: ( + plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.LESS, + ), + pl_expr.ClosedInterval.Left: ( + plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.LESS, + ), + pl_expr.ClosedInterval.Right: ( + plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.LESS_EQUAL, + ), + pl_expr.ClosedInterval.Both: ( + plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.LESS_EQUAL, + ), + } + @with_mapping def evaluate( self, @@ -488,26 +512,14 @@ def evaluate( ) elif self.name == pl_expr.BooleanFunction.IsBetween: column, lo, hi = columns - closed = self.options - if closed == pl_expr.ClosedInterval.None_: - left = plc.binaryop.BinaryOperator.GREATER - right = plc.binaryop.BinaryOperator.LESS - elif closed == pl_expr.ClosedInterval.Left: - left = plc.binaryop.BinaryOperator.GREATER_EQUAL - right = plc.binaryop.BinaryOperator.LESS - elif closed == pl_expr.ClosedInterval.Right: - left = plc.binaryop.BinaryOperator.GREATER - right = plc.binaryop.BinaryOperator.LESS_EQUAL - else: - left = plc.binaryop.BinaryOperator.GREATER_EQUAL - right = plc.binaryop.BinaryOperator.LESS_EQUAL + lop, rop = self._BETWEEN_OPS[self.options] return Column( plc.binaryop.binary_operation( plc.binaryop.binary_operation( - column.obj, lo.obj, left, output_type=self.dtype + column.obj, lo.obj, lop, output_type=self.dtype ), plc.binaryop.binary_operation( - column.obj, hi.obj, right, output_type=self.dtype + column.obj, hi.obj, rop, output_type=self.dtype ), plc.binaryop.BinaryOperator.LOGICAL_AND, self.dtype, From c06b980fe0c906b5b1cc94e189d6f9f6533c87a0 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 21 May 2024 14:47:23 +0000 Subject: [PATCH 26/56] More fixes --- .../cudf_polars/containers/dataframe.py | 4 +++- python/cudf_polars/cudf_polars/dsl/expr.py | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index e5dd757690a..dba4c9f6c2c 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -121,7 +121,9 @@ def discard_columns(self, names: Set[str]) -> Self: def select(self, names: Sequence[str]) -> Self: """Select columns by name returning DataFrame.""" want = set(names) - return type(self)([c for c in self.columns if c.name in want], self.scalars) + if not want.issubset(self.column_names_set): + raise ValueError("Can't select missing names") + return type(self)([self._column_map[name] for name in names], self.scalars) def replace_columns(self, *columns: Column) -> Self: """Return a new dataframe with columns replaced by name.""" diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 819582b98f9..7ff4a359940 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -512,7 +512,8 @@ def evaluate( ) elif self.name == pl_expr.BooleanFunction.IsBetween: column, lo, hi = columns - lop, rop = self._BETWEEN_OPS[self.options] + (closed,) = self.options + lop, rop = self._BETWEEN_OPS[closed] return Column( plc.binaryop.binary_operation( plc.binaryop.binary_operation( @@ -545,6 +546,13 @@ def __init__( self.options = options self.name = name self.children = children + if self.name not in ( + pl_expr.StringFunction.Lowercase, + pl_expr.StringFunction.Uppercase, + pl_expr.StringFunction.EndsWith, + pl_expr.StringFunction.StartsWith, + ): + raise NotImplementedError(f"String function {self.name}") @with_mapping def evaluate( @@ -570,6 +578,11 @@ def evaluate( return Column( plc.strings.find.ends_with(column.obj, suffix.obj), column.name ) + elif self.name == pl_expr.StringFunction.StartsWith: + column, suffix = columns + return Column( + plc.strings.find.starts_with(column.obj, suffix.obj), column.name + ) else: raise NotImplementedError(f"StringFunction {self.name}") From 3b17c719f05b9479c6536c3bd9c64fb7a7d3914a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 21 May 2024 15:12:18 +0000 Subject: [PATCH 27/56] xfail strict in cudf_polars tests --- python/cudf_polars/pyproject.toml | 3 +++ python/cudf_polars/tests/test_basic.py | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index f5d29202961..3619e32e140 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -51,6 +51,9 @@ version = {file = "cudf_polars/VERSION"} [tool.setuptools.packages.find] exclude = ["*tests*"] +[tool.pytest.ini_options] +xfail_strict = true + [tool.ruff] line-length = 88 indent-width = 4 diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py index c7123513cd2..2b16dac8d84 100644 --- a/python/cudf_polars/tests/test_basic.py +++ b/python/cudf_polars/tests/test_basic.py @@ -123,7 +123,6 @@ def null_data(): ).lazy() -@pytest.mark.xfail(reason="Boolean function not yet implemented") def test_drop_nulls(null_data): result = null_data.drop_nulls() assert_gpu_result_equal(result) From 19db751fe491fe58d3b0a7e03162a73923808158 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 21 May 2024 15:37:30 +0000 Subject: [PATCH 28/56] Overview doc, simplify callback --- python/cudf_polars/cudf_polars/callback.py | 14 +- .../cudf_polars/cudf_polars/dsl/translate.py | 5 +- python/cudf_polars/docs/overview.md | 174 ++++++++++++++++++ 3 files changed, 181 insertions(+), 12 deletions(-) create mode 100644 python/cudf_polars/docs/overview.md diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index ed473e0ad0e..aabb8498ce2 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -29,12 +29,8 @@ def _callback( assert with_columns is None assert pyarrow_predicate is None assert n_rows is None - try: - with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"): - return ir.evaluate(cache={}).to_polars() - except Exception as e: - print("Unable to evaluate", e) - raise + with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"): + return ir.evaluate(cache={}).to_polars() def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None: @@ -54,11 +50,7 @@ def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None: """ try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): - callback = partial(_callback, translate_ir(nt)) + nt.set_udf(partial(_callback, translate_ir(nt))) except NotImplementedError: if raise_on_fail: raise - return - - nt.set_udf(callback) - return diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 430534bf6bd..17518f62806 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -21,7 +21,7 @@ class set_node(AbstractContextManager): __slots__ = ("n", "visitor") - def __init__(self, visitor, n): + def __init__(self, visitor, n: int): self.visitor = visitor self.n = n @@ -94,6 +94,9 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: else None, ) elif isinstance(node, pl_ir.Select): + # We translate the expressions (which are executed with + # reference to the input node) with the input node active + # so that dtype resolution works correctly. with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr] diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md new file mode 100644 index 00000000000..c07b1592130 --- /dev/null +++ b/python/cudf_polars/docs/overview.md @@ -0,0 +1,174 @@ +# Getting started + +You will need: + +1. Rust development environment. If you use the rapids [combined + devcontainer](https://github.com/rapidsai/devcontainers/), add + `"./features/src/rust": {"version": "latest", "profile": "default"},` to your + preferred configuration. Or else, use + [rustup](https://www.rust-lang.org/tools/install) +2. A [cudf development + environment](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment). + The combined devcontainer works, or whatever your favourite approach is. + +> ![NOTE] These instructions will get simpler as we merge code in. + +## Installing polars + +We will need to build polars from source. Until things settle down, +live at `HEAD`. + +```sh +git clone https://github.com/pola-rs/polars +cd polars +``` + +We will install build dependencies in the same environment that we created for +building cudf. Note that polars offers a `make build` command that sets up a +separate virtual environment, but we don't want to do that right now. So in the +polars clone: + +```sh +# cudf environment (conda or pip) is active +pip install --upgrade uv +uv pip install --upgrade -r py-polars/requirements-dev.txt +``` + +Now we have the necessary machinery to build polars +```sh +cd py-polars +# build in debug mode, best option for development/debugging +maturin develop -m Cargo.toml +``` + +For benchmarking purposes we should build in release mode +```sh +RUSTFLAGS='-C target-cpu=native' maturin develop -m Cargo.toml --release +``` + +After any update of the polars code, we need to rerun the `maturin` build +command. + +## Installing the cudf polars executor + +The executor for the polars logical plan lives in the cudf repo, in +`python/cudf_polars`. Build cudf as normal and then install the +`cudf_polars` package in editable mode: + +```sh +cd cudf/python/cudf_polars +pip install --no-deps -e . +``` + +You should now be able to run the tests in the `cudf_polars` package: +```sh +pytest -v tests +``` + +# Executor design + +The polars `LazyFrame.collect` functionality offers a +"post-optimization" callback that may be used by a third party library +to replace a (or more, though we only replace a single node) in the +optimized logical plan with a Python callback that is to deliver the +result of evaluating the plan. This splits the execution of the plan +into two phases. First, a symbolic phase which translates to our +internal representation (IR). Second, an execution phase which executes +using our IR. + +The translation phase receives the a low-level Rust `NodeTraverse` +object which delivers Python representations of the plan nodes (and +expressions) one at a time. During translation, we endeavour to raise +`NotImplementedError` for any unsupported functionality. This way, if +we can't execute something, we just don't modify the logical plan at +all: if we can translate the IR, it is assumed that evaluation will +later succeed. + +The usage of the cudf-based executor is therefore, at present: + +```python +from cudf_polars.callback import execute_with_cudf + +result = q.collect(post_opt_callback=execute_with_cudf) +``` + +This should either transparently run on the GPU and deliver a polars +dataframe, or else fail (but be handled) and just run the normal CPU +execution. + +## Adding a handler for a new plan node + +Plan node definitions live in `cudf_polars/dsl/ir.py`, these are +`dataclasses` that inherit from the base `IR` node. The evaluation of +a plan node is done by implementing the `evaluate` method. + +To translate the plan node, add a case handler in `translate_ir` which +lives in `cudf_polars/dsl/translate.py`. + +As well as child nodes that are plans, most plan nodes contain child +expressions, which should be transformed using the input to the plan as a +context. The translation of expressions is handled via +`translate_expr` in `cudf_poalrs/dsl/translate.py`. So that data-type +resolution is performed correctly any expression should be translated +with the correct plan node "active" in the visitor. For example, when +translating a `Join` node, the left keys (expressions) should be +translated with the left input active (and right keys with right +input). To facilitate this, use the `set_node` context manager. + +## Adding a handler for a new expression node + +Adding a handle for an expression node is very similar to a plan node. +Expressions are all defined in `cudf_polars/dsl/expr.py` and inherit +from `Expr`. Unlike plan nodes, these are not `dataclasses`, since it +is simpler for us to implement efficient hashing, repr, and equality if we +can write that ourselves. + +Every expression consists of two types of data: +1. child data (other `Expr`s) +2. non-child data (anything other than an `Expr`) +The generic implementations of special methods in the base `Expr` base +class require that the subclasses advertise which arguments to the +constructor are non-child in a `_non_child` class slot. The +constructor should then take arguments: +```python +def __init__(self, *non_child_data: Any, *children: Expr): +``` +Read the docstrings in the `Expr` class for more details. In +particular, one needs to be careful to ensure that an `Expr` hashes +correctly. + +Expressions are evaluated by implementing an `evaluate` method, this +takes a `DataFrame` as context (this provides columns), along with an +`ExecutionContext` parameter (indicating what context we're evaluating +this expression in, currently unused), and a `mapping` from +expressions to evaluated `Column`s: this enables a simple form of +expression rewriting during evaluation of expressions that is used in +evaluation of groupby-aggregations. To reduce boilerplate for lookup +in the mappings dictionary use the `@with_mapping` decorator. + +To simplify state tracking, all columns should be considered immutable +on construction. This matches the "functional" description coming from +the logical plan in any case, so is reasonably natural. + +# Containers + +Containers should be constructed as relatively lightweight objects +around their pylibcudf counterparts. We have three (in +`cudf_polars/containers/`): + +1. Scalar (a wrapper around a pylibcudf Scalar) +2. Column (a wrapper around a pylibcudf Column) +3. DataFrame (a wrapper around a pylibcudf Table) + +The interfaces offered by these are somewhat in flux, but broadly +speaking, a `DataFrame` is just a list of `Column`s which each hold +data plus a string `name`, along with a collection of `Scalar`s (this +might go away). + +The columns keep track of metadata (for example, whether or not they +are sorted). + +We offer some utility methods for transferring metadata when +constructing new dataframes and columns, both `DataFrame` and `Column` +offer a `with_metadata(*, like: Self)` call which copies metadata from +the template. From 146327cc0f51a51ac949d5dcdcef059b336de1a1 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 21 May 2024 16:14:35 +0000 Subject: [PATCH 29/56] Docstrings for plan nodes. --- python/cudf_polars/cudf_polars/dsl/ir.py | 141 ++++++++++++++++++++++- 1 file changed, 140 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 37eddb9b408..71e2ab7941c 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -63,25 +63,64 @@ @dataclass(slots=True) class IR: + """Abstract plan node, representing an unevaluated dataframe.""" + schema: dict[str, plc.DataType] + """Mapping from column names to their data types.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: - """Evaluate and return a dataframe.""" + """ + Evaluate the node and return a dataframe. + + Parameters + ---------- + cache + Mapping from cached node ids to constructed DataFrames. + Used to implement evaluation of the `Cache` node. + + Returns + ------- + DataFrame (on device) representing the evaluation of this plan + node. + + Raises + ------ + NotImplementedError if we couldn't evaluate things. Ideally + this should not occur, since the translation phase should pick + up things that we cannot handle. + """ raise NotImplementedError @dataclass(slots=True) class PythonScan(IR): + """Representation of input from a python function.""" + options: Any + """Arbitrary options.""" predicate: Expr | None + """Filter to apply to the constructed dataframe before returning it.""" @dataclass(slots=True) class Scan(IR): + """Input from files.""" + typ: Any + """What type of file are we reading? Parquet, CSV, etc...""" paths: list[str] + """List of paths to read from.""" file_options: Any + """Options for reading the file. + + Attributes are: + - ``with_columns: list[str]`` of projected columns to return. + - ``n_rows: int``: Number of rows to read. + - ``row_index: tuple[name, offset] | None``: Add an integer index + column with given name. + """ predicate: Expr | None + """Mask to apply to the read dataframe.""" def __post_init__(self): """Validate preconditions.""" @@ -138,8 +177,16 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class Cache(IR): + """ + Return a cached plan node. + + Used for CSE at the plan level. + """ + key: int + """The cache key.""" value: IR + """The unevaluated node to cache.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -151,9 +198,18 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class DataFrameScan(IR): + """ + Input from an existing polars DataFrame. + + This typically arises from ``q.collect().lazy()`` + """ + df: Any + """Polars LazyFrame object.""" projection: list[str] + """List of columns to project out.""" predicate: Expr | None + """Mask to apply.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -183,9 +239,18 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class Select(IR): + """Produce a new dataframe selecting given expressions from an input.""" + df: IR + """Input dataframe.""" cse: list[Expr] + """ + List of common subexpressions that will appear in the selected expressions. + + These must be evaluated before the returned expressions. + """ expr: list[Expr] + """List of expressions to evaluate to form the new dataframe.""" def evaluate(self, *, cache: dict[int, DataFrame]): """Evaluate and return a dataframe.""" @@ -227,11 +292,18 @@ def placeholder_column(n: int): @dataclass(slots=False) class GroupBy(IR): + """Perform a groupby.""" + df: IR + """Input dataframe.""" agg_requests: list[Expr] + """List of expressions to evaluate groupwise.""" keys: list[Expr] + """List of expressions forming the keys.""" maintain_order: bool + """Should the order of the input dataframe be maintained?""" options: Any + """Options controlling style of groupby.""" @staticmethod def check_agg(agg: Expr) -> int: @@ -314,11 +386,25 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class Join(IR): + """A join of two dataframes.""" + left: IR + """Left frame.""" right: IR + """Right frame.""" left_on: list[Expr] + """List of expressions used as keys in the left frame.""" right_on: list[Expr] + """List of expressions used as keys in the right frame.""" options: Any + """ + tuple of options: + - how: join type + - join_nulls: do nulls compare equal? + - slice: optional slice to perform after joining. + - suffix: string suffix for right columns if names match + - coalesce: should key columns be coalesced (only makes sense for outer joins) + """ def __post_init__(self): """Validate preconditions.""" @@ -424,8 +510,12 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class HStack(IR): + """Add new columns to a dataframe.""" + df: IR + """Input dataframe.""" columns: list[Expr] + """List of expressions to produce new columns.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -435,11 +525,18 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class Distinct(IR): + """Produce a new dataframe with distinct rows.""" + df: IR + """Input dataframe.""" keep: plc.stream_compaction.DuplicateKeepOption + """Which rows to keep.""" subset: set[str] | None + """Which columns to inspect when computing distinct rows.""" zlice: tuple[int, int] | None + """Optional slice to perform after compaction.""" stable: bool + """Should order be preserved?""" _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = { "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, @@ -495,12 +592,20 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class Sort(IR): + """Sort a dataframe.""" + df: IR + """Input.""" by: list[Expr] + """List of expressions to produce sort keys.""" do_sort: Callable[..., plc.Table] + """pylibcudf sorting function.""" zlice: tuple[int, int] | None + """Optional slice to apply after sorting.""" order: list[plc.types.Order] + """Order keys should be sorted in.""" null_order: list[plc.types.NullOrder] + """Where nulls sort to.""" def __init__( self, @@ -551,9 +656,14 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class Slice(IR): + """Slice a dataframe.""" + df: IR + """Input.""" offset: int + """Start of the slice.""" length: int + """Length of the slice.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -563,8 +673,12 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class Filter(IR): + """Filter a dataframe with a boolean mask.""" + df: IR + """Input.""" mask: Expr + """Expression evaluating to a mask.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -574,7 +688,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class Projection(IR): + """Select a subset of columns from a dataframe.""" + df: IR + """Input.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -585,9 +702,14 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class MapFunction(IR): + """Apply some function to a dataframe.""" + df: IR + """Input.""" name: str + """Function name.""" options: Any + """Arbitrary options, interpreted per function.""" _NAMES: ClassVar[frozenset[str]] = frozenset( [ @@ -670,8 +792,12 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class Union(IR): + """Concatenate dataframes vertically.""" + dfs: list[IR] + """List of inputs.""" zlice: tuple[int, int] | None + """Optional slice to apply after concatenation.""" def __post_init__(self): """Validated preconditions.""" @@ -681,6 +807,7 @@ def __post_init__(self): def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" + # TODO: only evaluate what we need if we have a slice dfs = [df.evaluate(cache=cache) for df in self.dfs] return DataFrame.from_table( plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names @@ -689,7 +816,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class HConcat(IR): + """Concatenate dataframes horizontally.""" + dfs: list[IR] + """List of inputs.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -703,8 +833,17 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @dataclass(slots=True) class ExtContext(IR): + """ + Concatenate dataframes horizontally. + + This is similar to HConcat, but is used only to temporarily + introduce new dataframes into an expression context. + """ + df: IR + """Input.""" extra: list[IR] + """List of extra inputs.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" From e81a1e125b0771cf4324732561d6c9df074140de Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 21 May 2024 16:25:00 +0000 Subject: [PATCH 30/56] ClosedInterval will be a string --- python/cudf_polars/cudf_polars/dsl/expr.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 7ff4a359940..2fc16be8f6b 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -32,6 +32,8 @@ if TYPE_CHECKING: from collections.abc import Sequence + import polars.type_aliases as pl_types + from cudf_polars.containers import DataFrame __all__ = [ @@ -389,23 +391,23 @@ def _distinct( _BETWEEN_OPS: ClassVar[ dict[ - pl_expr.ClosedInterval, + pl_types.ClosedInterval, tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator], ] ] = { - pl_expr.ClosedInterval.None_: ( + "none": ( plc.binaryop.BinaryOperator.GREATER, plc.binaryop.BinaryOperator.LESS, ), - pl_expr.ClosedInterval.Left: ( + "left": ( plc.binaryop.BinaryOperator.GREATER_EQUAL, plc.binaryop.BinaryOperator.LESS, ), - pl_expr.ClosedInterval.Right: ( + "right": ( plc.binaryop.BinaryOperator.GREATER, plc.binaryop.BinaryOperator.LESS_EQUAL, ), - pl_expr.ClosedInterval.Both: ( + "both": ( plc.binaryop.BinaryOperator.GREATER_EQUAL, plc.binaryop.BinaryOperator.LESS_EQUAL, ), From 98281e8f5958ec55bd56cd8d8af87016dbbf19d7 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 22 May 2024 12:06:46 +0100 Subject: [PATCH 31/56] Small fixes from code review Co-authored-by: Vyas Ramasubramani --- python/cudf_polars/cudf_polars/utils/dtypes.py | 1 - python/cudf_polars/docs/overview.md | 14 +++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index f3303fbbce2..911c391c063 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -88,5 +88,4 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: elif isinstance(dtype, pl.List): return plc.DataType(plc.TypeId.LIST) else: - breakpoint() raise NotImplementedError(f"{dtype=} conversion not supported") diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index c07b1592130..e2562959141 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -69,7 +69,7 @@ pytest -v tests The polars `LazyFrame.collect` functionality offers a "post-optimization" callback that may be used by a third party library -to replace a (or more, though we only replace a single node) in the +to replace a node (or more, though we only replace a single node) in the optimized logical plan with a Python callback that is to deliver the result of evaluating the plan. This splits the execution of the plan into two phases. First, a symbolic phase which translates to our @@ -108,7 +108,7 @@ lives in `cudf_polars/dsl/translate.py`. As well as child nodes that are plans, most plan nodes contain child expressions, which should be transformed using the input to the plan as a context. The translation of expressions is handled via -`translate_expr` in `cudf_poalrs/dsl/translate.py`. So that data-type +`translate_expr` in `cudf_polars/dsl/translate.py`. So that data-type resolution is performed correctly any expression should be translated with the correct plan node "active" in the visitor. For example, when translating a `Join` node, the left keys (expressions) should be @@ -137,13 +137,13 @@ Read the docstrings in the `Expr` class for more details. In particular, one needs to be careful to ensure that an `Expr` hashes correctly. -Expressions are evaluated by implementing an `evaluate` method, this -takes a `DataFrame` as context (this provides columns), along with an +Expressions are evaluated by implementing an `evaluate` method that +takes a `DataFrame` as context (this provides columns) along with an `ExecutionContext` parameter (indicating what context we're evaluating -this expression in, currently unused), and a `mapping` from -expressions to evaluated `Column`s: this enables a simple form of +this expression in, currently unused) and a `mapping` from +expressions to evaluated `Column`s. This approach enables a simple form of expression rewriting during evaluation of expressions that is used in -evaluation of groupby-aggregations. To reduce boilerplate for lookup +evaluation of, for example, groupby-aggregations. To reduce boilerplate for lookup in the mappings dictionary use the `@with_mapping` decorator. To simplify state tracking, all columns should be considered immutable From 3a1ac86131275aa98e803993dc5fb7bc56888675 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 22 May 2024 11:52:12 +0000 Subject: [PATCH 32/56] Dedent some assertions --- python/cudf_polars/cudf_polars/utils/dtypes.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 911c391c063..51379433c03 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -67,9 +67,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS) elif dtype.time_unit == "ns": return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS) - else: - assert dtype.time_unit is not None - assert_never(dtype.time_unit) + assert dtype.time_unit is not None + assert_never(dtype.time_unit) elif isinstance(dtype, pl.Duration): if dtype.time_unit == "ms": return plc.DataType(plc.TypeId.DURATION_MILLISECONDS) @@ -77,9 +76,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType: return plc.DataType(plc.TypeId.DURATION_MICROSECONDS) elif dtype.time_unit == "ns": return plc.DataType(plc.TypeId.DURATION_NANOSECONDS) - else: - assert dtype.time_unit is not None - assert_never(dtype.time_unit) + assert dtype.time_unit is not None + assert_never(dtype.time_unit) elif isinstance(dtype, pl.String): return plc.DataType(plc.TypeId.STRING) elif isinstance(dtype, pl.Null): From f0686a29e8e404fe780dfb772f7a871f76402f76 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 23 May 2024 14:59:55 +0000 Subject: [PATCH 33/56] More fixes in review --- .../cudf_polars/containers/column.py | 20 +-- .../cudf_polars/containers/dataframe.py | 6 +- python/cudf_polars/cudf_polars/dsl/expr.py | 164 +++++++++--------- python/cudf_polars/cudf_polars/dsl/ir.py | 24 +-- 4 files changed, 96 insertions(+), 118 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 7784febf2e8..9ca5b7f0310 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -78,20 +78,18 @@ def mask_nans(self) -> Self: """Return a copy of self with nans masked out.""" if self.nan_count > 0: raise NotImplementedError - else: - return self.copy() + return self.copy() @functools.cached_property def nan_count(self) -> int: """Return the number of NaN values in the column.""" if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): return 0 - else: - return plc.interop.to_arrow( - plc.reduce.reduce( - plc.unary.is_nan(self.obj), - plc.aggregation.sum(), - # TODO: pylibcudf needs to have a SizeType DataType singleton - plc.DataType(plc.TypeId.INT32), - ) - ).as_py() + return plc.interop.to_arrow( + plc.reduce.reduce( + plc.unary.is_nan(self.obj), + plc.aggregation.sum(), + # TODO: pylibcudf needs to have a SizeType DataType singleton + plc.DataType(plc.TypeId.INT32), + ) + ).as_py() diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index dba4c9f6c2c..aa2f412f694 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -43,8 +43,6 @@ def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None else: self.table = None - __iter__ = None - def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" assert len(self.scalars) == 0 @@ -66,12 +64,12 @@ def column_names(self) -> list[str]: return [c.name for c in self.columns] @cached_property - def num_columns(self): + def num_columns(self) -> int: """Number of columns.""" return len(self.columns) @cached_property - def num_rows(self): + def num_rows(self) -> int: """Number of rows.""" if self.table is None: raise ValueError("Number of rows of frame with scalars makes no sense") diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 2fc16be8f6b..fc91bef726a 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -79,12 +79,12 @@ class Expr: *children).`` """ - __slots__ = ("dtype", "hash_value", "repr_value") + __slots__ = ("dtype", "_hash_value", "_repr_value") dtype: plc.DataType """Data type of the expression.""" - hash_value: int + _hash_value: int """Caching slot for the hash of the expression.""" - repr_value: str + _repr_value: str """Caching slot for repr of the expression.""" children: tuple[Expr, ...] = () """Children of the expression.""" @@ -113,10 +113,10 @@ def get_hash(self) -> int: def __hash__(self): """Hash of an expression with caching.""" try: - return self.hash_value + return self._hash_value except AttributeError: - self.hash_value = self.get_hash() - return self.hash_value + self._hash_value = self.get_hash() + return self._hash_value def is_equal(self, other: Any) -> bool: """ @@ -153,20 +153,58 @@ def __ne__(self, other): def __repr__(self): """String representation of an expression with caching.""" try: - return self.repr_value + return self._repr_value except AttributeError: args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children)) - self.repr_value = f"{type(self).__name__}({args})" - return self.repr_value + self._repr_value = f"{type(self).__name__}({args})" + return self._repr_value + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: dict[Expr, Column] | None = None, + ) -> Column: # TODO: return type is a lie for Literal + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame that will provide columns. + context + What context are we performing this evaluation in? + mapping + Substitution mapping from expressions to Columns, used to + override the evaluation of a given expression if we're + performing a simple rewritten evaluation. + + Notes + ----- + Do not call this function directly, but rather + :func:`evaluate` which handles the mapping lookups. + + Returns + ------- + Column representing the evaluation of the expression (or maybe + a scalar, annoying!). + + Raises + ------ + NotImplementedError if we couldn't evaluate the expression. + Ideally all these are returned during translation to the IR, + but for now we are not perfect. + """ + raise NotImplementedError(f"Evaluation of {type(self).__name__}") - # TODO: return type is a lie for Literal def evaluate( self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME, mapping: dict[Expr, Column] | None = None, - ) -> Column: + ) -> Column: # TODO: return type is a lie for Literal """ Evaluate this expression given a dataframe for context. @@ -181,6 +219,12 @@ def evaluate( override the evaluation of a given expression if we're performing a simple rewritten evaluation. + Notes + ----- + Individual subclasses should implement :meth:`do_allocate`, + this method provides logic to handle lookups in the + substitution mapping. + Returns ------- Column representing the evaluation of the expression (or maybe @@ -192,7 +236,12 @@ def evaluate( Ideally all these are returned during translation to the IR, but for now we are not perfect. """ - raise NotImplementedError + if mapping is None: + return self.do_evaluate(df, context=context, mapping=mapping) + try: + return mapping[self] + except KeyError: + return self.do_evaluate(df, context=context, mapping=mapping) def collect_agg(self, *, depth: int) -> AggInfo: """ @@ -215,29 +264,9 @@ def collect_agg(self, *, depth: int) -> AggInfo: aggregation request (for example nested aggregations like ``a.max().min()``). """ - raise NotImplementedError - - -def with_mapping(fn): - """Decorate a callback that takes an expression mapping to use it.""" - - def _( - self, - df: DataFrame, - *, - context=ExecutionContext.FRAME, - mapping: dict[Expr, Column] | None = None, - ) -> Column: - """Look up self in the mapping before evaluating it.""" - if mapping is None: - return fn(self, df, context=context, mapping=mapping) - else: - try: - return mapping[self] - except KeyError: - return fn(self, df, context=context, mapping=mapping) - - return _ + raise NotImplementedError( + f"Collecting aggregation info for {type(self).__name__}" + ) class NamedExpr(Expr): @@ -249,8 +278,7 @@ def __init__(self, dtype: plc.DataType, name: str, value: Expr) -> None: self.name = name self.children = (value,) - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -278,8 +306,7 @@ def __init__(self, dtype: plc.DataType, value: Any) -> None: super().__init__(dtype) self.value = pa.scalar(value) - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -291,10 +318,6 @@ def evaluate( obj = plc.interop.from_arrow(self.value) return Scalar(obj) # type: ignore - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - raise NotImplementedError("Literal in groupby") - class Col(Expr): __slots__ = ("name",) @@ -305,8 +328,7 @@ def __init__(self, dtype: plc.DataType, name: str) -> None: self.dtype = dtype self.name = name - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -322,8 +344,7 @@ def collect_agg(self, *, depth: int) -> AggInfo: class Len(Expr): - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -332,7 +353,7 @@ def evaluate( ) -> Column: """Evaluate this expression given a dataframe for context.""" # TODO: type is wrong, and dtype - return df.num_rows + return df.num_rows # type: ignore def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" @@ -413,8 +434,7 @@ def _distinct( ), } - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -556,8 +576,7 @@ def __init__( ): raise NotImplementedError(f"String function {self.name}") - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -600,8 +619,7 @@ def __init__( self.options = options self.children = (column,) - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -621,11 +639,6 @@ def evaluate( is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0] ) - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - # TODO: Could do with sort-based groupby and segmented sort post-hoc - raise NotImplementedError("Sort in groupby") - class SortBy(Expr): __slots__ = ("options", "children") @@ -642,8 +655,7 @@ def __init__( self.options = options self.children = (column, *by) - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -665,11 +677,6 @@ def evaluate( ) return Column(table.columns()[0], column.name) - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - # TODO: Could do with sort-based groupby and segmented sort post-hoc - raise NotImplementedError("SortBy in groupby") - class Gather(Expr): __slots__ = ("children",) @@ -679,8 +686,7 @@ def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): super().__init__(dtype) self.children = (values, indices) - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -710,11 +716,6 @@ def evaluate( table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy) return Column(table.columns()[0], values.name) - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - # TODO: Could do with sort-based groupby and segmented gather. - raise NotImplementedError("Gather in groupby") - class Filter(Expr): __slots__ = ("children",) @@ -724,8 +725,7 @@ def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): super().__init__(dtype) self.children = (values, indices) - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -742,11 +742,6 @@ def evaluate( ) return Column(table.columns()[0], values.name).with_sorted(like=values) - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - # TODO: Could do with sort-based groupby and segmented filter - raise NotImplementedError("Filter in groupby") - class RollingWindow(Expr): __slots__ = ("options", "children") @@ -776,8 +771,7 @@ def __init__(self, dtype: plc.DataType, value: Expr): super().__init__(dtype) self.children = (value,) - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, @@ -934,8 +928,7 @@ def _last(self, column: Column) -> Column: n = column.obj.size() return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name) - @with_mapping - def evaluate( + def do_evaluate( self, df, *, @@ -987,8 +980,7 @@ def __init__( pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR, } - @with_mapping - def evaluate( + def do_evaluate( self, df: DataFrame, *, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 71e2ab7941c..61a3fb87ee6 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -154,7 +154,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: step = plc.interop.from_arrow(pa.scalar(1), data_type=dtype) init = plc.interop.from_arrow(pa.scalar(offset), data_type=dtype) index = Column( - plc.filling.sequence(df.num_rows(), init, step), name + plc.filling.sequence(df.num_rows, init, step), name ).set_sorted( is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, @@ -836,8 +836,7 @@ class ExtContext(IR): """ Concatenate dataframes horizontally. - This is similar to HConcat, but is used only to temporarily - introduce new dataframes into an expression context. + Prefer HConcat, since this is going to be deprecated on the polars side. """ df: IR @@ -845,17 +844,8 @@ class ExtContext(IR): extra: list[IR] """List of extra inputs.""" - def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: - """Evaluate and return a dataframe.""" - # TODO: polars optimizer doesn't do projection pushdown - # through extcontext AFAICT. - df = self.df.evaluate(cache=cache) - # extra contexts are added in order, if they have any - # overlapping column names, those are ignored. - names = df.column_names_set.copy() - # TODO: scalars - for ir in self.extra: - extra = ir.evaluate(cache=cache).discard_columns(names) - names |= extra.column_names_set - df = df.with_columns(extra.columns) - return df + def __post_init__(self): + """Validate preconditions.""" + raise NotImplementedError( + "ExtContext will be deprecated, use horizontal concat instead." + ) From 8d25f3a61a45bdcbda44d13d520fc094f0d55a72 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 23 May 2024 15:40:33 +0000 Subject: [PATCH 34/56] Singledispatch for translation --- .../cudf_polars/cudf_polars/dsl/translate.py | 500 +++++++++++------- python/cudf_polars/tests/test_extcontext.py | 6 +- 2 files changed, 307 insertions(+), 199 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 17518f62806..f90a08e3b53 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -6,10 +6,13 @@ from __future__ import annotations from contextlib import AbstractContextManager, nullcontext +from functools import singledispatch from typing import Any from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir +import cudf._lib.pylibcudf as plc # noqa: TCH002, singledispatch register needs this name defined. + from cudf_polars.dsl import expr, ir from cudf_polars.utils import dtypes @@ -37,6 +40,171 @@ def __exit__(self, *args): noop_context: nullcontext = nullcontext() +@singledispatch +def _translate_ir(node: Any, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + raise NotImplementedError(f"Translation for {type(node).__name__}") + + +@_translate_ir.register +def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.PythonScan( + schema, + node.options, + translate_expr(visitor, n=node.predicate) + if node.predicate is not None + else None, + ) + + +@_translate_ir.register +def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.Scan( + schema, + node.scan_type, + node.paths, + node.file_options, + translate_expr(visitor, n=node.predicate) + if node.predicate is not None + else None, + ) + + +@_translate_ir.register +def _(node: pl_ir.Cache, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input)) + + +@_translate_ir.register +def _( + node: pl_ir.DataFrameScan, visitor: Any, schema: dict[str, plc.DataType] +) -> ir.IR: + return ir.DataFrameScan( + schema, + node.df, + node.projection, + translate_expr(visitor, n=node.selection) + if node.selection is not None + else None, + ) + + +@_translate_ir.register +def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + # We translate the expressions (which are executed with + # reference to the input node) with the input node active + # so that dtype resolution works correctly. + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr] + exprs = [translate_expr(visitor, n=e) for e in node.expr] + return ir.Select(schema, inp, cse_exprs, exprs) + + +@_translate_ir.register +def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + aggs = [translate_expr(visitor, n=e) for e in node.aggs] + keys = [translate_expr(visitor, n=e) for e in node.keys] + return ir.GroupBy( + schema, + inp, + aggs, + keys, + node.maintain_order, + node.options, + ) + + +@_translate_ir.register +def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input_left): + inp_left = translate_ir(visitor, n=None) + left_on = [translate_expr(visitor, n=e) for e in node.left_on] + with set_node(visitor, node.input_right): + inp_right = translate_ir(visitor, n=None) + right_on = [translate_expr(visitor, n=e) for e in node.right_on] + return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options) + + +@_translate_ir.register +def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + exprs = [translate_expr(visitor, n=e) for e in node.exprs] + return ir.HStack(schema, inp, exprs) + + +@_translate_ir.register +def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.Distinct( + schema, + translate_ir(visitor, n=node.input), + node.options, + ) + + +@_translate_ir.register +def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + by = [translate_expr(visitor, n=e) for e in node.by_column] + return ir.Sort(schema, inp, by, node.sort_options, node.slice) + + +@_translate_ir.register +def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len) + + +@_translate_ir.register +def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + mask = translate_expr(visitor, n=node.predicate) + return ir.Filter(schema, inp, mask) + + +@_translate_ir.register +def _( + node: pl_ir.SimpleProjection, visitor: Any, schema: dict[str, plc.DataType] +) -> ir.IR: + return ir.Projection(schema, translate_ir(visitor, n=node.input)) + + +@_translate_ir.register +def _(node: pl_ir.MapFunction, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + name, *options = node.function + return ir.MapFunction( + schema, + # TODO: merge_sorted breaks this pattern + translate_ir(visitor, n=node.input), + name, + options, + ) + + +@_translate_ir.register +def _(node: pl_ir.Union, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.Union( + schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options + ) + + +@_translate_ir.register +def _(node: pl_ir.HConcat, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs]) + + +@_translate_ir.register +def _(node: pl_ir.ExtContext, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + return ir.ExtContext( + schema, + translate_ir(visitor, n=node.input), + [translate_ir(visitor, n=n) for n in node.contexts], + ) + + def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: """ Translate a polars-internal IR node to our representation. @@ -64,117 +232,134 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR: with ctx: node = visitor.view_current_node() schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()} - if isinstance(node, pl_ir.PythonScan): - return ir.PythonScan( - schema, - node.options, - translate_expr(visitor, n=node.predicate) - if node.predicate is not None - else None, - ) - elif isinstance(node, pl_ir.Scan): - return ir.Scan( - schema, - node.scan_type, - node.paths, - node.file_options, - translate_expr(visitor, n=node.predicate) - if node.predicate is not None - else None, - ) - elif isinstance(node, pl_ir.Cache): - return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input)) - elif isinstance(node, pl_ir.DataFrameScan): - return ir.DataFrameScan( - schema, - node.df, - node.projection, - translate_expr(visitor, n=node.selection) - if node.selection is not None - else None, - ) - elif isinstance(node, pl_ir.Select): - # We translate the expressions (which are executed with - # reference to the input node) with the input node active - # so that dtype resolution works correctly. - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr] - exprs = [translate_expr(visitor, n=e) for e in node.expr] - return ir.Select(schema, inp, cse_exprs, exprs) - elif isinstance(node, pl_ir.GroupBy): - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - aggs = [translate_expr(visitor, n=e) for e in node.aggs] - keys = [translate_expr(visitor, n=e) for e in node.keys] - return ir.GroupBy( - schema, - inp, - aggs, - keys, - node.maintain_order, - node.options, - ) - elif isinstance(node, pl_ir.Join): - with set_node(visitor, node.input_left): - inp_left = translate_ir(visitor, n=None) - left_on = [translate_expr(visitor, n=e) for e in node.left_on] - with set_node(visitor, node.input_right): - inp_right = translate_ir(visitor, n=None) - right_on = [translate_expr(visitor, n=e) for e in node.right_on] - return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options) - elif isinstance(node, pl_ir.HStack): - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - exprs = [translate_expr(visitor, n=e) for e in node.exprs] - return ir.HStack(schema, inp, exprs) - elif isinstance(node, pl_ir.Distinct): - return ir.Distinct( - schema, - translate_ir(visitor, n=node.input), - node.options, - ) - elif isinstance(node, pl_ir.Sort): - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - by = [translate_expr(visitor, n=e) for e in node.by_column] - return ir.Sort(schema, inp, by, node.sort_options, node.slice) - elif isinstance(node, pl_ir.Slice): - return ir.Slice( - schema, translate_ir(visitor, n=node.input), node.offset, node.len - ) - elif isinstance(node, pl_ir.Filter): - with set_node(visitor, node.input): - inp = translate_ir(visitor, n=None) - mask = translate_expr(visitor, n=node.predicate) - return ir.Filter(schema, inp, mask) - elif isinstance(node, pl_ir.SimpleProjection): - return ir.Projection(schema, translate_ir(visitor, n=node.input)) - elif isinstance(node, pl_ir.MapFunction): - name, *options = node.function - return ir.MapFunction( - schema, - # TODO: merge_sorted breaks this pattern - translate_ir(visitor, n=node.input), - name, - options, - ) - elif isinstance(node, pl_ir.Union): - return ir.Union( - schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options - ) - elif isinstance(node, pl_ir.HConcat): - return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs]) - elif isinstance(node, pl_ir.ExtContext): - return ir.ExtContext( - schema, - translate_ir(visitor, n=node.input), - [translate_ir(visitor, n=n) for n in node.contexts], - ) - else: - raise NotImplementedError( - f"No handler for LogicalPlan node with {type(node)=}" - ) + return _translate_ir(node, visitor, schema) + + +@singledispatch +def _translate_expr(node: Any, visitor: Any, dtype: plc.DataType) -> expr.Expr: + raise NotImplementedError(f"Translation for {type(node).__name__}") + + +@_translate_expr.register +def _(node: pl_expr.PyExprIR, visitor: Any, dtype: plc.DataType) -> expr.Expr: + e = translate_expr(visitor, n=node.node) + return expr.NamedExpr(dtype, node.output_name, e) + + +@_translate_expr.register +def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr: + name, *options = node.function_data + if isinstance(name, pl_expr.StringFunction): + return expr.StringFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + elif isinstance(name, pl_expr.BooleanFunction): + return expr.BooleanFunction( + dtype, + name, + options, + *(translate_expr(visitor, n=n) for n in node.input), + ) + else: + raise NotImplementedError(f"No handler for Expr function node with {name=}") + + +@_translate_expr.register +def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr: + # TODO: raise in groupby? + if node.partition_by is None: + return expr.RollingWindow( + dtype, node.options, translate_expr(visitor, n=node.function) + ) + else: + return expr.GroupedRollingWindow( + dtype, + node.options, + translate_expr(visitor, n=node.function), + *(translate_expr(visitor, n=n) for n in node.partition_by), + ) + + +@_translate_expr.register +def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Literal(dtype, node.value) + + +@_translate_expr.register +def _(node: pl_expr.Sort, visitor: Any, dtype: plc.DataType) -> expr.Expr: + # TODO: raise in groupby + return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr)) + + +@_translate_expr.register +def _(node: pl_expr.SortBy, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.SortBy( + dtype, + node.sort_options, + translate_expr(visitor, n=node.expr), + *(translate_expr(visitor, n=n) for n in node.by), + ) + + +@_translate_expr.register +def _(node: pl_expr.Gather, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Gather( + dtype, + translate_expr(visitor, n=node.expr), + translate_expr(visitor, n=node.idx), + ) + + +@_translate_expr.register +def _(node: pl_expr.Filter, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Filter( + dtype, + translate_expr(visitor, n=node.input), + translate_expr(visitor, n=node.by), + ) + + +@_translate_expr.register +def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr: + inner = translate_expr(visitor, n=node.expr) + # Push casts into literals so we can handle Cast(Literal(Null)) + if isinstance(inner, expr.Literal): + return expr.Literal(dtype, inner.value) + else: + return expr.Cast(dtype, inner) + + +@_translate_expr.register +def _(node: pl_expr.Column, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Col(dtype, node.name) + + +@_translate_expr.register +def _(node: pl_expr.Agg, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Agg( + dtype, + node.name, + node.options, + translate_expr(visitor, n=node.arguments), + ) + + +@_translate_expr.register +def _(node: pl_expr.BinaryExpr, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.BinOp( + dtype, + expr.BinOp._MAPPING[node.op], + translate_expr(visitor, n=node.left), + translate_expr(visitor, n=node.right), + ) + + +@_translate_expr.register +def _(node: pl_expr.Len, visitor: Any, dtype: plc.DataType) -> expr.Expr: + return expr.Len(dtype) def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: @@ -198,92 +383,11 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr: NotImplementedError if any translation fails due to unsupported functionality. """ if isinstance(n, pl_expr.PyExprIR): - # TODO: type narrowing didn't work because PyExprIR is Unknown + # TODO: type narrowing doesn't rule out int since PyExprIR is Unknown assert not isinstance(n, int) - e = translate_expr(visitor, n=n.node) - return expr.NamedExpr(e.dtype, n.output_name, e) - node = visitor.view_expression(n) - dtype = dtypes.from_polars(visitor.get_dtype(n)) - if isinstance(node, pl_expr.Function): - name, *options = node.function_data - if isinstance(name, pl_expr.StringFunction): - return expr.StringFunction( - dtype, - name, - options, - *(translate_expr(visitor, n=n) for n in node.input), - ) - elif isinstance(name, pl_expr.BooleanFunction): - return expr.BooleanFunction( - dtype, - name, - options, - *(translate_expr(visitor, n=n) for n in node.input), - ) - else: - raise NotImplementedError(f"No handler for Expr function node with {name=}") - elif isinstance(node, pl_expr.Window): - # TODO: raise in groupby? - if node.partition_by is None: - return expr.RollingWindow( - dtype, node.options, translate_expr(visitor, n=node.function) - ) - else: - return expr.GroupedRollingWindow( - dtype, - node.options, - translate_expr(visitor, n=node.function), - *(translate_expr(visitor, n=n) for n in node.partition_by), - ) - elif isinstance(node, pl_expr.Literal): - return expr.Literal(dtype, node.value) - elif isinstance(node, pl_expr.Sort): - # TODO: raise in groupby - return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr)) - elif isinstance(node, pl_expr.SortBy): - # TODO: raise in groupby - return expr.SortBy( - dtype, - node.sort_options, - translate_expr(visitor, n=node.expr), - *(translate_expr(visitor, n=n) for n in node.by), - ) - elif isinstance(node, pl_expr.Gather): - return expr.Gather( - dtype, - translate_expr(visitor, n=node.expr), - translate_expr(visitor, n=node.idx), - ) - elif isinstance(node, pl_expr.Filter): - return expr.Filter( - dtype, - translate_expr(visitor, n=node.input), - translate_expr(visitor, n=node.by), - ) - elif isinstance(node, pl_expr.Cast): - inner = translate_expr(visitor, n=node.expr) - # Push casts into literals so we can handle Cast(Literal(Null)) - if isinstance(inner, expr.Literal): - return expr.Literal(dtype, inner.value) - else: - return expr.Cast(dtype, inner) - elif isinstance(node, pl_expr.Column): - return expr.Col(dtype, node.name) - elif isinstance(node, pl_expr.Agg): - return expr.Agg( - dtype, - node.name, - node.options, - translate_expr(visitor, n=node.arguments), - ) - elif isinstance(node, pl_expr.BinaryExpr): - return expr.BinOp( - dtype, - expr.BinOp._MAPPING[node.op], - translate_expr(visitor, n=node.left), - translate_expr(visitor, n=node.right), - ) - elif isinstance(node, pl_expr.Len): - return expr.Len(dtype) + node = n + dtype = dtypes.from_polars(visitor.get_dtype(node.node)) else: - raise NotImplementedError(f"No handler for expression node with {type(node)=}") + node = visitor.view_expression(n) + dtype = dtypes.from_polars(visitor.get_dtype(n)) + return _translate_expr(node, visitor, dtype) diff --git a/python/cudf_polars/tests/test_extcontext.py b/python/cudf_polars/tests/test_extcontext.py index c5481d0ccbd..9daf88b4338 100644 --- a/python/cudf_polars/tests/test_extcontext.py +++ b/python/cudf_polars/tests/test_extcontext.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import pytest + import polars as pl from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -16,4 +18,6 @@ def test_extcontext(): ).lazy() ldf2 = ldf.select((pl.col("b") + pl.col("a")).alias("c")) query = ldf.with_context(ldf2).select(pl.col("b"), pl.col("c")) - assert_gpu_result_equal(query) + with pytest.raises(pl.exceptions.ComputeError): + # ExtContext to be deprecated so we're not implementing it. + assert_gpu_result_equal(query) From 90fca6d78a172e8321ffe8c22778e9efe039daea Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 23 May 2024 15:46:08 +0000 Subject: [PATCH 35/56] Spell out DSL --- python/cudf_polars/cudf_polars/dsl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/dsl/__init__.py b/python/cudf_polars/cudf_polars/dsl/__init__.py index cdc37f9e437..804c5ada566 100644 --- a/python/cudf_polars/cudf_polars/dsl/__init__.py +++ b/python/cudf_polars/cudf_polars/dsl/__init__.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 -"""The DSL for the polars executor.""" +"""The domain-specific language (DSL) for the polars executor.""" from __future__ import annotations From 0f82d0f55fd448486d42f6e26d01acf7767b7c90 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 23 May 2024 15:48:31 +0000 Subject: [PATCH 36/56] Avoid double import --- python/cudf_polars/cudf_polars/dsl/ir.py | 30 +++++++++++------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 61a3fb87ee6..bc8e7d1a764 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -36,8 +36,6 @@ if TYPE_CHECKING: from typing import Literal - from cudf_polars.dsl.expr import Expr - __all__ = [ "IR", @@ -98,7 +96,7 @@ class PythonScan(IR): options: Any """Arbitrary options.""" - predicate: Expr | None + predicate: expr.Expr | None """Filter to apply to the constructed dataframe before returning it.""" @@ -119,7 +117,7 @@ class Scan(IR): - ``row_index: tuple[name, offset] | None``: Add an integer index column with given name. """ - predicate: Expr | None + predicate: expr.Expr | None """Mask to apply to the read dataframe.""" def __post_init__(self): @@ -208,7 +206,7 @@ class DataFrameScan(IR): """Polars LazyFrame object.""" projection: list[str] """List of columns to project out.""" - predicate: Expr | None + predicate: expr.Expr | None """Mask to apply.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @@ -243,13 +241,13 @@ class Select(IR): df: IR """Input dataframe.""" - cse: list[Expr] + cse: list[expr.Expr] """ List of common subexpressions that will appear in the selected expressions. These must be evaluated before the returned expressions. """ - expr: list[Expr] + expr: list[expr.Expr] """List of expressions to evaluate to form the new dataframe.""" def evaluate(self, *, cache: dict[int, DataFrame]): @@ -296,9 +294,9 @@ class GroupBy(IR): df: IR """Input dataframe.""" - agg_requests: list[Expr] + agg_requests: list[expr.Expr] """List of expressions to evaluate groupwise.""" - keys: list[Expr] + keys: list[expr.Expr] """List of expressions forming the keys.""" maintain_order: bool """Should the order of the input dataframe be maintained?""" @@ -306,7 +304,7 @@ class GroupBy(IR): """Options controlling style of groupby.""" @staticmethod - def check_agg(agg: Expr) -> int: + def check_agg(agg: expr.Expr) -> int: """ Determine if we can handle an aggregation expression. @@ -392,9 +390,9 @@ class Join(IR): """Left frame.""" right: IR """Right frame.""" - left_on: list[Expr] + left_on: list[expr.Expr] """List of expressions used as keys in the left frame.""" - right_on: list[Expr] + right_on: list[expr.Expr] """List of expressions used as keys in the right frame.""" options: Any """ @@ -514,7 +512,7 @@ class HStack(IR): df: IR """Input dataframe.""" - columns: list[Expr] + columns: list[expr.Expr] """List of expressions to produce new columns.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: @@ -596,7 +594,7 @@ class Sort(IR): df: IR """Input.""" - by: list[Expr] + by: list[expr.Expr] """List of expressions to produce sort keys.""" do_sort: Callable[..., plc.Table] """pylibcudf sorting function.""" @@ -611,7 +609,7 @@ def __init__( self, schema: dict, df: IR, - by: list[Expr], + by: list[expr.Expr], options: Any, zlice: tuple[int, int] | None, ): @@ -677,7 +675,7 @@ class Filter(IR): df: IR """Input.""" - mask: Expr + mask: expr.Expr """Expression evaluating to a mask.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: From f5683e70f9f7a6aa5afe71cc56a495a6292bfcc8 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 23 May 2024 16:00:29 +0000 Subject: [PATCH 37/56] Docs fixes --- python/cudf_polars/docs/overview.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index e2562959141..cbf012f5881 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -133,18 +133,18 @@ constructor should then take arguments: ```python def __init__(self, *non_child_data: Any, *children: Expr): ``` -Read the docstrings in the `Expr` class for more details. In -particular, one needs to be careful to ensure that an `Expr` hashes -correctly. +Read the docstrings in the `Expr` class for more details. -Expressions are evaluated by implementing an `evaluate` method that +Expressions are evaluated by implementing a `do_evaluate` method that takes a `DataFrame` as context (this provides columns) along with an `ExecutionContext` parameter (indicating what context we're evaluating this expression in, currently unused) and a `mapping` from expressions to evaluated `Column`s. This approach enables a simple form of expression rewriting during evaluation of expressions that is used in -evaluation of, for example, groupby-aggregations. To reduce boilerplate for lookup -in the mappings dictionary use the `@with_mapping` decorator. +evaluation of, for example, groupby-aggregations. To perform the +evaluation, one should use the base class (generic) `evaluate` method +which handles the boilerplate for looking up in the substitution +`mapping`. To simplify state tracking, all columns should be considered immutable on construction. This matches the "functional" description coming from From 34aac9a9c9b8db373a0ad086ea32acaede4dc857 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 24 May 2024 10:21:33 +0000 Subject: [PATCH 38/56] Split scan tests out into separate file --- python/cudf_polars/cudf_polars/dsl/ir.py | 12 ++- python/cudf_polars/tests/test_basic.py | 7 -- python/cudf_polars/tests/test_scan.py | 98 ++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 14 deletions(-) create mode 100644 python/cudf_polars/tests/test_scan.py diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index bc8e7d1a764..a7c5d48064c 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -130,10 +130,8 @@ def __post_init__(self): def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" options = self.file_options - n_rows = options.n_rows with_columns = options.with_columns row_index = options.row_index - assert n_rows is None if self.typ == "csv": df = DataFrame.from_cudf( cudf.concat( @@ -148,17 +146,17 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: assert_never(self.typ) if row_index is not None: name, offset = row_index - dtype = self.schema[name] - step = plc.interop.from_arrow(pa.scalar(1), data_type=dtype) - init = plc.interop.from_arrow(pa.scalar(offset), data_type=dtype) + # TODO: dtype + step = plc.interop.from_arrow(pa.scalar(1)) + init = plc.interop.from_arrow(pa.scalar(offset)) index = Column( plc.filling.sequence(df.num_rows, init, step), name ).set_sorted( is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, - null_order=plc.types.null_order.AFTER, + null_order=plc.types.NullOrder.AFTER, ) - df = df.with_columns([index]) + df = DataFrame([index, *df.columns], []) # TODO: should be true, but not the case until we get # cudf-classic out of the loop for IO since it converts date32 # to datetime. diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py index 2b16dac8d84..ccf107d68db 100644 --- a/python/cudf_polars/tests/test_basic.py +++ b/python/cudf_polars/tests/test_basic.py @@ -66,13 +66,6 @@ def test_binaryops(op, dtype): assert_gpu_result_equal(result) -def test_scan_parquet(tmp_path): - df = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) - df.write_parquet(tmp_path / "example.parquet") - ldf = pl.scan_parquet(tmp_path / "example.parquet") - assert_gpu_result_equal(ldf) - - @pytest.mark.xfail(reason="Rolling window not yet implemented") def test_rolling(ldf_datetime): out = ldf_datetime.rolling(index_column="dt", period="2d").agg( diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py new file mode 100644 index 00000000000..b75e1bdef10 --- /dev/null +++ b/python/cudf_polars/tests/test_scan.py @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture( + params=[ + (None, None), + pytest.param( + ("row-index", 0), + marks=pytest.mark.xfail(reason="Incorrect dtype for row index"), + ), + pytest.param( + ("index", 10), + marks=pytest.mark.xfail(reason="Incorrect dtype for row index"), + ), + ], + ids=["no-row-index", "zero-offset-row-index", "offset-row-index"], +) +def row_index(request): + return request.param + + +@pytest.fixture( + params=[ + (None, 0), + pytest.param( + (2, 1), marks=pytest.mark.xfail(reason="No handling of row limit in scan") + ), + pytest.param( + (3, 0), marks=pytest.mark.xfail(reason="No handling of row limit in scan") + ), + ], + ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"], +) +def n_rows_skip_rows(request): + return request.param + + +@pytest.fixture(params=["csv", "parquet"]) +def df(request, tmp_path, row_index, n_rows_skip_rows): + df = pl.DataFrame( + { + "a": [1, 2, 3, None], + "b": ["ẅ", "x", "y", "z"], + "c": [None, None, 4, 5], + } + ) + name, offset = row_index + n_rows, skip_rows = n_rows_skip_rows + if request.param == "csv": + df.write_csv(tmp_path / "file.csv") + return pl.scan_csv( + tmp_path / "file.csv", + row_index_name=name, + row_index_offset=offset, + skip_rows_after_header=skip_rows, + n_rows=n_rows, + ) + else: + df.write_parquet(tmp_path / "file.pq") + # parquet doesn't have skip_rows argument + return pl.scan_parquet( + tmp_path / "file.pq", + row_index_name=name, + row_index_offset=offset, + n_rows=n_rows, + ) + + +@pytest.fixture(params=[None, ["a"], ["b", "a"]], ids=["all", "subset", "reordered"]) +def columns(request, row_index): + name, _ = row_index + if name is not None and request.param is not None: + return [*request.param, name] + return request.param + + +@pytest.fixture( + params=[None, pl.col("c").is_not_null()], ids=["no-mask", "c-is-not-null"] +) +def mask(request): + return request.param + + +def test_scan(df, columns, mask): + q = df + if mask is not None: + q = q.filter(mask) + if columns is not None: + q = df.select(*columns) + assert_gpu_result_equal(q) From 74e382403be60368c8a2902472469d0905599800 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 24 May 2024 12:05:48 +0000 Subject: [PATCH 39/56] Build out groupby test and fix one bug --- python/cudf_polars/cudf_polars/dsl/expr.py | 12 +++- python/cudf_polars/tests/test_groupby.py | 78 ++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 python/cudf_polars/tests/test_groupby.py diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index fc91bef726a..4f128122f82 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -1001,7 +1001,6 @@ def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" if depth == 1: # inside aggregation, need to pre-evaluate, - # This recurses to check if we have nested aggs # groupby construction has checked that we don't have # nested aggs, so stop the recursion and return ourselves # for pre-eval @@ -1010,6 +1009,17 @@ def collect_agg(self, *, depth: int) -> AggInfo: left_info, right_info = ( child.collect_agg(depth=depth) for child in self.children ) + requests = [*left_info.requests, *right_info.requests] + # TODO: Hack, if there were no reductions inside this + # binary expression then we want to pre-evaluate and + # collect ourselves. Otherwise we want to collect the + # aggregations inside and post-evaluate. This is a bad way + # of checking that we are in case 1. + if all( + agg.kind() == plc.aggregation.Kind.COLLECT_LIST + for _, agg, _ in requests + ): + return AggInfo([(self, plc.aggregation.collect_list(), self)]) return AggInfo( [*left_info.requests, *right_info.requests], ) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py new file mode 100644 index 00000000000..d06a7ecf105 --- /dev/null +++ b/python/cudf_polars/tests/test_groupby.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture +def df(): + return pl.LazyFrame( + { + "key1": [1, 1, 1, 2, 3, 1, 4, 6, 7], + "key2": [2, 2, 2, 2, 6, 1, 4, 6, 8], + "int": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "float": [7.0, 1, 2, 3, 4, 5, 6, 7, 8], + } + ) + + +@pytest.fixture( + params=[ + ["key1"], + ["key2"], + [pl.col("key1") * pl.col("key2")], + ["key1", "key2"], + [pl.col("key1") == pl.col("key2")], + ["key2", pl.col("key1") == pl.lit(1, dtype=pl.Int64)], + ], + ids=lambda keys: "-".join(map(str, keys)), +) +def keys(request): + return request.param + + +@pytest.fixture( + params=[ + ["int"], + ["float", "int"], + [pl.col("float") + pl.col("int")], + [pl.col("float").max() - pl.col("int").min()], + [pl.col("float").mean(), pl.col("int").std()], + ], + ids=lambda aggs: "-".join(map(str, aggs)), +) +def exprs(request): + return request.param + + +@pytest.fixture( + params=[ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + reason="Maintaining order in groupby not implemented" + ), + ), + ], + ids=["no_maintain_order", "maintain_order"], +) +def maintain_order(request): + return request.param + + +def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs): + q = df.group_by(*keys, maintain_order=maintain_order).agg(*exprs) + + if not maintain_order: + sort_keys = list(q.schema.keys())[: len(keys)] + q = q.sort(*sort_keys) + # from cudf_polars.dsl.translate import translate_ir + # ir = translate_ir(q._ldf.visit()) + # from IPython import embed; embed() + assert_gpu_result_equal(q, check_exact=False) From b77c573eaac4531dd9c0d2ce9e818a687441444d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 24 May 2024 12:06:14 +0000 Subject: [PATCH 40/56] Split out a few more tests --- python/cudf_polars/tests/test_basic.py | 20 -------------------- python/cudf_polars/tests/test_slice.py | 2 +- python/cudf_polars/tests/test_union.py | 13 +++++++++++++ 3 files changed, 14 insertions(+), 21 deletions(-) diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py index ccf107d68db..db813226281 100644 --- a/python/cudf_polars/tests/test_basic.py +++ b/python/cudf_polars/tests/test_basic.py @@ -197,26 +197,6 @@ def test_selection(ldf: pl.LazyFrame): assert_gpu_result_equal(out) -def test_concat_vertical(ldf): - out = pl.concat([ldf, ldf]) - assert_gpu_result_equal(out) - - -def test_concat_horizontal(ldf): - # Have to split the columns in two to avoid the same column names - left_columns = ldf.columns[: len(ldf.columns) // 2] - right_columns = ldf.columns[len(ldf.columns) // 2 :] - out = pl.concat( - [ldf.select(left_columns), ldf.select(right_columns)], how="horizontal" - ) - assert_gpu_result_equal(out) - - -def test_groupby(ldf): - out = ldf.group_by("int_key1").agg(pl.col("float_val").sum()) - assert_gpu_result_equal(out, check_row_order=False, check_exact=False) - - @pytest.mark.xfail(reason="arg_where not yet implemented") def test_expr_function(ldf): out = ldf.select(pl.arg_where(pl.col("int_key1") == 5)).set_sorted( diff --git a/python/cudf_polars/tests/test_slice.py b/python/cudf_polars/tests/test_slice.py index 6c918a89e33..d27e91302ba 100644 --- a/python/cudf_polars/tests/test_slice.py +++ b/python/cudf_polars/tests/test_slice.py @@ -31,4 +31,4 @@ def test_slice(offset, len): .sort(by=pl.col("a")) .slice(offset, len) ) - assert_gpu_result_equal(query, check_row_order=False) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py index 8a6e015e4db..2c85bb15a55 100644 --- a/python/cudf_polars/tests/test_union.py +++ b/python/cudf_polars/tests/test_union.py @@ -22,3 +22,16 @@ def test_union(): # Plan for this produces a `None`.astype(Int64) which we don't # handle correctly right now assert_gpu_result_equal(query) + + +def test_concat_vertical(): + ldf = pl.LazyFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ) + ldf2 = ldf.select(pl.col("a"), pl.col("b") * 2 + pl.col("a")) + q = pl.concat([ldf, ldf2], how="vertical") + + assert_gpu_result_equal(q) From 4b7dd6e2f3c0dcee7df5ec8e5cd89a54e7d528db Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 24 May 2024 13:47:56 +0000 Subject: [PATCH 41/56] Move expression tests to subdirectory --- python/cudf_polars/pyproject.toml | 2 +- python/cudf_polars/tests/{ => expressions}/test_filter.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename python/cudf_polars/tests/{ => expressions}/test_filter.py (100%) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 3619e32e140..baaf46f6a2b 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -133,7 +133,7 @@ ignore = [ fixable = ["ALL"] [tool.ruff.lint.per-file-ignores] -"**/tests/test_*.py" = ["D", "INP"] +"**/tests/**/test_*.py" = ["D", "INP"] [tool.ruff.lint.flake8-pytest-style] # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style diff --git a/python/cudf_polars/tests/test_filter.py b/python/cudf_polars/tests/expressions/test_filter.py similarity index 100% rename from python/cudf_polars/tests/test_filter.py rename to python/cudf_polars/tests/expressions/test_filter.py From 3aefc569524879f04d13f6bff57aa43704da1048 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 24 May 2024 13:51:11 +0000 Subject: [PATCH 42/56] Migrate agg tests --- python/cudf_polars/cudf_polars/dsl/expr.py | 10 ++- .../cudf_polars/tests/expressions/test_agg.py | 63 +++++++++++++++++++ python/cudf_polars/tests/test_basic.py | 44 ------------- 3 files changed, 70 insertions(+), 47 deletions(-) create mode 100644 python/cudf_polars/tests/expressions/test_agg.py diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 4f128122f82..df8260e4627 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -800,6 +800,9 @@ def __init__( self, dtype: plc.DataType, name: str, options: Any, value: Expr ) -> None: super().__init__(dtype) + # TODO: fix polars name + if name == "nunique": + name = "n_unique" self.name = name self.options = options self.children = (value,) @@ -812,7 +815,8 @@ def __init__( req = plc.aggregation.max() elif name == "median": req = plc.aggregation.median() - elif name == "nunique": + elif name == "n_unique": + # TODO: datatype of result req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE) elif name == "first" or name == "last": req = None @@ -836,7 +840,7 @@ def __init__( op = partial(self._reduce, request=req) elif name in {"min", "max"}: op = partial(op, propagate_nans=options) - elif name == "count": + elif name in {"count", "first", "last"}: pass else: raise AssertionError @@ -847,7 +851,7 @@ def __init__( "min", "max", "median", - "nunique", + "n_unique", "first", "last", "mean", diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py new file mode 100644 index 00000000000..c792ae64f74 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.dsl import expr +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(params=sorted(expr.Agg._SUPPORTED)) +def agg(request): + return request.param + + +@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16]) +def dtype(request): + return request.param + + +@pytest.fixture(params=[False, True], ids=["no-nulls", "with-nulls"]) +def with_nulls(request): + return request.param + + +@pytest.fixture( + params=[ + False, + pytest.param(True, marks=pytest.mark.xfail(reason="No handler for set_sorted")), + ], + ids=["unsorted", "sorted"], +) +def is_sorted(request): + return request.param + + +@pytest.fixture +def df(dtype, with_nulls, is_sorted): + values = [-10, 4, 5, 2, 3, 6, 8, 9, 4, 4, 5, 2, 3, 7, 3, 6, -10, -11] + if with_nulls: + values = [None if v % 5 == 0 else v for v in values] + + if is_sorted: + values = sorted(values, key=lambda x: -1000 if x is None else x) + + df = pl.LazyFrame({"a": values}, schema={"a": dtype}) + if is_sorted: + return df.set_sorted("a") + return df + + +def test_agg(df, agg): + expr = getattr(pl.col("a"), agg)() + q = df.select(expr) + + # https://github.com/rapidsai/cudf/issues/15852 + check_dtype = agg not in {"count", "n_unique", "median"} + if not check_dtype and q.schema["a"] != pl.Float64: + with pytest.raises(AssertionError): + assert_gpu_result_equal(q) + assert_gpu_result_equal(q, check_dtype=check_dtype, check_exact=False) diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py index db813226281..c877a14ff57 100644 --- a/python/cudf_polars/tests/test_basic.py +++ b/python/cudf_polars/tests/test_basic.py @@ -138,37 +138,6 @@ def test_sort(ldf): assert_gpu_result_equal(out) -def test_filter(ldf): - out = ldf.filter(pl.col("int_key1") > pl.col("int_key2")) - assert_gpu_result_equal(out) - - -@pytest.mark.parametrize( - "agg", - [ - "sum", - "min", - "max", - "mean", - # TODO: first/last get turned into slice of the Scan - "first", - "last", - "count", - "median", - ], -) -def test_agg(df, agg): - ldf = ( - df.cast( - {key: pl.Float64 for key in df.columns if ("int" in key or "float" in key)} - ) - .select(list(filter(lambda c: "str" not in c, df.columns))) - .lazy() - ) - out = getattr(ldf, agg)() - assert_gpu_result_equal(out, check_dtype=agg != "count", check_exact=False) - - @pytest.mark.parametrize("keep", ["first", "last", "none"]) @pytest.mark.parametrize("subset", [None, "keys"]) @pytest.mark.parametrize("sort", [False, True]) @@ -189,14 +158,6 @@ def test_unique(ldf: pl.LazyFrame, keep, subset, sort, maintain_order): assert_gpu_result_equal(out, check_row_order=maintain_order) -def test_selection(ldf: pl.LazyFrame): - k = pl.col("int_key1") - v = pl.col("int_val") - # groupby stops predicate pushdown - out = ldf.group_by(k).agg(v.sum()).filter(k * 2 > v) - assert_gpu_result_equal(out) - - @pytest.mark.xfail(reason="arg_where not yet implemented") def test_expr_function(ldf): out = ldf.select(pl.arg_where(pl.col("int_key1") == 5)).set_sorted( @@ -205,11 +166,6 @@ def test_expr_function(ldf): assert_gpu_result_equal(out) -def test_filter_expr(ldf): - out = ldf.select(pl.col("int_key1").filter(pl.col("int_key2") > 4)) - assert_gpu_result_equal(out) - - def test_gather_expr(ldf): out = ldf.select(pl.col("int_key1").gather(pl.col("int_key2"))) assert_gpu_result_equal(out) From 22805a632757b0e18365180cb4816caf02b5e751 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 24 May 2024 13:53:42 +0000 Subject: [PATCH 43/56] Joins and sorts already test elsewhere --- python/cudf_polars/tests/test_basic.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py index c877a14ff57..2853c5b8c33 100644 --- a/python/cudf_polars/tests/test_basic.py +++ b/python/cudf_polars/tests/test_basic.py @@ -121,23 +121,6 @@ def test_drop_nulls(null_data): assert_gpu_result_equal(result) -@pytest.mark.parametrize("how", ["inner", "left", "semi", "outer_coalesce"]) -def test_join(df: pl.DataFrame, how): - pl.set_random_seed(42) - # Sample eagerly since we haven't implemented it yet. - ldf1 = df.sample(n=50).lazy() - ldf2 = df.sample(n=50).lazy() - - out = ldf1.join(ldf2, on=["int_key1", "int_key2"], how=how) - assert_gpu_result_equal(out, check_row_order=False) - - -def test_sort(ldf): - for col in ldf.columns: - out = ldf.sort(by=col) - assert_gpu_result_equal(out) - - @pytest.mark.parametrize("keep", ["first", "last", "none"]) @pytest.mark.parametrize("subset", [None, "keys"]) @pytest.mark.parametrize("sort", [False, True]) From d8745f6e36ed36f7311b301471be6f00c5fd9e7f Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 24 May 2024 14:13:30 +0000 Subject: [PATCH 44/56] Better distinct test and fix bug --- python/cudf_polars/cudf_polars/dsl/ir.py | 11 +++++---- python/cudf_polars/tests/test_basic.py | 28 ----------------------- python/cudf_polars/tests/test_distinct.py | 9 ++++++-- 3 files changed, 13 insertions(+), 35 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index a7c5d48064c..7f26bc892ec 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -557,7 +557,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: indices = list(range(df.num_columns)) else: indices = [i for i, k in enumerate(df.column_names) if k in self.subset] - keys_sorted = all(c.is_sorted for c in df.columns) + keys_sorted = all(df.columns[i].is_sorted for i in indices) if keys_sorted: table = plc.stream_compaction.unique( df.table, @@ -628,10 +628,11 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: df = self.df.evaluate(cache=cache) sort_keys = [k.evaluate(df) for k in self.by] names = {c.name: i for i, c in enumerate(df.columns)} + # TODO: More robust identification here. keys_in_result = [ i for k in sort_keys - if (i := names.get(k.name)) is not None and k is df.columns[i] + if (i := names.get(k.name)) is not None and k.obj is df.columns[i].obj ] table = self.do_sort( df.table, @@ -641,11 +642,11 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: ) columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)] # If a sort key is in the result table, set the sortedness property - for i in keys_in_result: + for k, i in enumerate(keys_in_result): columns[i] = columns[i].set_sorted( is_sorted=plc.types.Sorted.YES, - order=self.order[i], - null_order=self.null_order[i], + order=self.order[k], + null_order=self.null_order[k], ) return DataFrame(columns, []).slice(self.zlice) diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py index 2853c5b8c33..0c7cbc6a6be 100644 --- a/python/cudf_polars/tests/test_basic.py +++ b/python/cudf_polars/tests/test_basic.py @@ -121,34 +121,6 @@ def test_drop_nulls(null_data): assert_gpu_result_equal(result) -@pytest.mark.parametrize("keep", ["first", "last", "none"]) -@pytest.mark.parametrize("subset", [None, "keys"]) -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("maintain_order", [False, True]) -def test_unique(ldf: pl.LazyFrame, keep, subset, sort, maintain_order): - if subset is not None: - subset = list(filter(lambda c: "key" in c, ldf.columns)) - sort_by = subset - else: - sort_by = ldf.columns - if sort: - ldf = ldf.sort(*sort_by) - out = ldf.unique( - subset, - keep=keep, - maintain_order=maintain_order, - ) - assert_gpu_result_equal(out, check_row_order=maintain_order) - - -@pytest.mark.xfail(reason="arg_where not yet implemented") -def test_expr_function(ldf): - out = ldf.select(pl.arg_where(pl.col("int_key1") == 5)).set_sorted( - pl.col("int_key1") - ) - assert_gpu_result_equal(out) - - def test_gather_expr(ldf): out = ldf.select(pl.col("int_key1").gather(pl.col("int_key2"))) assert_gpu_result_equal(out) diff --git a/python/cudf_polars/tests/test_distinct.py b/python/cudf_polars/tests/test_distinct.py index e0fa089cee2..d42c4a96f5a 100644 --- a/python/cudf_polars/tests/test_distinct.py +++ b/python/cudf_polars/tests/test_distinct.py @@ -9,10 +9,11 @@ from cudf_polars.testing.asserts import assert_gpu_result_equal -@pytest.mark.parametrize("subset", [None, ["a"], ["a", "b"], ["b", "c"]]) +@pytest.mark.parametrize("subset", [None, ["a"], ["a", "b"], ["b", "c"], ["c", "a"]]) @pytest.mark.parametrize("keep", ["any", "none", "first", "last"]) @pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"]) -def test_distinct(subset, keep, maintain_order): +@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"]) +def test_distinct(subset, keep, maintain_order, pre_sorted): ldf = pl.DataFrame( { "a": [1, 2, 1, 3, 5, None, None], @@ -20,6 +21,10 @@ def test_distinct(subset, keep, maintain_order): "c": [True, True, True, True, False, False, True], } ).lazy() + if pre_sorted: + keys = ["a", "b", "c"] if subset is None else subset + descending = False if len(keys) == 1 else [False, True, True][: len(keys)] + ldf = ldf.sort(*keys, descending=descending) query = ldf.unique(subset=subset, keep=keep, maintain_order=maintain_order) assert_gpu_result_equal(query, check_row_order=maintain_order) From eb6626e575ee485a761ebda3a483349f54dcd623 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 24 May 2024 15:03:38 +0000 Subject: [PATCH 45/56] More exhaustive binop tests --- .../tests/expressions/test_numeric_binops.py | 106 ++++++++++++++++++ python/cudf_polars/tests/test_basic.py | 19 ---- 2 files changed, 106 insertions(+), 19 deletions(-) create mode 100644 python/cudf_polars/tests/expressions/test_numeric_binops.py diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py new file mode 100644 index 00000000000..548aebf0875 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + +dtypes = [ + pl.Int8, + pl.Int16, + pl.Int64, + pl.UInt8, + pl.UInt64, + pl.Float32, + pl.Float64, +] + + +@pytest.fixture(params=dtypes) +def ltype(request): + return request.param + + +@pytest.fixture(params=dtypes) +def rtype(request): + return request.param + + +@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"]) +def with_nulls(request): + return request.param + + +@pytest.fixture( + params=[ + pl.Expr.eq, + pl.Expr.eq_missing, + pl.Expr.ne, + pl.Expr.ne_missing, + pl.Expr.lt, + pl.Expr.le, + pl.Expr.gt, + pl.Expr.ge, + pl.Expr.add, + pl.Expr.sub, + pl.Expr.mul, + pl.Expr.truediv, + pl.Expr.floordiv, + pl.Expr.mod, + ], + ids=lambda fn: fn.__name__, +) +def binop(request): + return request.param + + +@pytest.fixture +def df(request, ltype, rtype, with_nulls, binop): + a = [1, 2, 3, 5, 8] + if with_nulls: + a[2] = None + a[-1] = None + b = [10, 20, 30, 50, 0] + if with_nulls: + b[1] = None + b[3] = None + b[-1] = None + + lkind = ( + "i" + if ltype.is_signed_integer() + else ("u" if ltype.is_unsigned_integer() else "f") + ) + rkind = ( + "i" + if rtype.is_signed_integer() + else ("u" if rtype.is_unsigned_integer() else "f") + ) + if ( + not with_nulls + and binop.__name__ in {"floordiv", "mod"} + # This catches the case where the result is not promoted to float. + and ( + (lkind == rkind and lkind in {"i", "u"}) + or ({lkind, rkind} == {"i", "u"} and pl.UInt64 not in {ltype, rtype}) + ) + ): + request.applymarker( + pytest.mark.xfail( + reason="Polars nullifies division by zero for integral types" + ) + ) + + return pl.LazyFrame({"a": a, "b": b}, schema={"a": ltype, "b": rtype}) + + +def test_numeric_binop(df, binop): + left = pl.col("a") + right = pl.col("b") + + q = df.select(binop(left, right)) + + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py index 0c7cbc6a6be..606fd9a1c90 100644 --- a/python/cudf_polars/tests/test_basic.py +++ b/python/cudf_polars/tests/test_basic.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import operator from datetime import datetime import numpy as np @@ -48,24 +47,6 @@ def ldf(df): return df.lazy() -@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) -@pytest.mark.parametrize( - "op", [operator.add, operator.sub, operator.mul, operator.truediv] -) -def test_binaryops(op, dtype): - df = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": [1, 2, 3, 4, 5], - } - ).lazy() - - dtype = pl.datatypes.numpy_char_code_to_dtype(dtype) - df = df.with_columns(pl.col("a").cast(dtype)).with_columns(pl.col("b").cast(dtype)) - result = df.with_columns(op(pl.col("a"), pl.col("b"))) - assert_gpu_result_equal(result) - - @pytest.mark.xfail(reason="Rolling window not yet implemented") def test_rolling(ldf_datetime): out = ldf_datetime.rolling(index_column="dt", period="2d").agg( From 246ff6af7447333da9243fe2d5da59419bfe9bcf Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 24 May 2024 15:06:20 +0000 Subject: [PATCH 46/56] Migrate basic gather test --- .../tests/expressions/test_gather.py | 19 +++++++++++++++++++ python/cudf_polars/tests/test_basic.py | 5 ----- 2 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 python/cudf_polars/tests/expressions/test_gather.py diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py new file mode 100644 index 00000000000..df33e19a0b6 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_gather.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_gather(): + ldf = pl.LazyFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [0, 3, 1, 5, 6, 1, 0], + } + ) + + query = ldf.select(pl.col("a").gather(pl.col("b"))) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py index 606fd9a1c90..5484e9a5277 100644 --- a/python/cudf_polars/tests/test_basic.py +++ b/python/cudf_polars/tests/test_basic.py @@ -100,8 +100,3 @@ def null_data(): def test_drop_nulls(null_data): result = null_data.drop_nulls() assert_gpu_result_equal(result) - - -def test_gather_expr(ldf): - out = ldf.select(pl.col("int_key1").gather(pl.col("int_key2"))) - assert_gpu_result_equal(out) From 26c5994c847a1f2f0a5253f41002fea10926bc33 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 24 May 2024 15:06:41 +0000 Subject: [PATCH 47/56] Basic tests now covered elsewhere, or unimplemented functionality --- python/cudf_polars/tests/test_basic.py | 102 ------------------------- 1 file changed, 102 deletions(-) delete mode 100644 python/cudf_polars/tests/test_basic.py diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py deleted file mode 100644 index 5484e9a5277..00000000000 --- a/python/cudf_polars/tests/test_basic.py +++ /dev/null @@ -1,102 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. -# SPDX-License-Identifier: Apache-2.0 -from __future__ import annotations - -from datetime import datetime - -import numpy as np -import pytest - -import polars as pl - -from cudf_polars.testing.asserts import assert_gpu_result_equal - - -@pytest.fixture -def ldf_datetime(): - dates = [ - "2020-01-01 13:45:48", - "2020-01-01 16:42:13", - "2020-01-01 16:45:09", - "2020-01-02 18:12:48", - "2020-01-03 19:45:32", - "2020-01-08 23:16:43", - ] - return ( - pl.DataFrame({"dt": dates, "a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 1, 2, 2, 2]}) - .with_columns(pl.col("dt").str.strptime(pl.Datetime).set_sorted()) - .lazy() - ) - - -@pytest.fixture -def df(): - return pl.DataFrame( - { - "int_key1": np.repeat(np.arange(10), 10), - "int_key2": np.tile(np.arange(10), 10), - "str_key1": np.repeat(list("ABCDEFGHIJ"), 10), - "int_val": np.random.randint(100, size=100), - "float_val": np.random.rand(100), - } - ) - - -@pytest.fixture -def ldf(df): - return df.lazy() - - -@pytest.mark.xfail(reason="Rolling window not yet implemented") -def test_rolling(ldf_datetime): - out = ldf_datetime.rolling(index_column="dt", period="2d").agg( - [ - pl.sum("a").alias("sum_a"), - pl.min("a").alias("min_a"), - pl.max("a").alias("max_a"), - ] - ) - assert_gpu_result_equal(out) - - -@pytest.mark.xfail(reason="Grouped rolling window not yet implemented") -def test_groupby_rolling(ldf_datetime): - out = ldf_datetime.rolling(index_column="dt", period="2d", group_by="b").agg( - [ - pl.sum("a").alias("sum_a"), - pl.min("a").alias("min_a"), - pl.max("a").alias("max_a"), - ] - ) - assert_gpu_result_equal(out) - - -@pytest.mark.xfail(reason="Rolling expression not yet implemented") -def test_rolling_expression(ldf_datetime): - out = ldf_datetime.with_columns( - sum_a=pl.sum("a").rolling(index_column="dt", period="2d"), - min_a=pl.min("a").rolling(index_column="dt", period="2d"), - max_a=pl.max("a").rolling(index_column="dt", period="2d"), - ) - assert_gpu_result_equal(out) - - -def test_datetime_comparison(ldf_datetime): - out = ldf_datetime.filter( - pl.col("dt") > datetime.fromisoformat("2020-01-01 16:45:09") - ) - assert_gpu_result_equal(out) - - -@pytest.fixture -def null_data(): - return pl.DataFrame( - { - "a": [1, 2, None, 4, None], - } - ).lazy() - - -def test_drop_nulls(null_data): - result = null_data.drop_nulls() - assert_gpu_result_equal(result) From 00628b02026292200bca82e630e5c2d24f917804 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 28 May 2024 15:12:33 +0000 Subject: [PATCH 48/56] Update join for new names --- python/cudf_polars/cudf_polars/dsl/expr.py | 4 ++-- python/cudf_polars/cudf_polars/dsl/ir.py | 16 ++++++++++----- .../cudf_polars/cudf_polars/dsl/translate.py | 1 + python/cudf_polars/tests/test_join.py | 20 +++++++------------ 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index df8260e4627..92b26518f5b 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -367,7 +367,7 @@ class BooleanFunction(Expr): __slots__ = ("name", "options", "children") _non_child = ("dtype", "name", "options") - def __init__(self, dtype: plc.DataType, name: str, options: Any, *children: Expr): + def __init__(self, dtype: plc.DataType, name: str, options: tuple, *children: Expr): super().__init__(dtype) self.options = options self.name = name @@ -561,7 +561,7 @@ def __init__( self, dtype: plc.DataType, name: pl_expr.StringFunction, - options: Any, + options: tuple, *children: Expr, ): super().__init__(dtype) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 7f26bc892ec..c4dd2efac71 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -392,7 +392,13 @@ class Join(IR): """List of expressions used as keys in the left frame.""" right_on: list[expr.Expr] """List of expressions used as keys in the right frame.""" - options: Any + options: tuple[ + Literal["inner", "left", "full", "leftsemi", "leftanti"], + bool, + tuple[int, int] | None, + str | None, + bool, + ] """ tuple of options: - how: join type @@ -410,7 +416,7 @@ def __post_init__(self): @cache @staticmethod def _joiners( - how: Literal["inner", "left", "outer", "leftsemi", "leftanti"], + how: Literal["inner", "left", "full", "leftsemi", "leftanti"], ) -> tuple[ Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None ]: @@ -426,7 +432,7 @@ def _joiners( plc.copying.OutOfBoundsPolicy.DONT_CHECK, plc.copying.OutOfBoundsPolicy.NULLIFY, ) - elif how == "outer": + elif how == "full": return ( plc.join.full_join, plc.copying.OutOfBoundsPolicy.NULLIFY, @@ -471,7 +477,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: lg, rg = join_fn(left_on.table, right_on.table, null_equality) left = left.replace_columns(*left_on.columns) right = right.replace_columns(*right_on.columns) - if coalesce and how != "outer": + if coalesce and how == "inner": right = right.discard_columns(right_on.column_names_set) left = DataFrame.from_table( plc.copying.gather(left.table, lg, left_policy), left.column_names @@ -479,7 +485,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: right = DataFrame.from_table( plc.copying.gather(right.table, rg, right_policy), right.column_names ) - if coalesce and how == "outer": + if coalesce and how != "inner": left = left.replace_columns( *( Column( diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index f90a08e3b53..187fbce20dd 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -249,6 +249,7 @@ def _(node: pl_expr.PyExprIR, visitor: Any, dtype: plc.DataType) -> expr.Expr: @_translate_expr.register def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr: name, *options = node.function_data + options = tuple(options) if isinstance(name, pl_expr.StringFunction): return expr.StringFunction( dtype, diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 9ba513023da..f4a4704f3cc 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -14,16 +14,16 @@ [ "inner", "left", - "outer", "semi", "anti", pytest.param( "cross", marks=pytest.mark.xfail(reason="cross join not implemented"), ), - "outer_coalesce", + "full", ], ) +@pytest.mark.parametrize("coalesce", [False, True]) @pytest.mark.parametrize( "join_nulls", [False, True], ids=["nulls_not_equal", "nulls_equal"] ) @@ -32,19 +32,11 @@ [ pl.col("a"), pl.col("a") * 2, - [pl.col("a"), pl.col("a") + 1], + [pl.col("a"), pl.col("c") + 1], ["c", "a"], ], ) -def test_join(request, how, join_nulls, join_expr): - request.applymarker( - pytest.mark.xfail( - how == "outer_coalesce" - and isinstance(join_expr, list) - and not isinstance(join_expr[0], str), - reason="https://github.com/pola-rs/polars/issues/16289", - ) - ) +def test_join(how, coalesce, join_nulls, join_expr): left = pl.DataFrame( { "a": [1, 2, 3, 1, None], @@ -59,5 +51,7 @@ def test_join(request, how, join_nulls, join_expr): } ).lazy() - query = left.join(right, on=join_expr, how=how, join_nulls=join_nulls) + query = left.join( + right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce + ) assert_gpu_result_equal(query, check_row_order=False) From 47df8e27138a93a92e292a8c20bf3be73adb4b6f Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 29 May 2024 15:52:43 +0000 Subject: [PATCH 49/56] Dataframe copy --- python/cudf_polars/cudf_polars/containers/dataframe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index aa2f412f694..c595ea93673 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -43,6 +43,10 @@ def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None else: self.table = None + def copy(self) -> Self: + """Return a shallow copy of self.""" + return type(self)(self.columns, self.scalars) + def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" assert len(self.scalars) == 0 From 215732372cb5ca968427880362dafb05cb7a9fb4 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 29 May 2024 15:54:13 +0000 Subject: [PATCH 50/56] Fix handling of CSE in Select and HStack --- python/cudf_polars/cudf_polars/dsl/ir.py | 9 ++++++- .../cudf_polars/cudf_polars/dsl/translate.py | 27 ++++++++++++++----- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index c4dd2efac71..d6c8d15a0d2 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -516,13 +516,20 @@ class HStack(IR): df: IR """Input dataframe.""" + cse: list[expr.Expr] + """ + List of common subexpressions that will appear in the selected expressions. + + These must be evaluated before the returned expressions. + """ columns: list[expr.Expr] """List of expressions to produce new columns.""" def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - return df.with_columns([c.evaluate(df) for c in self.columns]) + ctx = df.copy().with_columns([e.evaluate(df) for e in self.cse]) + return df.with_columns([c.evaluate(ctx) for c in self.columns]) @dataclass(slots=True) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 187fbce20dd..2d4f76fccc2 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -90,13 +90,13 @@ def _( @_translate_ir.register def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: - # We translate the expressions (which are executed with - # reference to the input node) with the input node active - # so that dtype resolution works correctly. with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr] - exprs = [translate_expr(visitor, n=e) for e in node.expr] + # Special-case carveout in get_dtype for Select means we should + # translate these expressions with the Select node active (even + # though they refer to the input node). + cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr] + exprs = [translate_expr(visitor, n=e) for e in node.expr] return ir.Select(schema, inp, cse_exprs, exprs) @@ -131,8 +131,21 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - exprs = [translate_expr(visitor, n=e) for e in node.exprs] - return ir.HStack(schema, inp, exprs) + # Like Select, there is a special-case carveout in get_dtype for + # HStack, so we translate these expressions with HStack Select + # node active. + cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_exprs] + exprs = [translate_expr(visitor, n=e) for e in node.exprs] + return ir.HStack(schema, inp, cse_exprs, exprs) + + +@_translate_ir.register +def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + with set_node(visitor, node.input): + inp = translate_ir(visitor, n=None) + exprs = [translate_expr(visitor, n=e) for e in node.expr] + # Reduce is just a Select where all outputs are a single row. + return ir.Select(schema, inp, [], exprs) @_translate_ir.register From 6d324cbbcb64d7d7270266537ef129fa2c0fdf45 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 30 May 2024 10:17:39 +0000 Subject: [PATCH 51/56] Adapt to polars-side changes dtype-determination is now simpler. --- python/cudf_polars/cudf_polars/dsl/ir.py | 22 +++++++++++++++++-- .../cudf_polars/cudf_polars/dsl/translate.py | 22 ++++++++----------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index d6c8d15a0d2..6da5d937b0c 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -251,8 +251,26 @@ class Select(IR): def evaluate(self, *, cache: dict[int, DataFrame]): """Evaluate and return a dataframe.""" df = self.df.evaluate(cache=cache) - for e in self.cse: - df = df.with_columns([e.evaluate(df)]) + df = df.with_columns([e.evaluate(df) for e in self.cse]) + return DataFrame([e.evaluate(df) for e in self.expr], []) + + +@dataclass(slots=True) +class Reduce(IR): + """ + Produce a new dataframe selecting given expressions from an input. + + This is a special case of :class:`Select` where all outputs are a single row. + """ + + df: IR + """Input dataframe.""" + expr: list[expr.Expr] + """List of expressions to evaluate to form the new dataframe.""" + + def evaluate(self, *, cache: dict[int, DataFrame]): + """Evaluate and return a dataframe.""" + df = self.df.evaluate(cache=cache) return DataFrame([e.evaluate(df) for e in self.expr], []) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 2d4f76fccc2..b3d0edf183f 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -92,9 +92,6 @@ def _( def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - # Special-case carveout in get_dtype for Select means we should - # translate these expressions with the Select node active (even - # though they refer to the input node). cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr] exprs = [translate_expr(visitor, n=e) for e in node.expr] return ir.Select(schema, inp, cse_exprs, exprs) @@ -104,8 +101,8 @@ def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - aggs = [translate_expr(visitor, n=e) for e in node.aggs] - keys = [translate_expr(visitor, n=e) for e in node.keys] + aggs = [translate_expr(visitor, n=e) for e in node.aggs] + keys = [translate_expr(visitor, n=e) for e in node.keys] return ir.GroupBy( schema, inp, @@ -118,6 +115,9 @@ def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir. @_translate_ir.register def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: + # Join key dtypes are dependent on the schema of the left and + # right inputs, so these must be translated with the relevant + # input active. with set_node(visitor, node.input_left): inp_left = translate_ir(visitor, n=None) left_on = [translate_expr(visitor, n=e) for e in node.left_on] @@ -131,9 +131,6 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - # Like Select, there is a special-case carveout in get_dtype for - # HStack, so we translate these expressions with HStack Select - # node active. cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_exprs] exprs = [translate_expr(visitor, n=e) for e in node.exprs] return ir.HStack(schema, inp, cse_exprs, exprs) @@ -143,9 +140,8 @@ def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - exprs = [translate_expr(visitor, n=e) for e in node.expr] - # Reduce is just a Select where all outputs are a single row. - return ir.Select(schema, inp, [], exprs) + exprs = [translate_expr(visitor, n=e) for e in node.expr] + return ir.Reduce(schema, inp, exprs) @_translate_ir.register @@ -161,7 +157,7 @@ def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - by = [translate_expr(visitor, n=e) for e in node.by_column] + by = [translate_expr(visitor, n=e) for e in node.by_column] return ir.Sort(schema, inp, by, node.sort_options, node.slice) @@ -174,7 +170,7 @@ def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR: with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) - mask = translate_expr(visitor, n=node.predicate) + mask = translate_expr(visitor, n=node.predicate) return ir.Filter(schema, inp, mask) From 786730a3754bbfd8d5e4d581d63f33e24ec181bd Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 30 May 2024 10:18:04 +0000 Subject: [PATCH 52/56] A few more tests --- python/cudf_polars/tests/test_hstack.py | 13 +++++++++ python/cudf_polars/tests/test_select.py | 38 +++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 python/cudf_polars/tests/test_select.py diff --git a/python/cudf_polars/tests/test_hstack.py b/python/cudf_polars/tests/test_hstack.py index 731c036bc88..b8c97f4607f 100644 --- a/python/cudf_polars/tests/test_hstack.py +++ b/python/cudf_polars/tests/test_hstack.py @@ -17,3 +17,16 @@ def test_hstack(): query = ldf.with_columns(pl.col("a") + pl.col("b")) assert_gpu_result_equal(query) + + +def test_hstack_with_cse(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + expr = pl.col("a") + pl.col("b") + query = ldf.with_columns(expr.alias("c"), expr.alias("d") * 2) + assert_gpu_result_equal(query) diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py new file mode 100644 index 00000000000..503edef152e --- /dev/null +++ b/python/cudf_polars/tests/test_select.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_select(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + query = ldf.select( + pl.col("a") + pl.col("b"), (pl.col("a") * 2 + pl.col("b")).alias("d") + ) + + assert_gpu_result_equal(query) + + +def test_select_reduce(): + ldf = pl.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ).lazy() + + query = ldf.select( + (pl.col("a") + pl.col("b")).max(), + (pl.col("a") * 2 + pl.col("b")).alias("d").mean(), + ) + + assert_gpu_result_equal(query) From 2773b0bc1dd32e947ffd9858668223b1f9e56bc7 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 30 May 2024 10:46:05 +0000 Subject: [PATCH 53/56] Update for rapids-build-backend --- dependencies.yaml | 2 +- python/cudf_polars/pyproject.toml | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index 8bfa3190b3d..38ec30a8033 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -603,7 +603,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=0.20.24 + - polars>=0.20.30 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 4b64ec62830..49ecd7080b9 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -20,7 +20,7 @@ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ "cudf==24.8.*,>=0.0.0a0", - "polars>=0.20.24", + "polars>=0.20.30", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -181,3 +181,5 @@ docstring-code-format = true build-backend = "setuptools.build_meta" commit-file = "cudf_polars/GIT_COMMIT" dependencies-file = "../../dependencies.yaml" +# Pure python +disable-cuda = true From 62f6455651739ada52faf82b3f52fff9a7f6e307 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 30 May 2024 10:48:04 +0000 Subject: [PATCH 54/56] Rename with_sorted to sorted_like --- python/cudf_polars/cudf_polars/containers/column.py | 6 +++--- python/cudf_polars/cudf_polars/containers/dataframe.py | 10 ++++++---- python/cudf_polars/cudf_polars/dsl/expr.py | 6 +++--- python/cudf_polars/cudf_polars/dsl/ir.py | 8 ++++---- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 9ca5b7f0310..a139927acab 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -34,9 +34,9 @@ def __init__(self, column: plc.Column, name: str): def rename(self, name: str) -> Column: """Return a new column sharing data with a new name.""" - return type(self)(self.obj, name).with_sorted(like=self) + return type(self)(self.obj, name).sorted_like(self) - def with_sorted(self, *, like: Column) -> Self: + def sorted_like(self, like: Column, /) -> Self: """Copy sortedness properties from a column onto self.""" return self.set_sorted( is_sorted=like.is_sorted, order=like.order, null_order=like.null_order @@ -72,7 +72,7 @@ def set_sorted( def copy(self) -> Self: """Return a shallow copy of the column.""" - return type(self)(self.obj, self.name).with_sorted(like=self) + return type(self)(self.obj, self.name).sorted_like(self) def mask_nans(self) -> Self: """Return a copy of self with nans masked out.""" diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index c595ea93673..0762724d555 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -95,13 +95,15 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: raise ValueError("Mismatching name and table length.") return cls([Column(c, name) for c, name in zip(table.columns(), names)], []) - def with_sorted(self, *, like: DataFrame, subset: Set[str] | None = None) -> Self: + def sorted_like( + self, like: DataFrame, /, *, subset: Set[str] | None = None + ) -> Self: """Copy sortedness from a dataframe onto self.""" if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") subset = self.column_names_set if subset is None else subset self.columns = [ - c.with_sorted(like=other) if c.name in subset else c + c.sorted_like(other) if c.name in subset else c for c, other in zip(self.columns, like.columns) ] return self @@ -147,7 +149,7 @@ def select_columns(self, names: Set[str]) -> list[Column]: def filter(self, mask: Column) -> Self: """Return a filtered table given a mask.""" table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj) - return type(self).from_table(table, self.column_names).with_sorted(like=self) + return type(self).from_table(table, self.column_names).sorted_like(self) def slice(self, zlice: tuple[int, int] | None) -> Self: """ @@ -172,4 +174,4 @@ def slice(self, zlice: tuple[int, int] | None) -> Self: # to the end of the frame if it is larger. end = min(start + length, self.num_rows) (table,) = plc.copying.slice(self.table, [start, end]) - return type(self).from_table(table, self.column_names).with_sorted(like=self) + return type(self).from_table(table, self.column_names).sorted_like(self) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 92b26518f5b..d96a6464404 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -740,7 +740,7 @@ def do_evaluate( table = plc.stream_compaction.apply_boolean_mask( plc.Table([values.obj]), mask.obj ) - return Column(table.columns()[0], values.name).with_sorted(like=values) + return Column(table.columns()[0], values.name).sorted_like(values) class RollingWindow(Expr): @@ -781,8 +781,8 @@ def do_evaluate( """Evaluate this expression given a dataframe for context.""" (child,) = self.children column = child.evaluate(df, context=context, mapping=mapping) - return Column(plc.unary.cast(column.obj, self.dtype), column.name).with_sorted( - like=column + return Column(plc.unary.cast(column.obj, self.dtype), column.name).sorted_like( + column ) def collect_agg(self, *, depth: int) -> AggInfo: diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 6da5d937b0c..d630b40f600 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -613,7 +613,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: [Column(c, old.name) for c, old in zip(table.columns(), df.columns)], [] ) if keys_sorted or self.stable: - result = result.with_sorted(like=df) + result = result.sorted_like(df) return result.slice(self.zlice) @@ -787,7 +787,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: [plc.types.NullOrder.BEFORE], ), first.column_names, - ).with_sorted(like=first, subset={key_column}) + ).sorted_like(first, subset={key_column}) elif self.name == "rechunk": # No-op in our data model return self.df.evaluate(cache=cache) @@ -799,7 +799,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: return DataFrame.from_table( plc.stream_compaction.drop_nulls(df.table, indices, len(indices)), df.column_names, - ).with_sorted(like=df) + ).sorted_like(df) elif self.name == "rename": df = self.df.evaluate(cache=cache) # final tag is "swapping" which is useful for the @@ -813,7 +813,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame: subset = df.column_names_set - {to_explode} return DataFrame.from_table( plc.lists.explode_outer(df.table, index), df.column_names - ).with_sorted(like=df, subset=subset) + ).sorted_like(df, subset=subset) else: raise AssertionError("Should never be reached") From a1f579f3a14f7b31c6984d101c9b694e4f93d077 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 30 May 2024 11:20:01 +0000 Subject: [PATCH 55/56] Column.copy takes an optional new_name argument This removes the need for rename. --- .../cudf_polars/containers/column.py | 40 +++++++++++++++---- .../cudf_polars/containers/dataframe.py | 2 +- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index a139927acab..49034b5f5c8 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -32,12 +32,23 @@ def __init__(self, column: plc.Column, name: str): self.order = plc.types.Order.ASCENDING self.null_order = plc.types.NullOrder.BEFORE - def rename(self, name: str) -> Column: - """Return a new column sharing data with a new name.""" - return type(self)(self.obj, name).sorted_like(self) - def sorted_like(self, like: Column, /) -> Self: - """Copy sortedness properties from a column onto self.""" + """ + Copy sortedness properties from a column onto self. + + Parameters + ---------- + like + The column to copy sortedness metadata from. + + Returns + ------- + Self with metadata set. + + See Also + -------- + set_sorted + """ return self.set_sorted( is_sorted=like.is_sorted, order=like.order, null_order=like.null_order ) @@ -70,9 +81,22 @@ def set_sorted( self.null_order = null_order return self - def copy(self) -> Self: - """Return a shallow copy of the column.""" - return type(self)(self.obj, self.name).sorted_like(self) + def copy(self, *, new_name: str | None = None) -> Self: + """ + Return a shallow copy of the column. + + Parameters + ---------- + new_name + Optional new name for the copied column. + + Returns + ------- + New column sharing data with self. + """ + return type(self)( + self.obj, self.name if new_name is None else new_name + ).sorted_like(self) def mask_nans(self) -> Self: """Return a copy of self with nans masked out.""" diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 0762724d555..2ed4298e993 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -139,7 +139,7 @@ def replace_columns(self, *columns: Column) -> Self: def rename_columns(self, mapping: Mapping[str, str]) -> Self: """Rename some columns.""" return type(self)( - [c.rename(mapping.get(c.name, c.name)) for c in self.columns], self.scalars + [c.copy(new_name=mapping.get(c.name)) for c in self.columns], self.scalars ) def select_columns(self, names: Set[str]) -> list[Column]: From 1240b629a70b32ba06d8bf86ac71a0806bee99fe Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 30 May 2024 11:20:25 +0000 Subject: [PATCH 56/56] Expand docstrings --- .../cudf_polars/containers/dataframe.py | 52 +++++++++++++++++-- python/cudf_polars/cudf_polars/dsl/expr.py | 13 ++++- 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 2ed4298e993..de21a280020 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -89,7 +89,25 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self: @classmethod def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: - """Create from a pylibcudf table.""" + """ + Create from a pylibcudf table. + + Parameters + ---------- + table + Pylibcudf table to obtain columns from + names + Names for the columns + + Returns + ------- + New dataframe sharing data with the input table. + + Raises + ------ + ValueError if the number of provided names does not match the + number of columns in the table. + """ # TODO: strict=True when we drop py39 if table.num_columns() != len(names): raise ValueError("Mismatching name and table length.") @@ -98,7 +116,24 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: def sorted_like( self, like: DataFrame, /, *, subset: Set[str] | None = None ) -> Self: - """Copy sortedness from a dataframe onto self.""" + """ + Copy sortedness from a dataframe onto self. + + Parameters + ---------- + like + The dataframe to copy from + subset + Optional subset of columns from which to copy data. + + Returns + ------- + Self with metadata set. + + Raises + ------ + ValueError if there is a name mismatch between self and like. + """ if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") subset = self.column_names_set if subset is None else subset @@ -112,7 +147,18 @@ def with_columns(self, columns: Sequence[Column]) -> Self: """ Return a new dataframe with extra columns. - Data is shared. + Parameters + ---------- + columns + Columns to add + + Returns + ------- + New dataframe + + Notes + ----- + If column names overlap, newer names replace older ones. """ return type(self)([*self.columns, *columns], self.scalars) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index d96a6464404..249cc3775f7 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -183,12 +183,21 @@ def do_evaluate( Notes ----- Do not call this function directly, but rather - :func:`evaluate` which handles the mapping lookups. + :meth:`evaluate` which handles the mapping lookups. + + The typed return value of :class:`Column` is not true when + evaluating :class:`Literal` nodes (which instead produce + :class:`Scalar` objects). However, these duck-type to having a + pylibcudf container object inside them, and usually they end + up appearing in binary expressions which pylibcudf handles + appropriately since there are overloads for (column, scalar) + pairs. We don't have to handle (scalar, scalar) in binops + since the polars optimizer has a constant-folding pass. Returns ------- Column representing the evaluation of the expression (or maybe - a scalar, annoying!). + a scalar). Raises ------