From b3d0e062203939f4e2d44240f95ee3f9957945aa Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 16 May 2024 17:41:03 +0000
Subject: [PATCH 01/56] Give pylibcudf DataTypes a __hash__

---
 python/cudf/cudf/_lib/pylibcudf/types.pyx | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index de10196e289..a5248ad0a1f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -51,6 +51,9 @@ cdef class DataType:
             self.c_obj == (<DataType>other).c_obj
         )
 
+    def __hash__(self):
+        return hash((self.c_obj.id(), self.c_obj.scale()))
+
     @staticmethod
     cdef DataType from_libcudf(data_type dt):
         """Create a DataType from a libcudf data_type.

From 22f6a4f05b86748fc9b09c0d81092d7c17948400 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 8 May 2024 14:17:25 +0000
Subject: [PATCH 02/56] WIP: Translate polars IR to ours

---
 .../cudf_polars/cudf_polars/dsl/__init__.py   |   8 +
 python/cudf_polars/cudf_polars/dsl/expr.py    | 121 ++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py      | 160 +++++++++++
 .../cudf_polars/cudf_polars/dsl/translate.py  | 261 ++++++++++++++++++
 4 files changed, 550 insertions(+)
 create mode 100644 python/cudf_polars/cudf_polars/dsl/__init__.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expr.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/ir.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/translate.py

diff --git a/python/cudf_polars/cudf_polars/dsl/__init__.py b/python/cudf_polars/cudf_polars/dsl/__init__.py
new file mode 100644
index 00000000000..cdc37f9e437
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""The DSL for the polars executor."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
new file mode 100644
index 00000000000..affc17d3de0
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""
+DSL nodes for the polars expression language.
+
+An expression node is a function, `DataFrame -> Column` or `DataFrame -> Scalar`.
+
+The evaluation context is provided by a LogicalPlan node, and can
+affect the evaluation rule as well as providing the dataframe input.
+In particular, the interpretation of the expression language in a
+`GroupBy` node is groupwise, rather than whole frame.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+__all__ = [
+    "Expr",
+    "NamedExpr",
+    "Literal",
+    "Column",
+    "BooleanFunction",
+    "Sort",
+    "SortBy",
+    "Gather",
+    "Filter",
+    "Window",
+    "Cast",
+    "Agg",
+    "BinOp",
+]
+
+
+@dataclass(slots=True)
+class Expr:
+    pass
+
+
+@dataclass(slots=True)
+class NamedExpr(Expr):
+    name: str
+    value: Expr
+
+
+@dataclass(slots=True)
+class Literal(Expr):
+    dtype: Any
+    value: Any
+
+
+@dataclass(slots=True)
+class Column(Expr):
+    name: str
+
+
+@dataclass(slots=True)
+class Len(Expr):
+    pass
+
+
+@dataclass(slots=True)
+class BooleanFunction(Expr):
+    name: str
+    options: Any
+    arguments: list[Expr]
+
+
+@dataclass(slots=True)
+class Sort(Expr):
+    column: Expr
+    options: Any
+
+
+@dataclass(slots=True)
+class SortBy(Expr):
+    column: Expr
+    by: list[Expr]
+    descending: list[bool]
+
+
+@dataclass(slots=True)
+class Gather(Expr):
+    values: Expr
+    indices: Expr
+
+
+@dataclass(slots=True)
+class Filter(Expr):
+    values: Expr
+    mask: Expr
+
+
+@dataclass(slots=True)
+class Window(Expr):
+    agg: Expr
+    by: None | list[Expr]
+    options: Any
+
+
+@dataclass(slots=True)
+class Cast(Expr):
+    dtype: Any
+    column: Expr
+
+
+@dataclass(slots=True)
+class Agg(Expr):
+    column: Expr
+    name: str
+    options: Any
+
+
+@dataclass(slots=True)
+class BinOp(Expr):
+    left: Expr
+    right: Expr
+    op: Any
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
new file mode 100644
index 00000000000..4009e5ffb04
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -0,0 +1,160 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""
+DSL nodes for the LogicalPlan of polars.
+
+An IR node is either a source, normal, or a sink. Respectively they
+can be considered as functions:
+
+- source: `IO () -> DataFrame`
+- normal: `DataFrame -> DataFrame`
+- sink: `DataFrame -> IO ()`
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from cudf_polars.dsl.expr import Expr
+
+
+__all__ = [
+    "IR",
+    "PythonScan",
+    "Scan",
+    "Cache",
+    "DataFrameScan",
+    "Select",
+    "GroupBy",
+    "Join",
+    "HStack",
+    "Distinct",
+    "Sort",
+    "Slice",
+    "Filter",
+    "Projection",
+    "MapFunction",
+    "Union",
+    "HConcat",
+    "ExtContext",
+]
+
+
+@dataclass(slots=True)
+class IR:
+    schema: dict
+
+
+@dataclass(slots=True)
+class PythonScan(IR):
+    options: Any
+    predicate: Expr | None
+
+
+@dataclass(slots=True)
+class Scan(IR):
+    typ: Any
+    paths: list[str]
+    file_options: Any
+    predicate: Expr | None
+
+
+@dataclass(slots=True)
+class Cache(IR):
+    key: int
+    value: IR
+
+
+@dataclass(slots=True)
+class DataFrameScan(IR):
+    df: Any
+    projection: list[str]
+    predicate: Expr | None
+
+
+@dataclass(slots=True)
+class Select(IR):
+    df: IR
+    cse: list[Expr]
+    expr: list[Expr]
+
+
+@dataclass(slots=True)
+class GroupBy(IR):
+    df: IR
+    agg_requests: list[Expr]
+    keys: list[Expr]
+    options: Any
+
+
+@dataclass(slots=True)
+class Join(IR):
+    left: IR
+    right: IR
+    left_on: list[Expr]
+    right_on: list[Expr]
+    options: Any
+
+
+@dataclass(slots=True)
+class HStack(IR):
+    df: IR
+    columns: list[Expr]
+
+
+@dataclass(slots=True)
+class Distinct(IR):
+    df: IR
+    options: Any
+
+
+@dataclass(slots=True)
+class Sort(IR):
+    df: IR
+    by: list[Expr]
+    options: Any
+
+
+@dataclass(slots=True)
+class Slice(IR):
+    df: IR
+    offset: int
+    length: int
+
+
+@dataclass(slots=True)
+class Filter(IR):
+    df: IR
+    mask: Expr
+
+
+@dataclass(slots=True)
+class Projection(IR):
+    df: IR
+
+
+@dataclass(slots=True)
+class MapFunction(IR):
+    df: IR
+    name: str
+    options: Any
+
+
+@dataclass(slots=True)
+class Union(IR):
+    dfs: list[IR]
+
+
+@dataclass(slots=True)
+class HConcat(IR):
+    dfs: list[IR]
+
+
+@dataclass(slots=True)
+class ExtContext(IR):
+    df: IR
+    extra: list[IR]
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
new file mode 100644
index 00000000000..b456e76e99f
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -0,0 +1,261 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Translate polars IR representation to ours."""
+
+from __future__ import annotations
+
+from contextlib import AbstractContextManager, nullcontext
+from typing import Any
+
+from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
+
+from cudf_polars.dsl import expr, ir
+
+__all__ = ["translate_ir", "translate_expr"]
+
+
+class set_node(AbstractContextManager):
+    __slots__ = ("n", "visitor")
+
+    def __init__(self, visitor, n):
+        self.visitor = visitor
+        self.n = n
+
+    def __enter__(self):
+        n = self.visitor.get_node()
+        self.visitor.set_node(self.n)
+        self.n = n
+
+    def __exit__(self, *args):
+        self.visitor.set_node(self.n)
+
+
+noop_context: nullcontext = nullcontext()
+
+
+def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
+    """
+    Translate a polars-internal IR node to our representation.
+
+    Parameters
+    ----------
+    visitor
+        Polars NodeTraverser object
+    n
+        Optional node to start traversing from, if not provided uses
+        current polars-internal node.
+
+    Returns
+    -------
+    Translated IR object
+
+    Raises
+    ------
+    NotImplementedError if we can't translate the nodes due to
+    unsupported functionality.
+    """
+    ctx: AbstractContextManager = (
+        set_node(visitor, n) if n is not None else noop_context
+    )
+    with ctx:
+        node = visitor.view_current_node()
+        schema = visitor.get_schema()
+        if isinstance(node, pl_ir.PythonScan):
+            return ir.PythonScan(
+                schema,
+                node.options,
+                translate_expr(visitor, n=node.predicate)
+                if node.predicate is not None
+                else None,
+            )
+        elif isinstance(node, pl_ir.Scan):
+            return ir.Scan(
+                schema,
+                node.scan_type,
+                node.paths,
+                node.file_options,
+                translate_expr(visitor, n=node.predicate)
+                if node.predicate is not None
+                else None,
+            )
+        elif isinstance(node, pl_ir.Cache):
+            return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input))
+        elif isinstance(node, pl_ir.DataFrameScan):
+            return ir.DataFrameScan(
+                schema,
+                node.df,
+                node.projection,
+                translate_expr(visitor, n=node.selection)
+                if node.selection is not None
+                else None,
+            )
+        elif isinstance(node, pl_ir.Select):
+            return ir.Select(
+                schema,
+                translate_ir(visitor, n=node.input),
+                [translate_expr(visitor, n=e) for e in node.cse_expr],
+                [translate_expr(visitor, n=e) for e in node.expr],
+            )
+        elif isinstance(node, pl_ir.GroupBy):
+            return ir.GroupBy(
+                schema,
+                translate_ir(visitor, n=node.input),
+                [translate_expr(visitor, n=e) for e in node.aggs],
+                [translate_expr(visitor, n=e) for e in node.keys],
+                node.options,
+            )
+        elif isinstance(node, pl_ir.Join):
+            return ir.Join(
+                schema,
+                translate_ir(visitor, n=node.input_left),
+                translate_ir(visitor, n=node.input_right),
+                [translate_expr(visitor, n=e) for e in node.left_on],
+                [translate_expr(visitor, n=e) for e in node.right_on],
+                node.options,
+            )
+        elif isinstance(node, pl_ir.HStack):
+            return ir.HStack(
+                schema,
+                translate_ir(visitor, n=node.input),
+                [translate_expr(visitor, n=e) for e in node.exprs],
+            )
+        elif isinstance(node, pl_ir.Distinct):
+            return ir.Distinct(
+                schema,
+                translate_ir(visitor, n=node.input),
+                node.options,
+            )
+        elif isinstance(node, pl_ir.Sort):
+            return ir.Sort(
+                schema,
+                translate_ir(visitor, n=node.input),
+                [translate_expr(visitor, n=e) for e in node.by_column],
+                node.sort_options,
+            )
+        elif isinstance(node, pl_ir.Slice):
+            return ir.Slice(
+                schema, translate_ir(visitor, n=node.input), node.offset, node.len
+            )
+        elif isinstance(node, pl_ir.Filter):
+            return ir.Filter(
+                schema,
+                translate_ir(visitor, n=node.input),
+                translate_expr(visitor, n=node.predicate),
+            )
+        elif isinstance(node, pl_ir.SimpleProjection):
+            return ir.Projection(schema, translate_ir(visitor, n=node.input))
+        elif isinstance(node, pl_ir.MapFunction):
+            name, *options = node.function
+            return ir.MapFunction(
+                schema,
+                # TODO: merge_sorted breaks this pattern
+                translate_ir(visitor, n=node.input),
+                name,
+                options,
+            )
+        elif isinstance(node, pl_ir.Union):
+            return ir.Union(schema, [translate_ir(visitor, n=n) for n in node.inputs])
+        elif isinstance(node, pl_ir.HConcat):
+            return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
+        elif isinstance(node, pl_ir.ExtContext):
+            return ir.ExtContext(
+                schema,
+                translate_ir(visitor, n=node.input),
+                [translate_ir(visitor, n=n) for n in node.contexts],
+            )
+        else:
+            raise NotImplementedError(
+                f"No handler for LogicalPlan node with {type(node)=}"
+            )
+
+
+BOOLEAN_FUNCTIONS: frozenset[str] = frozenset()
+
+
+def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
+    """
+    Translate a polars-internal expression IR into our representation.
+
+    Parameters
+    ----------
+    visitor
+        Polars NodeTraverser object
+    n
+        Node to translate, either an integer referencing a polars
+        internal node, or a named expression node.
+
+    Returns
+    -------
+    Translated IR object.
+
+    Raises
+    ------
+    NotImplementedError if any translation fails due to unsupported functionality.
+    """
+    if isinstance(n, pl_expr.PyExprIR):
+        # TODO: type narrowing didn't work because PyExprIR is Unknown
+        assert not isinstance(n, int)
+        return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node))
+    node = visitor.view_expression(n)
+    if isinstance(node, pl_expr.Function):
+        name, *options = node.function_data
+        if name in BOOLEAN_FUNCTIONS:
+            return expr.BooleanFunction(
+                name,
+                options,
+                [translate_expr(visitor, n=n) for n in node.input],
+            )
+        else:
+            raise NotImplementedError(f"No handler for Expr function node with {name=}")
+    elif isinstance(node, pl_expr.Window):
+        # TODO: raise in groupby?
+        return expr.Window(
+            translate_expr(visitor, n=node.function),
+            [translate_expr(visitor, n=n) for n in node.partition_by]
+            if node.partition_by is not None
+            else None,
+            node.options,
+        )
+    elif isinstance(node, pl_expr.Literal):
+        return expr.Literal(node.dtype, node.value)
+    elif isinstance(node, pl_expr.Sort):
+        # TODO: raise in groupby
+        return expr.Sort(translate_expr(visitor, n=node.expr), node.options)
+    elif isinstance(node, pl_expr.SortBy):
+        # TODO: raise in groupby
+        return expr.SortBy(
+            translate_expr(visitor, n=node.expr),
+            [translate_expr(visitor, n=n) for n in node.by],
+            node.descending,
+        )
+    elif isinstance(node, pl_expr.Gather):
+        return expr.Gather(
+            translate_expr(visitor, n=node.expr),
+            translate_expr(visitor, n=node.idx),
+        )
+    elif isinstance(node, pl_expr.Filter):
+        return expr.Filter(
+            translate_expr(visitor, n=node.input),
+            translate_expr(visitor, n=node.by),
+        )
+    elif isinstance(node, pl_expr.Cast):
+        return expr.Cast(node.dtype, translate_expr(visitor, n=node.expr))
+    elif isinstance(node, pl_expr.Column):
+        return expr.Column(node.name)
+    elif isinstance(node, pl_expr.Agg):
+        return expr.Agg(
+            translate_expr(visitor, n=node.arguments),
+            node.name,
+            node.options,
+        )
+    elif isinstance(node, pl_expr.BinaryExpr):
+        return expr.BinOp(
+            translate_expr(visitor, n=node.left),
+            translate_expr(visitor, n=node.right),
+            node.op,
+        )
+    elif isinstance(node, pl_expr.Len):
+        return expr.Len()
+    else:
+        raise NotImplementedError(f"No handler for expression node with {type(node)=}")

From 8ac43478f47e1010df33a6eefa5fab7842653e24 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 8 May 2024 16:50:06 +0000
Subject: [PATCH 03/56] Add some container objects

---
 .../cudf_polars/containers/__init__.py        |  8 +++
 .../cudf_polars/containers/column.py          | 54 +++++++++++++++++++
 .../cudf_polars/containers/dataframe.py       | 50 +++++++++++++++++
 .../cudf_polars/containers/scalar.py          | 25 +++++++++
 4 files changed, 137 insertions(+)
 create mode 100644 python/cudf_polars/cudf_polars/containers/__init__.py
 create mode 100644 python/cudf_polars/cudf_polars/containers/column.py
 create mode 100644 python/cudf_polars/cudf_polars/containers/dataframe.py
 create mode 100644 python/cudf_polars/cudf_polars/containers/scalar.py

diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py
new file mode 100644
index 00000000000..c8b444389bd
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/containers/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Containers of concrete data."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
new file mode 100644
index 00000000000..1d7f00435cf
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""A column, with some properties."""
+
+from __future__ import annotations
+
+import cudf._lib.pylibcudf as plc
+
+__all__: list[str] = ["Column"]
+
+
+class Column:
+    """A column, a name, and sortedness."""
+
+    __slots__ = ("obj", "name", "is_sorted", "order", "null_order")
+    obj: plc.Column
+    name: str
+    is_sorted: plc.types.Sorted
+    order: plc.types.Order
+    null_order: plc.types.NullOrder
+
+    def __init__(self, column: plc.Column, name: str):
+        self.obj = column
+        self.name = name
+        self.is_sorted = plc.types.Sorted.NO
+
+    def set_sorted(
+        self,
+        is_sorted: plc.types.Sorted,
+        order: plc.types.Order,
+        null_order: plc.types.NullOrder,
+    ) -> Column:
+        """
+        Return a new column sharing data with sortedness set.
+
+        Parameters
+        ----------
+        is_sorted
+            Is the column sorted
+        order
+            The order if sorted
+        null_order
+            Where nulls sort, if sorted
+
+        Returns
+        -------
+        New column sharing data.
+        """
+        obj = Column(self.obj, self.name)
+        obj.is_sorted = is_sorted
+        obj.order = order
+        obj.null_order = null_order
+        return obj
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
new file mode 100644
index 00000000000..9f368c76626
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""A dataframe, with some properties."""
+
+from __future__ import annotations
+
+import itertools
+from typing import TYPE_CHECKING
+
+import cudf._lib.pylibcudf as plc
+
+if TYPE_CHECKING:
+    from cudf_polars.containers.column import Column
+    from cudf_polars.containers.scalar import Scalar
+
+__all__: list[str] = ["DataFrame"]
+
+
+class DataFrame:
+    """A representation of a dataframe."""
+
+    __slots__ = ("columns", "scalars", "names", "scalar_names", "table")
+    columns: list[Column]
+    scalars: list[Scalar]
+    names: dict[str, int]
+    scalar_names: frozenset[str]
+    table: plc.Table | None
+
+    def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None:
+        self.names = dict(zip((c.name for c in columns), itertools.count(0))) | dict(
+            zip((s.name for s in columns), itertools.count(0))
+        )
+        self.scalar_names = frozenset(s.name for s in scalars)
+        self.columns = columns
+        self.scalars = scalars
+        if len(scalars) == 0:
+            self.table = plc.Table(columns)
+        else:
+            self.table = None
+
+    __iter__ = None
+
+    def __getitem__(self, name: str) -> Column | Scalar:
+        """Return column with given name."""
+        i = self.names[name]
+        if name in self.scalar_names:
+            return self.scalars[i]
+        else:
+            return self.columns[i]
diff --git a/python/cudf_polars/cudf_polars/containers/scalar.py b/python/cudf_polars/cudf_polars/containers/scalar.py
new file mode 100644
index 00000000000..a9b59a3218c
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/containers/scalar.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""A scalar, with some properties."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import cudf._lib.pylibcudf as plc
+
+__all__: list[str] = ["Scalar"]
+
+
+class Scalar:
+    """A scalar, and a name."""
+
+    __slots__ = ("obj", "name")
+    obj: plc.Scalar
+    name: str
+
+    def __init__(self, scalar: plc.Column, name: str):
+        self.obj = scalar
+        self.name = name

From 4ab983e017970a8eaa2e60badfa1b831c4f7a716 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 8 May 2024 16:54:19 +0000
Subject: [PATCH 04/56] WIP: really, fleshing out some evaluation

---
 python/cudf_polars/cudf_polars/dsl/ir.py | 81 ++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 4009e5ffb04..f796e57b0ab 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -17,6 +17,10 @@
 
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
+from typing_extensions import assert_never
+
+import cudf._lib.pylibcudf as plc
+import cudf_polars.dsl.expr as expr
 
 if TYPE_CHECKING:
     from cudf_polars.dsl.expr import Expr
@@ -54,6 +58,9 @@ class PythonScan(IR):
     options: Any
     predicate: Expr | None
 
+    def evaluate(self):
+        raise NotImplementedError
+
 
 @dataclass(slots=True)
 class Scan(IR):
@@ -62,6 +69,36 @@ class Scan(IR):
     file_options: Any
     predicate: Expr | None
 
+    def __post_init__(self):
+        if self.file_options.n_rows is not None:
+            raise NotImplementedError("row limit in scan")
+        if self.typ not in ("csv", "parquet"):
+            raise NotImplementedError(f"Unhandled scan type: {self.typ}")
+    def evaluate(self):
+        options = self.file_options
+        n_rows = options.n_rows
+        with_columns = options.with_columns
+        row_index = options.row_index
+        assert n_rows is None
+        if self.typ == "csv":
+            df = cudf.concat(
+                [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
+            )
+        elif self.typ == "parquet":
+            df = cudf.read_parquet(self.paths, columns=with_columns)
+        else:
+            assert_never(self.typ)
+        if row_index is not None:
+            name, offset = row_index
+            dtype = self.schema[name]
+            index = as_column(
+                ..., dtype=dtype
+            )
+            
+        
+            
+                
+
 
 @dataclass(slots=True)
 class Cache(IR):
@@ -90,6 +127,42 @@ class GroupBy(IR):
     keys: list[Expr]
     options: Any
 
+    @staticmethod
+    def check_agg(agg: Expr) -> int:
+        """
+        Determine if we can handle an aggregation expression.
+
+        Parameters
+        ----------
+        agg
+            Expression to check
+
+        Returns
+        -------
+        depth of nesting
+
+        Raises
+        ------
+        NotImplementedError for unsupported expression nodes.
+        """
+        if isinstance(agg, expr.Agg):
+            if agg.name == "implode":
+                raise NotImplementedError("implode in groupby")
+            return 1 + GroupBy.check_agg(agg.column)
+        elif isinstance(agg, (expr.Len, expr.Column, expr.Literal)):
+            return 0
+        elif isinstance(agg, expr.BinOp):
+            return max(GroupBy.check_agg(agg.left), GroupBy.check_agg(agg.right))
+        elif isinstance(agg, expr.Cast):
+            return GroupBy.check_agg(agg.column)
+        else:
+            raise NotImplementedError(f"No handler for {agg=}")
+
+    def __post_init__(self):
+        """Check whether all the aggregations are implemented."""
+        if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests):
+            raise NotImplementedError("Nested aggregations in groupby")
+
 
 @dataclass(slots=True)
 class Join(IR):
@@ -99,6 +172,14 @@ class Join(IR):
     right_on: list[Expr]
     options: Any
 
+    def __post_init__(self):
+        """Raise for unsupported options."""
+        how, coalesce = self.options[0], self.options[-1]
+        if how == "cross":
+            raise NotImplementedError("cross join not implemented")
+        if how == "outer" and not coalesce:
+            raise NotImplementedError("non-coalescing outer join")
+
 
 @dataclass(slots=True)
 class HStack(IR):

From 1981a3dce23d4babab20373a7d3bac61097818c9 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 9 May 2024 11:27:09 +0000
Subject: [PATCH 05/56] Flesh out more container stuff

---
 .../cudf_polars/containers/__init__.py        |  6 +-
 .../cudf_polars/containers/column.py          | 28 ++++--
 .../cudf_polars/containers/dataframe.py       | 73 ++++++++++++++-
 .../cudf_polars/cudf_polars/utils/__init__.py |  8 ++
 .../cudf_polars/cudf_polars/utils/dtypes.py   | 89 +++++++++++++++++++
 5 files changed, 192 insertions(+), 12 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/utils/__init__.py
 create mode 100644 python/cudf_polars/cudf_polars/utils/dtypes.py

diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py
index c8b444389bd..ef9d9ca61b6 100644
--- a/python/cudf_polars/cudf_polars/containers/__init__.py
+++ b/python/cudf_polars/cudf_polars/containers/__init__.py
@@ -5,4 +5,8 @@
 
 from __future__ import annotations
 
-__all__: list[str] = []
+__all__: list[str] = ["DataFrame", "Column", "Scalar"]
+
+from cudf_polars.containers.column import Column
+from cudf_polars.containers.dataframe import DataFrame
+from cudf_polars.containers.scalar import Scalar
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 1d7f00435cf..efcd2e0da20 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -5,8 +5,13 @@
 
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import cudf._lib.pylibcudf as plc
 
+if TYPE_CHECKING:
+    from typing_extensions import Self
+
 __all__: list[str] = ["Column"]
 
 
@@ -25,14 +30,22 @@ def __init__(self, column: plc.Column, name: str):
         self.name = name
         self.is_sorted = plc.types.Sorted.NO
 
+    def with_metadata(self, *, like: Column) -> Self:
+        """Copy metadata from a column onto self."""
+        self.is_sorted = like.is_sorted
+        self.order = like.order
+        self.null_order = like.null_order
+        return self
+
     def set_sorted(
         self,
+        *,
         is_sorted: plc.types.Sorted,
         order: plc.types.Order,
         null_order: plc.types.NullOrder,
-    ) -> Column:
+    ) -> Self:
         """
-        Return a new column sharing data with sortedness set.
+        Modify sortedness metadata in place.
 
         Parameters
         ----------
@@ -45,10 +58,9 @@ def set_sorted(
 
         Returns
         -------
-        New column sharing data.
+        Self with metadata set.
         """
-        obj = Column(self.obj, self.name)
-        obj.is_sorted = is_sorted
-        obj.order = order
-        obj.null_order = null_order
-        return obj
+        self.is_sorted = is_sorted
+        self.order = order
+        self.null_order = null_order
+        return self
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 9f368c76626..502817d652b 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -6,13 +6,19 @@
 from __future__ import annotations
 
 import itertools
+from functools import cached_property
 from typing import TYPE_CHECKING
 
 import cudf._lib.pylibcudf as plc
 
+from cudf_polars.containers.column import Column
+from cudf_polars.containers.scalar import Scalar
+
 if TYPE_CHECKING:
-    from cudf_polars.containers.column import Column
-    from cudf_polars.containers.scalar import Scalar
+    from typing_extensions import Self
+
+    import cudf
+
 
 __all__: list[str] = ["DataFrame"]
 
@@ -35,7 +41,7 @@ def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None:
         self.columns = columns
         self.scalars = scalars
         if len(scalars) == 0:
-            self.table = plc.Table(columns)
+            self.table = plc.Table([c.obj for c in columns])
         else:
             self.table = None
 
@@ -48,3 +54,64 @@ def __getitem__(self, name: str) -> Column | Scalar:
             return self.scalars[i]
         else:
             return self.columns[i]
+
+    @cached_property
+    def num_rows(self):
+        """Number of rows."""
+        if self.table is None:
+            raise ValueError("Number of rows of frame with scalars makes no sense")
+        return self.table.num_rows()
+
+    @classmethod
+    def from_cudf(cls, df: cudf.DataFrame) -> Self:
+        """Create from a cudf dataframe."""
+        return cls(
+            [Column(c.to_pylibcudf(mode="read"), name) for name, c in df._data.items()],
+            [],
+        )
+
+    def with_columns(self, *columns: Column | Scalar) -> Self:
+        """
+        Return a new dataframe with extra columns.
+
+        Data is shared.
+        """
+        cols = [c for c in columns if isinstance(c, Column)]
+        scalars = [c for c in columns if isinstance(c, Scalar)]
+        return type(self)([*self.columns, *cols], [*self.scalars, *scalars])
+
+    def discard_columns(self, names: set[str]) -> Self:
+        """Drop columns by name."""
+        return type(self)([c for c in self.columns if c not in names], self.scalars)
+
+    def replace_columns(self, *columns: Column) -> Self:
+        """Return a new dataframe with columns replaced by name, maintaining order."""
+        new = {c.name: c for c in columns}
+        if set(new).intersection(self.scalar_names):
+            raise ValueError("Cannot replace scalars")
+        if not set(new).issubset(self.names):
+            raise ValueError("Cannot replace with non-existing names")
+        return type(self)([new.get(c.name, c) for c in self.columns], self.scalars)
+
+    def rename_columns(self, mapping: dict[str, str]) -> Self:
+        """Rename some columns."""
+        new_columns = [
+            Column(c, mapping.get(c.name, c.name)).with_metadata(like=c)
+            for c in self.columns
+        ]
+        return type(self)(new_columns, self.scalars)
+
+    def select_columns(self, names: set[str]) -> list[Column]:
+        """Select columns by name."""
+        return [c for c in self.columns if c.name in names]
+
+    def filter(self, mask: Column) -> Self:
+        """Return a filtered table given a mask."""
+        table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
+        return type(self)(
+            [
+                Column(new, old.name).with_metadata(like=old)
+                for old, new in zip(self.columns, table.columns())
+            ],
+            [],
+        )
diff --git a/python/cudf_polars/cudf_polars/utils/__init__.py b/python/cudf_polars/cudf_polars/utils/__init__.py
new file mode 100644
index 00000000000..6018209e1e8
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/utils/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Utilities."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
new file mode 100644
index 00000000000..1ac8719b839
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Datatype utilities."""
+
+from __future__ import annotations
+
+from functools import cache
+
+from typing_extensions import assert_never
+
+import polars as pl
+
+import cudf._lib.pylibcudf as plc
+
+
+@cache
+def from_polars(dtype: pl.DataType) -> plc.DataType:
+    """
+    Convert a polars datatype to a pylibcudf one.
+
+    Parameters
+    ----------
+    dtype
+        Polars dtype to convert
+
+    Returns
+    -------
+    Matching pylibcudf DataType object.
+
+    Raises
+    ------
+    NotImplementedError for unsupported conversions.
+    """
+    if isinstance(dtype, pl.Int8):
+        return plc.DataType(plc.TypeId.BOOL8)
+    elif isinstance(dtype, pl.Int8):
+        return plc.DataType(plc.TypeId.INT8)
+    elif isinstance(dtype, pl.Int16):
+        return plc.DataType(plc.TypeId.INT16)
+    elif isinstance(dtype, pl.Int32):
+        return plc.DataType(plc.TypeId.INT32)
+    elif isinstance(dtype, pl.Int64):
+        return plc.DataType(plc.TypeId.INT64)
+    if isinstance(dtype, pl.UInt8):
+        return plc.DataType(plc.TypeId.UINT8)
+    elif isinstance(dtype, pl.UInt16):
+        return plc.DataType(plc.TypeId.UINT16)
+    elif isinstance(dtype, pl.UInt32):
+        return plc.DataType(plc.TypeId.UINT32)
+    elif isinstance(dtype, pl.UInt64):
+        return plc.DataType(plc.TypeId.UINT64)
+    elif isinstance(dtype, pl.Float32):
+        return plc.DataType(plc.TypeId.FLOAT32)
+    elif isinstance(dtype, pl.Float64):
+        return plc.DataType(plc.TypeId.FLOAT64)
+    elif isinstance(dtype, pl.Date):
+        return plc.DataType(plc.TypeId.TIMESTAMP_DAYS)
+    elif isinstance(dtype, pl.Time):
+        raise NotImplementedError("Time of day dtype not implemented")
+    elif isinstance(dtype, pl.Datetime):
+        if dtype.time_zone is not None:
+            raise NotImplementedError("Time zone support")
+        if dtype.time_unit == "ms":
+            return plc.DataType(plc.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype.time_unit == "us":
+            return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+        elif dtype.time_unit == "ns":
+            return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS)
+        else:
+            assert dtype.time_unit is not None
+            assert_never(dtype.time_unit)
+    elif isinstance(dtype, pl.Duration):
+        if dtype.time_unit == "ms":
+            return plc.DataType(plc.TypeId.DURATION_MILLISECONDS)
+        elif dtype.time_unit == "us":
+            return plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+        elif dtype.time_unit == "ns":
+            return plc.DataType(plc.TypeId.DURATION_NANOSECONDS)
+        else:
+            assert dtype.time_unit is not None
+            assert_never(dtype.time_unit)
+    elif isinstance(dtype, pl.String):
+        return plc.DataType(plc.TypeId.STRING)
+    elif isinstance(dtype, pl.Null):
+        # TODO: Hopefully
+        return plc.DataType(plc.TypeId.EMPTY)
+    else:
+        raise NotImplementedError(f"{dtype=} conversion not supported")

From 700f0757a4ab3495f039b767c3b228443ffeef05 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 9 May 2024 11:29:50 +0000
Subject: [PATCH 06/56] WIP: More fleshing out evaluation

---
 python/cudf_polars/cudf_polars/dsl/expr.py    |  14 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 198 ++++++++++++++++--
 .../cudf_polars/cudf_polars/dsl/translate.py  |   4 +-
 3 files changed, 192 insertions(+), 24 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index affc17d3de0..3ec0223b7a2 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -16,13 +16,16 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from cudf_polars.containers import Column, DataFrame
 
 __all__ = [
     "Expr",
     "NamedExpr",
     "Literal",
-    "Column",
+    "Col",
     "BooleanFunction",
     "Sort",
     "SortBy",
@@ -37,7 +40,10 @@
 
 @dataclass(slots=True)
 class Expr:
-    pass
+    # TODO: return type is a lie for Literal
+    def evaluate(self, context: DataFrame) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        raise NotImplementedError
 
 
 @dataclass(slots=True)
@@ -53,7 +59,7 @@ class Literal(Expr):
 
 
 @dataclass(slots=True)
-class Column(Expr):
+class Col(Expr):
     name: str
 
 
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index f796e57b0ab..f7d5b56e637 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -16,13 +16,24 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
+from functools import cache
+from typing import TYPE_CHECKING, Any, Callable
+
+import pyarrow as pa
 from typing_extensions import assert_never
 
+import polars as pl
+
+import cudf
 import cudf._lib.pylibcudf as plc
+
 import cudf_polars.dsl.expr as expr
+from cudf_polars.containers import Column, DataFrame
+from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
+    from typing import Literal
+
     from cudf_polars.dsl.expr import Expr
 
 
@@ -52,15 +63,16 @@
 class IR:
     schema: dict
 
+    def evaluate(self) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        raise NotImplementedError
+
 
 @dataclass(slots=True)
 class PythonScan(IR):
     options: Any
     predicate: Expr | None
 
-    def evaluate(self):
-        raise NotImplementedError
-
 
 @dataclass(slots=True)
 class Scan(IR):
@@ -70,34 +82,49 @@ class Scan(IR):
     predicate: Expr | None
 
     def __post_init__(self):
+        """Validate preconditions."""
         if self.file_options.n_rows is not None:
             raise NotImplementedError("row limit in scan")
         if self.typ not in ("csv", "parquet"):
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
-    def evaluate(self):
+
+    def evaluate(self) -> DataFrame:
+        """Evaluate and return a dataframe."""
         options = self.file_options
         n_rows = options.n_rows
         with_columns = options.with_columns
         row_index = options.row_index
         assert n_rows is None
         if self.typ == "csv":
-            df = cudf.concat(
-                [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
+            df = DataFrame.from_cudf(
+                cudf.concat(
+                    [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
+                )
             )
         elif self.typ == "parquet":
-            df = cudf.read_parquet(self.paths, columns=with_columns)
+            df = DataFrame.from_cudf(
+                cudf.read_parquet(self.paths, columns=with_columns)
+            )
         else:
             assert_never(self.typ)
         if row_index is not None:
             name, offset = row_index
-            dtype = self.schema[name]
-            index = as_column(
-                ..., dtype=dtype
+            dtype = dtypes.from_polars(self.schema[name])
+            step = plc.interop.from_arrow(pa.scalar(1), data_type=dtype)
+            init = plc.interop.from_arrow(pa.scalar(offset), data_type=dtype)
+            index = Column(
+                plc.filling.sequence(df.num_rows(), init, step), name
+            ).set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=plc.types.Order.ASCENDING,
+                null_order=plc.types.null_order.AFTER,
             )
-            
-        
-            
-                
+            df = df.with_columns(index)
+        if self.predicate is None:
+            return df
+        else:
+            mask = self.predicate.evaluate(df)
+            return df.filter(mask)
 
 
 @dataclass(slots=True)
@@ -112,6 +139,34 @@ class DataFrameScan(IR):
     projection: list[str]
     predicate: Expr | None
 
+    def evaluate(self) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        pdf = pl.DataFrame._from_pydf(self.df)
+        if self.projection is not None:
+            pdf = pdf.select(self.projection)
+        # TODO: goes away when libcudf supports large strings
+        table = pdf.to_arrow()
+        schema = table.schema
+        for i, field in enumerate(schema):
+            if field.type == pa.large_string():
+                # TODO: Nested types
+                schema = schema.set(i, pa.field(field.name, pa.string()))
+        table = table.cast(schema)
+        df = DataFrame(
+            [
+                Column(col, name)
+                for name, col in zip(
+                    self.schema.keys(), plc.interop.from_arrow(table).columns()
+                )
+            ],
+            [],
+        )
+        if self.predicate is not None:
+            mask = self.predicate.evaluate(df)
+            return df.filter(mask)
+        else:
+            return df
+
 
 @dataclass(slots=True)
 class Select(IR):
@@ -119,6 +174,13 @@ class Select(IR):
     cse: list[Expr]
     expr: list[Expr]
 
+    def evaluate(self):
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate()
+        for e in self.cse:
+            df = df.with_columns(e.evaluate(df))
+        return DataFrame([e.evaluate(df) for e in self.expr], [])
+
 
 @dataclass(slots=True)
 class GroupBy(IR):
@@ -174,11 +236,109 @@ class Join(IR):
 
     def __post_init__(self):
         """Raise for unsupported options."""
-        how, coalesce = self.options[0], self.options[-1]
-        if how == "cross":
+        if self.options[0] == "cross":
             raise NotImplementedError("cross join not implemented")
-        if how == "outer" and not coalesce:
-            raise NotImplementedError("non-coalescing outer join")
+
+    @cache
+    @staticmethod
+    def _joiners(
+        how: Literal["inner", "left", "outer", "leftsemi", "leftanti"],
+    ) -> tuple[
+        Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
+    ]:
+        if how == "inner":
+            return (
+                plc.join.inner_join,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+            )
+        elif how == "left":
+            return (
+                plc.join.left_join,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+                plc.copying.OutOfBoundsPolicy.NULLIFY,
+            )
+        elif how == "outer":
+            return (
+                plc.join.full_join,
+                plc.copying.OutOfBoundsPolicy.NULLIFY,
+                plc.copying.OutOfBoundsPolicy.NULLIFY,
+            )
+        elif how == "leftsemi":
+            return (
+                plc.join.left_semi_join,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+                None,
+            )
+        elif how == "leftanti":
+            return (
+                plc.join.left_anti_join,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+                None,
+            )
+        else:
+            assert_never(how)
+
+    def evaluate(self) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        left = self.left.evaluate()
+        right = self.right.evaluate()
+        left_on = DataFrame([e.evaluate(left) for e in self.left_on], [])
+        right_on = DataFrame([e.evaluate(right) for e in self.right_on], [])
+        how, join_nulls, zlice, suffix, coalesce = self.options
+        null_equality = (
+            plc.types.NullEquality.EQUAL
+            if join_nulls
+            else plc.types.NullEquality.UNEQUAL
+        )
+        suffix = "_right" if suffix is None else suffix
+        join_fn, left_policy, right_policy = Join._joiners(how)
+        if right_policy is None:
+            # Semi join
+            lg = join_fn(left_on.table, right_on.table, null_equality)
+            left = left.replace_columns(*left_on.columns)
+            table = plc.copying.gather(left.table, lg, left_policy)
+            result = DataFrame(
+                [
+                    Column(c, col.name)
+                    for col, c in zip(left_on.columns, table.columns())
+                ],
+                [],
+            )
+        else:
+            lg, rg = join_fn(left_on, right_on, null_equality)
+            left = left.replace_columns(*left_on.columns)
+            right = right.replace_columns(*right_on.columns)
+            if coalesce and how != "outer":
+                right = right.discard_columns(set(right_on.names))
+            left = DataFrame(
+                plc.copying.gather(left.table, lg, left_policy).columns(), []
+            )
+            right = DataFrame(
+                plc.copying.gather(right.table, rg, right_policy).columns(), []
+            )
+            if coalesce and how == "outer":
+                left.replace_columns(
+                    *(
+                        Column(
+                            plc.replace.replace_nulls(left_col.obj, right_col.obj),
+                            left_col.name,
+                        )
+                        for left_col, right_col in zip(
+                            left.select_columns(set(left_on.names)),
+                            right.select_columns(set(right_on.names)),
+                        )
+                    )
+                )
+                right.discard_columns(set(right_on.names))
+            right = right.rename_columns(
+                {name: f"{name}{suffix}" for name in right.names if name in left.names}
+            )
+            result = left.with_columns(*right.columns)
+        if zlice is not None:
+            raise NotImplementedError("slicing")
+        else:
+            return result
 
 
 @dataclass(slots=True)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index b456e76e99f..9d6020ee6f4 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -16,6 +16,8 @@
 
 
 class set_node(AbstractContextManager):
+    """Run a block with current node set in the visitor."""
+
     __slots__ = ("n", "visitor")
 
     def __init__(self, visitor, n):
@@ -242,7 +244,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
     elif isinstance(node, pl_expr.Cast):
         return expr.Cast(node.dtype, translate_expr(visitor, n=node.expr))
     elif isinstance(node, pl_expr.Column):
-        return expr.Column(node.name)
+        return expr.Col(node.name)
     elif isinstance(node, pl_expr.Agg):
         return expr.Agg(
             translate_expr(visitor, n=node.arguments),

From 9c303bc32246e0b8bfc3644e4fa566f29459fa67 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 9 May 2024 17:44:00 +0000
Subject: [PATCH 07/56] WIP: More fleshing out

Still need to port the expression eval
---
 .../cudf_polars/containers/column.py          |  15 +-
 .../cudf_polars/containers/dataframe.py       |  69 ++++++--
 python/cudf_polars/cudf_polars/dsl/ir.py      | 163 ++++++++++++++----
 .../cudf_polars/cudf_polars/utils/sorting.py  |  44 +++++
 4 files changed, 236 insertions(+), 55 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/utils/sorting.py

diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index efcd2e0da20..e34a1a7726e 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -30,12 +30,15 @@ def __init__(self, column: plc.Column, name: str):
         self.name = name
         self.is_sorted = plc.types.Sorted.NO
 
-    def with_metadata(self, *, like: Column) -> Self:
-        """Copy metadata from a column onto self."""
-        self.is_sorted = like.is_sorted
-        self.order = like.order
-        self.null_order = like.null_order
-        return self
+    def rename(self, name: str) -> Column:
+        """Return a new column sharing data with a new name."""
+        return type(self)(self.obj, name).with_sorted(like=self)
+
+    def with_sorted(self, *, like: Column) -> Self:
+        """Copy sortedness properties from a column onto self."""
+        return self.set_sorted(
+            is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
+        )
 
     def set_sorted(
         self,
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 502817d652b..c29494debd5 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -55,6 +55,16 @@ def __getitem__(self, name: str) -> Column | Scalar:
         else:
             return self.columns[i]
 
+    @cached_property
+    def column_names(self) -> list[str]:
+        """Return a list of the column names."""
+        return [c.name for c in self.columns]
+
+    @cached_property
+    def num_columns(self):
+        """Number of columns."""
+        return len(self.columns)
+
     @cached_property
     def num_rows(self):
         """Number of rows."""
@@ -70,6 +80,22 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self:
             [],
         )
 
+    @classmethod
+    def from_table(cls, table: plc.Table, names: list[str]) -> Self:
+        """Create from a pylibcudf table."""
+        if table.num_columns != len(names):
+            raise ValueError("Mismatching name and table length.")
+        return cls([Column(c, name) for c, name in zip(table.columns(), names)], [])
+
+    def with_sorted(self, *, like: DataFrame) -> Self:
+        """Copy sortedness from a dataframe onto self."""
+        if like.column_names != self.column_names:
+            raise ValueError("Can only copy from identically named frame")
+        self.columns = [
+            c.with_sorted(like=other) for c, other in zip(self.columns, like.columns)
+        ]
+        return self
+
     def with_columns(self, *columns: Column | Scalar) -> Self:
         """
         Return a new dataframe with extra columns.
@@ -85,7 +111,7 @@ def discard_columns(self, names: set[str]) -> Self:
         return type(self)([c for c in self.columns if c not in names], self.scalars)
 
     def replace_columns(self, *columns: Column) -> Self:
-        """Return a new dataframe with columns replaced by name, maintaining order."""
+        """Return a new dataframe with columns replaced by name."""
         new = {c.name: c for c in columns}
         if set(new).intersection(self.scalar_names):
             raise ValueError("Cannot replace scalars")
@@ -95,11 +121,9 @@ def replace_columns(self, *columns: Column) -> Self:
 
     def rename_columns(self, mapping: dict[str, str]) -> Self:
         """Rename some columns."""
-        new_columns = [
-            Column(c, mapping.get(c.name, c.name)).with_metadata(like=c)
-            for c in self.columns
-        ]
-        return type(self)(new_columns, self.scalars)
+        return type(self)(
+            [c.rename(mapping.get(c.name, c.name)) for c in self.columns], self.scalars
+        )
 
     def select_columns(self, names: set[str]) -> list[Column]:
         """Select columns by name."""
@@ -108,10 +132,29 @@ def select_columns(self, names: set[str]) -> list[Column]:
     def filter(self, mask: Column) -> Self:
         """Return a filtered table given a mask."""
         table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
-        return type(self)(
-            [
-                Column(new, old.name).with_metadata(like=old)
-                for old, new in zip(self.columns, table.columns())
-            ],
-            [],
-        )
+        return type(self).from_table(table, self.column_names).with_sorted(like=self)
+
+    def slice(self, zlice: tuple[int, int] | None) -> Self:
+        """
+        Slice a dataframe.
+
+        Parameters
+        ----------
+        zlice
+            optional, tuple of start and length, negative values of start
+            treated as for python indexing. If not provided, returns self.
+
+        Returns
+        -------
+        New dataframe (if zlice is not None) other self (if it is)
+        """
+        if zlice is None:
+            return self
+        start, length = zlice
+        if start < 0:
+            start += self.num_rows
+        # Polars slice takes an arbitrary positive integer and slice
+        # to the end of the frame if it is larger.
+        end = min(start + length, self.num_rows)
+        (table,) = plc.copying.slice(self.table, [start, end])
+        return type(self).from_table(table, self.column_names).with_sorted(like=self)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index f7d5b56e637..ce069e1ce5f 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -17,7 +17,7 @@
 
 from dataclasses import dataclass
 from functools import cache
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
 import pyarrow as pa
 from typing_extensions import assert_never
@@ -29,7 +29,7 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import Column, DataFrame
-from cudf_polars.utils import dtypes
+from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
     from typing import Literal
@@ -63,7 +63,7 @@
 class IR:
     schema: dict
 
-    def evaluate(self) -> DataFrame:
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         raise NotImplementedError
 
@@ -88,7 +88,7 @@ def __post_init__(self):
         if self.typ not in ("csv", "parquet"):
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
 
-    def evaluate(self) -> DataFrame:
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         options = self.file_options
         n_rows = options.n_rows
@@ -132,6 +132,13 @@ class Cache(IR):
     key: int
     value: IR
 
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        try:
+            return cache[self.key]
+        except KeyError:
+            return cache.setdefault(self.key, self.value.evaluate(cache=cache))
+
 
 @dataclass(slots=True)
 class DataFrameScan(IR):
@@ -139,7 +146,7 @@ class DataFrameScan(IR):
     projection: list[str]
     predicate: Expr | None
 
-    def evaluate(self) -> DataFrame:
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         pdf = pl.DataFrame._from_pydf(self.df)
         if self.projection is not None:
@@ -152,14 +159,8 @@ def evaluate(self) -> DataFrame:
                 # TODO: Nested types
                 schema = schema.set(i, pa.field(field.name, pa.string()))
         table = table.cast(schema)
-        df = DataFrame(
-            [
-                Column(col, name)
-                for name, col in zip(
-                    self.schema.keys(), plc.interop.from_arrow(table).columns()
-                )
-            ],
-            [],
+        df = DataFrame.from_table(
+            plc.interop.from_arrow(table), list(self.schema.keys())
         )
         if self.predicate is not None:
             mask = self.predicate.evaluate(df)
@@ -174,9 +175,9 @@ class Select(IR):
     cse: list[Expr]
     expr: list[Expr]
 
-    def evaluate(self):
+    def evaluate(self, *, cache: dict[int, DataFrame]):
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate()
+        df = self.df.evaluate(cache=cache)
         for e in self.cse:
             df = df.with_columns(e.evaluate(df))
         return DataFrame([e.evaluate(df) for e in self.expr], [])
@@ -235,7 +236,7 @@ class Join(IR):
     options: Any
 
     def __post_init__(self):
-        """Raise for unsupported options."""
+        """Validate preconditions."""
         if self.options[0] == "cross":
             raise NotImplementedError("cross join not implemented")
 
@@ -279,10 +280,10 @@ def _joiners(
         else:
             assert_never(how)
 
-    def evaluate(self) -> DataFrame:
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        left = self.left.evaluate()
-        right = self.right.evaluate()
+        left = self.left.evaluate(cache=cache)
+        right = self.right.evaluate(cache=cache)
         left_on = DataFrame([e.evaluate(left) for e in self.left_on], [])
         right_on = DataFrame([e.evaluate(right) for e in self.right_on], [])
         how, join_nulls, zlice, suffix, coalesce = self.options
@@ -298,24 +299,18 @@ def evaluate(self) -> DataFrame:
             lg = join_fn(left_on.table, right_on.table, null_equality)
             left = left.replace_columns(*left_on.columns)
             table = plc.copying.gather(left.table, lg, left_policy)
-            result = DataFrame(
-                [
-                    Column(c, col.name)
-                    for col, c in zip(left_on.columns, table.columns())
-                ],
-                [],
-            )
+            result = DataFrame.from_table(table, left.column_names)
         else:
             lg, rg = join_fn(left_on, right_on, null_equality)
             left = left.replace_columns(*left_on.columns)
             right = right.replace_columns(*right_on.columns)
             if coalesce and how != "outer":
                 right = right.discard_columns(set(right_on.names))
-            left = DataFrame(
-                plc.copying.gather(left.table, lg, left_policy).columns(), []
+            left = DataFrame.from_table(
+                plc.copying.gather(left.table, lg, left_policy), left.column_names
             )
-            right = DataFrame(
-                plc.copying.gather(right.table, rg, right_policy).columns(), []
+            right = DataFrame.from_table(
+                plc.copying.gather(right.table, rg, right_policy), right.column_names
             )
             if coalesce and how == "outer":
                 left.replace_columns(
@@ -335,10 +330,7 @@ def evaluate(self) -> DataFrame:
                 {name: f"{name}{suffix}" for name in right.names if name in left.names}
             )
             result = left.with_columns(*right.columns)
-        if zlice is not None:
-            raise NotImplementedError("slicing")
-        else:
-            return result
+        return result.slice(zlice)
 
 
 @dataclass(slots=True)
@@ -346,18 +338,117 @@ class HStack(IR):
     df: IR
     columns: list[Expr]
 
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        return df.with_columns(*(c.evaluate(df) for c in self.columns))
+
 
 @dataclass(slots=True)
 class Distinct(IR):
     df: IR
-    options: Any
+    keep: plc.stream_compaction.DuplicateKeepOption
+    subset: set[str] | None
+    zlice: tuple[int, int] | None
+    stable: bool
+
+    _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = {
+        "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+        "last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+        "none": plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+        "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY,
+    }
+
+    def __init__(self, schema: dict, df: IR, options: Any):
+        self.schema = schema
+        self.df = df
+        (keep, subset, maintain_order, zlice) = options
+        self.keep = Distinct._KEEP_MAP[keep]
+        self.subset = set(subset) if subset is not None else None
+        self.stable = maintain_order
+        self.zlice = zlice
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        if self.subset is None:
+            indices = list(range(df.num_columns))
+        else:
+            indices = [i for i, k in enumerate(df.names) if k in self.subset]
+        keys_sorted = all(c.is_sorted for c in df.columns)
+        if keys_sorted:
+            table = plc.stream_compaction.unique(
+                df.table,
+                indices,
+                self.keep,
+                plc.types.NullEquality.EQUAL,
+            )
+        else:
+            distinct = (
+                plc.stream_compaction.stable_distinct
+                if self.stable
+                else plc.stream_compaction.distinct
+            )
+            table = distinct(
+                df.table,
+                indices,
+                self.keep,
+                plc.types.NullEquality.EQUAL,
+                plc.types.NanEquality.ALL_EQUAL,
+            )
+        result = DataFrame(
+            [Column(c, old.name) for c, old in zip(table.columns(), df.columns)], []
+        )
+        if keys_sorted or self.stable:
+            result = result.with_sorted(like=df)
+        return result.slice(self.zlice)
 
 
 @dataclass(slots=True)
 class Sort(IR):
     df: IR
     by: list[Expr]
-    options: Any
+    do_sort: Callable[..., plc.Table]
+    zlice: tuple[int, int] | None
+    order: list[plc.types.Order]
+    null_order: list[plc.types.NullOrder]
+
+    def __init__(self, schema: dict, df: IR, by: list[Expr], options: Any):
+        self.schema = schema
+        self.df = df
+        self.by = by
+        stable, nulls_last, descending = options
+        self.order, self.null_order = sorting.sort_order(
+            descending, nulls_last=nulls_last, num_keys=len(by)
+        )
+        self.do_sort = (
+            plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+        )
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        sort_keys = [k.evaluate(df) for k in self.by]
+        keys_in_result = [
+            i
+            for k in sort_keys
+            if (i := df.names.get(k.name)) is not None and k is df.columns[i]
+        ]
+        table = self.do_sort(
+            df.table,
+            plc.Table([k.obj for k in sort_keys]),
+            self.order,
+            self.null_order,
+        )
+        columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)]
+        # If a sort key is in the result table, set the sortedness property
+        for idx in keys_in_result:
+            columns[idx] = columns[idx].set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=self.order[idx],
+                null_order=self.null_order[idx],
+            )
+        return DataFrame(columns, [])
 
 
 @dataclass(slots=True)
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
new file mode 100644
index 00000000000..fed1cd35416
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Sorting utilities."""
+
+from __future__ import annotations
+
+import cudf._lib.pylibcudf as plc
+
+
+def sort_order(
+    descending: list[bool], *, nulls_last: bool, num_keys: int
+) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]:
+    """
+    Produce sort order arguments.
+
+    Parameters
+    ----------
+    descending
+        List indicating order for each column
+    nulls_last
+        Should nulls sort last or first?
+    num_keys
+        Number of sort keys
+
+    Returns
+    -------
+    tuple of column_order and null_precendence
+    suitable for passing to sort routines
+    """
+    # Mimicking polars broadcast handling of descending
+    if num_keys > (n := len(descending)) and n == 1:
+        descending = [descending[0]] * num_keys
+    column_order = [
+        plc.types.Order.DESCENDING if d else plc.types.Order.ASCENDING
+        for d in descending
+    ]
+    null_precedence = []
+    for asc in column_order:
+        if (asc == plc.types.Order.ASCENDING) ^ (not nulls_last):
+            null_precedence.append(plc.types.NullOrder.AFTER)
+        elif (asc == plc.types.Order.ASCENDING) ^ nulls_last:
+            null_precedence.append(plc.types.NullOrder.BEFORE)
+    return column_order, null_precedence

From 688d8ef9a8b79a66f710e7a2528cef92b44479b2 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 10 May 2024 13:24:37 +0000
Subject: [PATCH 08/56] WIP: more implementation

---
 .../cudf_polars/containers/dataframe.py       |  28 +++--
 python/cudf_polars/cudf_polars/dsl/ir.py      | 101 +++++++++++++++++-
 2 files changed, 117 insertions(+), 12 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index c29494debd5..fda4eb3617d 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -12,13 +12,14 @@
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers.column import Column
-from cudf_polars.containers.scalar import Scalar
 
 if TYPE_CHECKING:
     from typing_extensions import Self
 
     import cudf
 
+    from cudf_polars.containers.scalar import Scalar
+
 
 __all__: list[str] = ["DataFrame"]
 
@@ -55,6 +56,11 @@ def __getitem__(self, name: str) -> Column | Scalar:
         else:
             return self.columns[i]
 
+    @cached_property
+    def column_names_set(self) -> set[str]:
+        """Return the column names as a set."""
+        return {c.name for c in self.columns}
+
     @cached_property
     def column_names(self) -> list[str]:
         """Return a list of the column names."""
@@ -87,28 +93,34 @@ def from_table(cls, table: plc.Table, names: list[str]) -> Self:
             raise ValueError("Mismatching name and table length.")
         return cls([Column(c, name) for c, name in zip(table.columns(), names)], [])
 
-    def with_sorted(self, *, like: DataFrame) -> Self:
+    def with_sorted(self, *, like: DataFrame, subset: set[str] | None = None) -> Self:
         """Copy sortedness from a dataframe onto self."""
         if like.column_names != self.column_names:
             raise ValueError("Can only copy from identically named frame")
+        subset = self.column_names_set if subset is None else subset
         self.columns = [
-            c.with_sorted(like=other) for c, other in zip(self.columns, like.columns)
+            c.with_sorted(like=other) if c.name in subset else c
+            for c, other in zip(self.columns, like.columns)
         ]
         return self
 
-    def with_columns(self, *columns: Column | Scalar) -> Self:
+    def with_columns(self, columns: list[Column]) -> Self:
         """
         Return a new dataframe with extra columns.
 
         Data is shared.
         """
-        cols = [c for c in columns if isinstance(c, Column)]
-        scalars = [c for c in columns if isinstance(c, Scalar)]
-        return type(self)([*self.columns, *cols], [*self.scalars, *scalars])
+        return type(self)([*self.columns, *columns], self.scalars)
 
     def discard_columns(self, names: set[str]) -> Self:
         """Drop columns by name."""
-        return type(self)([c for c in self.columns if c not in names], self.scalars)
+        return type(self)(
+            [c for c in self.columns if c.name not in names], self.scalars
+        )
+
+    def select(self, names: set[str]) -> Self:
+        """Select columns by name returning DataFrame."""
+        return type(self)([c for c in self.columns if c.name in names], self.scalars)
 
     def replace_columns(self, *columns: Column) -> Self:
         """Return a new dataframe with columns replaced by name."""
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index ce069e1ce5f..e0d794e0615 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -119,7 +119,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                 order=plc.types.Order.ASCENDING,
                 null_order=plc.types.null_order.AFTER,
             )
-            df = df.with_columns(index)
+            df = df.with_columns([index])
         if self.predicate is None:
             return df
         else:
@@ -179,7 +179,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]):
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         for e in self.cse:
-            df = df.with_columns(e.evaluate(df))
+            df = df.with_columns([e.evaluate(df)])
         return DataFrame([e.evaluate(df) for e in self.expr], [])
 
 
@@ -329,7 +329,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             right = right.rename_columns(
                 {name: f"{name}{suffix}" for name in right.names if name in left.names}
             )
-            result = left.with_columns(*right.columns)
+            result = left.with_columns(right.columns)
         return result.slice(zlice)
 
 
@@ -341,7 +341,7 @@ class HStack(IR):
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
-        return df.with_columns(*(c.evaluate(df) for c in self.columns))
+        return df.with_columns([c.evaluate(df) for c in self.columns])
 
 
 @dataclass(slots=True)
@@ -457,17 +457,32 @@ class Slice(IR):
     offset: int
     length: int
 
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        return df.slice((self.offset, self.length))
+
 
 @dataclass(slots=True)
 class Filter(IR):
     df: IR
     mask: Expr
 
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        return df.filter(self.mask.evaluate(df))
+
 
 @dataclass(slots=True)
 class Projection(IR):
     df: IR
 
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        return df.select(set(self.schema.keys()))
+
 
 @dataclass(slots=True)
 class MapFunction(IR):
@@ -475,6 +490,84 @@ class MapFunction(IR):
     name: str
     options: Any
 
+    _NAMES: ClassVar[frozenset[str]] = frozenset(
+        [
+            "drop_nulls",
+            "rechunk",
+            "merge_sorted",
+            "rename",
+            "explode",
+        ]
+    )
+
+    def __post_init__(self):
+        """Validate preconditions."""
+        if self.name not in MapFunction._NAMES:
+            raise NotImplementedError(f"Unhandled map function {self.name}")
+        if self.name == "explode":
+            (to_explode,) = self.options
+            if len(to_explode) > 1:
+                # TODO: straightforward, but need to error check
+                # polars requires that all to-explode columns have the
+                # same sub-shapes
+                raise NotImplementedError("Explode with more than one column")
+        elif self.name == "merge_sorted":
+            assert isinstance(self.df, Union)
+            (key_column,) = self.options
+            if key_column not in self.df.dfs[0].schema:
+                raise ValueError(f"Key column {key_column} not found")
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        if self.name == "merge_sorted":
+            # merge_sorted operates on Union inputs
+            # but if we evaluate the Union then we can't unpick the
+            # pieces, so we dive inside and evaluate the pieces by hand
+            assert isinstance(self.df, Union)
+            first, *rest = (c.evaluate(cache=cache) for c in self.df.dfs)
+            (key_column,) = self.options
+            if not all(first.column_names == r.column_names for r in rest):
+                raise ValueError("DataFrame shapes/column names don't match")
+            # Already validated that key_column is in column names
+            index = first.column_names.index(key_column)
+            return DataFrame.from_table(
+                plc.merge.merge_sorted(
+                    [first.table, *(df.table for df in rest)],
+                    [index],
+                    [plc.types.Order.ASCENDING],
+                    [plc.types.NullOrder.BEFORE],
+                ),
+                first.column_names,
+            ).with_sorted(like=first, subset={key_column})
+        elif self.name == "rechunk":
+            # No-op in our data model
+            return self.df.evaluate(cache=cache)
+        elif self.name == "drop_nulls":
+            df = self.df.evaluate(cache=cache)
+            (subset,) = self.options
+            subset = set(subset)
+            indices = [i for i, name in enumerate(df.column_names) if name in subset]
+            return DataFrame.from_table(
+                plc.stream_compaction.drop_nulls(df.table, indices, len(indices)),
+                df.column_names,
+            ).with_sorted(like=df)
+        elif self.name == "rename":
+            df = self.df.evaluate(cache=cache)
+            # final tag is "swapping" which is useful for the
+            # optimiser (it blocks some pushdown operations)
+            old, new, _ = self.options
+            return df.rename_columns(dict(zip(old, new)))
+        elif self.name == "explode":
+            df = self.df.evaluate(cache=cache)
+            ((to_explode,),) = self.options
+            index = df.column_names.index(to_explode)
+            subset = df.column_names_set - {to_explode}
+            return DataFrame.from_table(
+                plc.lists.explode_outer(df.table, index), df.column_names
+            ).with_sorted(like=df, subset=subset)
+        else:
+            raise AssertionError("Should never be reached")
+
 
 @dataclass(slots=True)
 class Union(IR):

From f56525aff239ac3f097999436e92540272858a9d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 10 May 2024 13:42:01 +0000
Subject: [PATCH 09/56] WIP: simplify

---
 .../cudf_polars/containers/dataframe.py       | 24 +++++---------
 python/cudf_polars/cudf_polars/dsl/ir.py      | 31 +++++++++++--------
 2 files changed, 25 insertions(+), 30 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index fda4eb3617d..8cd2943853e 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -5,7 +5,6 @@
 
 from __future__ import annotations
 
-import itertools
 from functools import cached_property
 from typing import TYPE_CHECKING
 
@@ -14,6 +13,8 @@
 from cudf_polars.containers.column import Column
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from typing_extensions import Self
 
     import cudf
@@ -30,14 +31,10 @@ class DataFrame:
     __slots__ = ("columns", "scalars", "names", "scalar_names", "table")
     columns: list[Column]
     scalars: list[Scalar]
-    names: dict[str, int]
     scalar_names: frozenset[str]
     table: plc.Table | None
 
     def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None:
-        self.names = dict(zip((c.name for c in columns), itertools.count(0))) | dict(
-            zip((s.name for s in columns), itertools.count(0))
-        )
         self.scalar_names = frozenset(s.name for s in scalars)
         self.columns = columns
         self.scalars = scalars
@@ -48,14 +45,6 @@ def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None:
 
     __iter__ = None
 
-    def __getitem__(self, name: str) -> Column | Scalar:
-        """Return column with given name."""
-        i = self.names[name]
-        if name in self.scalar_names:
-            return self.scalars[i]
-        else:
-            return self.columns[i]
-
     @cached_property
     def column_names_set(self) -> set[str]:
         """Return the column names as a set."""
@@ -104,7 +93,7 @@ def with_sorted(self, *, like: DataFrame, subset: set[str] | None = None) -> Sel
         ]
         return self
 
-    def with_columns(self, columns: list[Column]) -> Self:
+    def with_columns(self, columns: Sequence[Column]) -> Self:
         """
         Return a new dataframe with extra columns.
 
@@ -118,16 +107,17 @@ def discard_columns(self, names: set[str]) -> Self:
             [c for c in self.columns if c.name not in names], self.scalars
         )
 
-    def select(self, names: set[str]) -> Self:
+    def select(self, names: Sequence[str]) -> Self:
         """Select columns by name returning DataFrame."""
-        return type(self)([c for c in self.columns if c.name in names], self.scalars)
+        want = set(names)
+        return type(self)([c for c in self.columns if c.name in want], self.scalars)
 
     def replace_columns(self, *columns: Column) -> Self:
         """Return a new dataframe with columns replaced by name."""
         new = {c.name: c for c in columns}
         if set(new).intersection(self.scalar_names):
             raise ValueError("Cannot replace scalars")
-        if not set(new).issubset(self.names):
+        if not set(new).issubset(self.column_names_set):
             raise ValueError("Cannot replace with non-existing names")
         return type(self)([new.get(c.name, c) for c in self.columns], self.scalars)
 
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index e0d794e0615..a8147549b28 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -61,7 +61,7 @@
 
 @dataclass(slots=True)
 class IR:
-    schema: dict
+    schema: dict[str, Any]
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -305,7 +305,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             left = left.replace_columns(*left_on.columns)
             right = right.replace_columns(*right_on.columns)
             if coalesce and how != "outer":
-                right = right.discard_columns(set(right_on.names))
+                right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
                 plc.copying.gather(left.table, lg, left_policy), left.column_names
             )
@@ -320,14 +320,18 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                             left_col.name,
                         )
                         for left_col, right_col in zip(
-                            left.select_columns(set(left_on.names)),
-                            right.select_columns(set(right_on.names)),
+                            left.select_columns(left_on.column_names_set),
+                            right.select_columns(right_on.column_names_set),
                         )
                     )
                 )
-                right.discard_columns(set(right_on.names))
+                right.discard_columns(right_on.column_names_set)
             right = right.rename_columns(
-                {name: f"{name}{suffix}" for name in right.names if name in left.names}
+                {
+                    name: f"{name}{suffix}"
+                    for name in right.column_names
+                    if name in left.column_names_set
+                }
             )
             result = left.with_columns(right.columns)
         return result.slice(zlice)
@@ -374,7 +378,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         if self.subset is None:
             indices = list(range(df.num_columns))
         else:
-            indices = [i for i, k in enumerate(df.names) if k in self.subset]
+            indices = [i for i, k in enumerate(df.column_names) if k in self.subset]
         keys_sorted = all(c.is_sorted for c in df.columns)
         if keys_sorted:
             table = plc.stream_compaction.unique(
@@ -429,10 +433,11 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         sort_keys = [k.evaluate(df) for k in self.by]
+        names = {c.name: i for i, c in enumerate(df.columns)}
         keys_in_result = [
             i
             for k in sort_keys
-            if (i := df.names.get(k.name)) is not None and k is df.columns[i]
+            if (i := names.get(k.name)) is not None and k is df.columns[i]
         ]
         table = self.do_sort(
             df.table,
@@ -442,11 +447,11 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         )
         columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)]
         # If a sort key is in the result table, set the sortedness property
-        for idx in keys_in_result:
-            columns[idx] = columns[idx].set_sorted(
+        for i in keys_in_result:
+            columns[i] = columns[i].set_sorted(
                 is_sorted=plc.types.Sorted.YES,
-                order=self.order[idx],
-                null_order=self.null_order[idx],
+                order=self.order[i],
+                null_order=self.null_order[i],
             )
         return DataFrame(columns, [])
 
@@ -481,7 +486,7 @@ class Projection(IR):
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
-        return df.select(set(self.schema.keys()))
+        return df.select(list(self.schema.keys()))
 
 
 @dataclass(slots=True)

From 2cb6f5031d4724075ea5cdcfe2d20807794f048a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 10 May 2024 14:53:06 +0000
Subject: [PATCH 10/56] WIP: Maybe done with eval of plan nodes

---
 .../cudf_polars/containers/dataframe.py       | 23 ++++++------
 python/cudf_polars/cudf_polars/dsl/ir.py      | 36 +++++++++++++++++++
 .../cudf_polars/cudf_polars/dsl/translate.py  |  4 ++-
 3 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 8cd2943853e..c30f8c10ca2 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -13,7 +13,7 @@
 from cudf_polars.containers.column import Column
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Mapping, Sequence, Set
 
     from typing_extensions import Self
 
@@ -34,10 +34,10 @@ class DataFrame:
     scalar_names: frozenset[str]
     table: plc.Table | None
 
-    def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None:
+    def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None:
         self.scalar_names = frozenset(s.name for s in scalars)
-        self.columns = columns
-        self.scalars = scalars
+        self.columns = list(columns)
+        self.scalars = list(scalars)
         if len(scalars) == 0:
             self.table = plc.Table([c.obj for c in columns])
         else:
@@ -46,9 +46,9 @@ def __init__(self, columns: list[Column], scalars: list[Scalar]) -> None:
     __iter__ = None
 
     @cached_property
-    def column_names_set(self) -> set[str]:
+    def column_names_set(self) -> frozenset[str]:
         """Return the column names as a set."""
-        return {c.name for c in self.columns}
+        return frozenset(c.name for c in self.columns)
 
     @cached_property
     def column_names(self) -> list[str]:
@@ -76,13 +76,14 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self:
         )
 
     @classmethod
-    def from_table(cls, table: plc.Table, names: list[str]) -> Self:
+    def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         """Create from a pylibcudf table."""
+        # TODO: strict=True when we drop py39
         if table.num_columns != len(names):
             raise ValueError("Mismatching name and table length.")
         return cls([Column(c, name) for c, name in zip(table.columns(), names)], [])
 
-    def with_sorted(self, *, like: DataFrame, subset: set[str] | None = None) -> Self:
+    def with_sorted(self, *, like: DataFrame, subset: Set[str] | None = None) -> Self:
         """Copy sortedness from a dataframe onto self."""
         if like.column_names != self.column_names:
             raise ValueError("Can only copy from identically named frame")
@@ -101,7 +102,7 @@ def with_columns(self, columns: Sequence[Column]) -> Self:
         """
         return type(self)([*self.columns, *columns], self.scalars)
 
-    def discard_columns(self, names: set[str]) -> Self:
+    def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""
         return type(self)(
             [c for c in self.columns if c.name not in names], self.scalars
@@ -121,13 +122,13 @@ def replace_columns(self, *columns: Column) -> Self:
             raise ValueError("Cannot replace with non-existing names")
         return type(self)([new.get(c.name, c) for c in self.columns], self.scalars)
 
-    def rename_columns(self, mapping: dict[str, str]) -> Self:
+    def rename_columns(self, mapping: Mapping[str, str]) -> Self:
         """Rename some columns."""
         return type(self)(
             [c.rename(mapping.get(c.name, c.name)) for c in self.columns], self.scalars
         )
 
-    def select_columns(self, names: set[str]) -> list[Column]:
+    def select_columns(self, names: Set[str]) -> list[Column]:
         """Select columns by name."""
         return [c for c in self.columns if c.name in names]
 
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index a8147549b28..078ad3e884c 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -486,6 +486,7 @@ class Projection(IR):
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
+        # This can reorder things.
         return df.select(list(self.schema.keys()))
 
 
@@ -577,14 +578,49 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 @dataclass(slots=True)
 class Union(IR):
     dfs: list[IR]
+    zlice: tuple[int, int] | None
+
+    def __post_init__(self):
+        """Validated preconditions."""
+        schema = self.dfs[0].schema
+        if not all(s == schema for s in self.dfs[1:]):
+            raise ValueError("Schema mismatch")
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        return DataFrame.from_table(
+            plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names
+        ).slice(self.zlice)
 
 
 @dataclass(slots=True)
 class HConcat(IR):
     dfs: list[IR]
 
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        columns, scalars = zip(*((df.columns, df.scalars) for df in dfs))
+        return DataFrame(columns, scalars)
+
 
 @dataclass(slots=True)
 class ExtContext(IR):
     df: IR
     extra: list[IR]
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        # TODO: polars optimizer doesn't do projection pushdown
+        # through extcontext AFAICT.
+        df = self.df.evaluate(cache=cache)
+        # extra contexts are added in order, if they have any
+        # overlapping column names, those are ignored.
+        names = df.column_names_set.copy()
+        # TODO: scalars
+        for ir in self.extra:
+            extra = ir.evaluate(cache=cache).discard_columns(names)
+            names |= extra.column_names_set
+            df = df.with_columns(extra.columns)
+        return df
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 9d6020ee6f4..ff634948663 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -157,7 +157,9 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
                 options,
             )
         elif isinstance(node, pl_ir.Union):
-            return ir.Union(schema, [translate_ir(visitor, n=n) for n in node.inputs])
+            return ir.Union(
+                schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options
+            )
         elif isinstance(node, pl_ir.HConcat):
             return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
         elif isinstance(node, pl_ir.ExtContext):

From c3e0a9207de94df026655234bbe0b3f7373b4ed6 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 13 May 2024 14:52:06 +0000
Subject: [PATCH 11/56] WIP: expression evaluation

---
 .../cudf_polars/containers/dataframe.py       |   6 +-
 .../cudf_polars/containers/scalar.py          |   4 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    | 169 +++++++++++++++++-
 python/cudf_polars/cudf_polars/dsl/ir.py      |  14 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |  17 +-
 5 files changed, 187 insertions(+), 23 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index c30f8c10ca2..2c05cee9dea 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -28,15 +28,13 @@
 class DataFrame:
     """A representation of a dataframe."""
 
-    __slots__ = ("columns", "scalars", "names", "scalar_names", "table")
     columns: list[Column]
     scalars: list[Scalar]
-    scalar_names: frozenset[str]
     table: plc.Table | None
 
     def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None:
-        self.scalar_names = frozenset(s.name for s in scalars)
         self.columns = list(columns)
+        self._column_map = {c.name: c for c in self.columns}
         self.scalars = list(scalars)
         if len(scalars) == 0:
             self.table = plc.Table([c.obj for c in columns])
@@ -116,8 +114,6 @@ def select(self, names: Sequence[str]) -> Self:
     def replace_columns(self, *columns: Column) -> Self:
         """Return a new dataframe with columns replaced by name."""
         new = {c.name: c for c in columns}
-        if set(new).intersection(self.scalar_names):
-            raise ValueError("Cannot replace scalars")
         if not set(new).issubset(self.column_names_set):
             raise ValueError("Cannot replace with non-existing names")
         return type(self)([new.get(c.name, c) for c in self.columns], self.scalars)
diff --git a/python/cudf_polars/cudf_polars/containers/scalar.py b/python/cudf_polars/cudf_polars/containers/scalar.py
index a9b59a3218c..fc97d0fd9c2 100644
--- a/python/cudf_polars/cudf_polars/containers/scalar.py
+++ b/python/cudf_polars/cudf_polars/containers/scalar.py
@@ -18,8 +18,6 @@ class Scalar:
 
     __slots__ = ("obj", "name")
     obj: plc.Scalar
-    name: str
 
-    def __init__(self, scalar: plc.Column, name: str):
+    def __init__(self, scalar: plc.Scalar):
         self.obj = scalar
-        self.name = name
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 3ec0223b7a2..376651f4124 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -15,11 +15,22 @@
 
 from __future__ import annotations
 
+import enum
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
+from enum import IntEnum
+from typing import TYPE_CHECKING, Any, ClassVar
+
+import pyarrow as pa
+
+from polars.polars import _expr_nodes as pl_expr
+
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars.containers import Column, Scalar
+from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
-    from cudf_polars.containers import Column, DataFrame
+    from cudf_polars.containers import DataFrame
 
 __all__ = [
     "Expr",
@@ -38,10 +49,18 @@
 ]
 
 
+class ExecutionContext(IntEnum):
+    FRAME = enum.auto()
+    GROUPBY = enum.auto()
+    ROLLING = enum.auto()
+
+
 @dataclass(slots=True)
 class Expr:
     # TODO: return type is a lie for Literal
-    def evaluate(self, context: DataFrame) -> Column:
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         raise NotImplementedError
 
@@ -51,21 +70,45 @@ class NamedExpr(Expr):
     name: str
     value: Expr
 
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        return Column(self.value.evaluate(df, context=context), self.name)
+
 
 @dataclass(slots=True)
 class Literal(Expr):
-    dtype: Any
+    dtype: plc.Datatype
     value: Any
 
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        obj = plc.interop.from_arrow(pa.scalar(self.value), data_type=self.dtype)
+        return Scalar(obj)  # type: ignore
+
 
 @dataclass(slots=True)
 class Col(Expr):
     name: str
 
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        return df._column_map[self.name]
+
 
 @dataclass(slots=True)
 class Len(Expr):
-    pass
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # TODO: type is wrong
+        return df.num_rows
 
 
 @dataclass(slots=True)
@@ -80,12 +123,43 @@ class Sort(Expr):
     column: Expr
     options: Any
 
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        column = self.column.evaluate(df, context=context)
+        (stable, nulls_last, descending) = self.options
+        order, null_order = sorting.sort_order(
+            [descending], nulls_last=nulls_last, num_keys=1
+        )
+        do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort
+        table = do_sort(plc.Table([column], order, null_order))
+        return Column(table.columns()[0], column.name).set_sorted(
+            is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0]
+        )
+
 
 @dataclass(slots=True)
 class SortBy(Expr):
     column: Expr
     by: list[Expr]
-    descending: list[bool]
+    options: Any
+
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        column = self.column.evaluate(df, context=context)
+        by = [b.evaluate(df, context=context) for b in self.by]
+        (stable, nulls_last, descending) = self.options
+        order, null_order = sorting.sort_order(
+            descending, nulls_last=nulls_last, num_keys=len(self.by)
+        )
+        do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+        table = do_sort(
+            plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order
+        )
+        return Column(table.columns()[0], column.name)
 
 
 @dataclass(slots=True)
@@ -93,12 +167,47 @@ class Gather(Expr):
     values: Expr
     indices: Expr
 
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        values = self.values.evaluate(df, context=context)
+        indices = self.indices.evaluate(df, context=context)
+        lo, hi = plc.reduce.minmax(indices.obj)
+        lo = plc.interop.to_arrow(lo).as_py()
+        hi = plc.interop.to_arrow(hi).as_py()
+        n = df.num_rows
+        if hi >= n or lo < -n:
+            raise ValueError("gather indices are out of bounds")
+        if indices.obj.null_count():
+            bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
+            obj = plc.replace.replace_nulls(
+                indices.obj,
+                plc.interop.from_arrow(pa.scalar(n), data_type=indices.obj.data_type()),
+            )
+        else:
+            bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
+            obj = indices.obj
+        table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
+        return Column(table.columns()[0], values.name)
+
 
 @dataclass(slots=True)
 class Filter(Expr):
     values: Expr
     mask: Expr
 
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        values = self.values.evaluate(df, context=context)
+        mask = self.mask.evaluate(df, context=context)
+        table = plc.stream_compaction.apply_boolean_mask(
+            plc.Table([values.obj]), mask.obj
+        )
+        return Column(table.columns()[0], values.name).with_sorted(like=values)
+
 
 @dataclass(slots=True)
 class Window(Expr):
@@ -109,9 +218,18 @@ class Window(Expr):
 
 @dataclass(slots=True)
 class Cast(Expr):
-    dtype: Any
+    dtype: plc.DataType
     column: Expr
 
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        column = self.column.evaluate(df, context=context)
+        return Column(plc.unary.cast(column, self.dtype), column.name).with_sorted(
+            like=column
+        )
+
 
 @dataclass(slots=True)
 class Agg(Expr):
@@ -124,4 +242,39 @@ class Agg(Expr):
 class BinOp(Expr):
     left: Expr
     right: Expr
-    op: Any
+    op: plc.binaryop.BinaryOperator
+    dtype: plc.DataType
+
+    _MAPPING: ClassVar[dict[pl_expr.PyOperator, plc.binaryop.BinaryOperator]] = {
+        pl_expr.PyOperator.Eq: plc.binaryop.BinaryOperator.EQUAL,
+        pl_expr.PyOperator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
+        pl_expr.PyOperator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
+        pl_expr.PyOperator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+        pl_expr.PyOperator.Lt: plc.binaryop.BinaryOperator.LESS,
+        pl_expr.PyOperator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL,
+        pl_expr.PyOperator.Gt: plc.binaryop.BinaryOperator.GREATER,
+        pl_expr.PyOperator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL,
+        pl_expr.PyOperator.Plus: plc.binaryop.BinaryOperator.ADD,
+        pl_expr.PyOperator.Minus: plc.binaryop.BinaryOperator.SUB,
+        pl_expr.PyOperator.Multiply: plc.binaryop.BinaryOperator.MUL,
+        pl_expr.PyOperator.Divide: plc.binaryop.BinaryOperator.DIV,
+        pl_expr.PyOperator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV,
+        pl_expr.PyOperator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV,
+        pl_expr.PyOperator.Modulus: plc.binaryop.BinaryOperator.PYMOD,
+        pl_expr.PyOperator.And: plc.binaryop.BinaryOperator.BITWISE_AND,
+        pl_expr.PyOperator.Or: plc.binaryop.BinaryOperator.BITWISE_OR,
+        pl_expr.PyOperator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR,
+        pl_expr.PyOperator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND,
+        pl_expr.PyOperator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
+    }
+
+    def evaluate(
+        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        left = self.left.evaluate(df, context=context)
+        right = self.right.evaluate(df, context=context)
+        return Column(
+            plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype),
+            left.name,
+        )
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 078ad3e884c..70e7d20bd22 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -29,7 +29,7 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import Column, DataFrame
-from cudf_polars.utils import dtypes, sorting
+from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
     from typing import Literal
@@ -61,7 +61,7 @@
 
 @dataclass(slots=True)
 class IR:
-    schema: dict[str, Any]
+    schema: dict[str, plc.DataType]
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -109,7 +109,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             assert_never(self.typ)
         if row_index is not None:
             name, offset = row_index
-            dtype = dtypes.from_polars(self.schema[name])
+            dtype = self.schema[name]
             step = plc.interop.from_arrow(pa.scalar(1), data_type=dtype)
             init = plc.interop.from_arrow(pa.scalar(offset), data_type=dtype)
             index = Column(
@@ -120,6 +120,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                 null_order=plc.types.null_order.AFTER,
             )
             df = df.with_columns([index])
+        assert all(
+            c.obj.data_type() == dtype
+            for c, dtype in zip(df.columns, self.schema.values())
+        )
         if self.predicate is None:
             return df
         else:
@@ -162,6 +166,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         df = DataFrame.from_table(
             plc.interop.from_arrow(table), list(self.schema.keys())
         )
+        assert all(
+            c.obj.data_type() == dtype
+            for c, dtype in zip(df.columns, self.schema.values())
+        )
         if self.predicate is not None:
             mask = self.predicate.evaluate(df)
             return df.filter(mask)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index ff634948663..95f705199d4 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -11,6 +11,7 @@
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 from cudf_polars.dsl import expr, ir
+from cudf_polars.utils import dtypes
 
 __all__ = ["translate_ir", "translate_expr"]
 
@@ -62,7 +63,7 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
     )
     with ctx:
         node = visitor.view_current_node()
-        schema = visitor.get_schema()
+        schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()}
         if isinstance(node, pl_ir.PythonScan):
             return ir.PythonScan(
                 schema,
@@ -222,7 +223,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
             node.options,
         )
     elif isinstance(node, pl_expr.Literal):
-        return expr.Literal(node.dtype, node.value)
+        return expr.Literal(dtypes.from_polars(node.dtype), node.value)
     elif isinstance(node, pl_expr.Sort):
         # TODO: raise in groupby
         return expr.Sort(translate_expr(visitor, n=node.expr), node.options)
@@ -244,7 +245,13 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
             translate_expr(visitor, n=node.by),
         )
     elif isinstance(node, pl_expr.Cast):
-        return expr.Cast(node.dtype, translate_expr(visitor, n=node.expr))
+        inner = translate_expr(visitor, n=node.expr)
+        # Push casts into literals so we can handle Cast(Literal(Null))
+        dtype = dtypes.from_polars(node.dtype)
+        if isinstance(inner, expr.Literal):
+            return expr.Literal(dtype, inner.value)
+        else:
+            return expr.Cast(dtype, inner)
     elif isinstance(node, pl_expr.Column):
         return expr.Col(node.name)
     elif isinstance(node, pl_expr.Agg):
@@ -257,7 +264,9 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
         return expr.BinOp(
             translate_expr(visitor, n=node.left),
             translate_expr(visitor, n=node.right),
-            node.op,
+            expr.BinOp._MAPPING[node.op],
+            # TODO: Should lay dtype onto every node, but visitor.get_dtype is O(n) not O(1)
+            dtypes.from_polars(visitor.get_dtype(n)),
         )
     elif isinstance(node, pl_expr.Len):
         return expr.Len()

From ec4562c26f3812cbc46a5d28c56243d5d06bd69f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 13 May 2024 17:02:17 +0000
Subject: [PATCH 12/56] WIP: some more

---
 python/cudf_polars/cudf_polars/dsl/expr.py | 27 ++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 376651f4124..1f9488f4884 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -30,6 +30,8 @@
 from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
+    from typing import Callable
+
     from cudf_polars.containers import DataFrame
 
 __all__ = [
@@ -237,6 +239,31 @@ class Agg(Expr):
     name: str
     options: Any
 
+    _MAPPING: ClassVar[dict[str, Callable[..., plc.aggregation.Aggregation]]] = {
+        "min": plc.aggregation.min,
+        "max": plc.aggregation.max,
+        "median": plc.aggregation.median,
+        "nunique": plc.aggregation.nunique,
+        "first": lambda: plc.aggregation.nth_element(0),
+        "last": lambda: plc.aggregation.nth_element(-1),  # TODO: check
+        "mean": plc.aggregation.mean,
+        "sum": plc.aggregation.sum,
+        "count": lambda include_null: plc.aggregation.count(
+            plc.types.NullPolicy.INCLUDE
+            if include_null
+            else plc.types.NullPolicy.EXCLUDE
+        ),
+        "std": plc.aggregation.std,
+        "var": plc.aggregation.var,
+        "agg_groups": lambda: None,
+    }
+
+    def evaluate(
+        self, df, *, context: ExecutionContext = ExecutionContext.FRAME
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        raise NotImplementedError("Agg")
+
 
 @dataclass(slots=True)
 class BinOp(Expr):

From f21cd5707e3ee1c6bc4a7747d307d6b72c0d5a41 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 14 May 2024 13:06:04 +0000
Subject: [PATCH 13/56] WIP: some agg expr stuff

---
 .../cudf_polars/containers/column.py          | 25 +++++++++
 python/cudf_polars/cudf_polars/dsl/expr.py    | 52 ++++++++++++++++++-
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index e34a1a7726e..9c3b2114602 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import functools
 from typing import TYPE_CHECKING
 
 import cudf._lib.pylibcudf as plc
@@ -67,3 +68,27 @@ def set_sorted(
         self.order = order
         self.null_order = null_order
         return self
+
+    def copy(self) -> Self:
+        """Return a shallow copy of the column."""
+        return type(self)(self.obj, self.name).with_sorted(like=self)
+
+    def mask_nans(self) -> Self:
+        """Return a copy of self with nans masked out."""
+        if self.nan_count > 0:
+            raise NotImplementedError
+        else:
+            return self.copy()
+
+    @functools.cached_property
+    def nan_count(self) -> int:
+        """Return the number of NaN values in the column."""
+        if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+            return 0
+        else:
+            return plc.reduce.reduce(
+                plc.unary.is_nan(self.obj),
+                plc.aggregation.sum(),
+                # TODO: pylibcudf needs to have a SizeType DataType singleton
+                plc.DataType(plc.TypeId.INT32),
+            )
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 1f9488f4884..a3b14f79368 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -254,10 +254,60 @@ class Agg(Expr):
             else plc.types.NullPolicy.EXCLUDE
         ),
         "std": plc.aggregation.std,
-        "var": plc.aggregation.var,
+        "var": plc.aggregation.variance,
         "agg_groups": lambda: None,
     }
 
+    def _min(self, column: Column, *, propagate_nans: bool) -> plc.Column:
+        if propagate_nans and column.nan_count > 0:
+            return plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(float("nan")), data_type=column.obj.type()
+                ),
+                1,
+            )
+        if column.nan_count > 0:
+            column = column.mask_nans()
+        return plc.Column.from_scalar(
+            plc.reduce.reduce(column.obj, plc.aggregation.min(), column.obj.type()), 1
+        )
+
+    def _max(self, column: Column, *, propagate_nans: bool) -> plc.Column:
+        if propagate_nans and column.nan_count > 0:
+            return plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(float("nan")), data_type=column.obj.type()
+                ),
+                1,
+            )
+        if column.nan_count > 0:
+            column = column.mask_nans()
+        return plc.Column.from_scalar(
+            plc.reduce.reduce(column.obj, plc.aggregation.max(), column.obj.type()), 1
+        )
+
+    def _median(self, column: Column) -> plc.Column:
+        return plc.Column.from_scalar(
+            plc.reduce.reduce(column.obj, plc.aggregation.median(), column.obj.type()),
+            1,
+        )
+
+    def _first(self, column: Column) -> plc.Column:
+        return plc.copying.slice(column.obj, [0, 1])[0]
+
+    def _last(self, column: Column) -> plc.Column:
+        n = column.obj.size()
+        return plc.copying.slice(column.obj, [n - 1, n])[0]
+
+    def _mean(self, column: Column) -> plc.Column:
+        return plc.Column.from_scalar(
+            plc.reduce.reduce(column.obj, plc.aggregation.mean(), column.obj.type()),
+            1,
+        )
+
+    def _nunique(self, column: Column) -> Column:
+        return plc.Col
+
     def evaluate(
         self, df, *, context: ExecutionContext = ExecutionContext.FRAME
     ) -> Column:

From 1f5a49031a83bf4d5a3fd96a7860e7dfc2cd3844 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 14 May 2024 14:04:23 +0000
Subject: [PATCH 14/56] Bla

---
 python/cudf_polars/cudf_polars/dsl/expr.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index a3b14f79368..7d9beb202aa 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -306,7 +306,10 @@ def _mean(self, column: Column) -> plc.Column:
         )
 
     def _nunique(self, column: Column) -> Column:
-        return plc.Col
+        return plc.Column.from_scalar(
+            plc.reduce.reduce(column.obj, plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE), ),
+            1,
+        )
 
     def evaluate(
         self, df, *, context: ExecutionContext = ExecutionContext.FRAME

From 31a3d5eefe0192536fd005b24060b0a9201ed3f6 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 15 May 2024 17:00:52 +0000
Subject: [PATCH 15/56] More fixes

---
 python/cudf_polars/cudf_polars/callback.py    |  50 +++++
 .../cudf_polars/containers/column.py          |   2 +
 .../cudf_polars/containers/dataframe.py       |  14 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    | 194 +++++++++++++-----
 python/cudf_polars/cudf_polars/dsl/ir.py      |  14 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |  86 ++++----
 6 files changed, 258 insertions(+), 102 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/callback.py

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
new file mode 100644
index 00000000000..4d7b63cd705
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Callback for the polars collect function to execute on device."""
+
+from __future__ import annotations
+
+from functools import partial
+from typing import TYPE_CHECKING
+
+from cudf_polars.dsl.translate import translate_ir
+
+if TYPE_CHECKING:
+    import polars as pl
+
+    from cudf_polars.dsl.ir import IR
+
+__all__: list[str] = ["execute_with_cudf"]
+
+
+def _callback(
+    ir: IR,
+    with_columns: list[str] | None,
+    pyarrow_predicate: str | None,
+    n_rows: int | None,
+) -> pl.DataFrame:
+    assert with_columns is None
+    assert pyarrow_predicate is None
+    assert n_rows is None
+    return ir.evaluate(cache={}).to_polars()
+
+
+def execute_with_cudf(nt) -> None:
+    """
+    A post optimization callback that attempts to execute the plan with cudf.
+
+    Parameters
+    ----------
+    nt
+        NodeTraverser
+
+    The NodeTraverser is mutated if the libcudf executor can handle the plan.
+    """
+    try:
+        callback = partial(_callback, translate_ir(nt))
+    except NotImplementedError:
+        return
+
+    nt.set_udf(callback)
+    return
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 9c3b2114602..a853680b18b 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -30,6 +30,8 @@ def __init__(self, column: plc.Column, name: str):
         self.obj = column
         self.name = name
         self.is_sorted = plc.types.Sorted.NO
+        self.order = plc.types.Order.ASCENDING
+        self.null_order = plc.types.NullOrder.BEFORE
 
     def rename(self, name: str) -> Column:
         """Return a new column sharing data with a new name."""
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 2c05cee9dea..e5dd757690a 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -8,6 +8,8 @@
 from functools import cached_property
 from typing import TYPE_CHECKING
 
+import polars as pl
+
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers.column import Column
@@ -43,6 +45,16 @@ def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None
 
     __iter__ = None
 
+    def to_polars(self) -> pl.DataFrame:
+        """Convert to a polars DataFrame."""
+        assert len(self.scalars) == 0
+        return pl.from_arrow(
+            plc.interop.to_arrow(
+                self.table,
+                [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
+            )
+        )
+
     @cached_property
     def column_names_set(self) -> frozenset[str]:
         """Return the column names as a set."""
@@ -77,7 +89,7 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self:
     def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         """Create from a pylibcudf table."""
         # TODO: strict=True when we drop py39
-        if table.num_columns != len(names):
+        if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
         return cls([Column(c, name) for c, name in zip(table.columns(), names)], [])
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 7d9beb202aa..86510ee4894 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -18,6 +18,7 @@
 import enum
 from dataclasses import dataclass
 from enum import IntEnum
+from functools import partial
 from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
@@ -59,6 +60,8 @@ class ExecutionContext(IntEnum):
 
 @dataclass(slots=True)
 class Expr:
+    dtype: plc.DataType
+
     # TODO: return type is a lie for Literal
     def evaluate(
         self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
@@ -76,12 +79,11 @@ def evaluate(
         self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        return Column(self.value.evaluate(df, context=context), self.name)
+        return Column(self.value.evaluate(df, context=context).obj, self.name)
 
 
 @dataclass(slots=True)
 class Literal(Expr):
-    dtype: plc.Datatype
     value: Any
 
     def evaluate(
@@ -135,7 +137,7 @@ def evaluate(
             [descending], nulls_last=nulls_last, num_keys=1
         )
         do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort
-        table = do_sort(plc.Table([column], order, null_order))
+        table = do_sort(plc.Table([column.obj]), order, null_order)
         return Column(table.columns()[0], column.name).set_sorted(
             is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0]
         )
@@ -228,7 +230,7 @@ def evaluate(
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         column = self.column.evaluate(df, context=context)
-        return Column(plc.unary.cast(column, self.dtype), column.name).with_sorted(
+        return Column(plc.unary.cast(column.obj, self.dtype), column.name).with_sorted(
             like=column
         )
 
@@ -236,86 +238,171 @@ def evaluate(
 @dataclass(slots=True)
 class Agg(Expr):
     column: Expr
+    op: Callable[..., plc.Column]
     name: str
-    options: Any
 
-    _MAPPING: ClassVar[dict[str, Callable[..., plc.aggregation.Aggregation]]] = {
-        "min": plc.aggregation.min,
-        "max": plc.aggregation.max,
-        "median": plc.aggregation.median,
-        "nunique": plc.aggregation.nunique,
-        "first": lambda: plc.aggregation.nth_element(0),
-        "last": lambda: plc.aggregation.nth_element(-1),  # TODO: check
-        "mean": plc.aggregation.mean,
-        "sum": plc.aggregation.sum,
-        "count": lambda include_null: plc.aggregation.count(
-            plc.types.NullPolicy.INCLUDE
-            if include_null
-            else plc.types.NullPolicy.EXCLUDE
-        ),
-        "std": plc.aggregation.std,
-        "var": plc.aggregation.variance,
-        "agg_groups": lambda: None,
-    }
+    _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
+        [
+            "min",
+            "max",
+            "median",
+            "nunique",
+            "first",
+            "last",
+            "mean",
+            "sum",
+            "count",
+            "std",
+            "var",
+            "agg_groups",
+        ]
+    )
+
+    def __init__(
+        self, dtype: plc.DataType, column: Expr, name: str, options: Any
+    ) -> None:
+        if name not in Agg._SUPPORTED:
+            raise NotImplementedError(f"Unsupported aggregation {name}")
+        self.dtype = dtype
+        self.column = column
+        self.name = name
+        op = getattr(self, f"_{name}")
+        if name in {"min", "max"}:
+            op = partial(op, propagate_nans=options)
+        elif name in {"std", "var"}:
+            op = partial(op, ddof=options)
+        self.op = op
+
+    def _std(self, column: Column, *, ddof: int) -> Column:
+        # TODO: handle nans
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(
+                    column.obj, plc.aggregation.std(ddof=ddof), self.dtype
+                ),
+                1,
+            ),
+            column.name,
+        )
 
-    def _min(self, column: Column, *, propagate_nans: bool) -> plc.Column:
-        if propagate_nans and column.nan_count > 0:
-            return plc.Column.from_scalar(
-                plc.interop.from_arrow(
-                    pa.scalar(float("nan")), data_type=column.obj.type()
+    def _var(self, column: Column, *, ddof: int) -> Column:
+        # TODO: handle nans
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(
+                    column.obj, plc.aggregation.variance(ddof=ddof), self.dtype
                 ),
                 1,
+            ),
+            column.name,
+        )
+
+    def _sum(self, column: Column) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, plc.aggregation.sum(), self.dtype), 1
+            ),
+            column.name,
+        )
+
+    def _count(self, column: Column) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(
+                    column.obj,
+                    plc.aggregation.count(plc.types.NullPolicy.EXCLUDE),
+                    self.dtype,
+                ),
+                1,
+            ),
+            column.name,
+        )
+
+    def _min(self, column: Column, *, propagate_nans: bool) -> Column:
+        if propagate_nans and column.nan_count > 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(float("nan")), data_type=self.dtype
+                    ),
+                    1,
+                ),
+                column.name,
             )
         if column.nan_count > 0:
             column = column.mask_nans()
-        return plc.Column.from_scalar(
-            plc.reduce.reduce(column.obj, plc.aggregation.min(), column.obj.type()), 1
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, plc.aggregation.min(), self.dtype), 1
+            ),
+            column.name,
         )
 
-    def _max(self, column: Column, *, propagate_nans: bool) -> plc.Column:
+    def _max(self, column: Column, *, propagate_nans: bool) -> Column:
         if propagate_nans and column.nan_count > 0:
-            return plc.Column.from_scalar(
-                plc.interop.from_arrow(
-                    pa.scalar(float("nan")), data_type=column.obj.type()
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(float("nan")), data_type=self.dtype
+                    ),
+                    1,
                 ),
-                1,
+                column.name,
             )
         if column.nan_count > 0:
             column = column.mask_nans()
-        return plc.Column.from_scalar(
-            plc.reduce.reduce(column.obj, plc.aggregation.max(), column.obj.type()), 1
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, plc.aggregation.max(), self.dtype), 1
+            ),
+            column.name,
         )
 
-    def _median(self, column: Column) -> plc.Column:
-        return plc.Column.from_scalar(
-            plc.reduce.reduce(column.obj, plc.aggregation.median(), column.obj.type()),
-            1,
+    def _median(self, column: Column) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, plc.aggregation.median(), self.dtype),
+                1,
+            ),
+            column.name,
         )
 
-    def _first(self, column: Column) -> plc.Column:
-        return plc.copying.slice(column.obj, [0, 1])[0]
+    def _first(self, column: Column) -> Column:
+        return Column(plc.copying.slice(column.obj, [0, 1])[0], column.name)
 
-    def _last(self, column: Column) -> plc.Column:
+    def _last(self, column: Column) -> Column:
         n = column.obj.size()
-        return plc.copying.slice(column.obj, [n - 1, n])[0]
+        return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name)
 
-    def _mean(self, column: Column) -> plc.Column:
-        return plc.Column.from_scalar(
-            plc.reduce.reduce(column.obj, plc.aggregation.mean(), column.obj.type()),
-            1,
+    def _mean(self, column: Column) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, plc.aggregation.mean(), self.dtype),
+                1,
+            ),
+            column.name,
         )
 
     def _nunique(self, column: Column) -> Column:
-        return plc.Column.from_scalar(
-            plc.reduce.reduce(column.obj, plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE), ),
-            1,
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(
+                    column.obj,
+                    plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE),
+                    self.dtype,
+                ),
+                1,
+            ),
+            column.name,
         )
 
     def evaluate(
         self, df, *, context: ExecutionContext = ExecutionContext.FRAME
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        raise NotImplementedError("Agg")
+        if context is not ExecutionContext.FRAME:
+            raise NotImplementedError(f"Agg in context {context}")
+        return self.op(self.column.evaluate(df, context=context))
 
 
 @dataclass(slots=True)
@@ -323,7 +410,6 @@ class BinOp(Expr):
     left: Expr
     right: Expr
     op: plc.binaryop.BinaryOperator
-    dtype: plc.DataType
 
     _MAPPING: ClassVar[dict[pl_expr.PyOperator, plc.binaryop.BinaryOperator]] = {
         pl_expr.PyOperator.Eq: plc.binaryop.BinaryOperator.EQUAL,
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 70e7d20bd22..ab49fecff25 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -120,10 +120,13 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                 null_order=plc.types.null_order.AFTER,
             )
             df = df.with_columns([index])
-        assert all(
-            c.obj.data_type() == dtype
-            for c, dtype in zip(df.columns, self.schema.values())
-        )
+        # TODO: should be true, but not the case until we get
+        # cudf-classic out of the loop for IO since it converts date32
+        # to datetime.
+        # assert all(
+        #     c.obj.type() == dtype
+        #     for c, dtype in zip(df.columns, self.schema.values())
+        # )
         if self.predicate is None:
             return df
         else:
@@ -167,8 +170,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             plc.interop.from_arrow(table), list(self.schema.keys())
         )
         assert all(
-            c.obj.data_type() == dtype
-            for c, dtype in zip(df.columns, self.schema.values())
+            c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values())
         )
         if self.predicate is not None:
             mask = self.predicate.evaluate(df)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 95f705199d4..fe7902fdcc0 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -94,35 +94,36 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
                 else None,
             )
         elif isinstance(node, pl_ir.Select):
-            return ir.Select(
-                schema,
-                translate_ir(visitor, n=node.input),
-                [translate_expr(visitor, n=e) for e in node.cse_expr],
-                [translate_expr(visitor, n=e) for e in node.expr],
-            )
+            with set_node(visitor, node.input):
+                inp = translate_ir(visitor, n=None)
+                cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr]
+                exprs = [translate_expr(visitor, n=e) for e in node.expr]
+            return ir.Select(schema, inp, cse_exprs, exprs)
         elif isinstance(node, pl_ir.GroupBy):
+            with set_node(visitor, node.input):
+                inp = translate_ir(visitor, n=None)
+                aggs = [translate_expr(visitor, n=e) for e in node.aggs]
+                keys = [translate_expr(visitor, n=e) for e in node.keys]
             return ir.GroupBy(
                 schema,
-                translate_ir(visitor, n=node.input),
-                [translate_expr(visitor, n=e) for e in node.aggs],
-                [translate_expr(visitor, n=e) for e in node.keys],
+                inp,
+                aggs,
+                keys,
                 node.options,
             )
         elif isinstance(node, pl_ir.Join):
-            return ir.Join(
-                schema,
-                translate_ir(visitor, n=node.input_left),
-                translate_ir(visitor, n=node.input_right),
-                [translate_expr(visitor, n=e) for e in node.left_on],
-                [translate_expr(visitor, n=e) for e in node.right_on],
-                node.options,
-            )
+            with set_node(visitor, node.input_left):
+                inp_left = translate_ir(visitor, n=None)
+                left_on = [translate_expr(visitor, n=e) for e in node.left_on]
+            with set_node(visitor, node.input_right):
+                inp_right = translate_ir(visitor, n=None)
+                right_on = [translate_expr(visitor, n=e) for e in node.right_on]
+            return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options)
         elif isinstance(node, pl_ir.HStack):
-            return ir.HStack(
-                schema,
-                translate_ir(visitor, n=node.input),
-                [translate_expr(visitor, n=e) for e in node.exprs],
-            )
+            with set_node(visitor, n=None):
+                inp = translate_ir(visitor, n=node.input)
+                exprs = [translate_expr(visitor, n=e) for e in node.exprs]
+            return ir.HStack(schema, inp, exprs)
         elif isinstance(node, pl_ir.Distinct):
             return ir.Distinct(
                 schema,
@@ -130,22 +131,19 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
                 node.options,
             )
         elif isinstance(node, pl_ir.Sort):
-            return ir.Sort(
-                schema,
-                translate_ir(visitor, n=node.input),
-                [translate_expr(visitor, n=e) for e in node.by_column],
-                node.sort_options,
-            )
+            with set_node(visitor, n=None):
+                inp = translate_ir(visitor, n=node.input)
+                by = [translate_expr(visitor, n=e) for e in node.by_column]
+            return ir.Sort(schema, inp, by, node.sort_options)
         elif isinstance(node, pl_ir.Slice):
             return ir.Slice(
                 schema, translate_ir(visitor, n=node.input), node.offset, node.len
             )
         elif isinstance(node, pl_ir.Filter):
-            return ir.Filter(
-                schema,
-                translate_ir(visitor, n=node.input),
-                translate_expr(visitor, n=node.predicate),
-            )
+            with set_node(visitor, n=None):
+                inp = translate_ir(visitor, n=node.input)
+                mask = translate_expr(visitor, n=node.predicate)
+            return ir.Filter(schema, inp, mask)
         elif isinstance(node, pl_ir.SimpleProjection):
             return ir.Projection(schema, translate_ir(visitor, n=node.input))
         elif isinstance(node, pl_ir.MapFunction):
@@ -201,12 +199,15 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
     if isinstance(n, pl_expr.PyExprIR):
         # TODO: type narrowing didn't work because PyExprIR is Unknown
         assert not isinstance(n, int)
-        return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node))
+        e = translate_expr(visitor, n=n.node)
+        return expr.NamedExpr(e.dtype, n.output_name, e)
     node = visitor.view_expression(n)
+    dtype = dtypes.from_polars(visitor.get_dtype(n))
     if isinstance(node, pl_expr.Function):
         name, *options = node.function_data
         if name in BOOLEAN_FUNCTIONS:
             return expr.BooleanFunction(
+                dtype,
                 name,
                 options,
                 [translate_expr(visitor, n=n) for n in node.input],
@@ -216,6 +217,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
     elif isinstance(node, pl_expr.Window):
         # TODO: raise in groupby?
         return expr.Window(
+            dtype,
             translate_expr(visitor, n=node.function),
             [translate_expr(visitor, n=n) for n in node.partition_by]
             if node.partition_by is not None
@@ -223,52 +225,54 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
             node.options,
         )
     elif isinstance(node, pl_expr.Literal):
-        return expr.Literal(dtypes.from_polars(node.dtype), node.value)
+        return expr.Literal(dtype, node.value)
     elif isinstance(node, pl_expr.Sort):
         # TODO: raise in groupby
-        return expr.Sort(translate_expr(visitor, n=node.expr), node.options)
+        return expr.Sort(dtype, translate_expr(visitor, n=node.expr), node.options)
     elif isinstance(node, pl_expr.SortBy):
         # TODO: raise in groupby
         return expr.SortBy(
+            dtype,
             translate_expr(visitor, n=node.expr),
             [translate_expr(visitor, n=n) for n in node.by],
             node.descending,
         )
     elif isinstance(node, pl_expr.Gather):
         return expr.Gather(
+            dtype,
             translate_expr(visitor, n=node.expr),
             translate_expr(visitor, n=node.idx),
         )
     elif isinstance(node, pl_expr.Filter):
         return expr.Filter(
+            dtype,
             translate_expr(visitor, n=node.input),
             translate_expr(visitor, n=node.by),
         )
     elif isinstance(node, pl_expr.Cast):
         inner = translate_expr(visitor, n=node.expr)
         # Push casts into literals so we can handle Cast(Literal(Null))
-        dtype = dtypes.from_polars(node.dtype)
         if isinstance(inner, expr.Literal):
             return expr.Literal(dtype, inner.value)
         else:
             return expr.Cast(dtype, inner)
     elif isinstance(node, pl_expr.Column):
-        return expr.Col(node.name)
+        return expr.Col(dtype, node.name)
     elif isinstance(node, pl_expr.Agg):
         return expr.Agg(
+            dtype,
             translate_expr(visitor, n=node.arguments),
             node.name,
             node.options,
         )
     elif isinstance(node, pl_expr.BinaryExpr):
         return expr.BinOp(
+            dtype,
             translate_expr(visitor, n=node.left),
             translate_expr(visitor, n=node.right),
             expr.BinOp._MAPPING[node.op],
-            # TODO: Should lay dtype onto every node, but visitor.get_dtype is O(n) not O(1)
-            dtypes.from_polars(visitor.get_dtype(n)),
         )
     elif isinstance(node, pl_expr.Len):
-        return expr.Len()
+        return expr.Len(dtype)
     else:
         raise NotImplementedError(f"No handler for expression node with {type(node)=}")

From cda34e05d0623017e4fac885f6e38fca9bcb1d71 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 16 May 2024 17:40:47 +0000
Subject: [PATCH 16/56] WIP: More working

---
 python/cudf_polars/cudf_polars/callback.py    |   9 +-
 .../cudf_polars/containers/column.py          |   1 -
 python/cudf_polars/cudf_polars/dsl/expr.py    | 381 ++++++++++++------
 python/cudf_polars/cudf_polars/dsl/ir.py      | 112 ++++-
 .../cudf_polars/cudf_polars/dsl/translate.py  |  28 +-
 .../cudf_polars/cudf_polars/utils/dtypes.py   |   3 +-
 .../cudf_polars/cudf_polars/utils/sorting.py  |   7 +-
 7 files changed, 391 insertions(+), 150 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 4d7b63cd705..b598e1442ce 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -27,7 +27,11 @@ def _callback(
     assert with_columns is None
     assert pyarrow_predicate is None
     assert n_rows is None
-    return ir.evaluate(cache={}).to_polars()
+    try:
+        return ir.evaluate(cache={}).to_polars()
+    except Exception as e:
+        print("Unable to evaluate", e)
+        raise
 
 
 def execute_with_cudf(nt) -> None:
@@ -43,7 +47,8 @@ def execute_with_cudf(nt) -> None:
     """
     try:
         callback = partial(_callback, translate_ir(nt))
-    except NotImplementedError:
+    except NotImplementedError as e:
+        print("Unable to translate", e)
         return
 
     nt.set_udf(callback)
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index a853680b18b..73db1c34b48 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -19,7 +19,6 @@
 class Column:
     """A column, a name, and sortedness."""
 
-    __slots__ = ("obj", "name", "is_sorted", "order", "null_order")
     obj: plc.Column
     name: str
     is_sorted: plc.types.Sorted
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 86510ee4894..41df85dcb73 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -19,7 +19,7 @@
 from dataclasses import dataclass
 from enum import IntEnum
 from functools import partial
-from typing import TYPE_CHECKING, Any, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
 
 import pyarrow as pa
 
@@ -58,80 +58,161 @@ class ExecutionContext(IntEnum):
     ROLLING = enum.auto()
 
 
-@dataclass(slots=True)
+class AggInfo(NamedTuple):
+    requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]]
+
+
+@dataclass(slots=True, unsafe_hash=True)
 class Expr:
     dtype: plc.DataType
 
     # TODO: return type is a lie for Literal
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         raise NotImplementedError
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        raise NotImplementedError
 
-@dataclass(slots=True)
+
+def with_mapping(fn):
+    """Decorate a callback that takes an expression mapping to use it."""
+
+    def look(
+        self,
+        df: DataFrame,
+        *,
+        context=ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ):
+        """Look up the self in the mapping before evaluating it."""
+        if mapping is None:
+            return fn(self, df, context=context, mapping=mapping)
+        else:
+            try:
+                return mapping[self]
+            except KeyError:
+                return fn(self, df, context=context, mapping=mapping)
+
+    return look
+
+
+@dataclass(slots=True, unsafe_hash=True)
 class NamedExpr(Expr):
     name: str
     value: Expr
 
+    @with_mapping
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        return Column(self.value.evaluate(df, context=context).obj, self.name)
+        return Column(
+            self.value.evaluate(df, context=context, mapping=mapping).obj, self.name
+        )
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return self.value.collect_agg(depth=depth)
 
-@dataclass(slots=True)
+
+@dataclass(slots=True, unsafe_hash=True)  # TODO: won't work for list literals
 class Literal(Expr):
     value: Any
 
+    @with_mapping
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        obj = plc.interop.from_arrow(pa.scalar(self.value), data_type=self.dtype)
+        # TODO: obey dtype
+        obj = plc.interop.from_arrow(pa.scalar(self.value))
         return Scalar(obj)  # type: ignore
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        raise NotImplementedError("Literal in groupby")
 
-@dataclass(slots=True)
+
+@dataclass(slots=True, unsafe_hash=True)
 class Col(Expr):
     name: str
 
+    @with_mapping
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         return df._column_map[self.name]
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([(self, plc.aggregation.collect_list(), self)])
 
-@dataclass(slots=True)
+
+@dataclass(slots=True, unsafe_hash=True)
 class Len(Expr):
+    @with_mapping
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        # TODO: type is wrong
+        # TODO: type is wrong, and dtype
         return df.num_rows
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: polars returns a uint, not an int for count
+        return AggInfo(
+            [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)]
+        )
 
-@dataclass(slots=True)
+
+@dataclass(slots=True, unsafe_hash=True)
 class BooleanFunction(Expr):
     name: str
     options: Any
-    arguments: list[Expr]
+    arguments: tuple[Expr, ...]
 
 
-@dataclass(slots=True)
+@dataclass(slots=True, unsafe_hash=True)
 class Sort(Expr):
     column: Expr
-    options: Any
+    options: tuple[bool, bool, bool]
 
+    @with_mapping
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        column = self.column.evaluate(df, context=context)
+        column = self.column.evaluate(df, context=context, mapping=mapping)
         (stable, nulls_last, descending) = self.options
         order, null_order = sorting.sort_order(
             [descending], nulls_last=nulls_last, num_keys=1
@@ -142,19 +223,29 @@ def evaluate(
             is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0]
         )
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: Could do with sort-based groupby and segmented sort post-hoc
+        raise NotImplementedError("Sort in groupby")
 
-@dataclass(slots=True)
+
+@dataclass(slots=True, unsafe_hash=True)
 class SortBy(Expr):
     column: Expr
-    by: list[Expr]
-    options: Any
+    by: tuple[Expr, ...]
+    options: tuple[bool, bool, tuple[bool]]
 
+    @with_mapping
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        column = self.column.evaluate(df, context=context)
-        by = [b.evaluate(df, context=context) for b in self.by]
+        column = self.column.evaluate(df, context=context, mapping=mapping)
+        by = [b.evaluate(df, context=context, mapping=mapping) for b in self.by]
         (stable, nulls_last, descending) = self.options
         order, null_order = sorting.sort_order(
             descending, nulls_last=nulls_last, num_keys=len(self.by)
@@ -165,18 +256,28 @@ def evaluate(
         )
         return Column(table.columns()[0], column.name)
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: Could do with sort-based groupby and segmented sort post-hoc
+        raise NotImplementedError("SortBy in groupby")
+
 
 @dataclass(slots=True)
 class Gather(Expr):
     values: Expr
     indices: Expr
 
+    @with_mapping
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        values = self.values.evaluate(df, context=context)
-        indices = self.indices.evaluate(df, context=context)
+        values = self.values.evaluate(df, context=context, mapping=mapping)
+        indices = self.indices.evaluate(df, context=context, mapping=mapping)
         lo, hi = plc.reduce.minmax(indices.obj)
         lo = plc.interop.to_arrow(lo).as_py()
         hi = plc.interop.to_arrow(hi).as_py()
@@ -195,28 +296,43 @@ def evaluate(
         table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
         return Column(table.columns()[0], values.name)
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: Could do with sort-based groupby and segmented gather.
+        raise NotImplementedError("Gather in groupby")
+
 
 @dataclass(slots=True)
 class Filter(Expr):
     values: Expr
     mask: Expr
 
+    @with_mapping
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        values = self.values.evaluate(df, context=context)
-        mask = self.mask.evaluate(df, context=context)
+        values = self.values.evaluate(df, context=context, mapping=mapping)
+        mask = self.mask.evaluate(df, context=context, mapping=mapping)
         table = plc.stream_compaction.apply_boolean_mask(
             plc.Table([values.obj]), mask.obj
         )
         return Column(table.columns()[0], values.name).with_sorted(like=values)
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: Could do with sort-based groupby and segmented filter
+        raise NotImplementedError("Filter in groupby")
+
 
 @dataclass(slots=True)
 class Window(Expr):
     agg: Expr
-    by: None | list[Expr]
+    by: None | tuple[Expr, ...]
     options: Any
 
 
@@ -225,21 +341,32 @@ class Cast(Expr):
     dtype: plc.DataType
     column: Expr
 
+    @with_mapping
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        column = self.column.evaluate(df, context=context)
+        column = self.column.evaluate(df, context=context, mapping=mapping)
         return Column(plc.unary.cast(column.obj, self.dtype), column.name).with_sorted(
             like=column
         )
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: Could do with sort-based groupby and segmented filter
+        return self.column.collect_agg(depth=depth)
+
 
 @dataclass(slots=True)
 class Agg(Expr):
     column: Expr
     op: Callable[..., plc.Column]
     name: str
+    request: plc.aggregation.Aggregation
 
     _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
         [
@@ -254,10 +381,29 @@ class Agg(Expr):
             "count",
             "std",
             "var",
-            "agg_groups",
         ]
     )
 
+    def __eq__(self, other):
+        """Return whether this Agg is equal to another."""
+        return type(self) == type(other) and (self.column, self.name) == (
+            other.column,
+            other.name,
+        )
+
+    def __hash__(self):
+        """Return a hash."""
+        return hash((self.column, self.name))
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth >= 1:
+            raise NotImplementedError("Nested aggregations in groupby")
+        ((expr, _, _),) = self.column.collect_agg(depth=depth + 1).requests
+        if self.request is None:
+            raise NotImplementedError(f"Aggregation {self.name} in groupby")
+        return AggInfo([(expr, self.request, self)])
+
     def __init__(
         self, dtype: plc.DataType, column: Expr, name: str, options: Any
     ) -> None:
@@ -266,53 +412,47 @@ def __init__(
         self.dtype = dtype
         self.column = column
         self.name = name
-        op = getattr(self, f"_{name}")
-        if name in {"min", "max"}:
+        # TODO: nan handling in groupby case
+        if name == "min":
+            req = plc.aggregation.min()
+        elif name == "max":
+            req = plc.aggregation.max()
+        elif name == "median":
+            req = plc.aggregation.median()
+        elif name == "nunique":
+            req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE)
+        elif name == "first" or name == "last":
+            req = None
+        elif name == "mean":
+            req = plc.aggregation.mean()
+        elif name == "sum":
+            req = plc.aggregation.sum()
+        elif name == "std":
+            # TODO: handle nans
+            req = plc.aggregation.std(ddof=options)
+        elif name == "var":
+            # TODO: handle nans
+            req = plc.aggregation.variance(ddof=options)
+        elif name == "count":
+            req = plc.aggregation.count(null_policy=plc.types.NullPolicy.EXCLUDE)
+        else:
+            raise NotImplementedError
+        self.request = req
+        op = getattr(self, f"_{name}", None)
+        if op is None:
+            op = partial(self._reduce, request=req)
+        elif name in {"min", "max"}:
             op = partial(op, propagate_nans=options)
-        elif name in {"std", "var"}:
-            op = partial(op, ddof=options)
+        else:
+            raise AssertionError
         self.op = op
 
-    def _std(self, column: Column, *, ddof: int) -> Column:
-        # TODO: handle nans
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(
-                    column.obj, plc.aggregation.std(ddof=ddof), self.dtype
-                ),
-                1,
-            ),
-            column.name,
-        )
-
-    def _var(self, column: Column, *, ddof: int) -> Column:
-        # TODO: handle nans
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(
-                    column.obj, plc.aggregation.variance(ddof=ddof), self.dtype
-                ),
-                1,
-            ),
-            column.name,
-        )
-
-    def _sum(self, column: Column) -> Column:
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, plc.aggregation.sum(), self.dtype), 1
-            ),
-            column.name,
-        )
-
-    def _count(self, column: Column) -> Column:
+    def _reduce(
+        self, column: Column, *, request: plc.aggregation.Aggregation
+    ) -> Column:
         return Column(
             plc.Column.from_scalar(
-                plc.reduce.reduce(
-                    column.obj,
-                    plc.aggregation.count(plc.types.NullPolicy.EXCLUDE),
-                    self.dtype,
-                ),
+                plc.reduce.reduce(column.obj, request, self.dtype),
                 1,
             ),
             column.name,
@@ -331,12 +471,7 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column:
             )
         if column.nan_count > 0:
             column = column.mask_nans()
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, plc.aggregation.min(), self.dtype), 1
-            ),
-            column.name,
-        )
+        return self._reduce(column, request=plc.aggregation.min())
 
     def _max(self, column: Column, *, propagate_nans: bool) -> Column:
         if propagate_nans and column.nan_count > 0:
@@ -351,21 +486,7 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column:
             )
         if column.nan_count > 0:
             column = column.mask_nans()
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, plc.aggregation.max(), self.dtype), 1
-            ),
-            column.name,
-        )
-
-    def _median(self, column: Column) -> Column:
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, plc.aggregation.median(), self.dtype),
-                1,
-            ),
-            column.name,
-        )
+        return self._reduce(column, request=plc.aggregation.max())
 
     def _first(self, column: Column) -> Column:
         return Column(plc.copying.slice(column.obj, [0, 1])[0], column.name)
@@ -374,38 +495,21 @@ def _last(self, column: Column) -> Column:
         n = column.obj.size()
         return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name)
 
-    def _mean(self, column: Column) -> Column:
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, plc.aggregation.mean(), self.dtype),
-                1,
-            ),
-            column.name,
-        )
-
-    def _nunique(self, column: Column) -> Column:
-        return Column(
-            plc.Column.from_scalar(
-                plc.reduce.reduce(
-                    column.obj,
-                    plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE),
-                    self.dtype,
-                ),
-                1,
-            ),
-            column.name,
-        )
-
+    @with_mapping
     def evaluate(
-        self, df, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         if context is not ExecutionContext.FRAME:
             raise NotImplementedError(f"Agg in context {context}")
-        return self.op(self.column.evaluate(df, context=context))
+        return self.op(self.column.evaluate(df, context=context, mapping=mapping))
 
 
-@dataclass(slots=True)
+@dataclass(slots=True, unsafe_hash=True)
 class BinOp(Expr):
     left: Expr
     right: Expr
@@ -434,13 +538,34 @@ class BinOp(Expr):
         pl_expr.PyOperator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
     }
 
+    @with_mapping
     def evaluate(
-        self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        left = self.left.evaluate(df, context=context)
-        right = self.right.evaluate(df, context=context)
+        left = self.left.evaluate(df, context=context, mapping=mapping)
+        right = self.right.evaluate(df, context=context, mapping=mapping)
         return Column(
             plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype),
-            left.name,
+            "what",
         )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate,
+            # This recurses to check if we have nested aggs
+            # groupby construction has checked that we don't have
+            # nested aggs, so stop the recursion and return ourselves
+            # for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            left_info = self.left.collect_agg(depth=depth)
+            right_info = self.right.collect_agg(depth=depth)
+            return AggInfo(
+                [*left_info.requests, *right_info.requests],
+            )
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index ab49fecff25..f9f6369426d 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,10 +15,12 @@
 
 from __future__ import annotations
 
+import types
 from dataclasses import dataclass
 from functools import cache
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
+import nvtx
 import pyarrow as pa
 from typing_extensions import assert_never
 
@@ -88,6 +90,7 @@ def __post_init__(self):
         if self.typ not in ("csv", "parquet"):
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
 
+    @nvtx.annotate(message="Scan", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         options = self.file_options
@@ -153,6 +156,7 @@ class DataFrameScan(IR):
     projection: list[str]
     predicate: Expr | None
 
+    @nvtx.annotate(message="from_dataframe", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         pdf = pl.DataFrame._from_pydf(self.df)
@@ -185,6 +189,7 @@ class Select(IR):
     cse: list[Expr]
     expr: list[Expr]
 
+    @nvtx.annotate(message="Select", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]):
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -193,11 +198,42 @@ def evaluate(self, *, cache: dict[int, DataFrame]):
         return DataFrame([e.evaluate(df) for e in self.expr], [])
 
 
-@dataclass(slots=True)
+def placeholder_column(n: int):
+    """
+    Produce a placeholder pylibcudf column with NO BACKING DATA.
+
+    Parameters
+    ----------
+    n
+        Number of rows the column will advertise
+
+    Returns
+    -------
+    pylibcudf Column that is almost unusable. DO NOT ACCESS THE DATA BUFFER.
+
+    Notes
+    -----
+    This is used to avoid allocating data for count aggregations.
+    """
+    return plc.Column(
+        plc.DataType(plc.TypeId.INT8),
+        n,
+        plc.gpumemoryview(
+            types.SimpleNamespace(__cuda_array_interface__={"data": (1, True)})
+        ),
+        None,
+        0,
+        0,
+        [],
+    )
+
+
+@dataclass(slots=False)
 class GroupBy(IR):
     df: IR
     agg_requests: list[Expr]
     keys: list[Expr]
+    maintain_order: bool
     options: Any
 
     @staticmethod
@@ -218,11 +254,13 @@ def check_agg(agg: Expr) -> int:
         ------
         NotImplementedError for unsupported expression nodes.
         """
-        if isinstance(agg, expr.Agg):
+        if isinstance(agg, expr.NamedExpr):
+            return GroupBy.check_agg(agg.value)
+        elif isinstance(agg, expr.Agg):
             if agg.name == "implode":
                 raise NotImplementedError("implode in groupby")
             return 1 + GroupBy.check_agg(agg.column)
-        elif isinstance(agg, (expr.Len, expr.Column, expr.Literal)):
+        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)):
             return 0
         elif isinstance(agg, expr.BinOp):
             return max(GroupBy.check_agg(agg.left), GroupBy.check_agg(agg.right))
@@ -233,8 +271,51 @@ def check_agg(agg: Expr) -> int:
 
     def __post_init__(self):
         """Check whether all the aggregations are implemented."""
+        if self.maintain_order:
+            raise NotImplementedError("Maintaining order in groupby")
         if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests):
             raise NotImplementedError("Nested aggregations in groupby")
+        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
+
+    @nvtx.annotate(message="GroupBy", domain="cudf_polars")
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        keys = [k.evaluate(df) for k in self.keys]
+        # TODO: use sorted information, need to expose column_order
+        # and null_precedence in pylibcudf groupby constructor
+        # sorted = (
+        #     plc.types.Sorted.YES
+        #     if all(k.is_sorted for k in keys)
+        #     else plc.types.Sorted.NO
+        # )
+        grouper = plc.groupby.GroupBy(
+            plc.Table([k.obj for k in keys]),
+            null_handling=plc.types.NullPolicy.INCLUDE,
+        )
+        # TODO: uniquify
+        requests = []
+        replacements = []
+        for info in self.agg_infos:
+            for pre_eval, req, rep in info.requests:
+                if pre_eval is None:
+                    col = placeholder_column(df.num_rows)
+                else:
+                    col = pre_eval.evaluate(df).obj
+                requests.append(plc.groupby.GroupByRequest(col, [req]))
+                replacements.append(rep)
+        group_keys, raw_tables = grouper.aggregate(requests)
+        raw_columns = []
+        for i, table in enumerate(raw_tables):
+            (column,) = table.columns()
+            raw_columns.append(Column(column, f"column{i}"))
+        mapping = dict(zip(replacements, raw_columns))
+        result_keys = [Column(gk, k.name) for gk, k in zip(group_keys.columns(), keys)]
+        result_subs = DataFrame(raw_columns, [])
+        results = [
+            req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
+        ]
+        return DataFrame([*result_keys, *results], [])
 
 
 @dataclass(slots=True)
@@ -290,6 +371,7 @@ def _joiners(
         else:
             assert_never(how)
 
+    @nvtx.annotate(message="Join", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
@@ -311,7 +393,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             table = plc.copying.gather(left.table, lg, left_policy)
             result = DataFrame.from_table(table, left.column_names)
         else:
-            lg, rg = join_fn(left_on, right_on, null_equality)
+            lg, rg = join_fn(left_on.table, right_on.table, null_equality)
             left = left.replace_columns(*left_on.columns)
             right = right.replace_columns(*right_on.columns)
             if coalesce and how != "outer":
@@ -352,6 +434,7 @@ class HStack(IR):
     df: IR
     columns: list[Expr]
 
+    @nvtx.annotate(message="HStack", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -382,6 +465,7 @@ def __init__(self, schema: dict, df: IR, options: Any):
         self.stable = maintain_order
         self.zlice = zlice
 
+    @nvtx.annotate(message="Distinct", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -427,10 +511,18 @@ class Sort(IR):
     order: list[plc.types.Order]
     null_order: list[plc.types.NullOrder]
 
-    def __init__(self, schema: dict, df: IR, by: list[Expr], options: Any):
+    def __init__(
+        self,
+        schema: dict,
+        df: IR,
+        by: list[Expr],
+        options: Any,
+        zlice: tuple[int, int] | None,
+    ):
         self.schema = schema
         self.df = df
         self.by = by
+        self.zlice = zlice
         stable, nulls_last, descending = options
         self.order, self.null_order = sorting.sort_order(
             descending, nulls_last=nulls_last, num_keys=len(by)
@@ -439,6 +531,7 @@ def __init__(self, schema: dict, df: IR, by: list[Expr], options: Any):
             plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
         )
 
+    @nvtx.annotate(message="Sort", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -463,7 +556,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                 order=self.order[i],
                 null_order=self.null_order[i],
             )
-        return DataFrame(columns, [])
+        return DataFrame(columns, []).slice(self.zlice)
 
 
 @dataclass(slots=True)
@@ -472,6 +565,7 @@ class Slice(IR):
     offset: int
     length: int
 
+    @nvtx.annotate(message="Slice", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -483,6 +577,7 @@ class Filter(IR):
     df: IR
     mask: Expr
 
+    @nvtx.annotate(message="Filter", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -493,6 +588,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 class Projection(IR):
     df: IR
 
+    @nvtx.annotate(message="Projection", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -533,6 +629,7 @@ def __post_init__(self):
             if key_column not in self.df.dfs[0].schema:
                 raise ValueError(f"Key column {key_column} not found")
 
+    @nvtx.annotate(message="MapFunction", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         if self.name == "merge_sorted":
@@ -596,6 +693,7 @@ def __post_init__(self):
         if not all(s == schema for s in self.dfs[1:]):
             raise ValueError("Schema mismatch")
 
+    @nvtx.annotate(message="Union", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
@@ -608,6 +706,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 class HConcat(IR):
     dfs: list[IR]
 
+    @nvtx.annotate(message="HConcat", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
@@ -620,6 +719,7 @@ class ExtContext(IR):
     df: IR
     extra: list[IR]
 
+    @nvtx.annotate(message="ExtContext", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         # TODO: polars optimizer doesn't do projection pushdown
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index fe7902fdcc0..62ccc09b2ff 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -8,6 +8,8 @@
 from contextlib import AbstractContextManager, nullcontext
 from typing import Any
 
+import nvtx
+
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 from cudf_polars.dsl import expr, ir
@@ -37,6 +39,7 @@ def __exit__(self, *args):
 noop_context: nullcontext = nullcontext()
 
 
+@nvtx.annotate(domain="cudf_polars")
 def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
     """
     Translate a polars-internal IR node to our representation.
@@ -109,6 +112,7 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
                 inp,
                 aggs,
                 keys,
+                node.maintain_order,
                 node.options,
             )
         elif isinstance(node, pl_ir.Join):
@@ -120,8 +124,8 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
                 right_on = [translate_expr(visitor, n=e) for e in node.right_on]
             return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options)
         elif isinstance(node, pl_ir.HStack):
-            with set_node(visitor, n=None):
-                inp = translate_ir(visitor, n=node.input)
+            with set_node(visitor, node.input):
+                inp = translate_ir(visitor, n=None)
                 exprs = [translate_expr(visitor, n=e) for e in node.exprs]
             return ir.HStack(schema, inp, exprs)
         elif isinstance(node, pl_ir.Distinct):
@@ -131,17 +135,17 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
                 node.options,
             )
         elif isinstance(node, pl_ir.Sort):
-            with set_node(visitor, n=None):
-                inp = translate_ir(visitor, n=node.input)
+            with set_node(visitor, node.input):
+                inp = translate_ir(visitor, n=None)
                 by = [translate_expr(visitor, n=e) for e in node.by_column]
-            return ir.Sort(schema, inp, by, node.sort_options)
+            return ir.Sort(schema, inp, by, node.sort_options, node.slice)
         elif isinstance(node, pl_ir.Slice):
             return ir.Slice(
                 schema, translate_ir(visitor, n=node.input), node.offset, node.len
             )
         elif isinstance(node, pl_ir.Filter):
-            with set_node(visitor, n=None):
-                inp = translate_ir(visitor, n=node.input)
+            with set_node(visitor, node.input):
+                inp = translate_ir(visitor, n=None)
                 mask = translate_expr(visitor, n=node.predicate)
             return ir.Filter(schema, inp, mask)
         elif isinstance(node, pl_ir.SimpleProjection):
@@ -176,6 +180,7 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
 BOOLEAN_FUNCTIONS: frozenset[str] = frozenset()
 
 
+@nvtx.annotate(domain="cudf_polars")
 def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
     """
     Translate a polars-internal expression IR into our representation.
@@ -210,7 +215,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
                 dtype,
                 name,
                 options,
-                [translate_expr(visitor, n=n) for n in node.input],
+                tuple(translate_expr(visitor, n=n) for n in node.input),
             )
         else:
             raise NotImplementedError(f"No handler for Expr function node with {name=}")
@@ -219,7 +224,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
         return expr.Window(
             dtype,
             translate_expr(visitor, n=node.function),
-            [translate_expr(visitor, n=n) for n in node.partition_by]
+            tuple(translate_expr(visitor, n=n) for n in node.partition_by)
             if node.partition_by is not None
             else None,
             node.options,
@@ -231,11 +236,12 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
         return expr.Sort(dtype, translate_expr(visitor, n=node.expr), node.options)
     elif isinstance(node, pl_expr.SortBy):
         # TODO: raise in groupby
+        stable, nulls_last, descending = node.sort_options
         return expr.SortBy(
             dtype,
             translate_expr(visitor, n=node.expr),
-            [translate_expr(visitor, n=n) for n in node.by],
-            node.descending,
+            tuple(translate_expr(visitor, n=n) for n in node.by),
+            (stable, nulls_last, tuple(descending)),
         )
     elif isinstance(node, pl_expr.Gather):
         return expr.Gather(
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 1ac8719b839..45adbdc842c 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -32,7 +32,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
     ------
     NotImplementedError for unsupported conversions.
     """
-    if isinstance(dtype, pl.Int8):
+    if isinstance(dtype, pl.Boolean):
         return plc.DataType(plc.TypeId.BOOL8)
     elif isinstance(dtype, pl.Int8):
         return plc.DataType(plc.TypeId.INT8)
@@ -86,4 +86,5 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
         # TODO: Hopefully
         return plc.DataType(plc.TypeId.EMPTY)
     else:
+        breakpoint()
         raise NotImplementedError(f"{dtype=} conversion not supported")
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
index fed1cd35416..b3ecfdd3dd4 100644
--- a/python/cudf_polars/cudf_polars/utils/sorting.py
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -5,11 +5,16 @@
 
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import cudf._lib.pylibcudf as plc
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
 
 def sort_order(
-    descending: list[bool], *, nulls_last: bool, num_keys: int
+    descending: Sequence[bool], *, nulls_last: bool, num_keys: int
 ) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]:
     """
     Produce sort order arguments.

From 235575d8ce9c9d65a68e42f3493361c82c087e9e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 17 May 2024 08:40:12 +0000
Subject: [PATCH 17/56] Expr objects are no longer dataclasses

This is easier for handling implementing hash, etc...
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 351 ++++++++++++------
 python/cudf_polars/cudf_polars/dsl/ir.py      |  10 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |  32 +-
 .../cudf_polars/cudf_polars/utils/dtypes.py   |   2 +
 4 files changed, 269 insertions(+), 126 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 41df85dcb73..fbc5404d129 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -16,7 +16,6 @@
 from __future__ import annotations
 
 import enum
-from dataclasses import dataclass
 from enum import IntEnum
 from functools import partial
 from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
@@ -31,7 +30,7 @@
 from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
-    from typing import Callable
+    from collections.abc import Sequence
 
     from cudf_polars.containers import DataFrame
 
@@ -45,7 +44,8 @@
     "SortBy",
     "Gather",
     "Filter",
-    "Window",
+    "RollingWindow",
+    "GroupedRollingWindow",
     "Cast",
     "Agg",
     "BinOp",
@@ -62,9 +62,86 @@ class AggInfo(NamedTuple):
     requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]]
 
 
-@dataclass(slots=True, unsafe_hash=True)
 class Expr:
+    __slots__ = ("dtype", "hash_value", "repr_value")
+    #: Data type of the expression
     dtype: plc.DataType
+    #: caching slot for the hash of the expression
+    hash_value: int
+    #: caching slot for repr of the expression
+    repr_value: str
+    #: Children of the expression
+    children: tuple[Expr, ...] = ()
+    #: Names of non-child data (not Exprs) for reconstruction
+    _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
+
+    # Constructor must take arguments in order (*_non_child, *children)
+    def __init__(self, dtype: plc.DataType) -> None:
+        self.dtype = dtype
+
+    def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence:
+        return (*(getattr(self, attr) for attr in self._non_child), *children)
+
+    def get_hash(self) -> int:
+        """
+        Return the hash of this expr.
+
+        Override this in subclasses, rather than __hash__.
+
+        Returns
+        -------
+        The integer hash value.
+        """
+        return hash((type(self), self._ctor_arguments(self.children)))
+
+    def __hash__(self):
+        """Hash of an expression with caching."""
+        try:
+            return self.hash_value
+        except AttributeError:
+            self.hash_value = self.get_hash()
+            return self.hash_value
+
+    def is_equal(self, other: Any) -> bool:
+        """
+        Equality of two expressions.
+
+        Override this in subclasses, rather than __eq__.
+
+        Parameter
+        ---------
+        other
+            object to compare to
+
+        Returns
+        -------
+        True if the two expressions are equal, false otherwise.
+        """
+        if type(self) is not type(other):
+            return False
+        return self._ctor_arguments(self.children) == other._ctor_arguments(
+            other.children
+        )
+
+    def __eq__(self, other):
+        """Equality of expressions."""
+        if type(self) != type(other) or hash(self) != hash(other):
+            return False
+        else:
+            return self.is_equal(other)
+
+    def __ne__(self, other):
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def __repr__(self):
+        """String representation of an expression with caching."""
+        try:
+            return self.repr_value
+        except AttributeError:
+            args = ", ".join(f"{arg}" for arg in self._ctor_arguments(self.children))
+            self.repr_value = f"{type(self)}({args})"
+            return self.repr_value
 
     # TODO: return type is a lie for Literal
     def evaluate(
@@ -104,10 +181,14 @@ def look(
     return look
 
 
-@dataclass(slots=True, unsafe_hash=True)
 class NamedExpr(Expr):
-    name: str
-    value: Expr
+    __slots__ = ("name", "children")
+    _non_child = ("dtype", "name")
+
+    def __init__(self, dtype: plc.DataType, name: str, value: Expr) -> None:
+        super().__init__(dtype)
+        self.name = name
+        self.children = (value,)
 
     @with_mapping
     def evaluate(
@@ -118,18 +199,25 @@ def evaluate(
         mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
         return Column(
-            self.value.evaluate(df, context=context, mapping=mapping).obj, self.name
+            child.evaluate(df, context=context, mapping=mapping).obj, self.name
         )
 
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
-        return self.value.collect_agg(depth=depth)
+        (value,) = self.children
+        return value.collect_agg(depth=depth)
 
 
-@dataclass(slots=True, unsafe_hash=True)  # TODO: won't work for list literals
 class Literal(Expr):
-    value: Any
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Scalar
+
+    def __init__(self, dtype: plc.DataType, value: Any) -> None:
+        super().__init__(dtype)
+        self.value = pa.scalar(value)
 
     @with_mapping
     def evaluate(
@@ -141,7 +229,7 @@ def evaluate(
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         # TODO: obey dtype
-        obj = plc.interop.from_arrow(pa.scalar(self.value))
+        obj = plc.interop.from_arrow(self.value)
         return Scalar(obj)  # type: ignore
 
     def collect_agg(self, *, depth: int) -> AggInfo:
@@ -149,10 +237,15 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         raise NotImplementedError("Literal in groupby")
 
 
-@dataclass(slots=True, unsafe_hash=True)
 class Col(Expr):
+    __slots__ = ("name",)
+    _non_child = ("dtype", "name")
     name: str
 
+    def __init__(self, dtype: plc.DataType, name: str) -> None:
+        self.dtype = dtype
+        self.name = name
+
     @with_mapping
     def evaluate(
         self,
@@ -169,7 +262,6 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         return AggInfo([(self, plc.aggregation.collect_list(), self)])
 
 
-@dataclass(slots=True, unsafe_hash=True)
 class Len(Expr):
     @with_mapping
     def evaluate(
@@ -191,17 +283,27 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         )
 
 
-@dataclass(slots=True, unsafe_hash=True)
 class BooleanFunction(Expr):
-    name: str
-    options: Any
-    arguments: tuple[Expr, ...]
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+
+    def __init__(self, dtype: plc.DataType, name: str, options: Any, *children: Expr):
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = tuple(children)
 
 
-@dataclass(slots=True, unsafe_hash=True)
 class Sort(Expr):
-    column: Expr
-    options: tuple[bool, bool, bool]
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+
+    def __init__(
+        self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
+    ):
+        super().__init__(dtype)
+        self.options = options
+        self.children = (column,)
 
     @with_mapping
     def evaluate(
@@ -212,7 +314,8 @@ def evaluate(
         mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        column = self.column.evaluate(df, context=context, mapping=mapping)
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
         (stable, nulls_last, descending) = self.options
         order, null_order = sorting.sort_order(
             [descending], nulls_last=nulls_last, num_keys=1
@@ -229,11 +332,20 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         raise NotImplementedError("Sort in groupby")
 
 
-@dataclass(slots=True, unsafe_hash=True)
 class SortBy(Expr):
-    column: Expr
-    by: tuple[Expr, ...]
-    options: tuple[bool, bool, tuple[bool]]
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        options: tuple[bool, bool, tuple[bool]],
+        column: Expr,
+        *by: Expr,
+    ):
+        super().__init__(dtype)
+        self.options = options
+        self.children = (column, *by)
 
     @with_mapping
     def evaluate(
@@ -244,11 +356,13 @@ def evaluate(
         mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        column = self.column.evaluate(df, context=context, mapping=mapping)
-        by = [b.evaluate(df, context=context, mapping=mapping) for b in self.by]
+        column, *by = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
         (stable, nulls_last, descending) = self.options
         order, null_order = sorting.sort_order(
-            descending, nulls_last=nulls_last, num_keys=len(self.by)
+            descending, nulls_last=nulls_last, num_keys=len(by)
         )
         do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
         table = do_sort(
@@ -262,10 +376,13 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         raise NotImplementedError("SortBy in groupby")
 
 
-@dataclass(slots=True)
 class Gather(Expr):
-    values: Expr
-    indices: Expr
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
+        super().__init__(dtype)
+        self.children = (values, indices)
 
     @with_mapping
     def evaluate(
@@ -276,8 +393,10 @@ def evaluate(
         mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        values = self.values.evaluate(df, context=context, mapping=mapping)
-        indices = self.indices.evaluate(df, context=context, mapping=mapping)
+        values, indices = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
         lo, hi = plc.reduce.minmax(indices.obj)
         lo = plc.interop.to_arrow(lo).as_py()
         hi = plc.interop.to_arrow(hi).as_py()
@@ -302,10 +421,13 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         raise NotImplementedError("Gather in groupby")
 
 
-@dataclass(slots=True)
 class Filter(Expr):
-    values: Expr
-    mask: Expr
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
+        super().__init__(dtype)
+        self.children = (values, indices)
 
     @with_mapping
     def evaluate(
@@ -316,8 +438,10 @@ def evaluate(
         mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        values = self.values.evaluate(df, context=context, mapping=mapping)
-        mask = self.mask.evaluate(df, context=context, mapping=mapping)
+        values, mask = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
         table = plc.stream_compaction.apply_boolean_mask(
             plc.Table([values.obj]), mask.obj
         )
@@ -329,17 +453,33 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         raise NotImplementedError("Filter in groupby")
 
 
-@dataclass(slots=True)
-class Window(Expr):
-    agg: Expr
-    by: None | tuple[Expr, ...]
-    options: Any
+class RollingWindow(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr):
+        super().__init__(dtype)
+        self.options = options
+        self.children = (agg,)
+
+
+class GroupedRollingWindow(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr):
+        super().__init__(dtype)
+        self.options = options
+        self.children = (agg, *by)
 
 
-@dataclass(slots=True)
 class Cast(Expr):
-    dtype: plc.DataType
-    column: Expr
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+
+    def __init__(self, dtype: plc.DataType, value: Expr):
+        super().__init__(dtype)
+        self.children = (value,)
 
     @with_mapping
     def evaluate(
@@ -350,7 +490,8 @@ def evaluate(
         mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        column = self.column.evaluate(df, context=context, mapping=mapping)
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
         return Column(plc.unary.cast(column.obj, self.dtype), column.name).with_sorted(
             like=column
         )
@@ -358,60 +499,23 @@ def evaluate(
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
         # TODO: Could do with sort-based groupby and segmented filter
-        return self.column.collect_agg(depth=depth)
+        (child,) = self.children
+        return child.collect_agg(depth=depth)
 
 
-@dataclass(slots=True)
 class Agg(Expr):
-    column: Expr
-    op: Callable[..., plc.Column]
-    name: str
-    request: plc.aggregation.Aggregation
-
-    _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
-        [
-            "min",
-            "max",
-            "median",
-            "nunique",
-            "first",
-            "last",
-            "mean",
-            "sum",
-            "count",
-            "std",
-            "var",
-        ]
-    )
-
-    def __eq__(self, other):
-        """Return whether this Agg is equal to another."""
-        return type(self) == type(other) and (self.column, self.name) == (
-            other.column,
-            other.name,
-        )
-
-    def __hash__(self):
-        """Return a hash."""
-        return hash((self.column, self.name))
-
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        if depth >= 1:
-            raise NotImplementedError("Nested aggregations in groupby")
-        ((expr, _, _),) = self.column.collect_agg(depth=depth + 1).requests
-        if self.request is None:
-            raise NotImplementedError(f"Aggregation {self.name} in groupby")
-        return AggInfo([(expr, self.request, self)])
+    __slots__ = ("name", "options", "op", "request", "children")
+    _non_child = ("dtype", "name", "options")
 
     def __init__(
-        self, dtype: plc.DataType, column: Expr, name: str, options: Any
+        self, dtype: plc.DataType, name: str, options: Any, value: Expr
     ) -> None:
-        if name not in Agg._SUPPORTED:
-            raise NotImplementedError(f"Unsupported aggregation {name}")
-        self.dtype = dtype
-        self.column = column
+        super().__init__(dtype)
         self.name = name
+        self.options = options
+        self.children = (value,)
+        if name not in Agg._SUPPORTED:
+            raise NotImplementedError(f"Unsupported aggregation {name=}")
         # TODO: nan handling in groupby case
         if name == "min":
             req = plc.aggregation.min()
@@ -447,6 +551,32 @@ def __init__(
             raise AssertionError
         self.op = op
 
+    _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
+        [
+            "min",
+            "max",
+            "median",
+            "nunique",
+            "first",
+            "last",
+            "mean",
+            "sum",
+            "count",
+            "std",
+            "var",
+        ]
+    )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth >= 1:
+            raise NotImplementedError("Nested aggregations in groupby")
+        (child,) = self.children
+        ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
+        if self.request is None:
+            raise NotImplementedError(f"Aggregation {self.name} in groupby")
+        return AggInfo([(expr, self.request, self)])
+
     def _reduce(
         self, column: Column, *, request: plc.aggregation.Aggregation
     ) -> Column:
@@ -506,14 +636,24 @@ def evaluate(
         """Evaluate this expression given a dataframe for context."""
         if context is not ExecutionContext.FRAME:
             raise NotImplementedError(f"Agg in context {context}")
-        return self.op(self.column.evaluate(df, context=context, mapping=mapping))
+        (child,) = self.children
+        return self.op(child.evaluate(df, context=context, mapping=mapping))
 
 
-@dataclass(slots=True, unsafe_hash=True)
 class BinOp(Expr):
-    left: Expr
-    right: Expr
-    op: plc.binaryop.BinaryOperator
+    __slots__ = ("op", "children")
+    _non_child = ("dtype", "op")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        op: plc.binaryop.BinaryOperator,
+        left: Expr,
+        right: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.op = op
+        self.children = (left, right)
 
     _MAPPING: ClassVar[dict[pl_expr.PyOperator, plc.binaryop.BinaryOperator]] = {
         pl_expr.PyOperator.Eq: plc.binaryop.BinaryOperator.EQUAL,
@@ -547,8 +687,10 @@ def evaluate(
         mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        left = self.left.evaluate(df, context=context, mapping=mapping)
-        right = self.right.evaluate(df, context=context, mapping=mapping)
+        left, right = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
         return Column(
             plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype),
             "what",
@@ -564,8 +706,9 @@ def collect_agg(self, *, depth: int) -> AggInfo:
             # for pre-eval
             return AggInfo([(self, plc.aggregation.collect_list(), self)])
         else:
-            left_info = self.left.collect_agg(depth=depth)
-            right_info = self.right.collect_agg(depth=depth)
+            left_info, right_info = (
+                child.collect_agg(depth=depth) for child in self.children
+            )
             return AggInfo(
                 [*left_info.requests, *right_info.requests],
             )
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index f9f6369426d..29336e7cdba 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -254,18 +254,14 @@ def check_agg(agg: Expr) -> int:
         ------
         NotImplementedError for unsupported expression nodes.
         """
-        if isinstance(agg, expr.NamedExpr):
-            return GroupBy.check_agg(agg.value)
+        if isinstance(agg, (expr.NamedExpr, expr.BinOp, expr.Cast)):
+            return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
             if agg.name == "implode":
                 raise NotImplementedError("implode in groupby")
-            return 1 + GroupBy.check_agg(agg.column)
+            return 1 + max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)):
             return 0
-        elif isinstance(agg, expr.BinOp):
-            return max(GroupBy.check_agg(agg.left), GroupBy.check_agg(agg.right))
-        elif isinstance(agg, expr.Cast):
-            return GroupBy.check_agg(agg.column)
         else:
             raise NotImplementedError(f"No handler for {agg=}")
 
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 62ccc09b2ff..c51f548b111 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -215,33 +215,35 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
                 dtype,
                 name,
                 options,
-                tuple(translate_expr(visitor, n=n) for n in node.input),
+                *(translate_expr(visitor, n=n) for n in node.input),
             )
         else:
             raise NotImplementedError(f"No handler for Expr function node with {name=}")
     elif isinstance(node, pl_expr.Window):
         # TODO: raise in groupby?
-        return expr.Window(
-            dtype,
-            translate_expr(visitor, n=node.function),
-            tuple(translate_expr(visitor, n=n) for n in node.partition_by)
-            if node.partition_by is not None
-            else None,
-            node.options,
-        )
+        if node.partition_by is None:
+            return expr.RollingWindow(
+                dtype, node.options, translate_expr(visitor, n=node.function)
+            )
+        else:
+            return expr.GroupedRollingWindow(
+                dtype,
+                node.options,
+                translate_expr(visitor, n=node.function),
+                *(translate_expr(visitor, n=n) for n in node.partition_by),
+            )
     elif isinstance(node, pl_expr.Literal):
         return expr.Literal(dtype, node.value)
     elif isinstance(node, pl_expr.Sort):
         # TODO: raise in groupby
-        return expr.Sort(dtype, translate_expr(visitor, n=node.expr), node.options)
+        return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr))
     elif isinstance(node, pl_expr.SortBy):
         # TODO: raise in groupby
-        stable, nulls_last, descending = node.sort_options
         return expr.SortBy(
             dtype,
+            node.sort_options,
             translate_expr(visitor, n=node.expr),
-            tuple(translate_expr(visitor, n=n) for n in node.by),
-            (stable, nulls_last, tuple(descending)),
+            *(translate_expr(visitor, n=n) for n in node.by),
         )
     elif isinstance(node, pl_expr.Gather):
         return expr.Gather(
@@ -267,16 +269,16 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
     elif isinstance(node, pl_expr.Agg):
         return expr.Agg(
             dtype,
-            translate_expr(visitor, n=node.arguments),
             node.name,
             node.options,
+            translate_expr(visitor, n=node.arguments),
         )
     elif isinstance(node, pl_expr.BinaryExpr):
         return expr.BinOp(
             dtype,
+            expr.BinOp._MAPPING[node.op],
             translate_expr(visitor, n=node.left),
             translate_expr(visitor, n=node.right),
-            expr.BinOp._MAPPING[node.op],
         )
     elif isinstance(node, pl_expr.Len):
         return expr.Len(dtype)
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 45adbdc842c..f3303fbbce2 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -85,6 +85,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
     elif isinstance(dtype, pl.Null):
         # TODO: Hopefully
         return plc.DataType(plc.TypeId.EMPTY)
+    elif isinstance(dtype, pl.List):
+        return plc.DataType(plc.TypeId.LIST)
     else:
         breakpoint()
         raise NotImplementedError(f"{dtype=} conversion not supported")

From e158de6c8ff4a7b95a9ba83c127398d64c7be416 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 17 May 2024 08:50:12 +0000
Subject: [PATCH 18/56] No recursive nvtx annotations

---
 python/cudf_polars/cudf_polars/callback.py      |  8 ++++++--
 python/cudf_polars/cudf_polars/dsl/expr.py      |  8 ++++----
 python/cudf_polars/cudf_polars/dsl/ir.py        | 16 ----------------
 python/cudf_polars/cudf_polars/dsl/translate.py |  4 ----
 4 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index b598e1442ce..38d80bb417e 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -8,6 +8,8 @@
 from functools import partial
 from typing import TYPE_CHECKING
 
+import nvtx
+
 from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
@@ -28,7 +30,8 @@ def _callback(
     assert pyarrow_predicate is None
     assert n_rows is None
     try:
-        return ir.evaluate(cache={}).to_polars()
+        with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
+            return ir.evaluate(cache={}).to_polars()
     except Exception as e:
         print("Unable to evaluate", e)
         raise
@@ -46,7 +49,8 @@ def execute_with_cudf(nt) -> None:
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
     try:
-        callback = partial(_callback, translate_ir(nt))
+        with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
+            callback = partial(_callback, translate_ir(nt))
     except NotImplementedError as e:
         print("Unable to translate", e)
         return
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index fbc5404d129..33266c6634a 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -139,8 +139,8 @@ def __repr__(self):
         try:
             return self.repr_value
         except AttributeError:
-            args = ", ".join(f"{arg}" for arg in self._ctor_arguments(self.children))
-            self.repr_value = f"{type(self)}({args})"
+            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
+            self.repr_value = f"{type(self).__name__}({args})"
             return self.repr_value
 
     # TODO: return type is a lie for Literal
@@ -168,8 +168,8 @@ def look(
         *,
         context=ExecutionContext.FRAME,
         mapping: dict[Expr, Column] | None = None,
-    ):
-        """Look up the self in the mapping before evaluating it."""
+    ) -> Column:
+        """Look up self in the mapping before evaluating it."""
         if mapping is None:
             return fn(self, df, context=context, mapping=mapping)
         else:
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 29336e7cdba..e2bc3b7bf44 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -20,7 +20,6 @@
 from functools import cache
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
-import nvtx
 import pyarrow as pa
 from typing_extensions import assert_never
 
@@ -90,7 +89,6 @@ def __post_init__(self):
         if self.typ not in ("csv", "parquet"):
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
 
-    @nvtx.annotate(message="Scan", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         options = self.file_options
@@ -156,7 +154,6 @@ class DataFrameScan(IR):
     projection: list[str]
     predicate: Expr | None
 
-    @nvtx.annotate(message="from_dataframe", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         pdf = pl.DataFrame._from_pydf(self.df)
@@ -189,7 +186,6 @@ class Select(IR):
     cse: list[Expr]
     expr: list[Expr]
 
-    @nvtx.annotate(message="Select", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]):
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -273,7 +269,6 @@ def __post_init__(self):
             raise NotImplementedError("Nested aggregations in groupby")
         self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
 
-    @nvtx.annotate(message="GroupBy", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -367,7 +362,6 @@ def _joiners(
         else:
             assert_never(how)
 
-    @nvtx.annotate(message="Join", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
@@ -430,7 +424,6 @@ class HStack(IR):
     df: IR
     columns: list[Expr]
 
-    @nvtx.annotate(message="HStack", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -461,7 +454,6 @@ def __init__(self, schema: dict, df: IR, options: Any):
         self.stable = maintain_order
         self.zlice = zlice
 
-    @nvtx.annotate(message="Distinct", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -527,7 +519,6 @@ def __init__(
             plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
         )
 
-    @nvtx.annotate(message="Sort", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -561,7 +552,6 @@ class Slice(IR):
     offset: int
     length: int
 
-    @nvtx.annotate(message="Slice", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -573,7 +563,6 @@ class Filter(IR):
     df: IR
     mask: Expr
 
-    @nvtx.annotate(message="Filter", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -584,7 +573,6 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 class Projection(IR):
     df: IR
 
-    @nvtx.annotate(message="Projection", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
@@ -625,7 +613,6 @@ def __post_init__(self):
             if key_column not in self.df.dfs[0].schema:
                 raise ValueError(f"Key column {key_column} not found")
 
-    @nvtx.annotate(message="MapFunction", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         if self.name == "merge_sorted":
@@ -689,7 +676,6 @@ def __post_init__(self):
         if not all(s == schema for s in self.dfs[1:]):
             raise ValueError("Schema mismatch")
 
-    @nvtx.annotate(message="Union", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
@@ -702,7 +688,6 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 class HConcat(IR):
     dfs: list[IR]
 
-    @nvtx.annotate(message="HConcat", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
@@ -715,7 +700,6 @@ class ExtContext(IR):
     df: IR
     extra: list[IR]
 
-    @nvtx.annotate(message="ExtContext", domain="cudf_polars")
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         # TODO: polars optimizer doesn't do projection pushdown
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index c51f548b111..37fb599c35d 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -8,8 +8,6 @@
 from contextlib import AbstractContextManager, nullcontext
 from typing import Any
 
-import nvtx
-
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 from cudf_polars.dsl import expr, ir
@@ -39,7 +37,6 @@ def __exit__(self, *args):
 noop_context: nullcontext = nullcontext()
 
 
-@nvtx.annotate(domain="cudf_polars")
 def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
     """
     Translate a polars-internal IR node to our representation.
@@ -180,7 +177,6 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
 BOOLEAN_FUNCTIONS: frozenset[str] = frozenset()
 
 
-@nvtx.annotate(domain="cudf_polars")
 def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
     """
     Translate a polars-internal expression IR into our representation.

From b4003910ca5c19ab8fdb51a48dabd31a5b452cf5 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 17 May 2024 09:08:17 +0000
Subject: [PATCH 19/56] Testing infrastructure

---
 python/cudf_polars/cudf_polars/callback.py    | 11 ++-
 .../cudf_polars/testing/__init__.py           |  8 ++
 .../cudf_polars/testing/asserts.py            | 76 +++++++++++++++++++
 3 files changed, 92 insertions(+), 3 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/testing/__init__.py
 create mode 100644 python/cudf_polars/cudf_polars/testing/asserts.py

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 38d80bb417e..ed473e0ad0e 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -37,7 +37,7 @@ def _callback(
         raise
 
 
-def execute_with_cudf(nt) -> None:
+def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
 
@@ -46,13 +46,18 @@ def execute_with_cudf(nt) -> None:
     nt
         NodeTraverser
 
+    raise_on_fail
+        Should conversion raise an exception rather than continuing
+        without setting a callback.
+
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
             callback = partial(_callback, translate_ir(nt))
-    except NotImplementedError as e:
-        print("Unable to translate", e)
+    except NotImplementedError:
+        if raise_on_fail:
+            raise
         return
 
     nt.set_udf(callback)
diff --git a/python/cudf_polars/cudf_polars/testing/__init__.py b/python/cudf_polars/cudf_polars/testing/__init__.py
new file mode 100644
index 00000000000..d0147e713f9
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/testing/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Testing utilities for cudf_polars."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
new file mode 100644
index 00000000000..a6e26a6425c
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Device-aware assertions."""
+
+from __future__ import annotations
+
+from functools import partial
+from typing import TYPE_CHECKING
+
+from polars.testing.asserts import assert_frame_equal
+
+from cudf_polars.callback import execute_with_cudf
+
+if TYPE_CHECKING:
+    import polars as pl
+
+__all__: list[str] = ["assert_gpu_result_equal"]
+
+
+def assert_gpu_result_equal(
+    lazydf: pl.LazyFrame,
+    *,
+    check_row_order: bool = True,
+    check_column_order: bool = True,
+    check_dtype: bool = True,
+    check_exact: bool = True,
+    rtol: float = 1e-05,
+    atol: float = 1e-08,
+    categorical_as_str: bool = False,
+):
+    """
+    Assert that collection of a lazyframe on GPU produces correct results.
+
+    Parameters
+    ----------
+    lazydf
+        frame to collect.
+    check_row_order
+        Expect rows to be in same order
+    check_column_order
+        Expect columns to be in same order
+    check_dtype
+        Expect dtypes to match
+    check_exact
+        Require exact equality for floats, if `False` compare using
+        rtol and atol.
+    rtol
+        Relative tolerance for float comparisons
+    atol
+        Absolute tolerance for float comparisons
+    categorical_as_str
+        Decat categoricals to strings before comparing
+
+    Raises
+    ------
+    AssertionError
+        If the GPU and CPU collection do not match.
+    NotImplementedError
+        If GPU collection failed in some way.
+    """
+    expect = lazydf.collect()
+    got = lazydf.collect(
+        post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)
+    )
+    assert_frame_equal(
+        expect,
+        got,
+        check_row_order=check_row_order,
+        check_column_order=check_column_order,
+        check_dtype=check_dtype,
+        check_exact=check_exact,
+        rtol=rtol,
+        atol=atol,
+        categorical_as_str=categorical_as_str,
+    )

From 7f04985483621fcbaf849dadf4518751f8628068 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 17 May 2024 09:11:52 +0000
Subject: [PATCH 20/56] Add basic tests

---
 python/cudf_polars/pyproject.toml           |   3 +
 python/cudf_polars/tests/test_basic.py      | 239 ++++++++++++++++++++
 python/cudf_polars/tests/test_distinct.py   |  25 ++
 python/cudf_polars/tests/test_extcontext.py |  19 ++
 python/cudf_polars/tests/test_filter.py     |  20 ++
 python/cudf_polars/tests/test_hconcat.py    |  19 ++
 python/cudf_polars/tests/test_hstack.py     |  19 ++
 python/cudf_polars/tests/test_join.py       |  64 ++++++
 python/cudf_polars/tests/test_slice.py      |  34 +++
 python/cudf_polars/tests/test_sort.py       |  42 ++++
 python/cudf_polars/tests/test_union.py      |  24 ++
 11 files changed, 508 insertions(+)
 create mode 100644 python/cudf_polars/tests/test_basic.py
 create mode 100644 python/cudf_polars/tests/test_distinct.py
 create mode 100644 python/cudf_polars/tests/test_extcontext.py
 create mode 100644 python/cudf_polars/tests/test_filter.py
 create mode 100644 python/cudf_polars/tests/test_hconcat.py
 create mode 100644 python/cudf_polars/tests/test_hstack.py
 create mode 100644 python/cudf_polars/tests/test_join.py
 create mode 100644 python/cudf_polars/tests/test_slice.py
 create mode 100644 python/cudf_polars/tests/test_sort.py
 create mode 100644 python/cudf_polars/tests/test_union.py

diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 00fde6c0e05..f5d29202961 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -129,6 +129,9 @@ ignore = [
 ]
 fixable = ["ALL"]
 
+[tool.ruff.lint.per-file-ignores]
+"**/tests/test_*.py" = ["D", "INP"]
+
 [tool.ruff.lint.flake8-pytest-style]
 # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
 fixture-parentheses = false
diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
new file mode 100644
index 00000000000..094f1bc3490
--- /dev/null
+++ b/python/cudf_polars/tests/test_basic.py
@@ -0,0 +1,239 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import operator
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture
+def ldf_datetime():
+    dates = [
+        "2020-01-01 13:45:48",
+        "2020-01-01 16:42:13",
+        "2020-01-01 16:45:09",
+        "2020-01-02 18:12:48",
+        "2020-01-03 19:45:32",
+        "2020-01-08 23:16:43",
+    ]
+    return (
+        pl.DataFrame({"dt": dates, "a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 1, 2, 2, 2]})
+        .with_columns(pl.col("dt").str.strptime(pl.Datetime).set_sorted())
+        .lazy()
+    )
+
+
+@pytest.fixture
+def df():
+    return pl.DataFrame(
+        {
+            "int_key1": np.repeat(np.arange(10), 10),
+            "int_key2": np.tile(np.arange(10), 10),
+            "str_key1": np.repeat(list("ABCDEFGHIJ"), 10),
+            "int_val": np.random.randint(100, size=100),
+            "float_val": np.random.rand(100),
+        }
+    )
+
+
+@pytest.fixture
+def ldf(df):
+    return df.lazy()
+
+
+@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
+@pytest.mark.parametrize(
+    "op", [operator.add, operator.sub, operator.mul, operator.truediv]
+)
+def test_binaryops(op, dtype):
+    df = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5],
+            "b": [1, 2, 3, 4, 5],
+        }
+    ).lazy()
+
+    dtype = pl.datatypes.numpy_char_code_to_dtype(dtype)
+    df = df.with_columns(pl.col("a").cast(dtype)).with_columns(pl.col("b").cast(dtype))
+    result = df.with_columns(op(pl.col("a"), pl.col("b")))
+    assert_gpu_result_equal(result)
+
+
+def test_scan_parquet(tmp_path):
+    df = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
+    df.write_parquet(tmp_path / "example.parquet")
+    ldf = pl.scan_parquet(tmp_path / "example.parquet")
+    assert_gpu_result_equal(ldf)
+
+
+def test_rolling(ldf_datetime):
+    out = ldf_datetime.rolling(index_column="dt", period="2d").agg(
+        [
+            pl.sum("a").alias("sum_a"),
+            pl.min("a").alias("min_a"),
+            pl.max("a").alias("max_a"),
+        ]
+    )
+    assert_gpu_result_equal(out)
+
+
+def test_groupby_rolling(ldf_datetime):
+    out = ldf_datetime.rolling(index_column="dt", period="2d", group_by="b").agg(
+        [
+            pl.sum("a").alias("sum_a"),
+            pl.min("a").alias("min_a"),
+            pl.max("a").alias("max_a"),
+        ]
+    )
+    assert_gpu_result_equal(out)
+
+
+def test_rolling_expression(ldf_datetime):
+    out = ldf_datetime.with_columns(
+        sum_a=pl.sum("a").rolling(index_column="dt", period="2d"),
+        min_a=pl.min("a").rolling(index_column="dt", period="2d"),
+        max_a=pl.max("a").rolling(index_column="dt", period="2d"),
+    )
+    assert_gpu_result_equal(out)
+
+
+def test_datetime_comparison(ldf_datetime):
+    out = ldf_datetime.filter(
+        pl.col("dt") > datetime.fromisoformat("2020-01-01 16:45:09")
+    )
+    assert_gpu_result_equal(out)
+
+
+@pytest.fixture
+def null_data():
+    return pl.DataFrame(
+        {
+            "a": [1, 2, None, 4, None],
+        }
+    ).lazy()
+
+
+def test_drop_nulls(null_data):
+    result = null_data.drop_nulls()
+    assert_gpu_result_equal(result)
+
+
+@pytest.mark.parametrize("how", ["inner", "left", "semi", "outer_coalesce"])
+def test_join(df: pl.DataFrame, how):
+    pl.set_random_seed(42)
+    # Sample eagerly since we haven't implemented it yet.
+    ldf1 = df.sample(n=50).lazy()
+    ldf2 = df.sample(n=50).lazy()
+
+    out = ldf1.join(ldf2, on=["int_key1", "int_key2"], how=how)
+    assert_gpu_result_equal(out, check_row_order=False)
+
+
+def test_sort(ldf):
+    for col in ldf.columns:
+        out = ldf.sort(by=col)
+        assert_gpu_result_equal(out)
+
+
+def test_filter(ldf):
+    out = ldf.filter(pl.col("int_key1") > pl.col("int_key2"))
+    assert_gpu_result_equal(out)
+
+
+@pytest.mark.parametrize(
+    "agg",
+    [
+        "sum",
+        "min",
+        "max",
+        "mean",
+        # TODO: first/last get turned into slice of the Scan
+        "first",
+        "last",
+        "count",
+        "median",
+    ],
+)
+def test_agg(df, agg):
+    ldf = (
+        df.cast(
+            {key: pl.Float64 for key in df.columns if ("int" in key or "float" in key)}
+        )
+        .select(list(filter(lambda c: "str" not in c, df.columns)))
+        .lazy()
+    )
+    out = getattr(ldf, agg)()
+    assert_gpu_result_equal(out, check_dtype=agg != "count", check_exact=False)
+
+
+@pytest.mark.parametrize("keep", ["first", "last", "none"])
+@pytest.mark.parametrize("subset", [None, "keys"])
+@pytest.mark.parametrize("sort", [False, True])
+@pytest.mark.parametrize("maintain_order", [False, True])
+def test_unique(ldf: pl.LazyFrame, keep, subset, sort, maintain_order):
+    if subset is not None:
+        subset = list(filter(lambda c: "key" in c, ldf.columns))
+        sort_by = subset
+    else:
+        sort_by = ldf.columns
+    if sort:
+        ldf = ldf.sort(*sort_by)
+    out = ldf.unique(
+        subset,
+        keep=keep,
+        maintain_order=maintain_order,
+    )
+    assert_gpu_result_equal(out, check_row_order=maintain_order)
+
+
+def test_selection(ldf: pl.LazyFrame):
+    k = pl.col("int_key1")
+    v = pl.col("int_val")
+    # groupby stops predicate pushdown
+    out = ldf.group_by(k).agg(v.sum()).filter(k * 2 > v)
+    assert_gpu_result_equal(out)
+
+
+def test_concat_vertical(ldf):
+    out = pl.concat([ldf, ldf])
+    assert_gpu_result_equal(out)
+
+
+def test_concat_horizontal(ldf):
+    # Have to split the columns in two to avoid the same column names
+    left_columns = ldf.columns[: len(ldf.columns) // 2]
+    right_columns = ldf.columns[len(ldf.columns) // 2 :]
+    out = pl.concat(
+        [ldf.select(left_columns), ldf.select(right_columns)], how="horizontal"
+    )
+    assert_gpu_result_equal(out)
+
+
+def test_groupby(ldf):
+    out = ldf.group_by("int_key1").agg(pl.col("float_val").sum())
+    assert_gpu_result_equal(out, check_row_order=False, check_exact=False)
+
+
+def test_expr_function(ldf):
+    out = ldf.select(pl.arg_where(pl.col("int_key1") == 5)).set_sorted(
+        pl.col("int_key1")
+    )
+    # TODO: Fix the underlying dtype
+    assert_gpu_result_equal(out, check_dtype=False)
+
+
+def test_filter_expr(ldf):
+    out = ldf.select(pl.col("int_key1").filter(pl.col("int_key2") > 4))
+    assert_gpu_result_equal(out)
+
+
+def test_gather_expr(ldf):
+    out = ldf.select(pl.col("int_key1").gather(pl.col("int_key2")))
+    assert_gpu_result_equal(out)
diff --git a/python/cudf_polars/tests/test_distinct.py b/python/cudf_polars/tests/test_distinct.py
new file mode 100644
index 00000000000..e0fa089cee2
--- /dev/null
+++ b/python/cudf_polars/tests/test_distinct.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("subset", [None, ["a"], ["a", "b"], ["b", "c"]])
+@pytest.mark.parametrize("keep", ["any", "none", "first", "last"])
+@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
+def test_distinct(subset, keep, maintain_order):
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 1, 3, 5, None, None],
+            "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+            "c": [True, True, True, True, False, False, True],
+        }
+    ).lazy()
+
+    query = ldf.unique(subset=subset, keep=keep, maintain_order=maintain_order)
+    assert_gpu_result_equal(query, check_row_order=maintain_order)
diff --git a/python/cudf_polars/tests/test_extcontext.py b/python/cudf_polars/tests/test_extcontext.py
new file mode 100644
index 00000000000..c5481d0ccbd
--- /dev/null
+++ b/python/cudf_polars/tests/test_extcontext.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_extcontext():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+    ldf2 = ldf.select((pl.col("b") + pl.col("a")).alias("c"))
+    query = ldf.with_context(ldf2).select(pl.col("b"), pl.col("c"))
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_filter.py b/python/cudf_polars/tests/test_filter.py
new file mode 100644
index 00000000000..783403d764c
--- /dev/null
+++ b/python/cudf_polars/tests/test_filter.py
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_filter():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    # group-by is just to avoid the filter being pushed into the scan.
+    query = ldf.group_by(pl.col("a")).agg(pl.col("b").sum()).filter(pl.col("b") < 1)
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_hconcat.py b/python/cudf_polars/tests/test_hconcat.py
new file mode 100644
index 00000000000..46cbb21b25a
--- /dev/null
+++ b/python/cudf_polars/tests/test_hconcat.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_hconcat():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+    ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"))
+    query = pl.concat([ldf, ldf2], how="horizontal")
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_hstack.py b/python/cudf_polars/tests/test_hstack.py
new file mode 100644
index 00000000000..731c036bc88
--- /dev/null
+++ b/python/cudf_polars/tests/test_hstack.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_hstack():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    query = ldf.with_columns(pl.col("a") + pl.col("b"))
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
new file mode 100644
index 00000000000..9432824a34c
--- /dev/null
+++ b/python/cudf_polars/tests/test_join.py
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize(
+    "how",
+    [
+        "inner",
+        "left",
+        pytest.param(
+            "outer",
+            marks=pytest.mark.xfail(reason="non-coalescing join not implemented"),
+        ),
+        "semi",
+        "anti",
+        pytest.param(
+            "cross",
+            marks=pytest.mark.xfail(reason="cross join not implemented"),
+        ),
+        "outer_coalesce",
+    ],
+)
+@pytest.mark.parametrize(
+    "join_nulls", [False, True], ids=["nulls_not_equal", "nulls_equal"]
+)
+@pytest.mark.parametrize(
+    "join_expr",
+    [
+        pl.col("a"),
+        pytest.param(
+            pl.col("a") * 2,
+            marks=pytest.mark.xfail(reason="Taking key columns from wrong table"),
+        ),
+        pytest.param(
+            [pl.col("a"), pl.col("a") + 1],
+            marks=pytest.mark.xfail(reason="Taking key columns from wrong table"),
+        ),
+        ["c", "a"],
+    ],
+)
+def test_join(how, join_nulls, join_expr):
+    left = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 1, None],
+            "b": [1, 2, 3, 4, 5],
+            "c": [2, 3, 4, 5, 6],
+        }
+    ).lazy()
+    right = pl.DataFrame(
+        {
+            "a": [1, 4, 3, 7, None, None],
+            "c": [2, 3, 4, 5, 6, 7],
+        }
+    ).lazy()
+
+    query = left.join(right, on=join_expr, how=how, join_nulls=join_nulls)
+    assert_gpu_result_equal(query, check_row_order=False)
diff --git a/python/cudf_polars/tests/test_slice.py b/python/cudf_polars/tests/test_slice.py
new file mode 100644
index 00000000000..6c918a89e33
--- /dev/null
+++ b/python/cudf_polars/tests/test_slice.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize(
+    "offset",
+    [0, 1, 2],
+)
+@pytest.mark.parametrize(
+    "len",
+    [0, 2, 12],
+)
+def test_slice(offset, len):
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    query = (
+        ldf.group_by(pl.col("a"))
+        .agg(pl.col("b").sum())
+        .sort(by=pl.col("a"))
+        .slice(offset, len)
+    )
+    assert_gpu_result_equal(query, check_row_order=False)
diff --git a/python/cudf_polars/tests/test_sort.py b/python/cudf_polars/tests/test_sort.py
new file mode 100644
index 00000000000..ecc02efd967
--- /dev/null
+++ b/python/cudf_polars/tests/test_sort.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize(
+    "sort_keys",
+    [
+        (pl.col("a"),),
+        pytest.param(
+            (pl.col("d").abs(),),
+            marks=pytest.mark.xfail(reason="abs not yet implemented"),
+        ),
+        (pl.col("a"), pl.col("d")),
+        (pl.col("b"),),
+    ],
+)
+@pytest.mark.parametrize("nulls_last", [False, True])
+@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
+def test_sort(sort_keys, nulls_last, maintain_order):
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 1, 3, 5, None, None],
+            "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+            "c": [True, True, True, True, False, False, True],
+            "d": [1, 2, -1, 10, 6, -1, -7],
+        }
+    ).lazy()
+
+    query = ldf.sort(
+        *sort_keys,
+        descending=True,
+        nulls_last=nulls_last,
+        maintain_order=maintain_order,
+    )
+    assert_gpu_result_equal(query, check_row_order=maintain_order)
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
new file mode 100644
index 00000000000..8a6e015e4db
--- /dev/null
+++ b/python/cudf_polars/tests/test_union.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.xfail(reason="Need handling of null scalars that are cast")
+def test_union():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+    ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"), pl.col("a"))
+    query = pl.concat([ldf, ldf2], how="diagonal")
+    # Plan for this produces a `None`.astype(Int64) which we don't
+    # handle correctly right now
+    assert_gpu_result_equal(query)

From 233c1be771bdee9688fca30349b3debebd9cce5f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 17 May 2024 10:02:54 +0000
Subject: [PATCH 21/56] All tests passing (or at least xfailing appropriately)

---
 .../cudf_polars/containers/column.py          | 14 ++++++-----
 python/cudf_polars/cudf_polars/dsl/expr.py    | 16 +++++++++++-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 16 ++++++++----
 python/cudf_polars/tests/test_basic.py        |  8 ++++--
 python/cudf_polars/tests/test_join.py         | 25 +++++++++----------
 5 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 73db1c34b48..7784febf2e8 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -87,9 +87,11 @@ def nan_count(self) -> int:
         if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
             return 0
         else:
-            return plc.reduce.reduce(
-                plc.unary.is_nan(self.obj),
-                plc.aggregation.sum(),
-                # TODO: pylibcudf needs to have a SizeType DataType singleton
-                plc.DataType(plc.TypeId.INT32),
-            )
+            return plc.interop.to_arrow(
+                plc.reduce.reduce(
+                    plc.unary.is_nan(self.obj),
+                    plc.aggregation.sum(),
+                    # TODO: pylibcudf needs to have a SizeType DataType singleton
+                    plc.DataType(plc.TypeId.INT32),
+                )
+            ).as_py()
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 33266c6634a..cefe9922f64 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -538,7 +538,7 @@ def __init__(
             # TODO: handle nans
             req = plc.aggregation.variance(ddof=options)
         elif name == "count":
-            req = plc.aggregation.count(null_policy=plc.types.NullPolicy.EXCLUDE)
+            req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
         else:
             raise NotImplementedError
         self.request = req
@@ -547,6 +547,8 @@ def __init__(
             op = partial(self._reduce, request=req)
         elif name in {"min", "max"}:
             op = partial(op, propagate_nans=options)
+        elif name == "count":
+            pass
         else:
             raise AssertionError
         self.op = op
@@ -588,6 +590,18 @@ def _reduce(
             column.name,
         )
 
+    def _count(self, column: Column) -> Column:
+        # TODO: dtype handling
+        return Column(
+            plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(column.obj.size() - column.obj.null_count()),
+                ),
+                1,
+            ),
+            column.name,
+        )
+
     def _min(self, column: Column, *, propagate_nans: bool) -> Column:
         if propagate_nans and column.nan_count > 0:
             return Column(
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index e2bc3b7bf44..9ac6cd6d51b 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,6 +15,7 @@
 
 from __future__ import annotations
 
+import itertools
 import types
 from dataclasses import dataclass
 from functools import cache
@@ -263,8 +264,10 @@ def check_agg(agg: Expr) -> int:
 
     def __post_init__(self):
         """Check whether all the aggregations are implemented."""
-        if self.maintain_order:
+        if self.options.rolling is None and self.maintain_order:
             raise NotImplementedError("Maintaining order in groupby")
+        if self.options.rolling:
+            raise NotImplementedError("rolling window/groupby")
         if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests):
             raise NotImplementedError("Nested aggregations in groupby")
         self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
@@ -395,7 +398,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                 plc.copying.gather(right.table, rg, right_policy), right.column_names
             )
             if coalesce and how == "outer":
-                left.replace_columns(
+                left = left.replace_columns(
                     *(
                         Column(
                             plc.replace.replace_nulls(left_col.obj, right_col.obj),
@@ -407,7 +410,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                         )
                     )
                 )
-                right.discard_columns(right_on.column_names_set)
+                right = right.discard_columns(right_on.column_names_set)
             right = right.rename_columns(
                 {
                     name: f"{name}{suffix}"
@@ -673,7 +676,7 @@ class Union(IR):
     def __post_init__(self):
         """Validated preconditions."""
         schema = self.dfs[0].schema
-        if not all(s == schema for s in self.dfs[1:]):
+        if not all(s.schema == schema for s in self.dfs[1:]):
             raise ValueError("Schema mismatch")
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
@@ -692,7 +695,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
         columns, scalars = zip(*((df.columns, df.scalars) for df in dfs))
-        return DataFrame(columns, scalars)
+        return DataFrame(
+            list(itertools.chain.from_iterable(columns)),
+            list(itertools.chain.from_iterable(scalars)),
+        )
 
 
 @dataclass(slots=True)
diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
index 094f1bc3490..c7123513cd2 100644
--- a/python/cudf_polars/tests/test_basic.py
+++ b/python/cudf_polars/tests/test_basic.py
@@ -73,6 +73,7 @@ def test_scan_parquet(tmp_path):
     assert_gpu_result_equal(ldf)
 
 
+@pytest.mark.xfail(reason="Rolling window not yet implemented")
 def test_rolling(ldf_datetime):
     out = ldf_datetime.rolling(index_column="dt", period="2d").agg(
         [
@@ -84,6 +85,7 @@ def test_rolling(ldf_datetime):
     assert_gpu_result_equal(out)
 
 
+@pytest.mark.xfail(reason="Grouped rolling window not yet implemented")
 def test_groupby_rolling(ldf_datetime):
     out = ldf_datetime.rolling(index_column="dt", period="2d", group_by="b").agg(
         [
@@ -95,6 +97,7 @@ def test_groupby_rolling(ldf_datetime):
     assert_gpu_result_equal(out)
 
 
+@pytest.mark.xfail(reason="Rolling expression not yet implemented")
 def test_rolling_expression(ldf_datetime):
     out = ldf_datetime.with_columns(
         sum_a=pl.sum("a").rolling(index_column="dt", period="2d"),
@@ -120,6 +123,7 @@ def null_data():
     ).lazy()
 
 
+@pytest.mark.xfail(reason="Boolean function not yet implemented")
 def test_drop_nulls(null_data):
     result = null_data.drop_nulls()
     assert_gpu_result_equal(result)
@@ -221,12 +225,12 @@ def test_groupby(ldf):
     assert_gpu_result_equal(out, check_row_order=False, check_exact=False)
 
 
+@pytest.mark.xfail(reason="arg_where not yet implemented")
 def test_expr_function(ldf):
     out = ldf.select(pl.arg_where(pl.col("int_key1") == 5)).set_sorted(
         pl.col("int_key1")
     )
-    # TODO: Fix the underlying dtype
-    assert_gpu_result_equal(out, check_dtype=False)
+    assert_gpu_result_equal(out)
 
 
 def test_filter_expr(ldf):
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 9432824a34c..9ba513023da 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -14,10 +14,7 @@
     [
         "inner",
         "left",
-        pytest.param(
-            "outer",
-            marks=pytest.mark.xfail(reason="non-coalescing join not implemented"),
-        ),
+        "outer",
         "semi",
         "anti",
         pytest.param(
@@ -34,18 +31,20 @@
     "join_expr",
     [
         pl.col("a"),
-        pytest.param(
-            pl.col("a") * 2,
-            marks=pytest.mark.xfail(reason="Taking key columns from wrong table"),
-        ),
-        pytest.param(
-            [pl.col("a"), pl.col("a") + 1],
-            marks=pytest.mark.xfail(reason="Taking key columns from wrong table"),
-        ),
+        pl.col("a") * 2,
+        [pl.col("a"), pl.col("a") + 1],
         ["c", "a"],
     ],
 )
-def test_join(how, join_nulls, join_expr):
+def test_join(request, how, join_nulls, join_expr):
+    request.applymarker(
+        pytest.mark.xfail(
+            how == "outer_coalesce"
+            and isinstance(join_expr, list)
+            and not isinstance(join_expr[0], str),
+            reason="https://github.com/pola-rs/polars/issues/16289",
+        )
+    )
     left = pl.DataFrame(
         {
             "a": [1, 2, 3, 1, None],

From 3a3ad2db76c0d8f21f84e6f462b54e0a86d9d630 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 May 2024 09:54:00 +0000
Subject: [PATCH 22/56] Handle string functions and boolean functions and add
 some docs

---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 172 +++++++++++++++---
 python/cudf_polars/cudf_polars/dsl/ir.py      |   2 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |  12 +-
 3 files changed, 151 insertions(+), 35 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index cefe9922f64..3e7fc4bffc8 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -40,6 +40,7 @@
     "Literal",
     "Col",
     "BooleanFunction",
+    "StringFunction",
     "Sort",
     "SortBy",
     "Gather",
@@ -63,17 +64,30 @@ class AggInfo(NamedTuple):
 
 
 class Expr:
+    """
+    An abstract expression object.
+
+    This contains a (potentially empty) tuple of child expressions,
+    along with non-child data. For uniform reconstruction and
+    implementation of hashing and equality schemes, child classes need
+    to provide a certain amount of metadata when they are defined.
+    Specifically, the ``_non_child`` attribute must list, in-order,
+    the names of the slots that are passed to the constructor. The
+    constructor must take arguments in the order ``(*_non_child,
+    *children).``
+    """
+
     __slots__ = ("dtype", "hash_value", "repr_value")
-    #: Data type of the expression
     dtype: plc.DataType
-    #: caching slot for the hash of the expression
+    """Data type of the expression."""
     hash_value: int
-    #: caching slot for repr of the expression
+    """Caching slot for the hash of the expression."""
     repr_value: str
-    #: Children of the expression
+    """Caching slot for repr of the expression."""
     children: tuple[Expr, ...] = ()
-    #: Names of non-child data (not Exprs) for reconstruction
+    """Children of the expression."""
     _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
+    """Names of non-child data (not Exprs) for reconstruction."""
 
     # Constructor must take arguments in order (*_non_child, *children)
     def __init__(self, dtype: plc.DataType) -> None:
@@ -151,18 +165,61 @@ def evaluate(
         context: ExecutionContext = ExecutionContext.FRAME,
         mapping: dict[Expr, Column] | None = None,
     ) -> Column:
-        """Evaluate this expression given a dataframe for context."""
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame that will provide columns.
+        context
+            What context are we performing this evaluation in?
+        mapping
+            Substitution mapping from expressions to Columns, used to
+            override the evaluation of a given expression if we're
+            performing a simple rewritten evaluation.
+
+        Returns
+        -------
+        Column representing the evaluation of the expression (or maybe
+        a scalar, annoying!).
+
+        Raises
+        ------
+        NotImplementedError if we couldn't evaluate the expression.
+        Ideally all these are returned during translation to the IR,
+        but for now we are not perfect.
+        """
         raise NotImplementedError
 
     def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
+        """
+        Collect information about aggregations in groupbys.
+
+        Parameters
+        ----------
+        depth
+            The depth of aggregating (reduction or sampling)
+            expressions we are currently at.
+
+        Returns
+        -------
+        Aggregation info describing the expression to aggregate in the
+        groupby.
+
+        Raises
+        ------
+        NotImplementedError if we can't currently perform the
+        aggregation request (for example nested aggregations like
+        ``a.max().min()``).
+        """
         raise NotImplementedError
 
 
 def with_mapping(fn):
     """Decorate a callback that takes an expression mapping to use it."""
 
-    def look(
+    def _(
         self,
         df: DataFrame,
         *,
@@ -178,7 +235,7 @@ def look(
             except KeyError:
                 return fn(self, df, context=context, mapping=mapping)
 
-    return look
+    return _
 
 
 class NamedExpr(Expr):
@@ -293,6 +350,61 @@ def __init__(self, dtype: plc.DataType, name: str, options: Any, *children: Expr
         self.name = name
         self.children = tuple(children)
 
+    @with_mapping
+    def evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
+        if self.name == pl_expr.BooleanFunction.IsNull:
+            return Column(plc.unary.is_null(column.obj), column.name)
+        elif self.name == pl_expr.BooleanFunction.IsNotNull:
+            return Column(plc.unary.is_valid(column.obj), column.name)
+        else:
+            raise NotImplementedError(f"BooleanFunction {self.name}")
+
+
+class StringFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.StringFunction,
+        options: Any,
+        *children: Expr,
+    ):
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+
+    @with_mapping
+    def evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
+        if self.name == pl_expr.StringFunction.Lowercase:
+            return Column(plc.strings.case.to_lower(column.obj), column.name)
+        elif self.name == pl_expr.StringFunction.Uppercase:
+            (child,) = self.children
+            column = child.evaluate(df, context=context, mapping=mapping)
+            return Column(plc.strings.case.to_upper(column.obj), column.name)
+        else:
+            raise NotImplementedError(f"StringFunction {self.name}")
+
 
 class Sort(Expr):
     __slots__ = ("options", "children")
@@ -669,27 +781,27 @@ def __init__(
         self.op = op
         self.children = (left, right)
 
-    _MAPPING: ClassVar[dict[pl_expr.PyOperator, plc.binaryop.BinaryOperator]] = {
-        pl_expr.PyOperator.Eq: plc.binaryop.BinaryOperator.EQUAL,
-        pl_expr.PyOperator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
-        pl_expr.PyOperator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
-        pl_expr.PyOperator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
-        pl_expr.PyOperator.Lt: plc.binaryop.BinaryOperator.LESS,
-        pl_expr.PyOperator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL,
-        pl_expr.PyOperator.Gt: plc.binaryop.BinaryOperator.GREATER,
-        pl_expr.PyOperator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL,
-        pl_expr.PyOperator.Plus: plc.binaryop.BinaryOperator.ADD,
-        pl_expr.PyOperator.Minus: plc.binaryop.BinaryOperator.SUB,
-        pl_expr.PyOperator.Multiply: plc.binaryop.BinaryOperator.MUL,
-        pl_expr.PyOperator.Divide: plc.binaryop.BinaryOperator.DIV,
-        pl_expr.PyOperator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV,
-        pl_expr.PyOperator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV,
-        pl_expr.PyOperator.Modulus: plc.binaryop.BinaryOperator.PYMOD,
-        pl_expr.PyOperator.And: plc.binaryop.BinaryOperator.BITWISE_AND,
-        pl_expr.PyOperator.Or: plc.binaryop.BinaryOperator.BITWISE_OR,
-        pl_expr.PyOperator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR,
-        pl_expr.PyOperator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND,
-        pl_expr.PyOperator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
+    _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
+        pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
+        pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
+        pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
+        pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+        pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS,
+        pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL,
+        pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER,
+        pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL,
+        pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD,
+        pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB,
+        pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL,
+        pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV,
+        pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV,
+        pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV,
+        pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD,
+        pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND,
+        pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR,
+        pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR,
+        pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND,
+        pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
     }
 
     @with_mapping
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 9ac6cd6d51b..37eddb9b408 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -309,7 +309,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         results = [
             req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
         ]
-        return DataFrame([*result_keys, *results], [])
+        return DataFrame([*result_keys, *results], []).slice(self.options.slice)
 
 
 @dataclass(slots=True)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 37fb599c35d..430534bf6bd 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -174,9 +174,6 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
             )
 
 
-BOOLEAN_FUNCTIONS: frozenset[str] = frozenset()
-
-
 def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
     """
     Translate a polars-internal expression IR into our representation.
@@ -206,7 +203,14 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
     dtype = dtypes.from_polars(visitor.get_dtype(n))
     if isinstance(node, pl_expr.Function):
         name, *options = node.function_data
-        if name in BOOLEAN_FUNCTIONS:
+        if isinstance(name, pl_expr.StringFunction):
+            return expr.StringFunction(
+                dtype,
+                name,
+                options,
+                *(translate_expr(visitor, n=n) for n in node.input),
+            )
+        elif isinstance(name, pl_expr.BooleanFunction):
             return expr.BooleanFunction(
                 dtype,
                 name,

From dd6efaafe9d42efdb04125b434ea0ee636bbfe88 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 May 2024 12:05:22 +0000
Subject: [PATCH 23/56] Flesh out more boolean functions

---
 python/cudf_polars/cudf_polars/dsl/expr.py | 131 ++++++++++++++++++++-
 1 file changed, 128 insertions(+), 3 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 3e7fc4bffc8..f82ca78fb4f 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -17,7 +17,7 @@
 
 import enum
 from enum import IntEnum
-from functools import partial
+from functools import partial, reduce
 from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
 
 import pyarrow as pa
@@ -350,6 +350,47 @@ def __init__(self, dtype: plc.DataType, name: str, options: Any, *children: Expr
         self.name = name
         self.children = tuple(children)
 
+    def __post_init__(self):
+        """Validate preconditions."""
+        if (
+            self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All)
+            and not self.options[0]
+        ):
+            # With ignore_nulls == False, polars uses Kleene logic
+            raise NotImplementedError(f"Kleene logic for {self.name}")
+        if self.name in (
+            pl_expr.BooleanFunction.IsFinite,
+            pl_expr.BooleanFunction.IsInfinite,
+            pl_expr.BooleanFunction.IsBetween,
+            pl_expr.BooleanFunction.IsIn,
+        ):
+            raise NotImplementedError(f"{self.name}")
+
+    @staticmethod
+    def _distinct(
+        column: Column,
+        *,
+        keep: plc.stream_compaction.DuplicateKeepOption,
+        source_value: plc.Scalar,
+        target_value: plc.Scalar,
+    ) -> Column:
+        table = plc.Table([column.obj])
+        indices = plc.stream_compaction.distinct_indices(
+            table,
+            keep,
+            # TODO: polars doesn't expose options for these
+            plc.types.NullEquality.EQUAL,
+            plc.types.NanEquality.ALL_EQUAL,
+        )
+        return Column(
+            plc.copying.scatter(
+                [source_value],
+                indices,
+                plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
+            ).columns()[0],
+            column.name,
+        )
+
     @with_mapping
     def evaluate(
         self,
@@ -359,12 +400,96 @@ def evaluate(
         mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        (child,) = self.children
-        column = child.evaluate(df, context=context, mapping=mapping)
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        if self.name == pl_expr.BooleanFunction.Any:
+            (column,) = columns
+            return plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1
+            )
+        elif self.name == pl_expr.BooleanFunction.All:
+            (column,) = columns
+            return plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1
+            )
         if self.name == pl_expr.BooleanFunction.IsNull:
+            (column,) = columns
             return Column(plc.unary.is_null(column.obj), column.name)
         elif self.name == pl_expr.BooleanFunction.IsNotNull:
+            (column,) = columns
             return Column(plc.unary.is_valid(column.obj), column.name)
+        elif self.name == pl_expr.BooleanFunction.IsNan:
+            # TODO: copy over null mask since is_nan(null) => null in polars
+            (column,) = columns
+            return Column(plc.unary.is_nan(column.obj), column.name)
+        elif self.name == pl_expr.BooleanFunction.IsNotNan:
+            # TODO: copy over null mask since is_not_nan(null) => null in polars
+            (column,) = columns
+            return Column(plc.unary.is_not_nan(column.obj), column.name)
+        elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
+                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+            )
+        elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
+                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+            )
+        elif self.name == pl_expr.BooleanFunction.IsUnique:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
+                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+            )
+        elif self.name == pl_expr.BooleanFunction.IsDuplicated:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+                source_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+                target_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
+            )
+        elif self.name == pl_expr.AllHorizontal:
+            name = columns[0].name
+            if any(c.obj.null_count() > 0 for c in columns):
+                raise NotImplementedError("Kleene logic for all_horizontal")
+            return Column(
+                reduce(
+                    partial(
+                        plc.binaryop.binary_operation,
+                        op=plc.binaryop.BinaryOperator.BITWISE_AND,
+                        output_type=self.dtype,
+                    ),
+                    (c.obj for c in columns),
+                ),
+                name,
+            )
+        elif self.name == pl_expr.AnyHorizontal:
+            name = columns[0].name
+            if any(c.obj.null_count() > 0 for c in columns):
+                raise NotImplementedError("Kleene logic for any_horizontal")
+            return Column(
+                reduce(
+                    partial(
+                        plc.binaryop.binary_operation,
+                        op=plc.binaryop.BinaryOperator.BITWISE_OR,
+                        output_type=self.dtype,
+                    ),
+                    (c.obj for c in columns),
+                ),
+                name,
+            )
         else:
             raise NotImplementedError(f"BooleanFunction {self.name}")
 

From e279a2f2cb5ce64b1eeb7d81217d14443f70eb89 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 May 2024 12:39:01 +0000
Subject: [PATCH 24/56] More fixes

---
 python/cudf_polars/cudf_polars/dsl/expr.py | 53 +++++++++++++++++-----
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index f82ca78fb4f..3773bba8632 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -348,10 +348,7 @@ def __init__(self, dtype: plc.DataType, name: str, options: Any, *children: Expr
         super().__init__(dtype)
         self.options = options
         self.name = name
-        self.children = tuple(children)
-
-    def __post_init__(self):
-        """Validate preconditions."""
+        self.children = children
         if (
             self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All)
             and not self.options[0]
@@ -361,7 +358,6 @@ def __post_init__(self):
         if self.name in (
             pl_expr.BooleanFunction.IsFinite,
             pl_expr.BooleanFunction.IsInfinite,
-            pl_expr.BooleanFunction.IsBetween,
             pl_expr.BooleanFunction.IsIn,
         ):
             raise NotImplementedError(f"{self.name}")
@@ -460,7 +456,7 @@ def evaluate(
                 source_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
                 target_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
             )
-        elif self.name == pl_expr.AllHorizontal:
+        elif self.name == pl_expr.BooleanFunction.AllHorizontal:
             name = columns[0].name
             if any(c.obj.null_count() > 0 for c in columns):
                 raise NotImplementedError("Kleene logic for all_horizontal")
@@ -475,7 +471,7 @@ def evaluate(
                 ),
                 name,
             )
-        elif self.name == pl_expr.AnyHorizontal:
+        elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
             name = columns[0].name
             if any(c.obj.null_count() > 0 for c in columns):
                 raise NotImplementedError("Kleene logic for any_horizontal")
@@ -490,6 +486,34 @@ def evaluate(
                 ),
                 name,
             )
+        elif self.name == pl_expr.BooleanFunction.IsBetween:
+            column, lo, hi = columns
+            closed = self.options
+            if closed == pl_expr.ClosedInterval.None_:
+                left = plc.binaryop.BinaryOperator.GREATER
+                right = plc.binaryop.BinaryOperator.LESS
+            elif closed == pl_expr.ClosedInterval.Left:
+                left = plc.binaryop.BinaryOperator.GREATER_EQUAL
+                right = plc.binaryop.BinaryOperator.LESS
+            elif closed == pl_expr.ClosedInterval.Right:
+                left = plc.binaryop.BinaryOperator.GREATER
+                right = plc.binaryop.BinaryOperator.LESS_EQUAL
+            else:
+                left = plc.binaryop.BinaryOperator.GREATER_EQUAL
+                right = plc.binaryop.BinaryOperator.LESS_EQUAL
+            return Column(
+                plc.binaryop.binary_operation(
+                    plc.binaryop.binary_operation(
+                        column.obj, lo.obj, left, output_type=self.dtype
+                    ),
+                    plc.binaryop.binary_operation(
+                        column.obj, hi.obj, right, output_type=self.dtype
+                    ),
+                    plc.binaryop.BinaryOperator.LOGICAL_AND,
+                    self.dtype,
+                ),
+                column.name,
+            )
         else:
             raise NotImplementedError(f"BooleanFunction {self.name}")
 
@@ -519,14 +543,21 @@ def evaluate(
         mapping: dict[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        (child,) = self.children
-        column = child.evaluate(df, context=context, mapping=mapping)
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
         if self.name == pl_expr.StringFunction.Lowercase:
+            (column,) = columns
             return Column(plc.strings.case.to_lower(column.obj), column.name)
         elif self.name == pl_expr.StringFunction.Uppercase:
-            (child,) = self.children
-            column = child.evaluate(df, context=context, mapping=mapping)
+            (column,) = columns
             return Column(plc.strings.case.to_upper(column.obj), column.name)
+        elif self.name == pl_expr.StringFunction.EndsWith:
+            column, suffix = columns
+            return Column(
+                plc.strings.find.ends_with(column.obj, suffix.obj), column.name
+            )
         else:
             raise NotImplementedError(f"StringFunction {self.name}")
 

From bdd6ee38727b86b0a9fd821954f985037b9f40b4 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 May 2024 12:44:40 +0000
Subject: [PATCH 25/56] Simplify

---
 python/cudf_polars/cudf_polars/dsl/expr.py | 42 ++++++++++++++--------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 3773bba8632..819582b98f9 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -387,6 +387,30 @@ def _distinct(
             column.name,
         )
 
+    _BETWEEN_OPS: ClassVar[
+        dict[
+            pl_expr.ClosedInterval,
+            tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
+        ]
+    ] = {
+        pl_expr.ClosedInterval.None_: (
+            plc.binaryop.BinaryOperator.GREATER,
+            plc.binaryop.BinaryOperator.LESS,
+        ),
+        pl_expr.ClosedInterval.Left: (
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            plc.binaryop.BinaryOperator.LESS,
+        ),
+        pl_expr.ClosedInterval.Right: (
+            plc.binaryop.BinaryOperator.GREATER,
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+        ),
+        pl_expr.ClosedInterval.Both: (
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+        ),
+    }
+
     @with_mapping
     def evaluate(
         self,
@@ -488,26 +512,14 @@ def evaluate(
             )
         elif self.name == pl_expr.BooleanFunction.IsBetween:
             column, lo, hi = columns
-            closed = self.options
-            if closed == pl_expr.ClosedInterval.None_:
-                left = plc.binaryop.BinaryOperator.GREATER
-                right = plc.binaryop.BinaryOperator.LESS
-            elif closed == pl_expr.ClosedInterval.Left:
-                left = plc.binaryop.BinaryOperator.GREATER_EQUAL
-                right = plc.binaryop.BinaryOperator.LESS
-            elif closed == pl_expr.ClosedInterval.Right:
-                left = plc.binaryop.BinaryOperator.GREATER
-                right = plc.binaryop.BinaryOperator.LESS_EQUAL
-            else:
-                left = plc.binaryop.BinaryOperator.GREATER_EQUAL
-                right = plc.binaryop.BinaryOperator.LESS_EQUAL
+            lop, rop = self._BETWEEN_OPS[self.options]
             return Column(
                 plc.binaryop.binary_operation(
                     plc.binaryop.binary_operation(
-                        column.obj, lo.obj, left, output_type=self.dtype
+                        column.obj, lo.obj, lop, output_type=self.dtype
                     ),
                     plc.binaryop.binary_operation(
-                        column.obj, hi.obj, right, output_type=self.dtype
+                        column.obj, hi.obj, rop, output_type=self.dtype
                     ),
                     plc.binaryop.BinaryOperator.LOGICAL_AND,
                     self.dtype,

From c06b980fe0c906b5b1cc94e189d6f9f6533c87a0 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 May 2024 14:47:23 +0000
Subject: [PATCH 26/56] More fixes

---
 .../cudf_polars/containers/dataframe.py           |  4 +++-
 python/cudf_polars/cudf_polars/dsl/expr.py        | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index e5dd757690a..dba4c9f6c2c 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -121,7 +121,9 @@ def discard_columns(self, names: Set[str]) -> Self:
     def select(self, names: Sequence[str]) -> Self:
         """Select columns by name returning DataFrame."""
         want = set(names)
-        return type(self)([c for c in self.columns if c.name in want], self.scalars)
+        if not want.issubset(self.column_names_set):
+            raise ValueError("Can't select missing names")
+        return type(self)([self._column_map[name] for name in names], self.scalars)
 
     def replace_columns(self, *columns: Column) -> Self:
         """Return a new dataframe with columns replaced by name."""
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 819582b98f9..7ff4a359940 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -512,7 +512,8 @@ def evaluate(
             )
         elif self.name == pl_expr.BooleanFunction.IsBetween:
             column, lo, hi = columns
-            lop, rop = self._BETWEEN_OPS[self.options]
+            (closed,) = self.options
+            lop, rop = self._BETWEEN_OPS[closed]
             return Column(
                 plc.binaryop.binary_operation(
                     plc.binaryop.binary_operation(
@@ -545,6 +546,13 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
+        if self.name not in (
+            pl_expr.StringFunction.Lowercase,
+            pl_expr.StringFunction.Uppercase,
+            pl_expr.StringFunction.EndsWith,
+            pl_expr.StringFunction.StartsWith,
+        ):
+            raise NotImplementedError(f"String function {self.name}")
 
     @with_mapping
     def evaluate(
@@ -570,6 +578,11 @@ def evaluate(
             return Column(
                 plc.strings.find.ends_with(column.obj, suffix.obj), column.name
             )
+        elif self.name == pl_expr.StringFunction.StartsWith:
+            column, suffix = columns
+            return Column(
+                plc.strings.find.starts_with(column.obj, suffix.obj), column.name
+            )
         else:
             raise NotImplementedError(f"StringFunction {self.name}")
 

From 3b17c719f05b9479c6536c3bd9c64fb7a7d3914a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 May 2024 15:12:18 +0000
Subject: [PATCH 27/56] xfail strict in cudf_polars tests

---
 python/cudf_polars/pyproject.toml      | 3 +++
 python/cudf_polars/tests/test_basic.py | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f5d29202961..3619e32e140 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -51,6 +51,9 @@ version = {file = "cudf_polars/VERSION"}
 [tool.setuptools.packages.find]
 exclude = ["*tests*"]
 
+[tool.pytest.ini_options]
+xfail_strict = true
+
 [tool.ruff]
 line-length = 88
 indent-width = 4
diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
index c7123513cd2..2b16dac8d84 100644
--- a/python/cudf_polars/tests/test_basic.py
+++ b/python/cudf_polars/tests/test_basic.py
@@ -123,7 +123,6 @@ def null_data():
     ).lazy()
 
 
-@pytest.mark.xfail(reason="Boolean function not yet implemented")
 def test_drop_nulls(null_data):
     result = null_data.drop_nulls()
     assert_gpu_result_equal(result)

From 19db751fe491fe58d3b0a7e03162a73923808158 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 May 2024 15:37:30 +0000
Subject: [PATCH 28/56] Overview doc, simplify callback

---
 python/cudf_polars/cudf_polars/callback.py    |  14 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |   5 +-
 python/cudf_polars/docs/overview.md           | 174 ++++++++++++++++++
 3 files changed, 181 insertions(+), 12 deletions(-)
 create mode 100644 python/cudf_polars/docs/overview.md

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index ed473e0ad0e..aabb8498ce2 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -29,12 +29,8 @@ def _callback(
     assert with_columns is None
     assert pyarrow_predicate is None
     assert n_rows is None
-    try:
-        with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
-            return ir.evaluate(cache={}).to_polars()
-    except Exception as e:
-        print("Unable to evaluate", e)
-        raise
+    with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
+        return ir.evaluate(cache={}).to_polars()
 
 
 def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None:
@@ -54,11 +50,7 @@ def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None:
     """
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
-            callback = partial(_callback, translate_ir(nt))
+            nt.set_udf(partial(_callback, translate_ir(nt)))
     except NotImplementedError:
         if raise_on_fail:
             raise
-        return
-
-    nt.set_udf(callback)
-    return
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 430534bf6bd..17518f62806 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -21,7 +21,7 @@ class set_node(AbstractContextManager):
 
     __slots__ = ("n", "visitor")
 
-    def __init__(self, visitor, n):
+    def __init__(self, visitor, n: int):
         self.visitor = visitor
         self.n = n
 
@@ -94,6 +94,9 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
                 else None,
             )
         elif isinstance(node, pl_ir.Select):
+            # We translate the expressions (which are executed with
+            # reference to the input node) with the input node active
+            # so that dtype resolution works correctly.
             with set_node(visitor, node.input):
                 inp = translate_ir(visitor, n=None)
                 cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr]
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
new file mode 100644
index 00000000000..c07b1592130
--- /dev/null
+++ b/python/cudf_polars/docs/overview.md
@@ -0,0 +1,174 @@
+# Getting started
+
+You will need:
+
+1. Rust development environment. If you use the rapids [combined
+   devcontainer](https://github.com/rapidsai/devcontainers/), add
+   `"./features/src/rust": {"version": "latest", "profile": "default"},` to your
+   preferred configuration. Or else, use
+   [rustup](https://www.rust-lang.org/tools/install)
+2. A [cudf development
+   environment](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment).
+   The combined devcontainer works, or whatever your favourite approach is.
+
+> ![NOTE] These instructions will get simpler as we merge code in.
+
+## Installing polars
+
+We will need to build polars from source. Until things settle down,
+live at `HEAD`.
+
+```sh
+git clone https://github.com/pola-rs/polars
+cd polars
+```
+
+We will install build dependencies in the same environment that we created for
+building cudf. Note that polars offers a `make build` command that sets up a
+separate virtual environment, but we don't want to do that right now. So in the
+polars clone:
+
+```sh
+# cudf environment (conda or pip) is active
+pip install --upgrade uv
+uv pip install --upgrade -r py-polars/requirements-dev.txt
+```
+
+Now we have the necessary machinery to build polars
+```sh
+cd py-polars
+# build in debug mode, best option for development/debugging
+maturin develop -m Cargo.toml
+```
+
+For benchmarking purposes we should build in release mode
+```sh
+RUSTFLAGS='-C target-cpu=native' maturin develop -m Cargo.toml --release
+```
+
+After any update of the polars code, we need to rerun the `maturin` build
+command.
+
+## Installing the cudf polars executor
+
+The executor for the polars logical plan lives in the cudf repo, in
+`python/cudf_polars`. Build cudf as normal and then install the
+`cudf_polars` package in editable mode:
+
+```sh
+cd cudf/python/cudf_polars
+pip install --no-deps -e .
+```
+
+You should now be able to run the tests in the `cudf_polars` package:
+```sh
+pytest -v tests
+```
+
+# Executor design
+
+The polars `LazyFrame.collect` functionality offers a
+"post-optimization" callback that may be used by a third party library
+to replace a (or more, though we only replace a single node) in the
+optimized logical plan with a Python callback that is to deliver the
+result of evaluating the plan. This splits the execution of the plan
+into two phases. First, a symbolic phase which translates to our
+internal representation (IR). Second, an execution phase which executes
+using our IR.
+
+The translation phase receives the a low-level Rust `NodeTraverse`
+object which delivers Python representations of the plan nodes (and
+expressions) one at a time. During translation, we endeavour to raise
+`NotImplementedError` for any unsupported functionality. This way, if
+we can't execute something, we just don't modify the logical plan at
+all: if we can translate the IR, it is assumed that evaluation will
+later succeed.
+
+The usage of the cudf-based executor is therefore, at present:
+
+```python
+from cudf_polars.callback import execute_with_cudf
+
+result = q.collect(post_opt_callback=execute_with_cudf)
+```
+
+This should either transparently run on the GPU and deliver a polars
+dataframe, or else fail (but be handled) and just run the normal CPU
+execution.
+
+## Adding a handler for a new plan node
+
+Plan node definitions live in `cudf_polars/dsl/ir.py`, these are
+`dataclasses` that inherit from the base `IR` node. The evaluation of
+a plan node is done by implementing the `evaluate` method.
+
+To translate the plan node, add a case handler in `translate_ir` which
+lives in `cudf_polars/dsl/translate.py`.
+
+As well as child nodes that are plans, most plan nodes contain child
+expressions, which should be transformed using the input to the plan as a
+context. The translation of expressions is handled via
+`translate_expr` in `cudf_poalrs/dsl/translate.py`. So that data-type
+resolution is performed correctly any expression should be translated
+with the correct plan node "active" in the visitor. For example, when
+translating a `Join` node, the left keys (expressions) should be
+translated with the left input active (and right keys with right
+input). To facilitate this, use the `set_node` context manager.
+
+## Adding a handler for a new expression node
+
+Adding a handle for an expression node is very similar to a plan node.
+Expressions are all defined in `cudf_polars/dsl/expr.py` and inherit
+from `Expr`. Unlike plan nodes, these are not `dataclasses`, since it
+is simpler for us to implement efficient hashing, repr, and equality if we
+can write that ourselves.
+
+Every expression consists of two types of data:
+1. child data (other `Expr`s)
+2. non-child data (anything other than an `Expr`)
+The generic implementations of special methods in the base `Expr` base
+class require that the subclasses advertise which arguments to the
+constructor are non-child in a `_non_child` class slot. The
+constructor should then take arguments:
+```python
+def __init__(self, *non_child_data: Any, *children: Expr):
+```
+Read the docstrings in the `Expr` class for more details. In
+particular, one needs to be careful to ensure that an `Expr` hashes
+correctly.
+
+Expressions are evaluated by implementing an `evaluate` method, this
+takes a `DataFrame` as context (this provides columns), along with an
+`ExecutionContext` parameter (indicating what context we're evaluating
+this expression in, currently unused), and a `mapping` from
+expressions to evaluated `Column`s: this enables a simple form of
+expression rewriting during evaluation of expressions that is used in
+evaluation of groupby-aggregations. To reduce boilerplate for lookup
+in the mappings dictionary use the `@with_mapping` decorator.
+
+To simplify state tracking, all columns should be considered immutable
+on construction. This matches the "functional" description coming from
+the logical plan in any case, so is reasonably natural.
+
+# Containers
+
+Containers should be constructed as relatively lightweight objects
+around their pylibcudf counterparts. We have three (in
+`cudf_polars/containers/`):
+
+1. Scalar (a wrapper around a pylibcudf Scalar)
+2. Column (a wrapper around a pylibcudf Column)
+3. DataFrame (a wrapper around a pylibcudf Table)
+
+The interfaces offered by these are somewhat in flux, but broadly
+speaking, a `DataFrame` is just a list of `Column`s which each hold
+data plus a string `name`, along with a collection of `Scalar`s (this
+might go away).
+
+The columns keep track of metadata (for example, whether or not they
+are sorted).
+
+We offer some utility methods for transferring metadata when
+constructing new dataframes and columns, both `DataFrame` and `Column`
+offer a `with_metadata(*, like: Self)` call which copies metadata from
+the template.

From 146327cc0f51a51ac949d5dcdcef059b336de1a1 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 May 2024 16:14:35 +0000
Subject: [PATCH 29/56] Docstrings for plan nodes.

---
 python/cudf_polars/cudf_polars/dsl/ir.py | 141 ++++++++++++++++++++++-
 1 file changed, 140 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 37eddb9b408..71e2ab7941c 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -63,25 +63,64 @@
 
 @dataclass(slots=True)
 class IR:
+    """Abstract plan node, representing an unevaluated dataframe."""
+
     schema: dict[str, plc.DataType]
+    """Mapping from column names to their data types."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
-        """Evaluate and return a dataframe."""
+        """
+        Evaluate the node and return a dataframe.
+
+        Parameters
+        ----------
+        cache
+            Mapping from cached node ids to constructed DataFrames.
+            Used to implement evaluation of the `Cache` node.
+
+        Returns
+        -------
+        DataFrame (on device) representing the evaluation of this plan
+        node.
+
+        Raises
+        ------
+        NotImplementedError if we couldn't evaluate things. Ideally
+        this should not occur, since the translation phase should pick
+        up things that we cannot handle.
+        """
         raise NotImplementedError
 
 
 @dataclass(slots=True)
 class PythonScan(IR):
+    """Representation of input from a python function."""
+
     options: Any
+    """Arbitrary options."""
     predicate: Expr | None
+    """Filter to apply to the constructed dataframe before returning it."""
 
 
 @dataclass(slots=True)
 class Scan(IR):
+    """Input from files."""
+
     typ: Any
+    """What type of file are we reading? Parquet, CSV, etc..."""
     paths: list[str]
+    """List of paths to read from."""
     file_options: Any
+    """Options for reading the file.
+
+    Attributes are:
+    - ``with_columns: list[str]`` of projected columns to return.
+    - ``n_rows: int``: Number of rows to read.
+    - ``row_index: tuple[name, offset] | None``: Add an integer index
+        column with given name.
+    """
     predicate: Expr | None
+    """Mask to apply to the read dataframe."""
 
     def __post_init__(self):
         """Validate preconditions."""
@@ -138,8 +177,16 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class Cache(IR):
+    """
+    Return a cached plan node.
+
+    Used for CSE at the plan level.
+    """
+
     key: int
+    """The cache key."""
     value: IR
+    """The unevaluated node to cache."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -151,9 +198,18 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class DataFrameScan(IR):
+    """
+    Input from an existing polars DataFrame.
+
+    This typically arises from ``q.collect().lazy()``
+    """
+
     df: Any
+    """Polars LazyFrame object."""
     projection: list[str]
+    """List of columns to project out."""
     predicate: Expr | None
+    """Mask to apply."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -183,9 +239,18 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class Select(IR):
+    """Produce a new dataframe selecting given expressions from an input."""
+
     df: IR
+    """Input dataframe."""
     cse: list[Expr]
+    """
+    List of common subexpressions that will appear in the selected expressions.
+
+    These must be evaluated before the returned expressions.
+    """
     expr: list[Expr]
+    """List of expressions to evaluate to form the new dataframe."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]):
         """Evaluate and return a dataframe."""
@@ -227,11 +292,18 @@ def placeholder_column(n: int):
 
 @dataclass(slots=False)
 class GroupBy(IR):
+    """Perform a groupby."""
+
     df: IR
+    """Input dataframe."""
     agg_requests: list[Expr]
+    """List of expressions to evaluate groupwise."""
     keys: list[Expr]
+    """List of expressions forming the keys."""
     maintain_order: bool
+    """Should the order of the input dataframe be maintained?"""
     options: Any
+    """Options controlling style of groupby."""
 
     @staticmethod
     def check_agg(agg: Expr) -> int:
@@ -314,11 +386,25 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class Join(IR):
+    """A join of two dataframes."""
+
     left: IR
+    """Left frame."""
     right: IR
+    """Right frame."""
     left_on: list[Expr]
+    """List of expressions used as keys in the left frame."""
     right_on: list[Expr]
+    """List of expressions used as keys in the right frame."""
     options: Any
+    """
+    tuple of options:
+    - how: join type
+    - join_nulls: do nulls compare equal?
+    - slice: optional slice to perform after joining.
+    - suffix: string suffix for right columns if names match
+    - coalesce: should key columns be coalesced (only makes sense for outer joins)
+    """
 
     def __post_init__(self):
         """Validate preconditions."""
@@ -424,8 +510,12 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class HStack(IR):
+    """Add new columns to a dataframe."""
+
     df: IR
+    """Input dataframe."""
     columns: list[Expr]
+    """List of expressions to produce new columns."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -435,11 +525,18 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class Distinct(IR):
+    """Produce a new dataframe with distinct rows."""
+
     df: IR
+    """Input dataframe."""
     keep: plc.stream_compaction.DuplicateKeepOption
+    """Which rows to keep."""
     subset: set[str] | None
+    """Which columns to inspect when computing distinct rows."""
     zlice: tuple[int, int] | None
+    """Optional slice to perform after compaction."""
     stable: bool
+    """Should order be preserved?"""
 
     _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = {
         "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
@@ -495,12 +592,20 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class Sort(IR):
+    """Sort a dataframe."""
+
     df: IR
+    """Input."""
     by: list[Expr]
+    """List of expressions to produce sort keys."""
     do_sort: Callable[..., plc.Table]
+    """pylibcudf sorting function."""
     zlice: tuple[int, int] | None
+    """Optional slice to apply after sorting."""
     order: list[plc.types.Order]
+    """Order keys should be sorted in."""
     null_order: list[plc.types.NullOrder]
+    """Where nulls sort to."""
 
     def __init__(
         self,
@@ -551,9 +656,14 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class Slice(IR):
+    """Slice a dataframe."""
+
     df: IR
+    """Input."""
     offset: int
+    """Start of the slice."""
     length: int
+    """Length of the slice."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -563,8 +673,12 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class Filter(IR):
+    """Filter a dataframe with a boolean mask."""
+
     df: IR
+    """Input."""
     mask: Expr
+    """Expression evaluating to a mask."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -574,7 +688,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class Projection(IR):
+    """Select a subset of columns from a dataframe."""
+
     df: IR
+    """Input."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -585,9 +702,14 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class MapFunction(IR):
+    """Apply some function to a dataframe."""
+
     df: IR
+    """Input."""
     name: str
+    """Function name."""
     options: Any
+    """Arbitrary options, interpreted per function."""
 
     _NAMES: ClassVar[frozenset[str]] = frozenset(
         [
@@ -670,8 +792,12 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class Union(IR):
+    """Concatenate dataframes vertically."""
+
     dfs: list[IR]
+    """List of inputs."""
     zlice: tuple[int, int] | None
+    """Optional slice to apply after concatenation."""
 
     def __post_init__(self):
         """Validated preconditions."""
@@ -681,6 +807,7 @@ def __post_init__(self):
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
+        # TODO: only evaluate what we need if we have a slice
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
         return DataFrame.from_table(
             plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names
@@ -689,7 +816,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class HConcat(IR):
+    """Concatenate dataframes horizontally."""
+
     dfs: list[IR]
+    """List of inputs."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -703,8 +833,17 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
 @dataclass(slots=True)
 class ExtContext(IR):
+    """
+    Concatenate dataframes horizontally.
+
+    This is similar to HConcat, but is used only to temporarily
+    introduce new dataframes into an expression context.
+    """
+
     df: IR
+    """Input."""
     extra: list[IR]
+    """List of extra inputs."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""

From e81a1e125b0771cf4324732561d6c9df074140de Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 May 2024 16:25:00 +0000
Subject: [PATCH 30/56] ClosedInterval will be a string

---
 python/cudf_polars/cudf_polars/dsl/expr.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 7ff4a359940..2fc16be8f6b 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -32,6 +32,8 @@
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
+    import polars.type_aliases as pl_types
+
     from cudf_polars.containers import DataFrame
 
 __all__ = [
@@ -389,23 +391,23 @@ def _distinct(
 
     _BETWEEN_OPS: ClassVar[
         dict[
-            pl_expr.ClosedInterval,
+            pl_types.ClosedInterval,
             tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
         ]
     ] = {
-        pl_expr.ClosedInterval.None_: (
+        "none": (
             plc.binaryop.BinaryOperator.GREATER,
             plc.binaryop.BinaryOperator.LESS,
         ),
-        pl_expr.ClosedInterval.Left: (
+        "left": (
             plc.binaryop.BinaryOperator.GREATER_EQUAL,
             plc.binaryop.BinaryOperator.LESS,
         ),
-        pl_expr.ClosedInterval.Right: (
+        "right": (
             plc.binaryop.BinaryOperator.GREATER,
             plc.binaryop.BinaryOperator.LESS_EQUAL,
         ),
-        pl_expr.ClosedInterval.Both: (
+        "both": (
             plc.binaryop.BinaryOperator.GREATER_EQUAL,
             plc.binaryop.BinaryOperator.LESS_EQUAL,
         ),

From 98281e8f5958ec55bd56cd8d8af87016dbbf19d7 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <wence@gmx.li>
Date: Wed, 22 May 2024 12:06:46 +0100
Subject: [PATCH 31/56] Small fixes from code review

Co-authored-by: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
---
 python/cudf_polars/cudf_polars/utils/dtypes.py |  1 -
 python/cudf_polars/docs/overview.md            | 14 +++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index f3303fbbce2..911c391c063 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -88,5 +88,4 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
     elif isinstance(dtype, pl.List):
         return plc.DataType(plc.TypeId.LIST)
     else:
-        breakpoint()
         raise NotImplementedError(f"{dtype=} conversion not supported")
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index c07b1592130..e2562959141 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -69,7 +69,7 @@ pytest -v tests
 
 The polars `LazyFrame.collect` functionality offers a
 "post-optimization" callback that may be used by a third party library
-to replace a (or more, though we only replace a single node) in the
+to replace a node (or more, though we only replace a single node) in the
 optimized logical plan with a Python callback that is to deliver the
 result of evaluating the plan. This splits the execution of the plan
 into two phases. First, a symbolic phase which translates to our
@@ -108,7 +108,7 @@ lives in `cudf_polars/dsl/translate.py`.
 As well as child nodes that are plans, most plan nodes contain child
 expressions, which should be transformed using the input to the plan as a
 context. The translation of expressions is handled via
-`translate_expr` in `cudf_poalrs/dsl/translate.py`. So that data-type
+`translate_expr` in `cudf_polars/dsl/translate.py`. So that data-type
 resolution is performed correctly any expression should be translated
 with the correct plan node "active" in the visitor. For example, when
 translating a `Join` node, the left keys (expressions) should be
@@ -137,13 +137,13 @@ Read the docstrings in the `Expr` class for more details. In
 particular, one needs to be careful to ensure that an `Expr` hashes
 correctly.
 
-Expressions are evaluated by implementing an `evaluate` method, this
-takes a `DataFrame` as context (this provides columns), along with an
+Expressions are evaluated by implementing an `evaluate` method that
+takes a `DataFrame` as context (this provides columns) along with an
 `ExecutionContext` parameter (indicating what context we're evaluating
-this expression in, currently unused), and a `mapping` from
-expressions to evaluated `Column`s: this enables a simple form of
+this expression in, currently unused) and a `mapping` from
+expressions to evaluated `Column`s. This approach enables a simple form of
 expression rewriting during evaluation of expressions that is used in
-evaluation of groupby-aggregations. To reduce boilerplate for lookup
+evaluation of, for example, groupby-aggregations. To reduce boilerplate for lookup
 in the mappings dictionary use the `@with_mapping` decorator.
 
 To simplify state tracking, all columns should be considered immutable

From 3a1ac86131275aa98e803993dc5fb7bc56888675 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 22 May 2024 11:52:12 +0000
Subject: [PATCH 32/56] Dedent some assertions

---
 python/cudf_polars/cudf_polars/utils/dtypes.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 911c391c063..51379433c03 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -67,9 +67,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
             return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
         elif dtype.time_unit == "ns":
             return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS)
-        else:
-            assert dtype.time_unit is not None
-            assert_never(dtype.time_unit)
+        assert dtype.time_unit is not None
+        assert_never(dtype.time_unit)
     elif isinstance(dtype, pl.Duration):
         if dtype.time_unit == "ms":
             return plc.DataType(plc.TypeId.DURATION_MILLISECONDS)
@@ -77,9 +76,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
             return plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
         elif dtype.time_unit == "ns":
             return plc.DataType(plc.TypeId.DURATION_NANOSECONDS)
-        else:
-            assert dtype.time_unit is not None
-            assert_never(dtype.time_unit)
+        assert dtype.time_unit is not None
+        assert_never(dtype.time_unit)
     elif isinstance(dtype, pl.String):
         return plc.DataType(plc.TypeId.STRING)
     elif isinstance(dtype, pl.Null):

From f0686a29e8e404fe780dfb772f7a871f76402f76 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 23 May 2024 14:59:55 +0000
Subject: [PATCH 33/56] More fixes in review

---
 .../cudf_polars/containers/column.py          |  20 +--
 .../cudf_polars/containers/dataframe.py       |   6 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    | 164 +++++++++---------
 python/cudf_polars/cudf_polars/dsl/ir.py      |  24 +--
 4 files changed, 96 insertions(+), 118 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 7784febf2e8..9ca5b7f0310 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -78,20 +78,18 @@ def mask_nans(self) -> Self:
         """Return a copy of self with nans masked out."""
         if self.nan_count > 0:
             raise NotImplementedError
-        else:
-            return self.copy()
+        return self.copy()
 
     @functools.cached_property
     def nan_count(self) -> int:
         """Return the number of NaN values in the column."""
         if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
             return 0
-        else:
-            return plc.interop.to_arrow(
-                plc.reduce.reduce(
-                    plc.unary.is_nan(self.obj),
-                    plc.aggregation.sum(),
-                    # TODO: pylibcudf needs to have a SizeType DataType singleton
-                    plc.DataType(plc.TypeId.INT32),
-                )
-            ).as_py()
+        return plc.interop.to_arrow(
+            plc.reduce.reduce(
+                plc.unary.is_nan(self.obj),
+                plc.aggregation.sum(),
+                # TODO: pylibcudf needs to have a SizeType DataType singleton
+                plc.DataType(plc.TypeId.INT32),
+            )
+        ).as_py()
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index dba4c9f6c2c..aa2f412f694 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -43,8 +43,6 @@ def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None
         else:
             self.table = None
 
-    __iter__ = None
-
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
         assert len(self.scalars) == 0
@@ -66,12 +64,12 @@ def column_names(self) -> list[str]:
         return [c.name for c in self.columns]
 
     @cached_property
-    def num_columns(self):
+    def num_columns(self) -> int:
         """Number of columns."""
         return len(self.columns)
 
     @cached_property
-    def num_rows(self):
+    def num_rows(self) -> int:
         """Number of rows."""
         if self.table is None:
             raise ValueError("Number of rows of frame with scalars makes no sense")
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 2fc16be8f6b..fc91bef726a 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -79,12 +79,12 @@ class Expr:
     *children).``
     """
 
-    __slots__ = ("dtype", "hash_value", "repr_value")
+    __slots__ = ("dtype", "_hash_value", "_repr_value")
     dtype: plc.DataType
     """Data type of the expression."""
-    hash_value: int
+    _hash_value: int
     """Caching slot for the hash of the expression."""
-    repr_value: str
+    _repr_value: str
     """Caching slot for repr of the expression."""
     children: tuple[Expr, ...] = ()
     """Children of the expression."""
@@ -113,10 +113,10 @@ def get_hash(self) -> int:
     def __hash__(self):
         """Hash of an expression with caching."""
         try:
-            return self.hash_value
+            return self._hash_value
         except AttributeError:
-            self.hash_value = self.get_hash()
-            return self.hash_value
+            self._hash_value = self.get_hash()
+            return self._hash_value
 
     def is_equal(self, other: Any) -> bool:
         """
@@ -153,20 +153,58 @@ def __ne__(self, other):
     def __repr__(self):
         """String representation of an expression with caching."""
         try:
-            return self.repr_value
+            return self._repr_value
         except AttributeError:
             args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
-            self.repr_value = f"{type(self).__name__}({args})"
-            return self.repr_value
+            self._repr_value = f"{type(self).__name__}({args})"
+            return self._repr_value
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:  # TODO: return type is a lie for Literal
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame that will provide columns.
+        context
+            What context are we performing this evaluation in?
+        mapping
+            Substitution mapping from expressions to Columns, used to
+            override the evaluation of a given expression if we're
+            performing a simple rewritten evaluation.
+
+        Notes
+        -----
+        Do not call this function directly, but rather
+        :func:`evaluate` which handles the mapping lookups.
+
+        Returns
+        -------
+        Column representing the evaluation of the expression (or maybe
+        a scalar, annoying!).
+
+        Raises
+        ------
+        NotImplementedError if we couldn't evaluate the expression.
+        Ideally all these are returned during translation to the IR,
+        but for now we are not perfect.
+        """
+        raise NotImplementedError(f"Evaluation of {type(self).__name__}")
 
-    # TODO: return type is a lie for Literal
     def evaluate(
         self,
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
         mapping: dict[Expr, Column] | None = None,
-    ) -> Column:
+    ) -> Column:  # TODO: return type is a lie for Literal
         """
         Evaluate this expression given a dataframe for context.
 
@@ -181,6 +219,12 @@ def evaluate(
             override the evaluation of a given expression if we're
             performing a simple rewritten evaluation.
 
+        Notes
+        -----
+        Individual subclasses should implement :meth:`do_allocate`,
+        this method provides logic to handle lookups in the
+        substitution mapping.
+
         Returns
         -------
         Column representing the evaluation of the expression (or maybe
@@ -192,7 +236,12 @@ def evaluate(
         Ideally all these are returned during translation to the IR,
         but for now we are not perfect.
         """
-        raise NotImplementedError
+        if mapping is None:
+            return self.do_evaluate(df, context=context, mapping=mapping)
+        try:
+            return mapping[self]
+        except KeyError:
+            return self.do_evaluate(df, context=context, mapping=mapping)
 
     def collect_agg(self, *, depth: int) -> AggInfo:
         """
@@ -215,29 +264,9 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         aggregation request (for example nested aggregations like
         ``a.max().min()``).
         """
-        raise NotImplementedError
-
-
-def with_mapping(fn):
-    """Decorate a callback that takes an expression mapping to use it."""
-
-    def _(
-        self,
-        df: DataFrame,
-        *,
-        context=ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
-    ) -> Column:
-        """Look up self in the mapping before evaluating it."""
-        if mapping is None:
-            return fn(self, df, context=context, mapping=mapping)
-        else:
-            try:
-                return mapping[self]
-            except KeyError:
-                return fn(self, df, context=context, mapping=mapping)
-
-    return _
+        raise NotImplementedError(
+            f"Collecting aggregation info for {type(self).__name__}"
+        )
 
 
 class NamedExpr(Expr):
@@ -249,8 +278,7 @@ def __init__(self, dtype: plc.DataType, name: str, value: Expr) -> None:
         self.name = name
         self.children = (value,)
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -278,8 +306,7 @@ def __init__(self, dtype: plc.DataType, value: Any) -> None:
         super().__init__(dtype)
         self.value = pa.scalar(value)
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -291,10 +318,6 @@ def evaluate(
         obj = plc.interop.from_arrow(self.value)
         return Scalar(obj)  # type: ignore
 
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        raise NotImplementedError("Literal in groupby")
-
 
 class Col(Expr):
     __slots__ = ("name",)
@@ -305,8 +328,7 @@ def __init__(self, dtype: plc.DataType, name: str) -> None:
         self.dtype = dtype
         self.name = name
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -322,8 +344,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 
 
 class Len(Expr):
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -332,7 +353,7 @@ def evaluate(
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         # TODO: type is wrong, and dtype
-        return df.num_rows
+        return df.num_rows  # type: ignore
 
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
@@ -413,8 +434,7 @@ def _distinct(
         ),
     }
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -556,8 +576,7 @@ def __init__(
         ):
             raise NotImplementedError(f"String function {self.name}")
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -600,8 +619,7 @@ def __init__(
         self.options = options
         self.children = (column,)
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -621,11 +639,6 @@ def evaluate(
             is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0]
         )
 
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: Could do with sort-based groupby and segmented sort post-hoc
-        raise NotImplementedError("Sort in groupby")
-
 
 class SortBy(Expr):
     __slots__ = ("options", "children")
@@ -642,8 +655,7 @@ def __init__(
         self.options = options
         self.children = (column, *by)
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -665,11 +677,6 @@ def evaluate(
         )
         return Column(table.columns()[0], column.name)
 
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: Could do with sort-based groupby and segmented sort post-hoc
-        raise NotImplementedError("SortBy in groupby")
-
 
 class Gather(Expr):
     __slots__ = ("children",)
@@ -679,8 +686,7 @@ def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
         super().__init__(dtype)
         self.children = (values, indices)
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -710,11 +716,6 @@ def evaluate(
         table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
         return Column(table.columns()[0], values.name)
 
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: Could do with sort-based groupby and segmented gather.
-        raise NotImplementedError("Gather in groupby")
-
 
 class Filter(Expr):
     __slots__ = ("children",)
@@ -724,8 +725,7 @@ def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
         super().__init__(dtype)
         self.children = (values, indices)
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -742,11 +742,6 @@ def evaluate(
         )
         return Column(table.columns()[0], values.name).with_sorted(like=values)
 
-    def collect_agg(self, *, depth: int) -> AggInfo:
-        """Collect information about aggregations in groupbys."""
-        # TODO: Could do with sort-based groupby and segmented filter
-        raise NotImplementedError("Filter in groupby")
-
 
 class RollingWindow(Expr):
     __slots__ = ("options", "children")
@@ -776,8 +771,7 @@ def __init__(self, dtype: plc.DataType, value: Expr):
         super().__init__(dtype)
         self.children = (value,)
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
@@ -934,8 +928,7 @@ def _last(self, column: Column) -> Column:
         n = column.obj.size()
         return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name)
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df,
         *,
@@ -987,8 +980,7 @@ def __init__(
         pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
     }
 
-    @with_mapping
-    def evaluate(
+    def do_evaluate(
         self,
         df: DataFrame,
         *,
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 71e2ab7941c..61a3fb87ee6 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -154,7 +154,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             step = plc.interop.from_arrow(pa.scalar(1), data_type=dtype)
             init = plc.interop.from_arrow(pa.scalar(offset), data_type=dtype)
             index = Column(
-                plc.filling.sequence(df.num_rows(), init, step), name
+                plc.filling.sequence(df.num_rows, init, step), name
             ).set_sorted(
                 is_sorted=plc.types.Sorted.YES,
                 order=plc.types.Order.ASCENDING,
@@ -836,8 +836,7 @@ class ExtContext(IR):
     """
     Concatenate dataframes horizontally.
 
-    This is similar to HConcat, but is used only to temporarily
-    introduce new dataframes into an expression context.
+    Prefer HConcat, since this is going to be deprecated on the polars side.
     """
 
     df: IR
@@ -845,17 +844,8 @@ class ExtContext(IR):
     extra: list[IR]
     """List of extra inputs."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
-        """Evaluate and return a dataframe."""
-        # TODO: polars optimizer doesn't do projection pushdown
-        # through extcontext AFAICT.
-        df = self.df.evaluate(cache=cache)
-        # extra contexts are added in order, if they have any
-        # overlapping column names, those are ignored.
-        names = df.column_names_set.copy()
-        # TODO: scalars
-        for ir in self.extra:
-            extra = ir.evaluate(cache=cache).discard_columns(names)
-            names |= extra.column_names_set
-            df = df.with_columns(extra.columns)
-        return df
+    def __post_init__(self):
+        """Validate preconditions."""
+        raise NotImplementedError(
+            "ExtContext will be deprecated, use horizontal concat instead."
+        )

From 8d25f3a61a45bdcbda44d13d520fc094f0d55a72 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 23 May 2024 15:40:33 +0000
Subject: [PATCH 34/56] Singledispatch for translation

---
 .../cudf_polars/cudf_polars/dsl/translate.py  | 500 +++++++++++-------
 python/cudf_polars/tests/test_extcontext.py   |   6 +-
 2 files changed, 307 insertions(+), 199 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 17518f62806..f90a08e3b53 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -6,10 +6,13 @@
 from __future__ import annotations
 
 from contextlib import AbstractContextManager, nullcontext
+from functools import singledispatch
 from typing import Any
 
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
+import cudf._lib.pylibcudf as plc  # noqa: TCH002, singledispatch register needs this name defined.
+
 from cudf_polars.dsl import expr, ir
 from cudf_polars.utils import dtypes
 
@@ -37,6 +40,171 @@ def __exit__(self, *args):
 noop_context: nullcontext = nullcontext()
 
 
+@singledispatch
+def _translate_ir(node: Any, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    raise NotImplementedError(f"Translation for {type(node).__name__}")
+
+
+@_translate_ir.register
+def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.PythonScan(
+        schema,
+        node.options,
+        translate_expr(visitor, n=node.predicate)
+        if node.predicate is not None
+        else None,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.Scan(
+        schema,
+        node.scan_type,
+        node.paths,
+        node.file_options,
+        translate_expr(visitor, n=node.predicate)
+        if node.predicate is not None
+        else None,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Cache, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input))
+
+
+@_translate_ir.register
+def _(
+    node: pl_ir.DataFrameScan, visitor: Any, schema: dict[str, plc.DataType]
+) -> ir.IR:
+    return ir.DataFrameScan(
+        schema,
+        node.df,
+        node.projection,
+        translate_expr(visitor, n=node.selection)
+        if node.selection is not None
+        else None,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    # We translate the expressions (which are executed with
+    # reference to the input node) with the input node active
+    # so that dtype resolution works correctly.
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+        cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr]
+        exprs = [translate_expr(visitor, n=e) for e in node.expr]
+    return ir.Select(schema, inp, cse_exprs, exprs)
+
+
+@_translate_ir.register
+def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+        aggs = [translate_expr(visitor, n=e) for e in node.aggs]
+        keys = [translate_expr(visitor, n=e) for e in node.keys]
+    return ir.GroupBy(
+        schema,
+        inp,
+        aggs,
+        keys,
+        node.maintain_order,
+        node.options,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input_left):
+        inp_left = translate_ir(visitor, n=None)
+        left_on = [translate_expr(visitor, n=e) for e in node.left_on]
+    with set_node(visitor, node.input_right):
+        inp_right = translate_ir(visitor, n=None)
+        right_on = [translate_expr(visitor, n=e) for e in node.right_on]
+    return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options)
+
+
+@_translate_ir.register
+def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+        exprs = [translate_expr(visitor, n=e) for e in node.exprs]
+    return ir.HStack(schema, inp, exprs)
+
+
+@_translate_ir.register
+def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.Distinct(
+        schema,
+        translate_ir(visitor, n=node.input),
+        node.options,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+        by = [translate_expr(visitor, n=e) for e in node.by_column]
+    return ir.Sort(schema, inp, by, node.sort_options, node.slice)
+
+
+@_translate_ir.register
+def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len)
+
+
+@_translate_ir.register
+def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+        mask = translate_expr(visitor, n=node.predicate)
+    return ir.Filter(schema, inp, mask)
+
+
+@_translate_ir.register
+def _(
+    node: pl_ir.SimpleProjection, visitor: Any, schema: dict[str, plc.DataType]
+) -> ir.IR:
+    return ir.Projection(schema, translate_ir(visitor, n=node.input))
+
+
+@_translate_ir.register
+def _(node: pl_ir.MapFunction, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    name, *options = node.function
+    return ir.MapFunction(
+        schema,
+        # TODO: merge_sorted breaks this pattern
+        translate_ir(visitor, n=node.input),
+        name,
+        options,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Union, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.Union(
+        schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.HConcat, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
+
+
+@_translate_ir.register
+def _(node: pl_ir.ExtContext, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.ExtContext(
+        schema,
+        translate_ir(visitor, n=node.input),
+        [translate_ir(visitor, n=n) for n in node.contexts],
+    )
+
+
 def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
     """
     Translate a polars-internal IR node to our representation.
@@ -64,117 +232,134 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
     with ctx:
         node = visitor.view_current_node()
         schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()}
-        if isinstance(node, pl_ir.PythonScan):
-            return ir.PythonScan(
-                schema,
-                node.options,
-                translate_expr(visitor, n=node.predicate)
-                if node.predicate is not None
-                else None,
-            )
-        elif isinstance(node, pl_ir.Scan):
-            return ir.Scan(
-                schema,
-                node.scan_type,
-                node.paths,
-                node.file_options,
-                translate_expr(visitor, n=node.predicate)
-                if node.predicate is not None
-                else None,
-            )
-        elif isinstance(node, pl_ir.Cache):
-            return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input))
-        elif isinstance(node, pl_ir.DataFrameScan):
-            return ir.DataFrameScan(
-                schema,
-                node.df,
-                node.projection,
-                translate_expr(visitor, n=node.selection)
-                if node.selection is not None
-                else None,
-            )
-        elif isinstance(node, pl_ir.Select):
-            # We translate the expressions (which are executed with
-            # reference to the input node) with the input node active
-            # so that dtype resolution works correctly.
-            with set_node(visitor, node.input):
-                inp = translate_ir(visitor, n=None)
-                cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr]
-                exprs = [translate_expr(visitor, n=e) for e in node.expr]
-            return ir.Select(schema, inp, cse_exprs, exprs)
-        elif isinstance(node, pl_ir.GroupBy):
-            with set_node(visitor, node.input):
-                inp = translate_ir(visitor, n=None)
-                aggs = [translate_expr(visitor, n=e) for e in node.aggs]
-                keys = [translate_expr(visitor, n=e) for e in node.keys]
-            return ir.GroupBy(
-                schema,
-                inp,
-                aggs,
-                keys,
-                node.maintain_order,
-                node.options,
-            )
-        elif isinstance(node, pl_ir.Join):
-            with set_node(visitor, node.input_left):
-                inp_left = translate_ir(visitor, n=None)
-                left_on = [translate_expr(visitor, n=e) for e in node.left_on]
-            with set_node(visitor, node.input_right):
-                inp_right = translate_ir(visitor, n=None)
-                right_on = [translate_expr(visitor, n=e) for e in node.right_on]
-            return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options)
-        elif isinstance(node, pl_ir.HStack):
-            with set_node(visitor, node.input):
-                inp = translate_ir(visitor, n=None)
-                exprs = [translate_expr(visitor, n=e) for e in node.exprs]
-            return ir.HStack(schema, inp, exprs)
-        elif isinstance(node, pl_ir.Distinct):
-            return ir.Distinct(
-                schema,
-                translate_ir(visitor, n=node.input),
-                node.options,
-            )
-        elif isinstance(node, pl_ir.Sort):
-            with set_node(visitor, node.input):
-                inp = translate_ir(visitor, n=None)
-                by = [translate_expr(visitor, n=e) for e in node.by_column]
-            return ir.Sort(schema, inp, by, node.sort_options, node.slice)
-        elif isinstance(node, pl_ir.Slice):
-            return ir.Slice(
-                schema, translate_ir(visitor, n=node.input), node.offset, node.len
-            )
-        elif isinstance(node, pl_ir.Filter):
-            with set_node(visitor, node.input):
-                inp = translate_ir(visitor, n=None)
-                mask = translate_expr(visitor, n=node.predicate)
-            return ir.Filter(schema, inp, mask)
-        elif isinstance(node, pl_ir.SimpleProjection):
-            return ir.Projection(schema, translate_ir(visitor, n=node.input))
-        elif isinstance(node, pl_ir.MapFunction):
-            name, *options = node.function
-            return ir.MapFunction(
-                schema,
-                # TODO: merge_sorted breaks this pattern
-                translate_ir(visitor, n=node.input),
-                name,
-                options,
-            )
-        elif isinstance(node, pl_ir.Union):
-            return ir.Union(
-                schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options
-            )
-        elif isinstance(node, pl_ir.HConcat):
-            return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
-        elif isinstance(node, pl_ir.ExtContext):
-            return ir.ExtContext(
-                schema,
-                translate_ir(visitor, n=node.input),
-                [translate_ir(visitor, n=n) for n in node.contexts],
-            )
-        else:
-            raise NotImplementedError(
-                f"No handler for LogicalPlan node with {type(node)=}"
-            )
+        return _translate_ir(node, visitor, schema)
+
+
+@singledispatch
+def _translate_expr(node: Any, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    raise NotImplementedError(f"Translation for {type(node).__name__}")
+
+
+@_translate_expr.register
+def _(node: pl_expr.PyExprIR, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    e = translate_expr(visitor, n=node.node)
+    return expr.NamedExpr(dtype, node.output_name, e)
+
+
+@_translate_expr.register
+def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    name, *options = node.function_data
+    if isinstance(name, pl_expr.StringFunction):
+        return expr.StringFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    elif isinstance(name, pl_expr.BooleanFunction):
+        return expr.BooleanFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    else:
+        raise NotImplementedError(f"No handler for Expr function node with {name=}")
+
+
+@_translate_expr.register
+def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    # TODO: raise in groupby?
+    if node.partition_by is None:
+        return expr.RollingWindow(
+            dtype, node.options, translate_expr(visitor, n=node.function)
+        )
+    else:
+        return expr.GroupedRollingWindow(
+            dtype,
+            node.options,
+            translate_expr(visitor, n=node.function),
+            *(translate_expr(visitor, n=n) for n in node.partition_by),
+        )
+
+
+@_translate_expr.register
+def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Literal(dtype, node.value)
+
+
+@_translate_expr.register
+def _(node: pl_expr.Sort, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    # TODO: raise in groupby
+    return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr))
+
+
+@_translate_expr.register
+def _(node: pl_expr.SortBy, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.SortBy(
+        dtype,
+        node.sort_options,
+        translate_expr(visitor, n=node.expr),
+        *(translate_expr(visitor, n=n) for n in node.by),
+    )
+
+
+@_translate_expr.register
+def _(node: pl_expr.Gather, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Gather(
+        dtype,
+        translate_expr(visitor, n=node.expr),
+        translate_expr(visitor, n=node.idx),
+    )
+
+
+@_translate_expr.register
+def _(node: pl_expr.Filter, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Filter(
+        dtype,
+        translate_expr(visitor, n=node.input),
+        translate_expr(visitor, n=node.by),
+    )
+
+
+@_translate_expr.register
+def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    inner = translate_expr(visitor, n=node.expr)
+    # Push casts into literals so we can handle Cast(Literal(Null))
+    if isinstance(inner, expr.Literal):
+        return expr.Literal(dtype, inner.value)
+    else:
+        return expr.Cast(dtype, inner)
+
+
+@_translate_expr.register
+def _(node: pl_expr.Column, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Col(dtype, node.name)
+
+
+@_translate_expr.register
+def _(node: pl_expr.Agg, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Agg(
+        dtype,
+        node.name,
+        node.options,
+        translate_expr(visitor, n=node.arguments),
+    )
+
+
+@_translate_expr.register
+def _(node: pl_expr.BinaryExpr, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.BinOp(
+        dtype,
+        expr.BinOp._MAPPING[node.op],
+        translate_expr(visitor, n=node.left),
+        translate_expr(visitor, n=node.right),
+    )
+
+
+@_translate_expr.register
+def _(node: pl_expr.Len, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Len(dtype)
 
 
 def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
@@ -198,92 +383,11 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
     NotImplementedError if any translation fails due to unsupported functionality.
     """
     if isinstance(n, pl_expr.PyExprIR):
-        # TODO: type narrowing didn't work because PyExprIR is Unknown
+        # TODO: type narrowing doesn't rule out int since PyExprIR is Unknown
         assert not isinstance(n, int)
-        e = translate_expr(visitor, n=n.node)
-        return expr.NamedExpr(e.dtype, n.output_name, e)
-    node = visitor.view_expression(n)
-    dtype = dtypes.from_polars(visitor.get_dtype(n))
-    if isinstance(node, pl_expr.Function):
-        name, *options = node.function_data
-        if isinstance(name, pl_expr.StringFunction):
-            return expr.StringFunction(
-                dtype,
-                name,
-                options,
-                *(translate_expr(visitor, n=n) for n in node.input),
-            )
-        elif isinstance(name, pl_expr.BooleanFunction):
-            return expr.BooleanFunction(
-                dtype,
-                name,
-                options,
-                *(translate_expr(visitor, n=n) for n in node.input),
-            )
-        else:
-            raise NotImplementedError(f"No handler for Expr function node with {name=}")
-    elif isinstance(node, pl_expr.Window):
-        # TODO: raise in groupby?
-        if node.partition_by is None:
-            return expr.RollingWindow(
-                dtype, node.options, translate_expr(visitor, n=node.function)
-            )
-        else:
-            return expr.GroupedRollingWindow(
-                dtype,
-                node.options,
-                translate_expr(visitor, n=node.function),
-                *(translate_expr(visitor, n=n) for n in node.partition_by),
-            )
-    elif isinstance(node, pl_expr.Literal):
-        return expr.Literal(dtype, node.value)
-    elif isinstance(node, pl_expr.Sort):
-        # TODO: raise in groupby
-        return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr))
-    elif isinstance(node, pl_expr.SortBy):
-        # TODO: raise in groupby
-        return expr.SortBy(
-            dtype,
-            node.sort_options,
-            translate_expr(visitor, n=node.expr),
-            *(translate_expr(visitor, n=n) for n in node.by),
-        )
-    elif isinstance(node, pl_expr.Gather):
-        return expr.Gather(
-            dtype,
-            translate_expr(visitor, n=node.expr),
-            translate_expr(visitor, n=node.idx),
-        )
-    elif isinstance(node, pl_expr.Filter):
-        return expr.Filter(
-            dtype,
-            translate_expr(visitor, n=node.input),
-            translate_expr(visitor, n=node.by),
-        )
-    elif isinstance(node, pl_expr.Cast):
-        inner = translate_expr(visitor, n=node.expr)
-        # Push casts into literals so we can handle Cast(Literal(Null))
-        if isinstance(inner, expr.Literal):
-            return expr.Literal(dtype, inner.value)
-        else:
-            return expr.Cast(dtype, inner)
-    elif isinstance(node, pl_expr.Column):
-        return expr.Col(dtype, node.name)
-    elif isinstance(node, pl_expr.Agg):
-        return expr.Agg(
-            dtype,
-            node.name,
-            node.options,
-            translate_expr(visitor, n=node.arguments),
-        )
-    elif isinstance(node, pl_expr.BinaryExpr):
-        return expr.BinOp(
-            dtype,
-            expr.BinOp._MAPPING[node.op],
-            translate_expr(visitor, n=node.left),
-            translate_expr(visitor, n=node.right),
-        )
-    elif isinstance(node, pl_expr.Len):
-        return expr.Len(dtype)
+        node = n
+        dtype = dtypes.from_polars(visitor.get_dtype(node.node))
     else:
-        raise NotImplementedError(f"No handler for expression node with {type(node)=}")
+        node = visitor.view_expression(n)
+        dtype = dtypes.from_polars(visitor.get_dtype(n))
+    return _translate_expr(node, visitor, dtype)
diff --git a/python/cudf_polars/tests/test_extcontext.py b/python/cudf_polars/tests/test_extcontext.py
index c5481d0ccbd..9daf88b4338 100644
--- a/python/cudf_polars/tests/test_extcontext.py
+++ b/python/cudf_polars/tests/test_extcontext.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import pytest
+
 import polars as pl
 
 from cudf_polars.testing.asserts import assert_gpu_result_equal
@@ -16,4 +18,6 @@ def test_extcontext():
     ).lazy()
     ldf2 = ldf.select((pl.col("b") + pl.col("a")).alias("c"))
     query = ldf.with_context(ldf2).select(pl.col("b"), pl.col("c"))
-    assert_gpu_result_equal(query)
+    with pytest.raises(pl.exceptions.ComputeError):
+        # ExtContext to be deprecated so we're not implementing it.
+        assert_gpu_result_equal(query)

From 90fca6d78a172e8321ffe8c22778e9efe039daea Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 23 May 2024 15:46:08 +0000
Subject: [PATCH 35/56] Spell out DSL

---
 python/cudf_polars/cudf_polars/dsl/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/__init__.py b/python/cudf_polars/cudf_polars/dsl/__init__.py
index cdc37f9e437..804c5ada566 100644
--- a/python/cudf_polars/cudf_polars/dsl/__init__.py
+++ b/python/cudf_polars/cudf_polars/dsl/__init__.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 
-"""The DSL for the polars executor."""
+"""The domain-specific language (DSL) for the polars executor."""
 
 from __future__ import annotations
 

From 0f82d0f55fd448486d42f6e26d01acf7767b7c90 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 23 May 2024 15:48:31 +0000
Subject: [PATCH 36/56] Avoid double import

---
 python/cudf_polars/cudf_polars/dsl/ir.py | 30 +++++++++++-------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 61a3fb87ee6..bc8e7d1a764 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -36,8 +36,6 @@
 if TYPE_CHECKING:
     from typing import Literal
 
-    from cudf_polars.dsl.expr import Expr
-
 
 __all__ = [
     "IR",
@@ -98,7 +96,7 @@ class PythonScan(IR):
 
     options: Any
     """Arbitrary options."""
-    predicate: Expr | None
+    predicate: expr.Expr | None
     """Filter to apply to the constructed dataframe before returning it."""
 
 
@@ -119,7 +117,7 @@ class Scan(IR):
     - ``row_index: tuple[name, offset] | None``: Add an integer index
         column with given name.
     """
-    predicate: Expr | None
+    predicate: expr.Expr | None
     """Mask to apply to the read dataframe."""
 
     def __post_init__(self):
@@ -208,7 +206,7 @@ class DataFrameScan(IR):
     """Polars LazyFrame object."""
     projection: list[str]
     """List of columns to project out."""
-    predicate: Expr | None
+    predicate: expr.Expr | None
     """Mask to apply."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
@@ -243,13 +241,13 @@ class Select(IR):
 
     df: IR
     """Input dataframe."""
-    cse: list[Expr]
+    cse: list[expr.Expr]
     """
     List of common subexpressions that will appear in the selected expressions.
 
     These must be evaluated before the returned expressions.
     """
-    expr: list[Expr]
+    expr: list[expr.Expr]
     """List of expressions to evaluate to form the new dataframe."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]):
@@ -296,9 +294,9 @@ class GroupBy(IR):
 
     df: IR
     """Input dataframe."""
-    agg_requests: list[Expr]
+    agg_requests: list[expr.Expr]
     """List of expressions to evaluate groupwise."""
-    keys: list[Expr]
+    keys: list[expr.Expr]
     """List of expressions forming the keys."""
     maintain_order: bool
     """Should the order of the input dataframe be maintained?"""
@@ -306,7 +304,7 @@ class GroupBy(IR):
     """Options controlling style of groupby."""
 
     @staticmethod
-    def check_agg(agg: Expr) -> int:
+    def check_agg(agg: expr.Expr) -> int:
         """
         Determine if we can handle an aggregation expression.
 
@@ -392,9 +390,9 @@ class Join(IR):
     """Left frame."""
     right: IR
     """Right frame."""
-    left_on: list[Expr]
+    left_on: list[expr.Expr]
     """List of expressions used as keys in the left frame."""
-    right_on: list[Expr]
+    right_on: list[expr.Expr]
     """List of expressions used as keys in the right frame."""
     options: Any
     """
@@ -514,7 +512,7 @@ class HStack(IR):
 
     df: IR
     """Input dataframe."""
-    columns: list[Expr]
+    columns: list[expr.Expr]
     """List of expressions to produce new columns."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
@@ -596,7 +594,7 @@ class Sort(IR):
 
     df: IR
     """Input."""
-    by: list[Expr]
+    by: list[expr.Expr]
     """List of expressions to produce sort keys."""
     do_sort: Callable[..., plc.Table]
     """pylibcudf sorting function."""
@@ -611,7 +609,7 @@ def __init__(
         self,
         schema: dict,
         df: IR,
-        by: list[Expr],
+        by: list[expr.Expr],
         options: Any,
         zlice: tuple[int, int] | None,
     ):
@@ -677,7 +675,7 @@ class Filter(IR):
 
     df: IR
     """Input."""
-    mask: Expr
+    mask: expr.Expr
     """Expression evaluating to a mask."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:

From f5683e70f9f7a6aa5afe71cc56a495a6292bfcc8 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 23 May 2024 16:00:29 +0000
Subject: [PATCH 37/56] Docs fixes

---
 python/cudf_polars/docs/overview.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index e2562959141..cbf012f5881 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -133,18 +133,18 @@ constructor should then take arguments:
 ```python
 def __init__(self, *non_child_data: Any, *children: Expr):
 ```
-Read the docstrings in the `Expr` class for more details. In
-particular, one needs to be careful to ensure that an `Expr` hashes
-correctly.
+Read the docstrings in the `Expr` class for more details.
 
-Expressions are evaluated by implementing an `evaluate` method that
+Expressions are evaluated by implementing a `do_evaluate` method that
 takes a `DataFrame` as context (this provides columns) along with an
 `ExecutionContext` parameter (indicating what context we're evaluating
 this expression in, currently unused) and a `mapping` from
 expressions to evaluated `Column`s. This approach enables a simple form of
 expression rewriting during evaluation of expressions that is used in
-evaluation of, for example, groupby-aggregations. To reduce boilerplate for lookup
-in the mappings dictionary use the `@with_mapping` decorator.
+evaluation of, for example, groupby-aggregations. To perform the
+evaluation, one should use the base class (generic) `evaluate` method
+which handles the boilerplate for looking up in the substitution
+`mapping`.
 
 To simplify state tracking, all columns should be considered immutable
 on construction. This matches the "functional" description coming from

From 34aac9a9c9b8db373a0ad086ea32acaede4dc857 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 24 May 2024 10:21:33 +0000
Subject: [PATCH 38/56] Split scan tests out into separate file

---
 python/cudf_polars/cudf_polars/dsl/ir.py | 12 ++-
 python/cudf_polars/tests/test_basic.py   |  7 --
 python/cudf_polars/tests/test_scan.py    | 98 ++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 14 deletions(-)
 create mode 100644 python/cudf_polars/tests/test_scan.py

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index bc8e7d1a764..a7c5d48064c 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -130,10 +130,8 @@ def __post_init__(self):
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         options = self.file_options
-        n_rows = options.n_rows
         with_columns = options.with_columns
         row_index = options.row_index
-        assert n_rows is None
         if self.typ == "csv":
             df = DataFrame.from_cudf(
                 cudf.concat(
@@ -148,17 +146,17 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             assert_never(self.typ)
         if row_index is not None:
             name, offset = row_index
-            dtype = self.schema[name]
-            step = plc.interop.from_arrow(pa.scalar(1), data_type=dtype)
-            init = plc.interop.from_arrow(pa.scalar(offset), data_type=dtype)
+            # TODO: dtype
+            step = plc.interop.from_arrow(pa.scalar(1))
+            init = plc.interop.from_arrow(pa.scalar(offset))
             index = Column(
                 plc.filling.sequence(df.num_rows, init, step), name
             ).set_sorted(
                 is_sorted=plc.types.Sorted.YES,
                 order=plc.types.Order.ASCENDING,
-                null_order=plc.types.null_order.AFTER,
+                null_order=plc.types.NullOrder.AFTER,
             )
-            df = df.with_columns([index])
+            df = DataFrame([index, *df.columns], [])
         # TODO: should be true, but not the case until we get
         # cudf-classic out of the loop for IO since it converts date32
         # to datetime.
diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
index 2b16dac8d84..ccf107d68db 100644
--- a/python/cudf_polars/tests/test_basic.py
+++ b/python/cudf_polars/tests/test_basic.py
@@ -66,13 +66,6 @@ def test_binaryops(op, dtype):
     assert_gpu_result_equal(result)
 
 
-def test_scan_parquet(tmp_path):
-    df = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
-    df.write_parquet(tmp_path / "example.parquet")
-    ldf = pl.scan_parquet(tmp_path / "example.parquet")
-    assert_gpu_result_equal(ldf)
-
-
 @pytest.mark.xfail(reason="Rolling window not yet implemented")
 def test_rolling(ldf_datetime):
     out = ldf_datetime.rolling(index_column="dt", period="2d").agg(
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
new file mode 100644
index 00000000000..b75e1bdef10
--- /dev/null
+++ b/python/cudf_polars/tests/test_scan.py
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(
+    params=[
+        (None, None),
+        pytest.param(
+            ("row-index", 0),
+            marks=pytest.mark.xfail(reason="Incorrect dtype for row index"),
+        ),
+        pytest.param(
+            ("index", 10),
+            marks=pytest.mark.xfail(reason="Incorrect dtype for row index"),
+        ),
+    ],
+    ids=["no-row-index", "zero-offset-row-index", "offset-row-index"],
+)
+def row_index(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        (None, 0),
+        pytest.param(
+            (2, 1), marks=pytest.mark.xfail(reason="No handling of row limit in scan")
+        ),
+        pytest.param(
+            (3, 0), marks=pytest.mark.xfail(reason="No handling of row limit in scan")
+        ),
+    ],
+    ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"],
+)
+def n_rows_skip_rows(request):
+    return request.param
+
+
+@pytest.fixture(params=["csv", "parquet"])
+def df(request, tmp_path, row_index, n_rows_skip_rows):
+    df = pl.DataFrame(
+        {
+            "a": [1, 2, 3, None],
+            "b": ["ẅ", "x", "y", "z"],
+            "c": [None, None, 4, 5],
+        }
+    )
+    name, offset = row_index
+    n_rows, skip_rows = n_rows_skip_rows
+    if request.param == "csv":
+        df.write_csv(tmp_path / "file.csv")
+        return pl.scan_csv(
+            tmp_path / "file.csv",
+            row_index_name=name,
+            row_index_offset=offset,
+            skip_rows_after_header=skip_rows,
+            n_rows=n_rows,
+        )
+    else:
+        df.write_parquet(tmp_path / "file.pq")
+        # parquet doesn't have skip_rows argument
+        return pl.scan_parquet(
+            tmp_path / "file.pq",
+            row_index_name=name,
+            row_index_offset=offset,
+            n_rows=n_rows,
+        )
+
+
+@pytest.fixture(params=[None, ["a"], ["b", "a"]], ids=["all", "subset", "reordered"])
+def columns(request, row_index):
+    name, _ = row_index
+    if name is not None and request.param is not None:
+        return [*request.param, name]
+    return request.param
+
+
+@pytest.fixture(
+    params=[None, pl.col("c").is_not_null()], ids=["no-mask", "c-is-not-null"]
+)
+def mask(request):
+    return request.param
+
+
+def test_scan(df, columns, mask):
+    q = df
+    if mask is not None:
+        q = q.filter(mask)
+    if columns is not None:
+        q = df.select(*columns)
+    assert_gpu_result_equal(q)

From 74e382403be60368c8a2902472469d0905599800 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 24 May 2024 12:05:48 +0000
Subject: [PATCH 39/56] Build out groupby test and fix one bug

---
 python/cudf_polars/cudf_polars/dsl/expr.py | 12 +++-
 python/cudf_polars/tests/test_groupby.py   | 78 ++++++++++++++++++++++
 2 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/tests/test_groupby.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index fc91bef726a..4f128122f82 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1001,7 +1001,6 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
         if depth == 1:
             # inside aggregation, need to pre-evaluate,
-            # This recurses to check if we have nested aggs
             # groupby construction has checked that we don't have
             # nested aggs, so stop the recursion and return ourselves
             # for pre-eval
@@ -1010,6 +1009,17 @@ def collect_agg(self, *, depth: int) -> AggInfo:
             left_info, right_info = (
                 child.collect_agg(depth=depth) for child in self.children
             )
+            requests = [*left_info.requests, *right_info.requests]
+            # TODO: Hack, if there were no reductions inside this
+            # binary expression then we want to pre-evaluate and
+            # collect ourselves. Otherwise we want to collect the
+            # aggregations inside and post-evaluate. This is a bad way
+            # of checking that we are in case 1.
+            if all(
+                agg.kind() == plc.aggregation.Kind.COLLECT_LIST
+                for _, agg, _ in requests
+            ):
+                return AggInfo([(self, plc.aggregation.collect_list(), self)])
             return AggInfo(
                 [*left_info.requests, *right_info.requests],
             )
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
new file mode 100644
index 00000000000..d06a7ecf105
--- /dev/null
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture
+def df():
+    return pl.LazyFrame(
+        {
+            "key1": [1, 1, 1, 2, 3, 1, 4, 6, 7],
+            "key2": [2, 2, 2, 2, 6, 1, 4, 6, 8],
+            "int": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "float": [7.0, 1, 2, 3, 4, 5, 6, 7, 8],
+        }
+    )
+
+
+@pytest.fixture(
+    params=[
+        ["key1"],
+        ["key2"],
+        [pl.col("key1") * pl.col("key2")],
+        ["key1", "key2"],
+        [pl.col("key1") == pl.col("key2")],
+        ["key2", pl.col("key1") == pl.lit(1, dtype=pl.Int64)],
+    ],
+    ids=lambda keys: "-".join(map(str, keys)),
+)
+def keys(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        ["int"],
+        ["float", "int"],
+        [pl.col("float") + pl.col("int")],
+        [pl.col("float").max() - pl.col("int").min()],
+        [pl.col("float").mean(), pl.col("int").std()],
+    ],
+    ids=lambda aggs: "-".join(map(str, aggs)),
+)
+def exprs(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        False,
+        pytest.param(
+            True,
+            marks=pytest.mark.xfail(
+                reason="Maintaining order in groupby not implemented"
+            ),
+        ),
+    ],
+    ids=["no_maintain_order", "maintain_order"],
+)
+def maintain_order(request):
+    return request.param
+
+
+def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
+    q = df.group_by(*keys, maintain_order=maintain_order).agg(*exprs)
+
+    if not maintain_order:
+        sort_keys = list(q.schema.keys())[: len(keys)]
+        q = q.sort(*sort_keys)
+    # from cudf_polars.dsl.translate import translate_ir
+    # ir = translate_ir(q._ldf.visit())
+    # from IPython import embed; embed()
+    assert_gpu_result_equal(q, check_exact=False)

From b77c573eaac4531dd9c0d2ce9e818a687441444d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 24 May 2024 12:06:14 +0000
Subject: [PATCH 40/56] Split out a few more tests

---
 python/cudf_polars/tests/test_basic.py | 20 --------------------
 python/cudf_polars/tests/test_slice.py |  2 +-
 python/cudf_polars/tests/test_union.py | 13 +++++++++++++
 3 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
index ccf107d68db..db813226281 100644
--- a/python/cudf_polars/tests/test_basic.py
+++ b/python/cudf_polars/tests/test_basic.py
@@ -197,26 +197,6 @@ def test_selection(ldf: pl.LazyFrame):
     assert_gpu_result_equal(out)
 
 
-def test_concat_vertical(ldf):
-    out = pl.concat([ldf, ldf])
-    assert_gpu_result_equal(out)
-
-
-def test_concat_horizontal(ldf):
-    # Have to split the columns in two to avoid the same column names
-    left_columns = ldf.columns[: len(ldf.columns) // 2]
-    right_columns = ldf.columns[len(ldf.columns) // 2 :]
-    out = pl.concat(
-        [ldf.select(left_columns), ldf.select(right_columns)], how="horizontal"
-    )
-    assert_gpu_result_equal(out)
-
-
-def test_groupby(ldf):
-    out = ldf.group_by("int_key1").agg(pl.col("float_val").sum())
-    assert_gpu_result_equal(out, check_row_order=False, check_exact=False)
-
-
 @pytest.mark.xfail(reason="arg_where not yet implemented")
 def test_expr_function(ldf):
     out = ldf.select(pl.arg_where(pl.col("int_key1") == 5)).set_sorted(
diff --git a/python/cudf_polars/tests/test_slice.py b/python/cudf_polars/tests/test_slice.py
index 6c918a89e33..d27e91302ba 100644
--- a/python/cudf_polars/tests/test_slice.py
+++ b/python/cudf_polars/tests/test_slice.py
@@ -31,4 +31,4 @@ def test_slice(offset, len):
         .sort(by=pl.col("a"))
         .slice(offset, len)
     )
-    assert_gpu_result_equal(query, check_row_order=False)
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index 8a6e015e4db..2c85bb15a55 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -22,3 +22,16 @@ def test_union():
     # Plan for this produces a `None`.astype(Int64) which we don't
     # handle correctly right now
     assert_gpu_result_equal(query)
+
+
+def test_concat_vertical():
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    )
+    ldf2 = ldf.select(pl.col("a"), pl.col("b") * 2 + pl.col("a"))
+    q = pl.concat([ldf, ldf2], how="vertical")
+
+    assert_gpu_result_equal(q)

From 4b7dd6e2f3c0dcee7df5ec8e5cd89a54e7d528db Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 24 May 2024 13:47:56 +0000
Subject: [PATCH 41/56] Move expression tests to subdirectory

---
 python/cudf_polars/pyproject.toml                         | 2 +-
 python/cudf_polars/tests/{ => expressions}/test_filter.py | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename python/cudf_polars/tests/{ => expressions}/test_filter.py (100%)

diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 3619e32e140..baaf46f6a2b 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -133,7 +133,7 @@ ignore = [
 fixable = ["ALL"]
 
 [tool.ruff.lint.per-file-ignores]
-"**/tests/test_*.py" = ["D", "INP"]
+"**/tests/**/test_*.py" = ["D", "INP"]
 
 [tool.ruff.lint.flake8-pytest-style]
 # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
diff --git a/python/cudf_polars/tests/test_filter.py b/python/cudf_polars/tests/expressions/test_filter.py
similarity index 100%
rename from python/cudf_polars/tests/test_filter.py
rename to python/cudf_polars/tests/expressions/test_filter.py

From 3aefc569524879f04d13f6bff57aa43704da1048 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 24 May 2024 13:51:11 +0000
Subject: [PATCH 42/56] Migrate agg tests

---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 10 ++-
 .../cudf_polars/tests/expressions/test_agg.py | 63 +++++++++++++++++++
 python/cudf_polars/tests/test_basic.py        | 44 -------------
 3 files changed, 70 insertions(+), 47 deletions(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_agg.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 4f128122f82..df8260e4627 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -800,6 +800,9 @@ def __init__(
         self, dtype: plc.DataType, name: str, options: Any, value: Expr
     ) -> None:
         super().__init__(dtype)
+        # TODO: fix polars name
+        if name == "nunique":
+            name = "n_unique"
         self.name = name
         self.options = options
         self.children = (value,)
@@ -812,7 +815,8 @@ def __init__(
             req = plc.aggregation.max()
         elif name == "median":
             req = plc.aggregation.median()
-        elif name == "nunique":
+        elif name == "n_unique":
+            # TODO: datatype of result
             req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE)
         elif name == "first" or name == "last":
             req = None
@@ -836,7 +840,7 @@ def __init__(
             op = partial(self._reduce, request=req)
         elif name in {"min", "max"}:
             op = partial(op, propagate_nans=options)
-        elif name == "count":
+        elif name in {"count", "first", "last"}:
             pass
         else:
             raise AssertionError
@@ -847,7 +851,7 @@ def __init__(
             "min",
             "max",
             "median",
-            "nunique",
+            "n_unique",
             "first",
             "last",
             "mean",
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
new file mode 100644
index 00000000000..c792ae64f74
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.dsl import expr
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(params=sorted(expr.Agg._SUPPORTED))
+def agg(request):
+    return request.param
+
+
+@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture(params=[False, True], ids=["no-nulls", "with-nulls"])
+def with_nulls(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        False,
+        pytest.param(True, marks=pytest.mark.xfail(reason="No handler for set_sorted")),
+    ],
+    ids=["unsorted", "sorted"],
+)
+def is_sorted(request):
+    return request.param
+
+
+@pytest.fixture
+def df(dtype, with_nulls, is_sorted):
+    values = [-10, 4, 5, 2, 3, 6, 8, 9, 4, 4, 5, 2, 3, 7, 3, 6, -10, -11]
+    if with_nulls:
+        values = [None if v % 5 == 0 else v for v in values]
+
+    if is_sorted:
+        values = sorted(values, key=lambda x: -1000 if x is None else x)
+
+    df = pl.LazyFrame({"a": values}, schema={"a": dtype})
+    if is_sorted:
+        return df.set_sorted("a")
+    return df
+
+
+def test_agg(df, agg):
+    expr = getattr(pl.col("a"), agg)()
+    q = df.select(expr)
+
+    # https://github.com/rapidsai/cudf/issues/15852
+    check_dtype = agg not in {"count", "n_unique", "median"}
+    if not check_dtype and q.schema["a"] != pl.Float64:
+        with pytest.raises(AssertionError):
+            assert_gpu_result_equal(q)
+    assert_gpu_result_equal(q, check_dtype=check_dtype, check_exact=False)
diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
index db813226281..c877a14ff57 100644
--- a/python/cudf_polars/tests/test_basic.py
+++ b/python/cudf_polars/tests/test_basic.py
@@ -138,37 +138,6 @@ def test_sort(ldf):
         assert_gpu_result_equal(out)
 
 
-def test_filter(ldf):
-    out = ldf.filter(pl.col("int_key1") > pl.col("int_key2"))
-    assert_gpu_result_equal(out)
-
-
-@pytest.mark.parametrize(
-    "agg",
-    [
-        "sum",
-        "min",
-        "max",
-        "mean",
-        # TODO: first/last get turned into slice of the Scan
-        "first",
-        "last",
-        "count",
-        "median",
-    ],
-)
-def test_agg(df, agg):
-    ldf = (
-        df.cast(
-            {key: pl.Float64 for key in df.columns if ("int" in key or "float" in key)}
-        )
-        .select(list(filter(lambda c: "str" not in c, df.columns)))
-        .lazy()
-    )
-    out = getattr(ldf, agg)()
-    assert_gpu_result_equal(out, check_dtype=agg != "count", check_exact=False)
-
-
 @pytest.mark.parametrize("keep", ["first", "last", "none"])
 @pytest.mark.parametrize("subset", [None, "keys"])
 @pytest.mark.parametrize("sort", [False, True])
@@ -189,14 +158,6 @@ def test_unique(ldf: pl.LazyFrame, keep, subset, sort, maintain_order):
     assert_gpu_result_equal(out, check_row_order=maintain_order)
 
 
-def test_selection(ldf: pl.LazyFrame):
-    k = pl.col("int_key1")
-    v = pl.col("int_val")
-    # groupby stops predicate pushdown
-    out = ldf.group_by(k).agg(v.sum()).filter(k * 2 > v)
-    assert_gpu_result_equal(out)
-
-
 @pytest.mark.xfail(reason="arg_where not yet implemented")
 def test_expr_function(ldf):
     out = ldf.select(pl.arg_where(pl.col("int_key1") == 5)).set_sorted(
@@ -205,11 +166,6 @@ def test_expr_function(ldf):
     assert_gpu_result_equal(out)
 
 
-def test_filter_expr(ldf):
-    out = ldf.select(pl.col("int_key1").filter(pl.col("int_key2") > 4))
-    assert_gpu_result_equal(out)
-
-
 def test_gather_expr(ldf):
     out = ldf.select(pl.col("int_key1").gather(pl.col("int_key2")))
     assert_gpu_result_equal(out)

From 22805a632757b0e18365180cb4816caf02b5e751 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 24 May 2024 13:53:42 +0000
Subject: [PATCH 43/56] Joins and sorts already test elsewhere

---
 python/cudf_polars/tests/test_basic.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
index c877a14ff57..2853c5b8c33 100644
--- a/python/cudf_polars/tests/test_basic.py
+++ b/python/cudf_polars/tests/test_basic.py
@@ -121,23 +121,6 @@ def test_drop_nulls(null_data):
     assert_gpu_result_equal(result)
 
 
-@pytest.mark.parametrize("how", ["inner", "left", "semi", "outer_coalesce"])
-def test_join(df: pl.DataFrame, how):
-    pl.set_random_seed(42)
-    # Sample eagerly since we haven't implemented it yet.
-    ldf1 = df.sample(n=50).lazy()
-    ldf2 = df.sample(n=50).lazy()
-
-    out = ldf1.join(ldf2, on=["int_key1", "int_key2"], how=how)
-    assert_gpu_result_equal(out, check_row_order=False)
-
-
-def test_sort(ldf):
-    for col in ldf.columns:
-        out = ldf.sort(by=col)
-        assert_gpu_result_equal(out)
-
-
 @pytest.mark.parametrize("keep", ["first", "last", "none"])
 @pytest.mark.parametrize("subset", [None, "keys"])
 @pytest.mark.parametrize("sort", [False, True])

From d8745f6e36ed36f7311b301471be6f00c5fd9e7f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 24 May 2024 14:13:30 +0000
Subject: [PATCH 44/56] Better distinct test and fix bug

---
 python/cudf_polars/cudf_polars/dsl/ir.py  | 11 +++++----
 python/cudf_polars/tests/test_basic.py    | 28 -----------------------
 python/cudf_polars/tests/test_distinct.py |  9 ++++++--
 3 files changed, 13 insertions(+), 35 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index a7c5d48064c..7f26bc892ec 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -557,7 +557,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             indices = list(range(df.num_columns))
         else:
             indices = [i for i, k in enumerate(df.column_names) if k in self.subset]
-        keys_sorted = all(c.is_sorted for c in df.columns)
+        keys_sorted = all(df.columns[i].is_sorted for i in indices)
         if keys_sorted:
             table = plc.stream_compaction.unique(
                 df.table,
@@ -628,10 +628,11 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         df = self.df.evaluate(cache=cache)
         sort_keys = [k.evaluate(df) for k in self.by]
         names = {c.name: i for i, c in enumerate(df.columns)}
+        # TODO: More robust identification here.
         keys_in_result = [
             i
             for k in sort_keys
-            if (i := names.get(k.name)) is not None and k is df.columns[i]
+            if (i := names.get(k.name)) is not None and k.obj is df.columns[i].obj
         ]
         table = self.do_sort(
             df.table,
@@ -641,11 +642,11 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         )
         columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)]
         # If a sort key is in the result table, set the sortedness property
-        for i in keys_in_result:
+        for k, i in enumerate(keys_in_result):
             columns[i] = columns[i].set_sorted(
                 is_sorted=plc.types.Sorted.YES,
-                order=self.order[i],
-                null_order=self.null_order[i],
+                order=self.order[k],
+                null_order=self.null_order[k],
             )
         return DataFrame(columns, []).slice(self.zlice)
 
diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
index 2853c5b8c33..0c7cbc6a6be 100644
--- a/python/cudf_polars/tests/test_basic.py
+++ b/python/cudf_polars/tests/test_basic.py
@@ -121,34 +121,6 @@ def test_drop_nulls(null_data):
     assert_gpu_result_equal(result)
 
 
-@pytest.mark.parametrize("keep", ["first", "last", "none"])
-@pytest.mark.parametrize("subset", [None, "keys"])
-@pytest.mark.parametrize("sort", [False, True])
-@pytest.mark.parametrize("maintain_order", [False, True])
-def test_unique(ldf: pl.LazyFrame, keep, subset, sort, maintain_order):
-    if subset is not None:
-        subset = list(filter(lambda c: "key" in c, ldf.columns))
-        sort_by = subset
-    else:
-        sort_by = ldf.columns
-    if sort:
-        ldf = ldf.sort(*sort_by)
-    out = ldf.unique(
-        subset,
-        keep=keep,
-        maintain_order=maintain_order,
-    )
-    assert_gpu_result_equal(out, check_row_order=maintain_order)
-
-
-@pytest.mark.xfail(reason="arg_where not yet implemented")
-def test_expr_function(ldf):
-    out = ldf.select(pl.arg_where(pl.col("int_key1") == 5)).set_sorted(
-        pl.col("int_key1")
-    )
-    assert_gpu_result_equal(out)
-
-
 def test_gather_expr(ldf):
     out = ldf.select(pl.col("int_key1").gather(pl.col("int_key2")))
     assert_gpu_result_equal(out)
diff --git a/python/cudf_polars/tests/test_distinct.py b/python/cudf_polars/tests/test_distinct.py
index e0fa089cee2..d42c4a96f5a 100644
--- a/python/cudf_polars/tests/test_distinct.py
+++ b/python/cudf_polars/tests/test_distinct.py
@@ -9,10 +9,11 @@
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
-@pytest.mark.parametrize("subset", [None, ["a"], ["a", "b"], ["b", "c"]])
+@pytest.mark.parametrize("subset", [None, ["a"], ["a", "b"], ["b", "c"], ["c", "a"]])
 @pytest.mark.parametrize("keep", ["any", "none", "first", "last"])
 @pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
-def test_distinct(subset, keep, maintain_order):
+@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"])
+def test_distinct(subset, keep, maintain_order, pre_sorted):
     ldf = pl.DataFrame(
         {
             "a": [1, 2, 1, 3, 5, None, None],
@@ -20,6 +21,10 @@ def test_distinct(subset, keep, maintain_order):
             "c": [True, True, True, True, False, False, True],
         }
     ).lazy()
+    if pre_sorted:
+        keys = ["a", "b", "c"] if subset is None else subset
+        descending = False if len(keys) == 1 else [False, True, True][: len(keys)]
+        ldf = ldf.sort(*keys, descending=descending)
 
     query = ldf.unique(subset=subset, keep=keep, maintain_order=maintain_order)
     assert_gpu_result_equal(query, check_row_order=maintain_order)

From eb6626e575ee485a761ebda3a483349f54dcd623 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 24 May 2024 15:03:38 +0000
Subject: [PATCH 45/56] More exhaustive binop tests

---
 .../tests/expressions/test_numeric_binops.py  | 106 ++++++++++++++++++
 python/cudf_polars/tests/test_basic.py        |  19 ----
 2 files changed, 106 insertions(+), 19 deletions(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_numeric_binops.py

diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py
new file mode 100644
index 00000000000..548aebf0875
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+dtypes = [
+    pl.Int8,
+    pl.Int16,
+    pl.Int64,
+    pl.UInt8,
+    pl.UInt64,
+    pl.Float32,
+    pl.Float64,
+]
+
+
+@pytest.fixture(params=dtypes)
+def ltype(request):
+    return request.param
+
+
+@pytest.fixture(params=dtypes)
+def rtype(request):
+    return request.param
+
+
+@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"])
+def with_nulls(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        pl.Expr.eq,
+        pl.Expr.eq_missing,
+        pl.Expr.ne,
+        pl.Expr.ne_missing,
+        pl.Expr.lt,
+        pl.Expr.le,
+        pl.Expr.gt,
+        pl.Expr.ge,
+        pl.Expr.add,
+        pl.Expr.sub,
+        pl.Expr.mul,
+        pl.Expr.truediv,
+        pl.Expr.floordiv,
+        pl.Expr.mod,
+    ],
+    ids=lambda fn: fn.__name__,
+)
+def binop(request):
+    return request.param
+
+
+@pytest.fixture
+def df(request, ltype, rtype, with_nulls, binop):
+    a = [1, 2, 3, 5, 8]
+    if with_nulls:
+        a[2] = None
+        a[-1] = None
+    b = [10, 20, 30, 50, 0]
+    if with_nulls:
+        b[1] = None
+        b[3] = None
+        b[-1] = None
+
+    lkind = (
+        "i"
+        if ltype.is_signed_integer()
+        else ("u" if ltype.is_unsigned_integer() else "f")
+    )
+    rkind = (
+        "i"
+        if rtype.is_signed_integer()
+        else ("u" if rtype.is_unsigned_integer() else "f")
+    )
+    if (
+        not with_nulls
+        and binop.__name__ in {"floordiv", "mod"}
+        # This catches the case where the result is not promoted to float.
+        and (
+            (lkind == rkind and lkind in {"i", "u"})
+            or ({lkind, rkind} == {"i", "u"} and pl.UInt64 not in {ltype, rtype})
+        )
+    ):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="Polars nullifies division by zero for integral types"
+            )
+        )
+
+    return pl.LazyFrame({"a": a, "b": b}, schema={"a": ltype, "b": rtype})
+
+
+def test_numeric_binop(df, binop):
+    left = pl.col("a")
+    right = pl.col("b")
+
+    q = df.select(binop(left, right))
+
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
index 0c7cbc6a6be..606fd9a1c90 100644
--- a/python/cudf_polars/tests/test_basic.py
+++ b/python/cudf_polars/tests/test_basic.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import operator
 from datetime import datetime
 
 import numpy as np
@@ -48,24 +47,6 @@ def ldf(df):
     return df.lazy()
 
 
-@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
-@pytest.mark.parametrize(
-    "op", [operator.add, operator.sub, operator.mul, operator.truediv]
-)
-def test_binaryops(op, dtype):
-    df = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 4, 5],
-            "b": [1, 2, 3, 4, 5],
-        }
-    ).lazy()
-
-    dtype = pl.datatypes.numpy_char_code_to_dtype(dtype)
-    df = df.with_columns(pl.col("a").cast(dtype)).with_columns(pl.col("b").cast(dtype))
-    result = df.with_columns(op(pl.col("a"), pl.col("b")))
-    assert_gpu_result_equal(result)
-
-
 @pytest.mark.xfail(reason="Rolling window not yet implemented")
 def test_rolling(ldf_datetime):
     out = ldf_datetime.rolling(index_column="dt", period="2d").agg(

From 246ff6af7447333da9243fe2d5da59419bfe9bcf Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 24 May 2024 15:06:20 +0000
Subject: [PATCH 46/56] Migrate basic gather test

---
 .../tests/expressions/test_gather.py          | 19 +++++++++++++++++++
 python/cudf_polars/tests/test_basic.py        |  5 -----
 2 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_gather.py

diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py
new file mode 100644
index 00000000000..df33e19a0b6
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_gather.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_gather():
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [0, 3, 1, 5, 6, 1, 0],
+        }
+    )
+
+    query = ldf.select(pl.col("a").gather(pl.col("b")))
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
index 606fd9a1c90..5484e9a5277 100644
--- a/python/cudf_polars/tests/test_basic.py
+++ b/python/cudf_polars/tests/test_basic.py
@@ -100,8 +100,3 @@ def null_data():
 def test_drop_nulls(null_data):
     result = null_data.drop_nulls()
     assert_gpu_result_equal(result)
-
-
-def test_gather_expr(ldf):
-    out = ldf.select(pl.col("int_key1").gather(pl.col("int_key2")))
-    assert_gpu_result_equal(out)

From 26c5994c847a1f2f0a5253f41002fea10926bc33 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 24 May 2024 15:06:41 +0000
Subject: [PATCH 47/56] Basic tests now covered elsewhere, or unimplemented
 functionality

---
 python/cudf_polars/tests/test_basic.py | 102 -------------------------
 1 file changed, 102 deletions(-)
 delete mode 100644 python/cudf_polars/tests/test_basic.py

diff --git a/python/cudf_polars/tests/test_basic.py b/python/cudf_polars/tests/test_basic.py
deleted file mode 100644
index 5484e9a5277..00000000000
--- a/python/cudf_polars/tests/test_basic.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-
-from datetime import datetime
-
-import numpy as np
-import pytest
-
-import polars as pl
-
-from cudf_polars.testing.asserts import assert_gpu_result_equal
-
-
-@pytest.fixture
-def ldf_datetime():
-    dates = [
-        "2020-01-01 13:45:48",
-        "2020-01-01 16:42:13",
-        "2020-01-01 16:45:09",
-        "2020-01-02 18:12:48",
-        "2020-01-03 19:45:32",
-        "2020-01-08 23:16:43",
-    ]
-    return (
-        pl.DataFrame({"dt": dates, "a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 1, 2, 2, 2]})
-        .with_columns(pl.col("dt").str.strptime(pl.Datetime).set_sorted())
-        .lazy()
-    )
-
-
-@pytest.fixture
-def df():
-    return pl.DataFrame(
-        {
-            "int_key1": np.repeat(np.arange(10), 10),
-            "int_key2": np.tile(np.arange(10), 10),
-            "str_key1": np.repeat(list("ABCDEFGHIJ"), 10),
-            "int_val": np.random.randint(100, size=100),
-            "float_val": np.random.rand(100),
-        }
-    )
-
-
-@pytest.fixture
-def ldf(df):
-    return df.lazy()
-
-
-@pytest.mark.xfail(reason="Rolling window not yet implemented")
-def test_rolling(ldf_datetime):
-    out = ldf_datetime.rolling(index_column="dt", period="2d").agg(
-        [
-            pl.sum("a").alias("sum_a"),
-            pl.min("a").alias("min_a"),
-            pl.max("a").alias("max_a"),
-        ]
-    )
-    assert_gpu_result_equal(out)
-
-
-@pytest.mark.xfail(reason="Grouped rolling window not yet implemented")
-def test_groupby_rolling(ldf_datetime):
-    out = ldf_datetime.rolling(index_column="dt", period="2d", group_by="b").agg(
-        [
-            pl.sum("a").alias("sum_a"),
-            pl.min("a").alias("min_a"),
-            pl.max("a").alias("max_a"),
-        ]
-    )
-    assert_gpu_result_equal(out)
-
-
-@pytest.mark.xfail(reason="Rolling expression not yet implemented")
-def test_rolling_expression(ldf_datetime):
-    out = ldf_datetime.with_columns(
-        sum_a=pl.sum("a").rolling(index_column="dt", period="2d"),
-        min_a=pl.min("a").rolling(index_column="dt", period="2d"),
-        max_a=pl.max("a").rolling(index_column="dt", period="2d"),
-    )
-    assert_gpu_result_equal(out)
-
-
-def test_datetime_comparison(ldf_datetime):
-    out = ldf_datetime.filter(
-        pl.col("dt") > datetime.fromisoformat("2020-01-01 16:45:09")
-    )
-    assert_gpu_result_equal(out)
-
-
-@pytest.fixture
-def null_data():
-    return pl.DataFrame(
-        {
-            "a": [1, 2, None, 4, None],
-        }
-    ).lazy()
-
-
-def test_drop_nulls(null_data):
-    result = null_data.drop_nulls()
-    assert_gpu_result_equal(result)

From 00628b02026292200bca82e630e5c2d24f917804 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 28 May 2024 15:12:33 +0000
Subject: [PATCH 48/56] Update join for new names

---
 python/cudf_polars/cudf_polars/dsl/expr.py    |  4 ++--
 python/cudf_polars/cudf_polars/dsl/ir.py      | 16 ++++++++++-----
 .../cudf_polars/cudf_polars/dsl/translate.py  |  1 +
 python/cudf_polars/tests/test_join.py         | 20 +++++++------------
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index df8260e4627..92b26518f5b 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -367,7 +367,7 @@ class BooleanFunction(Expr):
     __slots__ = ("name", "options", "children")
     _non_child = ("dtype", "name", "options")
 
-    def __init__(self, dtype: plc.DataType, name: str, options: Any, *children: Expr):
+    def __init__(self, dtype: plc.DataType, name: str, options: tuple, *children: Expr):
         super().__init__(dtype)
         self.options = options
         self.name = name
@@ -561,7 +561,7 @@ def __init__(
         self,
         dtype: plc.DataType,
         name: pl_expr.StringFunction,
-        options: Any,
+        options: tuple,
         *children: Expr,
     ):
         super().__init__(dtype)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 7f26bc892ec..c4dd2efac71 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -392,7 +392,13 @@ class Join(IR):
     """List of expressions used as keys in the left frame."""
     right_on: list[expr.Expr]
     """List of expressions used as keys in the right frame."""
-    options: Any
+    options: tuple[
+        Literal["inner", "left", "full", "leftsemi", "leftanti"],
+        bool,
+        tuple[int, int] | None,
+        str | None,
+        bool,
+    ]
     """
     tuple of options:
     - how: join type
@@ -410,7 +416,7 @@ def __post_init__(self):
     @cache
     @staticmethod
     def _joiners(
-        how: Literal["inner", "left", "outer", "leftsemi", "leftanti"],
+        how: Literal["inner", "left", "full", "leftsemi", "leftanti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
@@ -426,7 +432,7 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "outer":
+        elif how == "full":
             return (
                 plc.join.full_join,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
@@ -471,7 +477,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
             left = left.replace_columns(*left_on.columns)
             right = right.replace_columns(*right_on.columns)
-            if coalesce and how != "outer":
+            if coalesce and how == "inner":
                 right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
                 plc.copying.gather(left.table, lg, left_policy), left.column_names
@@ -479,7 +485,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             right = DataFrame.from_table(
                 plc.copying.gather(right.table, rg, right_policy), right.column_names
             )
-            if coalesce and how == "outer":
+            if coalesce and how != "inner":
                 left = left.replace_columns(
                     *(
                         Column(
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index f90a08e3b53..187fbce20dd 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -249,6 +249,7 @@ def _(node: pl_expr.PyExprIR, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 @_translate_expr.register
 def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr:
     name, *options = node.function_data
+    options = tuple(options)
     if isinstance(name, pl_expr.StringFunction):
         return expr.StringFunction(
             dtype,
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 9ba513023da..f4a4704f3cc 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -14,16 +14,16 @@
     [
         "inner",
         "left",
-        "outer",
         "semi",
         "anti",
         pytest.param(
             "cross",
             marks=pytest.mark.xfail(reason="cross join not implemented"),
         ),
-        "outer_coalesce",
+        "full",
     ],
 )
+@pytest.mark.parametrize("coalesce", [False, True])
 @pytest.mark.parametrize(
     "join_nulls", [False, True], ids=["nulls_not_equal", "nulls_equal"]
 )
@@ -32,19 +32,11 @@
     [
         pl.col("a"),
         pl.col("a") * 2,
-        [pl.col("a"), pl.col("a") + 1],
+        [pl.col("a"), pl.col("c") + 1],
         ["c", "a"],
     ],
 )
-def test_join(request, how, join_nulls, join_expr):
-    request.applymarker(
-        pytest.mark.xfail(
-            how == "outer_coalesce"
-            and isinstance(join_expr, list)
-            and not isinstance(join_expr[0], str),
-            reason="https://github.com/pola-rs/polars/issues/16289",
-        )
-    )
+def test_join(how, coalesce, join_nulls, join_expr):
     left = pl.DataFrame(
         {
             "a": [1, 2, 3, 1, None],
@@ -59,5 +51,7 @@ def test_join(request, how, join_nulls, join_expr):
         }
     ).lazy()
 
-    query = left.join(right, on=join_expr, how=how, join_nulls=join_nulls)
+    query = left.join(
+        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
+    )
     assert_gpu_result_equal(query, check_row_order=False)

From 47df8e27138a93a92e292a8c20bf3be73adb4b6f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 29 May 2024 15:52:43 +0000
Subject: [PATCH 49/56] Dataframe copy

---
 python/cudf_polars/cudf_polars/containers/dataframe.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index aa2f412f694..c595ea93673 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -43,6 +43,10 @@ def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None
         else:
             self.table = None
 
+    def copy(self) -> Self:
+        """Return a shallow copy of self."""
+        return type(self)(self.columns, self.scalars)
+
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
         assert len(self.scalars) == 0

From 215732372cb5ca968427880362dafb05cb7a9fb4 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 29 May 2024 15:54:13 +0000
Subject: [PATCH 50/56] Fix handling of CSE in Select and HStack

---
 python/cudf_polars/cudf_polars/dsl/ir.py      |  9 ++++++-
 .../cudf_polars/cudf_polars/dsl/translate.py  | 27 ++++++++++++++-----
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index c4dd2efac71..d6c8d15a0d2 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -516,13 +516,20 @@ class HStack(IR):
 
     df: IR
     """Input dataframe."""
+    cse: list[expr.Expr]
+    """
+    List of common subexpressions that will appear in the selected expressions.
+
+    These must be evaluated before the returned expressions.
+    """
     columns: list[expr.Expr]
     """List of expressions to produce new columns."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
-        return df.with_columns([c.evaluate(df) for c in self.columns])
+        ctx = df.copy().with_columns([e.evaluate(df) for e in self.cse])
+        return df.with_columns([c.evaluate(ctx) for c in self.columns])
 
 
 @dataclass(slots=True)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 187fbce20dd..2d4f76fccc2 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -90,13 +90,13 @@ def _(
 
 @_translate_ir.register
 def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
-    # We translate the expressions (which are executed with
-    # reference to the input node) with the input node active
-    # so that dtype resolution works correctly.
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-        cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr]
-        exprs = [translate_expr(visitor, n=e) for e in node.expr]
+    # Special-case carveout in get_dtype for Select means we should
+    # translate these expressions with the Select node active (even
+    # though they refer to the input node).
+    cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr]
+    exprs = [translate_expr(visitor, n=e) for e in node.expr]
     return ir.Select(schema, inp, cse_exprs, exprs)
 
 
@@ -131,8 +131,21 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
 def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-        exprs = [translate_expr(visitor, n=e) for e in node.exprs]
-    return ir.HStack(schema, inp, exprs)
+    # Like Select, there is a special-case carveout in get_dtype for
+    # HStack, so we translate these expressions with HStack Select
+    # node active.
+    cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_exprs]
+    exprs = [translate_expr(visitor, n=e) for e in node.exprs]
+    return ir.HStack(schema, inp, cse_exprs, exprs)
+
+
+@_translate_ir.register
+def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+        exprs = [translate_expr(visitor, n=e) for e in node.expr]
+    # Reduce is just a Select where all outputs are a single row.
+    return ir.Select(schema, inp, [], exprs)
 
 
 @_translate_ir.register

From 6d324cbbcb64d7d7270266537ef129fa2c0fdf45 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 30 May 2024 10:17:39 +0000
Subject: [PATCH 51/56] Adapt to polars-side changes

dtype-determination is now simpler.
---
 python/cudf_polars/cudf_polars/dsl/ir.py      | 22 +++++++++++++++++--
 .../cudf_polars/cudf_polars/dsl/translate.py  | 22 ++++++++-----------
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index d6c8d15a0d2..6da5d937b0c 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -251,8 +251,26 @@ class Select(IR):
     def evaluate(self, *, cache: dict[int, DataFrame]):
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
-        for e in self.cse:
-            df = df.with_columns([e.evaluate(df)])
+        df = df.with_columns([e.evaluate(df) for e in self.cse])
+        return DataFrame([e.evaluate(df) for e in self.expr], [])
+
+
+@dataclass(slots=True)
+class Reduce(IR):
+    """
+    Produce a new dataframe selecting given expressions from an input.
+
+    This is a special case of :class:`Select` where all outputs are a single row.
+    """
+
+    df: IR
+    """Input dataframe."""
+    expr: list[expr.Expr]
+    """List of expressions to evaluate to form the new dataframe."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]):
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
         return DataFrame([e.evaluate(df) for e in self.expr], [])
 
 
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 2d4f76fccc2..b3d0edf183f 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -92,9 +92,6 @@ def _(
 def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-    # Special-case carveout in get_dtype for Select means we should
-    # translate these expressions with the Select node active (even
-    # though they refer to the input node).
     cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr]
     exprs = [translate_expr(visitor, n=e) for e in node.expr]
     return ir.Select(schema, inp, cse_exprs, exprs)
@@ -104,8 +101,8 @@ def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I
 def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-        aggs = [translate_expr(visitor, n=e) for e in node.aggs]
-        keys = [translate_expr(visitor, n=e) for e in node.keys]
+    aggs = [translate_expr(visitor, n=e) for e in node.aggs]
+    keys = [translate_expr(visitor, n=e) for e in node.keys]
     return ir.GroupBy(
         schema,
         inp,
@@ -118,6 +115,9 @@ def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.
 
 @_translate_ir.register
 def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    # Join key dtypes are dependent on the schema of the left and
+    # right inputs, so these must be translated with the relevant
+    # input active.
     with set_node(visitor, node.input_left):
         inp_left = translate_ir(visitor, n=None)
         left_on = [translate_expr(visitor, n=e) for e in node.left_on]
@@ -131,9 +131,6 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
 def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-    # Like Select, there is a special-case carveout in get_dtype for
-    # HStack, so we translate these expressions with HStack Select
-    # node active.
     cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_exprs]
     exprs = [translate_expr(visitor, n=e) for e in node.exprs]
     return ir.HStack(schema, inp, cse_exprs, exprs)
@@ -143,9 +140,8 @@ def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I
 def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-        exprs = [translate_expr(visitor, n=e) for e in node.expr]
-    # Reduce is just a Select where all outputs are a single row.
-    return ir.Select(schema, inp, [], exprs)
+    exprs = [translate_expr(visitor, n=e) for e in node.expr]
+    return ir.Reduce(schema, inp, exprs)
 
 
 @_translate_ir.register
@@ -161,7 +157,7 @@ def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir
 def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-        by = [translate_expr(visitor, n=e) for e in node.by_column]
+    by = [translate_expr(visitor, n=e) for e in node.by_column]
     return ir.Sort(schema, inp, by, node.sort_options, node.slice)
 
 
@@ -174,7 +170,7 @@ def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR
 def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-        mask = translate_expr(visitor, n=node.predicate)
+    mask = translate_expr(visitor, n=node.predicate)
     return ir.Filter(schema, inp, mask)
 
 

From 786730a3754bbfd8d5e4d581d63f33e24ec181bd Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 30 May 2024 10:18:04 +0000
Subject: [PATCH 52/56] A few more tests

---
 python/cudf_polars/tests/test_hstack.py | 13 +++++++++
 python/cudf_polars/tests/test_select.py | 38 +++++++++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 python/cudf_polars/tests/test_select.py

diff --git a/python/cudf_polars/tests/test_hstack.py b/python/cudf_polars/tests/test_hstack.py
index 731c036bc88..b8c97f4607f 100644
--- a/python/cudf_polars/tests/test_hstack.py
+++ b/python/cudf_polars/tests/test_hstack.py
@@ -17,3 +17,16 @@ def test_hstack():
 
     query = ldf.with_columns(pl.col("a") + pl.col("b"))
     assert_gpu_result_equal(query)
+
+
+def test_hstack_with_cse():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    expr = pl.col("a") + pl.col("b")
+    query = ldf.with_columns(expr.alias("c"), expr.alias("d") * 2)
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py
new file mode 100644
index 00000000000..503edef152e
--- /dev/null
+++ b/python/cudf_polars/tests/test_select.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_select():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    query = ldf.select(
+        pl.col("a") + pl.col("b"), (pl.col("a") * 2 + pl.col("b")).alias("d")
+    )
+
+    assert_gpu_result_equal(query)
+
+
+def test_select_reduce():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    query = ldf.select(
+        (pl.col("a") + pl.col("b")).max(),
+        (pl.col("a") * 2 + pl.col("b")).alias("d").mean(),
+    )
+
+    assert_gpu_result_equal(query)

From 2773b0bc1dd32e947ffd9858668223b1f9e56bc7 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 30 May 2024 10:46:05 +0000
Subject: [PATCH 53/56] Update for rapids-build-backend

---
 dependencies.yaml                 | 2 +-
 python/cudf_polars/pyproject.toml | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 8bfa3190b3d..38ec30a8033 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -603,7 +603,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=0.20.24
+          - polars>=0.20.30
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 4b64ec62830..49ecd7080b9 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.8.*,>=0.0.0a0",
-    "polars>=0.20.24",
+    "polars>=0.20.30",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -181,3 +181,5 @@ docstring-code-format = true
 build-backend = "setuptools.build_meta"
 commit-file = "cudf_polars/GIT_COMMIT"
 dependencies-file = "../../dependencies.yaml"
+# Pure python
+disable-cuda = true

From 62f6455651739ada52faf82b3f52fff9a7f6e307 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 30 May 2024 10:48:04 +0000
Subject: [PATCH 54/56] Rename with_sorted to sorted_like

---
 python/cudf_polars/cudf_polars/containers/column.py    |  6 +++---
 python/cudf_polars/cudf_polars/containers/dataframe.py | 10 ++++++----
 python/cudf_polars/cudf_polars/dsl/expr.py             |  6 +++---
 python/cudf_polars/cudf_polars/dsl/ir.py               |  8 ++++----
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 9ca5b7f0310..a139927acab 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -34,9 +34,9 @@ def __init__(self, column: plc.Column, name: str):
 
     def rename(self, name: str) -> Column:
         """Return a new column sharing data with a new name."""
-        return type(self)(self.obj, name).with_sorted(like=self)
+        return type(self)(self.obj, name).sorted_like(self)
 
-    def with_sorted(self, *, like: Column) -> Self:
+    def sorted_like(self, like: Column, /) -> Self:
         """Copy sortedness properties from a column onto self."""
         return self.set_sorted(
             is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
@@ -72,7 +72,7 @@ def set_sorted(
 
     def copy(self) -> Self:
         """Return a shallow copy of the column."""
-        return type(self)(self.obj, self.name).with_sorted(like=self)
+        return type(self)(self.obj, self.name).sorted_like(self)
 
     def mask_nans(self) -> Self:
         """Return a copy of self with nans masked out."""
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index c595ea93673..0762724d555 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -95,13 +95,15 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
             raise ValueError("Mismatching name and table length.")
         return cls([Column(c, name) for c, name in zip(table.columns(), names)], [])
 
-    def with_sorted(self, *, like: DataFrame, subset: Set[str] | None = None) -> Self:
+    def sorted_like(
+        self, like: DataFrame, /, *, subset: Set[str] | None = None
+    ) -> Self:
         """Copy sortedness from a dataframe onto self."""
         if like.column_names != self.column_names:
             raise ValueError("Can only copy from identically named frame")
         subset = self.column_names_set if subset is None else subset
         self.columns = [
-            c.with_sorted(like=other) if c.name in subset else c
+            c.sorted_like(other) if c.name in subset else c
             for c, other in zip(self.columns, like.columns)
         ]
         return self
@@ -147,7 +149,7 @@ def select_columns(self, names: Set[str]) -> list[Column]:
     def filter(self, mask: Column) -> Self:
         """Return a filtered table given a mask."""
         table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
-        return type(self).from_table(table, self.column_names).with_sorted(like=self)
+        return type(self).from_table(table, self.column_names).sorted_like(self)
 
     def slice(self, zlice: tuple[int, int] | None) -> Self:
         """
@@ -172,4 +174,4 @@ def slice(self, zlice: tuple[int, int] | None) -> Self:
         # to the end of the frame if it is larger.
         end = min(start + length, self.num_rows)
         (table,) = plc.copying.slice(self.table, [start, end])
-        return type(self).from_table(table, self.column_names).with_sorted(like=self)
+        return type(self).from_table(table, self.column_names).sorted_like(self)
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 92b26518f5b..d96a6464404 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -740,7 +740,7 @@ def do_evaluate(
         table = plc.stream_compaction.apply_boolean_mask(
             plc.Table([values.obj]), mask.obj
         )
-        return Column(table.columns()[0], values.name).with_sorted(like=values)
+        return Column(table.columns()[0], values.name).sorted_like(values)
 
 
 class RollingWindow(Expr):
@@ -781,8 +781,8 @@ def do_evaluate(
         """Evaluate this expression given a dataframe for context."""
         (child,) = self.children
         column = child.evaluate(df, context=context, mapping=mapping)
-        return Column(plc.unary.cast(column.obj, self.dtype), column.name).with_sorted(
-            like=column
+        return Column(plc.unary.cast(column.obj, self.dtype), column.name).sorted_like(
+            column
         )
 
     def collect_agg(self, *, depth: int) -> AggInfo:
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 6da5d937b0c..d630b40f600 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -613,7 +613,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             [Column(c, old.name) for c, old in zip(table.columns(), df.columns)], []
         )
         if keys_sorted or self.stable:
-            result = result.with_sorted(like=df)
+            result = result.sorted_like(df)
         return result.slice(self.zlice)
 
 
@@ -787,7 +787,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                     [plc.types.NullOrder.BEFORE],
                 ),
                 first.column_names,
-            ).with_sorted(like=first, subset={key_column})
+            ).sorted_like(first, subset={key_column})
         elif self.name == "rechunk":
             # No-op in our data model
             return self.df.evaluate(cache=cache)
@@ -799,7 +799,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             return DataFrame.from_table(
                 plc.stream_compaction.drop_nulls(df.table, indices, len(indices)),
                 df.column_names,
-            ).with_sorted(like=df)
+            ).sorted_like(df)
         elif self.name == "rename":
             df = self.df.evaluate(cache=cache)
             # final tag is "swapping" which is useful for the
@@ -813,7 +813,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             subset = df.column_names_set - {to_explode}
             return DataFrame.from_table(
                 plc.lists.explode_outer(df.table, index), df.column_names
-            ).with_sorted(like=df, subset=subset)
+            ).sorted_like(df, subset=subset)
         else:
             raise AssertionError("Should never be reached")
 

From a1f579f3a14f7b31c6984d101c9b694e4f93d077 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 30 May 2024 11:20:01 +0000
Subject: [PATCH 55/56] Column.copy takes an optional new_name argument

This removes the need for rename.
---
 .../cudf_polars/containers/column.py          | 40 +++++++++++++++----
 .../cudf_polars/containers/dataframe.py       |  2 +-
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index a139927acab..49034b5f5c8 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -32,12 +32,23 @@ def __init__(self, column: plc.Column, name: str):
         self.order = plc.types.Order.ASCENDING
         self.null_order = plc.types.NullOrder.BEFORE
 
-    def rename(self, name: str) -> Column:
-        """Return a new column sharing data with a new name."""
-        return type(self)(self.obj, name).sorted_like(self)
-
     def sorted_like(self, like: Column, /) -> Self:
-        """Copy sortedness properties from a column onto self."""
+        """
+        Copy sortedness properties from a column onto self.
+
+        Parameters
+        ----------
+        like
+            The column to copy sortedness metadata from.
+
+        Returns
+        -------
+        Self with metadata set.
+
+        See Also
+        --------
+        set_sorted
+        """
         return self.set_sorted(
             is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
         )
@@ -70,9 +81,22 @@ def set_sorted(
         self.null_order = null_order
         return self
 
-    def copy(self) -> Self:
-        """Return a shallow copy of the column."""
-        return type(self)(self.obj, self.name).sorted_like(self)
+    def copy(self, *, new_name: str | None = None) -> Self:
+        """
+        Return a shallow copy of the column.
+
+        Parameters
+        ----------
+        new_name
+            Optional new name for the copied column.
+
+        Returns
+        -------
+        New column sharing data with self.
+        """
+        return type(self)(
+            self.obj, self.name if new_name is None else new_name
+        ).sorted_like(self)
 
     def mask_nans(self) -> Self:
         """Return a copy of self with nans masked out."""
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 0762724d555..2ed4298e993 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -139,7 +139,7 @@ def replace_columns(self, *columns: Column) -> Self:
     def rename_columns(self, mapping: Mapping[str, str]) -> Self:
         """Rename some columns."""
         return type(self)(
-            [c.rename(mapping.get(c.name, c.name)) for c in self.columns], self.scalars
+            [c.copy(new_name=mapping.get(c.name)) for c in self.columns], self.scalars
         )
 
     def select_columns(self, names: Set[str]) -> list[Column]:

From 1240b629a70b32ba06d8bf86ac71a0806bee99fe Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 30 May 2024 11:20:25 +0000
Subject: [PATCH 56/56] Expand docstrings

---
 .../cudf_polars/containers/dataframe.py       | 52 +++++++++++++++++--
 python/cudf_polars/cudf_polars/dsl/expr.py    | 13 ++++-
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 2ed4298e993..de21a280020 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -89,7 +89,25 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self:
 
     @classmethod
     def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
-        """Create from a pylibcudf table."""
+        """
+        Create from a pylibcudf table.
+
+        Parameters
+        ----------
+        table
+            Pylibcudf table to obtain columns from
+        names
+            Names for the columns
+
+        Returns
+        -------
+        New dataframe sharing  data with the input table.
+
+        Raises
+        ------
+        ValueError if the number of provided names does not match the
+        number of columns in the table.
+        """
         # TODO: strict=True when we drop py39
         if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
@@ -98,7 +116,24 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
     def sorted_like(
         self, like: DataFrame, /, *, subset: Set[str] | None = None
     ) -> Self:
-        """Copy sortedness from a dataframe onto self."""
+        """
+        Copy sortedness from a dataframe onto self.
+
+        Parameters
+        ----------
+        like
+            The dataframe to copy from
+        subset
+            Optional subset of columns from which to copy data.
+
+        Returns
+        -------
+        Self with metadata set.
+
+        Raises
+        ------
+        ValueError if there is a name mismatch between self and like.
+        """
         if like.column_names != self.column_names:
             raise ValueError("Can only copy from identically named frame")
         subset = self.column_names_set if subset is None else subset
@@ -112,7 +147,18 @@ def with_columns(self, columns: Sequence[Column]) -> Self:
         """
         Return a new dataframe with extra columns.
 
-        Data is shared.
+        Parameters
+        ----------
+        columns
+            Columns to add
+
+        Returns
+        -------
+        New dataframe
+
+        Notes
+        -----
+        If column names overlap, newer names replace older ones.
         """
         return type(self)([*self.columns, *columns], self.scalars)
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index d96a6464404..249cc3775f7 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -183,12 +183,21 @@ def do_evaluate(
         Notes
         -----
         Do not call this function directly, but rather
-        :func:`evaluate` which handles the mapping lookups.
+        :meth:`evaluate` which handles the mapping lookups.
+
+        The typed return value of :class:`Column` is not true when
+        evaluating :class:`Literal` nodes (which instead produce
+        :class:`Scalar` objects). However, these duck-type to having a
+        pylibcudf container object inside them, and usually they end
+        up appearing in binary expressions which pylibcudf handles
+        appropriately since there are overloads for (column, scalar)
+        pairs. We don't have to handle (scalar, scalar) in binops
+        since the polars optimizer has a constant-folding pass.
 
         Returns
         -------
         Column representing the evaluation of the expression (or maybe
-        a scalar, annoying!).
+        a scalar).
 
         Raises
         ------