Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: parse strings as columns names at narwhals-level #1856

Merged
merged 28 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from itertools import chain
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
from typing import Iterator
from typing import Literal
from typing import Sequence
Expand All @@ -21,7 +20,6 @@
from narwhals.dependencies import is_numpy_array
from narwhals.utils import Implementation
from narwhals.utils import check_column_exists
from narwhals.utils import flatten
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import is_sequence_but_not_str
from narwhals.utils import parse_columns_to_drop
Expand Down Expand Up @@ -292,6 +290,9 @@ def estimated_size(self: Self, unit: SizeUnit) -> int | float:
def columns(self: Self) -> list[str]:
return self._native_frame.schema.names # type: ignore[no-any-return]

def simple_select(self, *column_names: str) -> Self:
return self._from_native_frame(self._native_frame.select(list(column_names)))

def select(self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr) -> Self:
new_series = evaluate_into_exprs(self, *exprs, **named_exprs)
if not new_series:
Expand Down Expand Up @@ -405,21 +406,19 @@ def drop_nulls(self: Self, subset: list[str] | None) -> Self:

def sort(
self: Self,
by: str | Iterable[str],
*more_by: str,
*by: str,
descending: bool | Sequence[bool],
nulls_last: bool,
) -> Self:
flat_keys = flatten([*flatten([by]), *more_by])
df = self._native_frame

if isinstance(descending, bool):
order = "descending" if descending else "ascending"
sorting = [(key, order) for key in flat_keys]
sorting = [(key, order) for key in by]
else:
sorting = [
(key, "descending" if is_descending else "ascending")
for key, is_descending in zip(flat_keys, descending)
for key, is_descending in zip(by, descending)
]

null_placement = "at_end" if nulls_last else "at_start"
Expand Down Expand Up @@ -663,7 +662,7 @@ def unique(

return self._from_native_frame(pc.take(df, keep_idx))

keep_idx = self.select(*subset).is_unique()
keep_idx = self.simple_select(*subset).is_unique()
return self.filter(keep_idx)

def gather_every(self: Self, n: int, offset: int) -> Self:
Expand Down
2 changes: 1 addition & 1 deletion narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:
raise AnonymousExprError.from_expr_name(msg)

tmp = df.group_by(*keys, drop_null_keys=False).agg(self)
tmp = df.select(*keys).join(
tmp = df.simple_select(*keys).join(
tmp, how="left", left_on=keys, right_on=keys, suffix="_right"
)
return [tmp[name] for name in self._output_names]
Expand Down
16 changes: 3 additions & 13 deletions narwhals/_arrow/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import pyarrow.compute as pc

from narwhals._expression_parsing import is_simple_aggregation
from narwhals._expression_parsing import parse_into_exprs
from narwhals.exceptions import AnonymousExprError
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import remove_prefix
Expand All @@ -20,8 +19,8 @@
from typing_extensions import Self

from narwhals._arrow.dataframe import ArrowDataFrame
from narwhals._arrow.expr import ArrowExpr
from narwhals._arrow.series import ArrowSeries
from narwhals._arrow.typing import IntoArrowExpr
from narwhals.typing import CompliantExpr

POLARS_TO_ARROW_AGGREGATIONS = {
Expand Down Expand Up @@ -49,16 +48,7 @@ def __init__(
self._keys = list(keys)
self._grouped = pa.TableGroupBy(self._df._native_frame, list(self._keys))

def agg(
self: Self,
*aggs: IntoArrowExpr,
**named_aggs: IntoArrowExpr,
) -> ArrowDataFrame:
exprs = parse_into_exprs(
*aggs,
namespace=self._df.__narwhals_namespace__(),
**named_aggs,
)
def agg(self: Self, *exprs: ArrowExpr) -> ArrowDataFrame:
for expr in exprs:
if expr._output_names is None:
msg = "group_by.agg"
Expand Down Expand Up @@ -93,7 +83,7 @@ def __iter__(self: Self) -> Iterator[tuple[Any, ArrowDataFrame]]:
table.filter(pc.equal(table[col_token], v)).drop([col_token])
)
)
.select(*self._keys)
.simple_select(*self._keys)
.head(1)
.iter_rows(named=False, buffer_size=512)
),
Expand Down
9 changes: 2 additions & 7 deletions narwhals/_arrow/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,15 +361,11 @@ def when(

def concat_str(
self: Self,
exprs: Iterable[IntoArrowExpr],
*more_exprs: IntoArrowExpr,
*exprs: IntoArrowExpr,
separator: str,
ignore_nulls: bool,
) -> ArrowExpr:
parsed_exprs = [
*parse_into_exprs(*exprs, namespace=self),
*parse_into_exprs(*more_exprs, namespace=self),
]
parsed_exprs = parse_into_exprs(*exprs, namespace=self)
dtypes = import_dtypes_module(self._version)

def func(df: ArrowDataFrame) -> list[ArrowSeries]:
Expand Down Expand Up @@ -399,7 +395,6 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]:
output_names=reduce_output_names(parsed_exprs),
kwargs={
"exprs": exprs,
"more_exprs": more_exprs,
"separator": separator,
"ignore_nulls": ignore_nulls,
},
Expand Down
2 changes: 1 addition & 1 deletion narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ def to_dummies(self: Self, *, separator: str, drop_first: bool) -> ArrowDataFram
pa.Table.from_arrays(columns, names=cols),
backend_version=self._backend_version,
version=self._version,
).select(*output_order)
).simple_select(*output_order)

def quantile(
self: Self,
Expand Down
2 changes: 1 addition & 1 deletion narwhals/_arrow/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@
from narwhals._arrow.expr import ArrowExpr
from narwhals._arrow.series import ArrowSeries

IntoArrowExpr: TypeAlias = Union[ArrowExpr, str, ArrowSeries]
IntoArrowExpr: TypeAlias = Union[ArrowExpr, ArrowSeries]
32 changes: 11 additions & 21 deletions narwhals/_dask/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from itertools import chain
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
from typing import Literal
from typing import Sequence

Expand All @@ -17,7 +16,6 @@
from narwhals.typing import CompliantLazyFrame
from narwhals.utils import Implementation
from narwhals.utils import check_column_exists
from narwhals.utils import flatten
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import parse_columns_to_drop
from narwhals.utils import parse_version
Expand All @@ -31,7 +29,6 @@
from narwhals._dask.expr import DaskExpr
from narwhals._dask.group_by import DaskLazyGroupBy
from narwhals._dask.namespace import DaskNamespace
from narwhals._dask.typing import IntoDaskExpr
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
from narwhals.dtypes import DType
from narwhals.utils import Version
Expand Down Expand Up @@ -107,22 +104,17 @@ def filter(self: Self, *predicates: DaskExpr, **constraints: Any) -> Self:

return self._from_native_frame(self._native_frame.loc[mask])

def select(
self: Self,
*exprs: IntoDaskExpr,
**named_exprs: IntoDaskExpr,
) -> Self:
if exprs and all(isinstance(x, str) for x in exprs) and not named_exprs:
# This is a simple slice => fastpath!
return self._from_native_frame(
select_columns_by_name(
self._native_frame,
list(exprs), # type: ignore[arg-type]
self._backend_version,
self._implementation,
)
def simple_select(self: Self, *column_names: str) -> Self:
return self._from_native_frame(
select_columns_by_name(
self._native_frame,
list(column_names),
self._backend_version,
self._implementation,
)
)

def select(self: Self, *exprs: DaskExpr, **named_exprs: DaskExpr) -> Self:
new_series = parse_exprs_and_named_exprs(self, *exprs, **named_exprs)

if not new_series:
Expand Down Expand Up @@ -211,20 +203,18 @@ def unique(

def sort(
self: Self,
by: str | Iterable[str],
*more_by: str,
*by: str,
descending: bool | Sequence[bool],
nulls_last: bool,
) -> Self:
flat_keys = flatten([*flatten([by]), *more_by])
df = self._native_frame
if isinstance(descending, bool):
ascending: bool | list[bool] = not descending
else:
ascending = [not d for d in descending]
na_position = "last" if nulls_last else "first"
return self._from_native_frame(
df.sort_values(flat_keys, ascending=ascending, na_position=na_position)
df.sort_values(list(by), ascending=ascending, na_position=na_position)
)

def join(
Expand Down
2 changes: 1 addition & 1 deletion narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ def func(df: DaskLazyFrame) -> list[Any]:
if df._native_frame.npartitions == 1: # pragma: no cover
tmp = df.group_by(*keys, drop_null_keys=False).agg(self)
tmp_native = (
df.select(*keys)
df.simple_select(*keys)
.join(tmp, how="left", left_on=keys, right_on=keys, suffix="_right")
._native_frame
)
Expand Down
13 changes: 3 additions & 10 deletions narwhals/_dask/group_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import dask.dataframe as dd

from narwhals._expression_parsing import is_simple_aggregation
from narwhals._expression_parsing import parse_into_exprs
from narwhals.exceptions import AnonymousExprError
from narwhals.utils import remove_prefix

Expand All @@ -23,7 +22,7 @@
from typing_extensions import Self

from narwhals._dask.dataframe import DaskLazyFrame
from narwhals._dask.typing import IntoDaskExpr
from narwhals._dask.expr import DaskExpr
from narwhals.typing import CompliantExpr


Expand Down Expand Up @@ -89,14 +88,8 @@ def __init__(

def agg(
self: Self,
*aggs: IntoDaskExpr,
**named_aggs: IntoDaskExpr,
*exprs: DaskExpr,
) -> DaskLazyFrame:
exprs = parse_into_exprs(
*aggs,
namespace=self._df.__narwhals_namespace__(),
**named_aggs,
)
output_names: list[str] = copy(self._keys)
for expr in exprs:
if expr._output_names is None:
Expand Down Expand Up @@ -135,7 +128,7 @@ def agg_dask(
"""
if not exprs:
# No aggregation provided
return df.select(*keys).unique(subset=keys)
return df.simple_select(*keys).unique(subset=keys)

all_simple_aggs = True
for expr in exprs:
Expand Down
Loading
Loading