diff --git a/README.md b/README.md index bdd3a3cfb..82c4d1da5 100644 --- a/README.md +++ b/README.md @@ -44,41 +44,40 @@ There are three steps to writing dataframe-agnostic code using Narwhals: Here's an example of a dataframe agnostic function: ```python -from typing import TypeVar +from typing import Any import pandas as pd import polars as pl -from narwhals import translate_frame, get_namespace, to_native - -AnyDataFrame = TypeVar("AnyDataFrame") +import narwhals as nw def my_agnostic_function( - suppliers_native: AnyDataFrame, - parts_native: AnyDataFrame, -) -> AnyDataFrame: - suppliers = translate_frame(suppliers_native) - parts = translate_frame(parts_native) - pl = get_namespace(suppliers) + suppliers_native, + parts_native, +): + suppliers = nw.DataFrame(suppliers_native) + parts = nw.DataFrame(parts_native) result = ( suppliers.join(parts, left_on="city", right_on="city") .filter( - pl.col("color").is_in(["Red", "Green"]), - pl.col("weight") > 14, + nw.col("color").is_in(["Red", "Green"]), + nw.col("weight") > 14, ) .group_by("s", "p") .agg( - weight_mean=pl.col("weight").mean(), - weight_max=pl.col("weight").max(), + weight_mean=nw.col("weight").mean(), + weight_max=nw.col("weight").max(), ) - ) - return to_native(result) + ).with_columns(nw.col("weight_max").cast(nw.Int64)) + return nw.to_native(result) + ``` You can pass in a pandas or Polars dataframe, the output will be the same! Let's try it out: ```python + suppliers = { "s": ["S1", "S2", "S3", "S4", "S5"], "sname": ["Smith", "Jones", "Blake", "Clark", "Adams"], diff --git a/demo.py b/demo.py new file mode 100644 index 000000000..6c8b50faf --- /dev/null +++ b/demo.py @@ -0,0 +1,29 @@ +# ruff: noqa +from typing import Any +import polars as pl + +import narwhals as nw + + +def func(df_raw: Any) -> Any: + df = nw.DataFrame(df_raw) + res = df.with_columns( + d=nw.col("a") + 1, + e=nw.col("a") + nw.col("b"), + ) + res = res.group_by(["a"]).agg( + nw.col("b").sum(), + d=nw.col("c").sum(), + # e=nw.len(), + ) + return nw.to_native(res) + + +import pandas as pd + +df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) +print(func(df)) +df = pl.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) +print(func(df)) +df = pl.LazyFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) +print(func(df).collect()) diff --git a/design.md b/design.md new file mode 100644 index 000000000..d343e4074 --- /dev/null +++ b/design.md @@ -0,0 +1,31 @@ +# Design + +Let's do this differently. + +Here's what I'd like to get to: + +import narwhals as nw +from narwhals.translate import ( + translate_frame, + translate_series, + to_native, +) + +dfpd = ... +df = nw.DataFrame(df_any) + +df = df.with_columns(c = nw.col('a') + nw.col('b')) + +result = to_native(df) + +--- + +we need to just have a single class. can't have all this nonsense... + +then, we don't even need a spec... + +we can still define entrypoints though? + +--- + +where should extract native happen? diff --git a/f.py b/f.py new file mode 100644 index 000000000..d30969522 --- /dev/null +++ b/f.py @@ -0,0 +1,66 @@ +# ruff: noqa +# type: ignore +from typing import Any +import pandas as pd +import polars as pl + +import narwhals as nw + + +def my_agnostic_function( + suppliers_native, + parts_native, +): + suppliers = nw.DataFrame(suppliers_native) + parts = nw.DataFrame(parts_native) + + result = ( + suppliers.join(parts, left_on="city", right_on="city") + .filter( + nw.col("color").is_in(["Red", "Green"]), + nw.col("weight") > 14, + ) + .group_by("s", "p") + .agg( + weight_mean=nw.col("weight").mean(), + weight_max=nw.col("weight").max(), + ) + ).with_columns(nw.col("weight_max").cast(nw.Int64)) + return nw.to_native(result) + + +suppliers = { + "s": ["S1", "S2", "S3", "S4", "S5"], + "sname": ["Smith", "Jones", "Blake", "Clark", "Adams"], + "status": [20, 10, 30, 20, 30], + "city": ["London", "Paris", "Paris", "London", "Athens"], +} +parts = { + "p": ["P1", "P2", "P3", "P4", "P5", "P6"], + "pname": ["Nut", "Bolt", "Screw", "Screw", "Cam", "Cog"], + "color": ["Red", "Green", "Blue", "Red", "Blue", "Red"], + "weight": [12.0, 17.0, 17.0, 14.0, 12.0, 19.0], + "city": ["London", "Paris", "Oslo", "London", "Paris", "London"], +} + +print("pandas output:") +print( + my_agnostic_function( + pd.DataFrame(suppliers), + pd.DataFrame(parts), + ) +) +print("\nPolars output:") +print( + my_agnostic_function( + pl.DataFrame(suppliers), + pl.DataFrame(parts), + ) +) +print("\nPolars lazy output:") +print( + my_agnostic_function( + pl.LazyFrame(suppliers), + pl.LazyFrame(parts), + ).collect() +) diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 289068eb1..da313c5a9 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -3,23 +3,36 @@ from narwhals.containers import is_pandas from narwhals.containers import is_polars from narwhals.containers import is_series -from narwhals.translate import get_namespace +from narwhals.dataframe import DataFrame +from narwhals.dtypes import * # noqa: F403 +from narwhals.expression import all +from narwhals.expression import col +from narwhals.expression import len +from narwhals.expression import max +from narwhals.expression import mean +from narwhals.expression import min +from narwhals.expression import sum +from narwhals.expression import sum_horizontal +from narwhals.series import Series from narwhals.translate import to_native -from narwhals.translate import translate_any -from narwhals.translate import translate_frame -from narwhals.translate import translate_series __version__ = "0.3.0" __all__ = [ - "translate_frame", - "translate_series", - "translate_any", "is_dataframe", "is_series", "is_polars", "is_pandas", "get_implementation", - "get_namespace", "to_native", + "all", + "col", + "len", + "min", + "max", + "mean", + "sum", + "sum_horizontal", + "DataFrame", + "Series", ] diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py new file mode 100644 index 000000000..001b96c79 --- /dev/null +++ b/narwhals/dataframe.py @@ -0,0 +1,215 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Iterable +from typing import Literal +from typing import Sequence + +from narwhals.dtypes import to_narwhals_dtype +from narwhals.pandas_like.dataframe import PandasDataFrame +from narwhals.translate import get_pandas +from narwhals.translate import get_polars + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals.dtypes import DType + from narwhals.group_by import GroupBy + from narwhals.series import Series + from narwhals.typing import IntoExpr + + +class DataFrame: + def __init__( + self, + df: Any, + *, + is_eager: bool = False, + is_lazy: bool = False, + implementation: str | None = None, + ) -> None: + self._is_eager = is_eager + self._is_lazy = is_lazy + if implementation is not None: + self._dataframe = df + self._implementation = implementation + return + if (pl := get_polars()) is not None and isinstance( + df, (pl.DataFrame, pl.LazyFrame) + ): + if isinstance(df, pl.DataFrame) and is_lazy: + msg = "can't instantiate with is_lazy and pl.DataFrame" + raise TypeError(msg) + if isinstance(df, pl.LazyFrame) and is_eager: + msg = "can't instantiate with is_eager and pl.LazyFrame" + raise TypeError(msg) + self._dataframe = df + self._implementation = "polars" + return + if (pd := get_pandas()) is not None and isinstance(df, pd.DataFrame): + self._dataframe = PandasDataFrame( + df, is_eager=is_eager, is_lazy=is_lazy, implementation="pandas" + ) + self._implementation = "pandas" + return + msg = f"Expected pandas or Polars dataframe or lazyframe, got: {type(df)}" + raise TypeError(msg) + + def _from_dataframe(self, df: Any) -> Self: + # construct, preserving properties + return self.__class__( + df, + is_eager=self._is_eager, + is_lazy=self._is_lazy, + implementation=self._implementation, + ) + + def _flatten_and_extract(self, *args: Any, **kwargs: Any) -> Any: + from narwhals.utils import flatten_into_expr + + args = [self._extract_native(v) for v in flatten_into_expr(*args)] # type: ignore[assignment] + kwargs = {k: self._extract_native(v) for k, v in kwargs.items()} + return args, kwargs + + def _extract_native(self, arg: Any) -> Any: + from narwhals.expression import Expr + from narwhals.series import Series + + if self._implementation != "polars": + return arg + if isinstance(arg, DataFrame): + return arg._dataframe + if isinstance(arg, Series): + return arg._series + if isinstance(arg, Expr): + import polars as pl + + return arg._call(pl) + return arg + + def __repr__(self) -> str: # pragma: no cover + header = " Narwhals DataFrame " + length = len(header) + return ( + "┌" + + "─" * length + + "┐\n" + + f"|{header}|\n" + + "| Use `narwhals.to_native()` to see native output |\n" + + "└" + + "─" * length + + "┘\n" + ) + + @property + def schema(self) -> dict[str, DType]: + return { + k: to_narwhals_dtype(v, self._implementation) + for k, v in self._dataframe.schema.items() + } + + @property + def columns(self) -> list[str]: + return self._dataframe.columns # type: ignore[no-any-return] + + @property + def shape(self) -> tuple[int, int]: + if self._is_lazy: + raise RuntimeError( + "Can't extract Series from Narwhals DataFrame if it was instantiated with `is_lazy=True`" + ) + return self._dataframe.shape # type: ignore[no-any-return] + + def __getitem__(self, col_name: str) -> Series: + from narwhals.series import Series + + if self._is_lazy: + raise RuntimeError( + "Can't extract Series from Narwhals DataFrame if it was instantiated with `is_lazy=True`" + ) + return Series(self._dataframe[col_name], implementation=self._implementation) + + def with_columns( + self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr + ) -> Self: + exprs, named_exprs = self._flatten_and_extract(*exprs, **named_exprs) + return self._from_dataframe( + self._dataframe.with_columns(*exprs, **named_exprs), + ) + + def select( + self, + *exprs: IntoExpr | Iterable[IntoExpr], + **named_exprs: IntoExpr, + ) -> Self: + exprs, named_exprs = self._flatten_and_extract(*exprs, **named_exprs) + return self._from_dataframe( + self._dataframe.select(*exprs, **named_exprs), + ) + + def rename(self, mapping: dict[str, str]) -> Self: + return self._from_dataframe(self._dataframe.rename(mapping)) + + def filter(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> Self: + predicates, _ = self._flatten_and_extract(*predicates) + return self._from_dataframe( + self._dataframe.filter(*predicates), + ) + + def group_by(self, *keys: str | Iterable[str]) -> GroupBy: + from narwhals.group_by import GroupBy + + return GroupBy(self, *keys) + + def sort( + self, + by: str | Iterable[str], + *more_by: str, + descending: bool | Sequence[bool] = False, + ) -> Self: + return self._from_dataframe( + self._dataframe.sort(by, *more_by, descending=descending) + ) + + def collect(self) -> Self: + return self.__class__( + self._dataframe.collect(), + is_eager=True, + is_lazy=False, + implementation=self._implementation, + ) + + def to_dict(self, *, as_series: bool = True) -> dict[str, Any]: + return self._dataframe.to_dict(as_series=as_series) # type: ignore[no-any-return] + + def join( + self, + other: Self, + *, + how: Literal["inner"] = "inner", + left_on: str | list[str], + right_on: str | list[str], + ) -> Self: + return self._from_dataframe( + self._dataframe.join( + other._dataframe, + how=how, + left_on=left_on, + right_on=right_on, + ) + ) + + def to_pandas(self) -> Any: + if not self._is_eager: + raise RuntimeError( + "DataFrame.to_pandas can only be called when it was instantiated with `is_eager=True`" + ) + return self._dataframe.to_pandas() + + def to_numpy(self) -> Any: + if not self._is_eager: + raise RuntimeError( + "DataFrame.to_numpy can only be called when it was instantiated with `is_eager=True`" + ) + return self._dataframe.to_numpy() diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py new file mode 100644 index 000000000..29197c399 --- /dev/null +++ b/narwhals/dtypes.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any + +from narwhals.pandas_like.utils import isinstance_or_issubclass + +if TYPE_CHECKING: + from typing_extensions import Self + + +class DType: + def __repr__(self) -> str: + return self.__class__.__qualname__ + + @classmethod + def is_numeric(cls: type[Self]) -> bool: + return issubclass(cls, NumericType) + + def __eq__(self, other: DType | type[DType]) -> bool: # type: ignore[override] + return isinstance_or_issubclass(other, type(self)) + + +class NumericType(DType): ... + + +class TemporalType(DType): ... + + +class Int64(NumericType): ... + + +class Int32(NumericType): ... + + +class Int16(NumericType): ... + + +class Int8(NumericType): ... + + +class UInt64(NumericType): ... + + +class UInt32(NumericType): ... + + +class UInt16(NumericType): ... + + +class UInt8(NumericType): ... + + +class Float64(NumericType): ... + + +class Float32(NumericType): ... + + +class String(DType): ... + + +class Boolean(DType): ... + + +class Object(DType): # todo: do we really want this one? + ... + + +class Datetime(TemporalType): ... + + +class Date(TemporalType): ... + + +def translate_dtype(plx: Any, dtype: DType) -> Any: + if dtype == Float64: + return plx.Float64 + if dtype == Float32: + return plx.Float32 + if dtype == Int64: + return plx.Int64 + if dtype == Int32: + return plx.Int32 + if dtype == Int16: + return plx.Int16 + if dtype == UInt8: + return plx.UInt8 + if dtype == UInt64: + return plx.UInt64 + if dtype == UInt32: + return plx.UInt32 + if dtype == UInt16: + return plx.UInt16 + if dtype == UInt8: + return plx.UInt8 + if dtype == String: + return plx.String + if dtype == Boolean: + return plx.Boolean + if dtype == Datetime: + return plx.Datetime + if dtype == Date: + return plx.Date + msg = f"Unknown dtype: {dtype}" + raise TypeError(msg) + + +def to_narwhals_dtype(dtype: Any, implementation: str) -> DType: + if implementation != "polars": + return dtype # type: ignore[no-any-return] + import polars as pl + + if dtype == pl.Float64: + return Float64() + if dtype == pl.Float32: + return Float32() + if dtype == pl.Int64: + return Int64() + if dtype == pl.Int32: + return Int32() + if dtype == pl.Int16: + return Int16() + if dtype == pl.UInt8: + return UInt8() + if dtype == pl.UInt64: + return UInt64() + if dtype == pl.UInt32: + return UInt32() + if dtype == pl.UInt16: + return UInt16() + if dtype == pl.UInt8: + return UInt8() + if dtype == pl.String: + return String() + if dtype == pl.Boolean: + return Boolean() + if dtype == pl.Datetime: + return Datetime() + if dtype == pl.Date: + return Date() + msg = f"Unexpected dtype, got: {type(dtype)}" + raise TypeError(msg) diff --git a/narwhals/expression.py b/narwhals/expression.py new file mode 100644 index 000000000..7d26273a1 --- /dev/null +++ b/narwhals/expression.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Callable +from typing import Iterable + +from narwhals.dtypes import translate_dtype + +if TYPE_CHECKING: + from narwhals.typing import IntoExpr + + +def extract_native(expr: Expr, other: Any) -> Any: + if isinstance(other, Expr): + return other._call(expr) + return other + + +class Expr: + def __init__(self, call: Callable[[Any], Any]) -> None: + # callable from namespace to expr + self._call = call + + # --- convert --- + def alias(self, name: str) -> Expr: + return self.__class__(lambda plx: self._call(plx).alias(name)) + + def cast( + self, + dtype: Any, + ) -> Expr: + return self.__class__( + lambda plx: self._call(plx).cast(translate_dtype(plx, dtype)), + ) + + # --- binary --- + def __eq__(self, other: object) -> Expr: # type: ignore[override] + return self.__class__( + lambda plx: self._call(plx).__eq__(extract_native(plx, other)) + ) + + def __and__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__and__(extract_native(plx, other)) + ) + + def __or__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__or__(extract_native(plx, other)) + ) + + def __add__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__add__(extract_native(plx, other)) + ) + + def __radd__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__radd__(extract_native(plx, other)) + ) + + def __sub__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__sub__(extract_native(plx, other)) + ) + + def __rsub__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__rsub__(extract_native(plx, other)) + ) + + def __mul__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__mul__(extract_native(plx, other)) + ) + + def __rmul__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__rmul__(extract_native(plx, other)) + ) + + def __le__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__le__(extract_native(plx, other)) + ) + + def __lt__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__lt__(extract_native(plx, other)) + ) + + def __gt__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__gt__(extract_native(plx, other)) + ) + + def __ge__(self, other: Any) -> Expr: + return self.__class__( + lambda plx: self._call(plx).__ge__(extract_native(plx, other)) + ) + + # --- unary --- + def mean(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).mean()) + + def sum(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).sum()) + + def min(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).min()) + + def max(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).max()) + + def n_unique(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).n_unique()) + + def unique(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).unique()) + + # --- transform --- + def is_between( + self, lower_bound: Any, upper_bound: Any, closed: str = "both" + ) -> Expr: + return self.__class__( + lambda plx: self._call(plx).is_between(lower_bound, upper_bound, closed) + ) + + def is_in(self, other: Any) -> Expr: + return self.__class__(lambda plx: self._call(plx).is_in(other)) + + def is_null(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).is_null()) + + # --- partial reduction --- + def drop_nulls(self) -> Expr: + return self.__class__(lambda plx: self._call(plx).drop_nulls()) + + def sample(self, n: int, fraction: float, *, with_replacement: bool) -> Expr: + return self.__class__( + lambda plx: self._call(plx).sample( + n, fraction=fraction, with_replacement=with_replacement + ) + ) + + +def col(*names: str | Iterable[str]) -> Expr: + return Expr(lambda plx: plx.col(*names)) + + +def all() -> Expr: + return Expr(lambda plx: plx.all()) + + +def len() -> Expr: + return Expr(lambda plx: plx.len()) + + +def sum(*columns: str) -> Expr: + return Expr(lambda plx: plx.sum(*columns)) + + +def mean(*columns: str) -> Expr: + return Expr(lambda plx: plx.mean(*columns)) + + +def min(*columns: str) -> Expr: + return Expr(lambda plx: plx.min(*columns)) + + +def max(*columns: str) -> Expr: + return Expr(lambda plx: plx.max(*columns)) + + +def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + return Expr(lambda plx: plx.sum_horizontal(*exprs)) + + +__all__ = [ + "Expr", +] diff --git a/narwhals/group_by.py b/narwhals/group_by.py new file mode 100644 index 000000000..1c4b70683 --- /dev/null +++ b/narwhals/group_by.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import Iterable + +if TYPE_CHECKING: + from narwhals.dataframe import DataFrame + from narwhals.typing import IntoExpr + + +class GroupBy: + def __init__(self, df: Any, *keys: str | Iterable[str]) -> None: + self._df = df + self._keys = keys + + def agg( + self, *aggs: IntoExpr | Iterable[IntoExpr], **named_aggs: IntoExpr + ) -> DataFrame: + aggs, named_aggs = self._df._flatten_and_extract(*aggs, **named_aggs) + return self._df.__class__( # type: ignore[no-any-return] + self._df._dataframe.group_by(*self._keys).agg(*aggs, **named_aggs), + implementation=self._df._implementation, + is_eager=self._df._is_eager, + is_lazy=self._df._is_lazy, + ) diff --git a/narwhals/pandas_like/dataframe.py b/narwhals/pandas_like/dataframe.py index c7deb8c59..0c57bd0e5 100644 --- a/narwhals/pandas_like/dataframe.py +++ b/narwhals/pandas_like/dataframe.py @@ -11,7 +11,6 @@ from narwhals.pandas_like.utils import reset_index from narwhals.pandas_like.utils import translate_dtype from narwhals.pandas_like.utils import validate_dataframe_comparand -from narwhals.spec import DataFrame as DataFrameProtocol from narwhals.utils import flatten_str if TYPE_CHECKING: @@ -19,13 +18,13 @@ from typing_extensions import Self - from narwhals.pandas_like.group_by import PdxGroupBy + from narwhals.dtypes import DType + from narwhals.pandas_like.group_by import PandasGroupBy from narwhals.pandas_like.series import PandasSeries - from narwhals.spec import DType - from narwhals.spec import IntoExpr + from narwhals.pandas_like.typing import IntoExpr -class PandasDataFrame(DataFrameProtocol): +class PandasDataFrame: # --- not in the spec --- def __init__( self, @@ -41,20 +40,6 @@ def __init__( self._is_eager = is_eager self._is_lazy = is_lazy - def __repr__(self) -> str: # pragma: no cover - header = " Narwhals LazyFrame " - length = len(header) - return ( - "┌" - + "─" * length - + "┐\n" - + f"|{header}|\n" - + "| Add `.to_native()` to see native output |\n" - + "└" - + "─" * length - + "┘\n" - ) - def _validate_columns(self, columns: Sequence[str]) -> None: counter = collections.Counter(columns) for col, count in counter.items(): @@ -112,7 +97,7 @@ def select( ) -> Self: new_series = evaluate_into_exprs(self, *exprs, **named_exprs) df = horizontal_concat( - [series.series for series in new_series], + [series._series for series in new_series], implementation=self._implementation, ) return self._from_dataframe(df) @@ -137,7 +122,7 @@ def with_columns( ) -> Self: new_series = evaluate_into_exprs(self, *exprs, **named_exprs) df = self._dataframe.assign( - **{series.name: series.series for series in new_series} + **{series.name: validate_dataframe_comparand(series) for series in new_series} ) return self._from_dataframe(df) @@ -177,10 +162,10 @@ def collect(self) -> PandasDataFrame: ) # --- actions --- - def group_by(self, *keys: str | Iterable[str]) -> PdxGroupBy: - from narwhals.pandas_like.group_by import PdxGroupBy + def group_by(self, *keys: str | Iterable[str]) -> PandasGroupBy: + from narwhals.pandas_like.group_by import PandasGroupBy - return PdxGroupBy( + return PandasGroupBy( self, flatten_str(*keys), is_eager=self._is_eager, diff --git a/narwhals/pandas_like/dtypes.py b/narwhals/pandas_like/dtypes.py deleted file mode 100644 index fac53db5c..000000000 --- a/narwhals/pandas_like/dtypes.py +++ /dev/null @@ -1,75 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -from narwhals.pandas_like.utils import isinstance_or_issubclass -from narwhals.spec import DType as DTypeProtocol - -if TYPE_CHECKING: - from typing_extensions import Self - - -class DType(DTypeProtocol): - def __repr__(self) -> str: - return self.__class__.__qualname__ - - @classmethod - def is_numeric(cls: type[Self]) -> bool: - return issubclass(cls, NumericType) - - def __eq__(self, other: DType | type[DType]) -> bool: # type: ignore[override] - return isinstance_or_issubclass(other, type(self)) - - -class NumericType(DType): ... - - -class TemporalType(DType): ... - - -class Int64(NumericType): ... - - -class Int32(NumericType): ... - - -class Int16(NumericType): ... - - -class Int8(NumericType): ... - - -class UInt64(NumericType): ... - - -class UInt32(NumericType): ... - - -class UInt16(NumericType): ... - - -class UInt8(NumericType): ... - - -class Float64(NumericType): ... - - -class Float32(NumericType): ... - - -class String(DType): ... - - -class Boolean(DType): ... - - -class Object(DType): ... - - -class Datetime(DType): ... - - -class Date(DType): ... - - -class Duration(DType): ... diff --git a/narwhals/pandas_like/expr.py b/narwhals/pandas_like/expr.py index 88f0ab300..efbb13232 100644 --- a/narwhals/pandas_like/expr.py +++ b/narwhals/pandas_like/expr.py @@ -6,17 +6,14 @@ from narwhals.pandas_like.series import PandasSeries from narwhals.pandas_like.utils import register_expression_call -from narwhals.spec import Expr as ExprProtocol -from narwhals.spec import ExprStringNamespace as ExprStringNamespaceProtocol if TYPE_CHECKING: from typing_extensions import Self from narwhals.pandas_like.dataframe import PandasDataFrame - from narwhals.pandas_like.dtypes import DType -class Expr(ExprProtocol): +class PandasExpr: def __init__( # noqa: PLR0913 self, call: Callable[[PandasDataFrame], list[PandasSeries]], @@ -37,7 +34,7 @@ def __init__( # noqa: PLR0913 def __repr__(self) -> str: return ( - f"Narwhals Expr(" + f"PandasExpr(" f"depth={self._depth}, " f"function_name={self._function_name}, " f"root_names={self._root_names}, " @@ -65,77 +62,77 @@ def from_column_names( def cast( self, - dtype: DType, # type: ignore[override] + dtype: Any, ) -> Self: return register_expression_call(self, "cast", dtype) - def __eq__(self, other: Expr | Any) -> Self: # type: ignore[override] + def __eq__(self, other: PandasExpr | Any) -> Self: # type: ignore[override] return register_expression_call(self, "__eq__", other) - def __ne__(self, other: Expr | Any) -> Self: # type: ignore[override] + def __ne__(self, other: PandasExpr | Any) -> Self: # type: ignore[override] return register_expression_call(self, "__ne__", other) - def __ge__(self, other: Expr | Any) -> Self: + def __ge__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__ge__", other) - def __gt__(self, other: Expr | Any) -> Self: + def __gt__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__gt__", other) - def __le__(self, other: Expr | Any) -> Self: + def __le__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__le__", other) - def __lt__(self, other: Expr | Any) -> Self: + def __lt__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__lt__", other) - def __and__(self, other: Expr | bool | Any) -> Self: + def __and__(self, other: PandasExpr | bool | Any) -> Self: return register_expression_call(self, "__and__", other) def __rand__(self, other: Any) -> Self: return register_expression_call(self, "__rand__", other) - def __or__(self, other: Expr | bool | Any) -> Self: + def __or__(self, other: PandasExpr | bool | Any) -> Self: return register_expression_call(self, "__or__", other) def __ror__(self, other: Any) -> Self: return register_expression_call(self, "__ror__", other) - def __add__(self, other: Expr | Any) -> Self: + def __add__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__add__", other) def __radd__(self, other: Any) -> Self: return register_expression_call(self, "__radd__", other) - def __sub__(self, other: Expr | Any) -> Self: + def __sub__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__sub__", other) def __rsub__(self, other: Any) -> Self: return register_expression_call(self, "__rsub__", other) - def __mul__(self, other: Expr | Any) -> Self: + def __mul__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__mul__", other) def __rmul__(self, other: Any) -> Self: return self.__mul__(other) - def __truediv__(self, other: Expr | Any) -> Self: + def __truediv__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__truediv__", other) def __rtruediv__(self, other: Any) -> Self: raise NotImplementedError - def __floordiv__(self, other: Expr | Any) -> Self: + def __floordiv__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__floordiv__", other) def __rfloordiv__(self, other: Any) -> Self: raise NotImplementedError - def __pow__(self, other: Expr | Any) -> Self: + def __pow__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__pow__", other) def __rpow__(self, other: Any) -> Self: # pragma: no cover raise NotImplementedError - def __mod__(self, other: Expr | Any) -> Self: + def __mod__(self, other: PandasExpr | Any) -> Self: return register_expression_call(self, "__mod__", other) def __rmod__(self, other: Any) -> Self: # pragma: no cover @@ -203,17 +200,17 @@ def str(self) -> ExprStringNamespace: return ExprStringNamespace(self) -class ExprStringNamespace(ExprStringNamespaceProtocol): - def __init__(self, expr: Expr) -> None: +class ExprStringNamespace: + def __init__(self, expr: PandasExpr) -> None: self._expr = expr - def ends_with(self, suffix: str) -> Expr: + def ends_with(self, suffix: str) -> PandasExpr: # TODO make a register_expression_call for namespaces - return Expr( + return PandasExpr( lambda df: [ PandasSeries( - series.series.str.endswith(suffix), + series._series.str.endswith(suffix), implementation=df._implementation, ) for series in self._expr._call(df) @@ -225,11 +222,11 @@ def ends_with(self, suffix: str) -> Expr: implementation=self._expr._implementation, ) - def strip_chars(self, characters: str = " ") -> Expr: - return Expr( + def strip_chars(self, characters: str = " ") -> PandasExpr: + return PandasExpr( lambda df: [ PandasSeries( - series.series.str.strip(characters), + series._series.str.strip(characters), implementation=df._implementation, ) for series in self._expr._call(df) diff --git a/narwhals/pandas_like/group_by.py b/narwhals/pandas_like/group_by.py index 12ee1cd47..ad2a26e11 100644 --- a/narwhals/pandas_like/group_by.py +++ b/narwhals/pandas_like/group_by.py @@ -14,16 +14,15 @@ from narwhals.pandas_like.utils import is_simple_aggregation from narwhals.pandas_like.utils import item from narwhals.pandas_like.utils import parse_into_exprs -from narwhals.spec import GroupBy as GroupByProtocol -from narwhals.spec import IntoExpr from narwhals.utils import remove_prefix if TYPE_CHECKING: from narwhals.pandas_like.dataframe import PandasDataFrame - from narwhals.pandas_like.expr import Expr + from narwhals.pandas_like.expr import PandasExpr + from narwhals.pandas_like.typing import IntoExpr -class PdxGroupBy(GroupByProtocol): +class PandasGroupBy: def __init__( self, df: PandasDataFrame, keys: list[str], *, is_eager: bool, is_lazy: bool ) -> None: @@ -90,7 +89,7 @@ def _from_dataframe(self, df: PandasDataFrame) -> PandasDataFrame: def agg_pandas( grouped: Any, - exprs: list[Expr], + exprs: list[PandasExpr], keys: list[str], output_names: list[str], from_dataframe: Callable[[Any], PandasDataFrame], @@ -142,7 +141,7 @@ def func(df: Any) -> Any: def agg_generic( # noqa: PLR0913 grouped: Any, - exprs: list[Expr], + exprs: list[PandasExpr], group_by_keys: list[str], output_names: list[str], implementation: str, diff --git a/narwhals/pandas_like/namespace.py b/narwhals/pandas_like/namespace.py index acb29f47f..5c5312e74 100644 --- a/narwhals/pandas_like/namespace.py +++ b/narwhals/pandas_like/namespace.py @@ -1,23 +1,25 @@ from __future__ import annotations from functools import reduce +from typing import TYPE_CHECKING from typing import Any from typing import Callable from typing import Iterable -from narwhals.pandas_like import dtypes +from narwhals import dtypes from narwhals.pandas_like.dataframe import PandasDataFrame -from narwhals.pandas_like.expr import Expr +from narwhals.pandas_like.expr import PandasExpr from narwhals.pandas_like.series import PandasSeries from narwhals.pandas_like.utils import horizontal_concat from narwhals.pandas_like.utils import parse_into_exprs from narwhals.pandas_like.utils import series_from_iterable -from narwhals.spec import IntoExpr -from narwhals.spec import Namespace as NamespaceProtocol from narwhals.utils import flatten_str +if TYPE_CHECKING: + from narwhals.pandas_like.typing import IntoExpr -class Namespace(NamespaceProtocol): + +class Namespace: Int64 = dtypes.Int64 Int32 = dtypes.Int32 Int16 = dtypes.Int16 @@ -54,8 +56,8 @@ def _create_expr_from_callable( # noqa: PLR0913 function_name: str, root_names: list[str] | None, output_names: list[str] | None, - ) -> Expr: - return Expr( + ) -> PandasExpr: + return PandasExpr( func, depth=depth, function_name=function_name, @@ -70,15 +72,15 @@ def _create_series_from_scalar( return PandasSeries( series_from_iterable( [value], - name=series.series.name, - index=series.series.index[0:1], + name=series._series.name, + index=series._series.index[0:1], implementation=self._implementation, ), implementation=self._implementation, ) - def _create_expr_from_series(self, series: PandasSeries) -> Expr: - return Expr( + def _create_expr_from_series(self, series: PandasSeries) -> PandasExpr: + return PandasExpr( lambda _df: [series], depth=0, function_name="series", @@ -88,13 +90,13 @@ def _create_expr_from_series(self, series: PandasSeries) -> Expr: ) # --- selection --- - def col(self, *column_names: str | Iterable[str]) -> Expr: - return Expr.from_column_names( + def col(self, *column_names: str | Iterable[str]) -> PandasExpr: + return PandasExpr.from_column_names( *flatten_str(*column_names), implementation=self._implementation ) - def all(self) -> Expr: - return Expr( + def all(self) -> PandasExpr: + return PandasExpr( lambda df: [ PandasSeries( df._dataframe.loc[:, column_name], @@ -110,28 +112,28 @@ def all(self) -> Expr: ) # --- reduction --- - def sum(self, *column_names: str) -> Expr: - return Expr.from_column_names( + def sum(self, *column_names: str) -> PandasExpr: + return PandasExpr.from_column_names( *column_names, implementation=self._implementation ).sum() - def mean(self, *column_names: str) -> Expr: - return Expr.from_column_names( + def mean(self, *column_names: str) -> PandasExpr: + return PandasExpr.from_column_names( *column_names, implementation=self._implementation ).mean() - def max(self, *column_names: str) -> Expr: - return Expr.from_column_names( + def max(self, *column_names: str) -> PandasExpr: + return PandasExpr.from_column_names( *column_names, implementation=self._implementation ).max() - def min(self, *column_names: str) -> Expr: - return Expr.from_column_names( + def min(self, *column_names: str) -> PandasExpr: + return PandasExpr.from_column_names( *column_names, implementation=self._implementation ).min() - def len(self) -> Expr: - return Expr( + def len(self) -> PandasExpr: + return PandasExpr( lambda df: [ PandasSeries( series_from_iterable( @@ -151,18 +153,18 @@ def len(self) -> Expr: ) # --- horizontal --- - def sum_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + def sum_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> PandasExpr: return reduce(lambda x, y: x + y, parse_into_exprs(self._implementation, *exprs)) - def all_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + def all_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> PandasExpr: return reduce(lambda x, y: x & y, parse_into_exprs(self._implementation, *exprs)) - def any_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: + def any_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> PandasExpr: return reduce(lambda x, y: x | y, parse_into_exprs(self._implementation, *exprs)) def concat( self, - items: Iterable[PandasDataFrame], # type: ignore[override] + items: Iterable[PandasDataFrame], *, how: str = "vertical", ) -> PandasDataFrame: diff --git a/narwhals/pandas_like/series.py b/narwhals/pandas_like/series.py index 1931b6d6a..b291b7654 100644 --- a/narwhals/pandas_like/series.py +++ b/narwhals/pandas_like/series.py @@ -11,16 +11,14 @@ from narwhals.pandas_like.utils import reverse_translate_dtype from narwhals.pandas_like.utils import translate_dtype from narwhals.pandas_like.utils import validate_column_comparand -from narwhals.spec import Series as SeriesProtocol -from narwhals.translate import get_namespace if TYPE_CHECKING: from typing_extensions import Self - from narwhals.pandas_like.dtypes import DType + from narwhals.dtypes import DType -class PandasSeries(SeriesProtocol): +class PandasSeries: def __init__( self, series: Any, @@ -37,26 +35,15 @@ def __init__( self._series = reset_index(series) self._implementation = implementation - def __repr__(self) -> str: # pragma: no cover - header = " Narwhals Series " - length = len(header) - return ( - "┌" - + "─" * length - + "┐\n" - + f"|{header}|\n" - + "| Add `.to_native()` to see native output |\n" - + "└" - + "─" * length - + "┘\n" - ) - def _from_series(self, series: Any) -> Self: return self.__class__( series.rename(series.name, copy=False), implementation=self._implementation, ) + def __len__(self) -> int: + return self.shape[0] + @property def name(self) -> str: return self._name @@ -66,42 +53,38 @@ def shape(self) -> tuple[int]: return self._series.shape # type: ignore[no-any-return] def rename(self, name: str) -> PandasSeries: - ser = self.series + ser = self._series return self._from_series(ser.rename(name, copy=False)) - @property - def series(self) -> Any: - return self._series - @property def dtype(self) -> DType: return translate_dtype(self._series.dtype) def cast( self, - dtype: DType, # type: ignore[override] + dtype: Any, ) -> Self: - ser = self.series + ser = self._series dtype = reverse_translate_dtype(dtype) return self._from_series(ser.astype(dtype)) def filter(self, mask: Self) -> Self: - ser = self.series + ser = self._series return self._from_series(ser.loc[validate_column_comparand(mask)]) def item(self) -> Any: - return item(self.series) + return item(self._series) def is_between( self, lower_bound: Any, upper_bound: Any, closed: str = "both" ) -> PandasSeries: - ser = self.series + ser = self._series return self._from_series(ser.between(lower_bound, upper_bound, inclusive=closed)) def is_in(self, other: Any) -> PandasSeries: import pandas as pd - ser = self.series + ser = self._series res = ser.isin(other).convert_dtypes() res[ser.isna()] = pd.NA return self._from_series(res) @@ -110,36 +93,36 @@ def is_in(self, other: Any) -> PandasSeries: def __eq__(self, other: object) -> PandasSeries: # type: ignore[override] other = validate_column_comparand(other) - ser = self.series + ser = self._series return self._from_series((ser == other).rename(ser.name, copy=False)) def __ne__(self, other: object) -> PandasSeries: # type: ignore[override] other = validate_column_comparand(other) - ser = self.series + ser = self._series return self._from_series((ser != other).rename(ser.name, copy=False)) def __ge__(self, other: Any) -> PandasSeries: other = validate_column_comparand(other) - ser = self.series + ser = self._series return self._from_series((ser >= other).rename(ser.name, copy=False)) def __gt__(self, other: Any) -> PandasSeries: other = validate_column_comparand(other) - ser = self.series + ser = self._series return self._from_series((ser > other).rename(ser.name, copy=False)) def __le__(self, other: Any) -> PandasSeries: other = validate_column_comparand(other) - ser = self.series + ser = self._series return self._from_series((ser <= other).rename(ser.name, copy=False)) def __lt__(self, other: Any) -> PandasSeries: other = validate_column_comparand(other) - ser = self.series + ser = self._series return self._from_series((ser < other).rename(ser.name, copy=False)) def __and__(self, other: Any) -> PandasSeries: - ser = self.series + ser = self._series other = validate_column_comparand(other) return self._from_series((ser & other).rename(ser.name, copy=False)) @@ -147,7 +130,7 @@ def __rand__(self, other: Any) -> PandasSeries: return self.__and__(other) def __or__(self, other: Any) -> PandasSeries: - ser = self.series + ser = self._series other = validate_column_comparand(other) return self._from_series((ser | other).rename(ser.name, copy=False)) @@ -155,7 +138,7 @@ def __ror__(self, other: Any) -> PandasSeries: return self.__or__(other) def __add__(self, other: Any) -> PandasSeries: - ser = self.series + ser = self._series other = validate_column_comparand(other) return self._from_series((ser + other).rename(ser.name, copy=False)) @@ -163,7 +146,7 @@ def __radd__(self, other: Any) -> PandasSeries: return self.__add__(other) def __sub__(self, other: Any) -> PandasSeries: - ser = self.series + ser = self._series other = validate_column_comparand(other) return self._from_series((ser - other).rename(ser.name, copy=False)) @@ -171,7 +154,7 @@ def __rsub__(self, other: Any) -> PandasSeries: return -1 * self.__sub__(other) def __mul__(self, other: Any) -> PandasSeries: - ser = self.series + ser = self._series other = validate_column_comparand(other) return self._from_series((ser * other).rename(ser.name, copy=False)) @@ -179,7 +162,7 @@ def __rmul__(self, other: Any) -> PandasSeries: return self.__mul__(other) def __truediv__(self, other: Any) -> PandasSeries: - ser = self.series + ser = self._series other = validate_column_comparand(other) return self._from_series((ser / other).rename(ser.name, copy=False)) @@ -187,7 +170,7 @@ def __rtruediv__(self, other: Any) -> PandasSeries: raise NotImplementedError def __floordiv__(self, other: Any) -> PandasSeries: - ser = self.series + ser = self._series other = validate_column_comparand(other) return self._from_series((ser // other).rename(ser.name, copy=False)) @@ -195,7 +178,7 @@ def __rfloordiv__(self, other: Any) -> PandasSeries: raise NotImplementedError def __pow__(self, other: Any) -> PandasSeries: - ser = self.series + ser = self._series other = validate_column_comparand(other) return self._from_series((ser**other).rename(ser.name, copy=False)) @@ -203,7 +186,7 @@ def __rpow__(self, other: Any) -> PandasSeries: # pragma: no cover raise NotImplementedError def __mod__(self, other: Any) -> PandasSeries: - ser = self.series + ser = self._series other = validate_column_comparand(other) return self._from_series((ser % other).rename(ser.name, copy=False)) @@ -213,41 +196,41 @@ def __rmod__(self, other: Any) -> PandasSeries: # pragma: no cover # Unary def __invert__(self: PandasSeries) -> PandasSeries: - ser = self.series + ser = self._series return self._from_series(~ser) # Reductions def any(self) -> Any: - ser = self.series + ser = self._series return ser.any() def all(self) -> Any: - ser = self.series + ser = self._series return ser.all() def min(self) -> Any: - ser = self.series + ser = self._series return ser.min() def max(self) -> Any: - ser = self.series + ser = self._series return ser.max() def sum(self) -> Any: - ser = self.series + ser = self._series return ser.sum() def prod(self) -> Any: - ser = self.series + ser = self._series return ser.prod() def median(self) -> Any: - ser = self.series + ser = self._series return ser.median() def mean(self) -> Any: - ser = self.series + ser = self._series return ser.mean() def std( @@ -255,7 +238,7 @@ def std( *, correction: float = 1.0, ) -> Any: - ser = self.series + ser = self._series return ser.std(ddof=correction) def var( @@ -263,7 +246,7 @@ def var( *, correction: float = 1.0, ) -> Any: - ser = self.series + ser = self._series return ser.var(ddof=correction) def len(self) -> Any: @@ -272,36 +255,42 @@ def len(self) -> Any: # Transformations def is_null(self) -> PandasSeries: - ser = self.series + ser = self._series return self._from_series(ser.isna()) def drop_nulls(self) -> PandasSeries: - ser = self.series + ser = self._series return self._from_series(ser.dropna()) def n_unique(self) -> int: - ser = self.series + ser = self._series return ser.nunique() # type: ignore[no-any-return] - def zip_with(self, mask: SeriesProtocol, other: SeriesProtocol) -> PandasSeries: + def zip_with(self, mask: PandasSeries, other: PandasSeries) -> PandasSeries: mask = validate_column_comparand(mask) other = validate_column_comparand(other) - ser = self.series + ser = self._series return self._from_series(ser.where(mask, other)) def sample(self, n: int, fraction: float, *, with_replacement: bool) -> PandasSeries: - ser = self.series + ser = self._series return self._from_series( ser.sample(n=n, frac=fraction, with_replacement=with_replacement) ) def unique(self) -> PandasSeries: - ser = self.series - plx = get_namespace(self._implementation) - return plx.Series(self.name, ser.unique()) # type: ignore[no-any-return, attr-defined] + if self._implementation != "pandas": + raise NotImplementedError + import pandas as pd + + return self._from_series( + pd.Series( + self._series.unique(), dtype=self._series.dtype, name=self._series.name + ) + ) def is_nan(self) -> PandasSeries: - ser = self.series + ser = self._series if is_extension_array_dtype(ser.dtype): return self._from_series((ser != ser).fillna(False)) # noqa: PLR0124 return self._from_series(ser.isna()) @@ -311,24 +300,24 @@ def sort( *, descending: bool | Sequence[bool] = True, ) -> PandasSeries: - ser = self.series + ser = self._series return self._from_series( ser.sort_values(ascending=not descending).rename(self.name) ) def alias(self, name: str) -> Self: - ser = self.series + ser = self._series return self._from_series(ser.rename(name, copy=False)) def to_numpy(self) -> Any: - return self.series.to_numpy() + return self._series.to_numpy() def to_pandas(self) -> Any: if self._implementation == "pandas": - return self.series + return self._series elif self._implementation == "cudf": - return self.series.to_pandas() + return self._series.to_pandas() elif self._implementation == "modin": - return self.series._to_pandas() + return self._series._to_pandas() msg = f"Unknown implementation: {self._implementation}" raise TypeError(msg) diff --git a/narwhals/pandas_like/typing.py b/narwhals/pandas_like/typing.py new file mode 100644 index 000000000..ca1fd0f37 --- /dev/null +++ b/narwhals/pandas_like/typing.py @@ -0,0 +1,10 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import TypeAlias + +if TYPE_CHECKING: + from narwhals.pandas_like.expr import PandasExpr + from narwhals.pandas_like.series import PandasSeries + + IntoExpr: TypeAlias = PandasExpr | str | int | float | PandasSeries diff --git a/narwhals/pandas_like/utils.py b/narwhals/pandas_like/utils.py index b8bc6e1bb..cd324b03e 100644 --- a/narwhals/pandas_like/utils.py +++ b/narwhals/pandas_like/utils.py @@ -12,12 +12,12 @@ T = TypeVar("T") if TYPE_CHECKING: + from narwhals.dtypes import DType from narwhals.pandas_like.dataframe import PandasDataFrame - from narwhals.pandas_like.dtypes import DType - from narwhals.pandas_like.expr import Expr + from narwhals.pandas_like.expr import PandasExpr from narwhals.pandas_like.series import PandasSeries - ExprT = TypeVar("ExprT", bound=Expr) + ExprT = TypeVar("ExprT", bound=PandasExpr) from narwhals.spec import IntoExpr @@ -46,7 +46,7 @@ def validate_column_comparand(other: Any) -> Any: if other.len() == 1: # broadcast return other.item() - return other.series + return other._series return other @@ -70,23 +70,23 @@ def validate_dataframe_comparand(other: Any) -> Any: if isinstance(other, PandasSeries): if other.len() == 1: # broadcast - return item(other) - return other.series + return item(other._series) + return other._series return other def maybe_evaluate_expr(df: PandasDataFrame, arg: Any) -> Any: """Evaluate expression if it's an expression, otherwise return it as is.""" - from narwhals.pandas_like.expr import Expr + from narwhals.pandas_like.expr import PandasExpr - if isinstance(arg, Expr): + if isinstance(arg, PandasExpr): return arg._call(df) return arg def parse_into_exprs( implementation: str, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr -) -> list[Expr]: +) -> list[PandasExpr]: out = [ parse_into_expr(implementation, into_expr) for into_expr in flatten_into_expr(*exprs) @@ -96,19 +96,16 @@ def parse_into_exprs( return out -def parse_into_expr(implementation: str, into_expr: IntoExpr) -> Expr: - from narwhals.pandas_like.expr import Expr +def parse_into_expr(implementation: str, into_expr: IntoExpr) -> PandasExpr: + from narwhals.expression import Expr from narwhals.pandas_like.namespace import Namespace - from narwhals.pandas_like.series import PandasSeries plx = Namespace(implementation=implementation) + if isinstance(into_expr, Expr): + return into_expr._call(plx) # type: ignore[no-any-return] if isinstance(into_expr, str): return plx.col(into_expr) - if isinstance(into_expr, Expr): - return into_expr - if isinstance(into_expr, PandasSeries): - return plx._create_expr_from_series(into_expr) msg = f"Expected IntoExpr, got {type(into_expr)}" raise TypeError(msg) @@ -117,7 +114,6 @@ def evaluate_into_expr(df: PandasDataFrame, into_expr: IntoExpr) -> list[PandasS """ Return list of raw columns. """ - expr = parse_into_expr(df._implementation, into_expr) return expr._call(df) @@ -145,7 +141,7 @@ def evaluate_into_exprs( def register_expression_call(expr: ExprT, attr: str, *args: Any, **kwargs: Any) -> ExprT: - from narwhals.pandas_like.expr import Expr + from narwhals.pandas_like.expr import PandasExpr from narwhals.pandas_like.namespace import Namespace from narwhals.pandas_like.series import PandasSeries @@ -171,7 +167,7 @@ def func(df: PandasDataFrame) -> list[PandasSeries]: root_names = copy(expr._root_names) for arg in list(args) + list(kwargs.values()): - if root_names is not None and isinstance(arg, Expr): + if root_names is not None and isinstance(arg, PandasExpr): if arg._root_names is not None: root_names.extend(arg._root_names) else: @@ -197,7 +193,7 @@ def item(s: Any) -> Any: return s.iloc[0] -def is_simple_aggregation(expr: Expr) -> bool: +def is_simple_aggregation(expr: PandasExpr) -> bool: return ( expr._function_name is not None and expr._depth is not None @@ -207,7 +203,7 @@ def is_simple_aggregation(expr: Expr) -> bool: ) -def evaluate_simple_aggregation(expr: Expr, grouped: Any) -> Any: +def evaluate_simple_aggregation(expr: PandasExpr, grouped: Any) -> Any: """ Use fastpath for simple aggregations if possible. @@ -295,7 +291,7 @@ def series_from_iterable( def translate_dtype(dtype: Any) -> DType: - from narwhals.pandas_like import dtypes + from narwhals import dtypes if dtype in ("int64", "Int64"): return dtypes.Int64() @@ -334,7 +330,7 @@ def isinstance_or_issubclass(obj: Any, cls: Any) -> bool: def reverse_translate_dtype(dtype: DType | type[DType]) -> Any: - from narwhals.pandas_like import dtypes + from narwhals import dtypes if isinstance_or_issubclass(dtype, dtypes.Float64): return "float64" diff --git a/narwhals/polars.py b/narwhals/polars.py deleted file mode 100644 index 9407d3a1a..000000000 --- a/narwhals/polars.py +++ /dev/null @@ -1,598 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING -from typing import Any -from typing import Iterable -from typing import Literal -from typing import Sequence - -import polars as pl - -from narwhals.pandas_like.utils import isinstance_or_issubclass -from narwhals.spec import DataFrame as DataFrameProtocol -from narwhals.spec import DType as DTypeProtocol -from narwhals.spec import Expr as ExprProtocol -from narwhals.spec import ExprStringNamespace as ExprStringNamespaceProtocol -from narwhals.spec import GroupBy as GroupByProtocol -from narwhals.spec import Namespace as NamespaceProtocol -from narwhals.spec import Series as SeriesProtocol -from narwhals.utils import flatten_into_expr - -if TYPE_CHECKING: - from polars.type_aliases import PolarsDataType - from typing_extensions import Self - - from narwhals.spec import IntoExpr - - -def extract_native(obj: Any) -> Any: - if isinstance(obj, Expr): - return obj._expr - if isinstance(obj, DType): - return obj._dtype - if isinstance(obj, PolarsDataFrame): - return obj._dataframe - if isinstance(obj, PolarsSeries): - return obj._series - return obj - - -class Expr(ExprProtocol): - def __init__(self, expr: pl.Expr) -> None: - self._expr = expr - - # --- convert --- - def alias(self, name: str) -> Self: - return self.__class__(self._expr.alias(name)) - - def cast( - self, - dtype: DType, # type: ignore[override] - ) -> Self: - return self.__class__(self._expr.cast(reverse_translate_dtype(dtype))) - - # --- binary --- - def __eq__(self, other: object) -> Expr: # type: ignore[override] - return self.__class__(self._expr.__eq__(extract_native(other))) - - def __and__(self, other: Any) -> Expr: - return self.__class__(self._expr.__and__(extract_native(other))) - - def __or__(self, other: Any) -> Expr: - return self.__class__(self._expr.__or__(extract_native(other))) - - def __add__(self, other: Any) -> Expr: - return self.__class__(self._expr.__add__(extract_native(other))) - - def __radd__(self, other: Any) -> Expr: - return self.__class__(self._expr.__radd__(extract_native(other))) - - def __sub__(self, other: Any) -> Expr: - return self.__class__(self._expr.__sub__(extract_native(other))) - - def __rsub__(self, other: Any) -> Expr: - return self.__class__(self._expr.__rsub__(extract_native(other))) - - def __mul__(self, other: Any) -> Expr: - return self.__class__(self._expr.__mul__(extract_native(other))) - - def __rmul__(self, other: Any) -> Expr: - return self.__class__(self._expr.__rmul__(extract_native(other))) - - def __le__(self, other: Any) -> Expr: - return self.__class__(self._expr.__le__(extract_native(other))) - - def __lt__(self, other: Any) -> Expr: - return self.__class__(self._expr.__lt__(extract_native(other))) - - def __gt__(self, other: Any) -> Expr: - return self.__class__(self._expr.__gt__(extract_native(other))) - - def __ge__(self, other: Any) -> Expr: - return self.__class__(self._expr.__ge__(extract_native(other))) - - # --- unary --- - def mean(self) -> Expr: - return self.__class__(self._expr.mean()) - - def sum(self) -> Expr: - return self.__class__(self._expr.sum()) - - def min(self) -> Expr: - return self.__class__(self._expr.min()) - - def max(self) -> Expr: - return self.__class__(self._expr.max()) - - def n_unique(self) -> Expr: - return self.__class__(self._expr.n_unique()) - - def unique(self) -> Expr: - return self.__class__(self._expr.unique()) - - # --- transform --- - def is_between( - self, lower_bound: Any, upper_bound: Any, closed: str = "both" - ) -> Expr: - return self.__class__(self._expr.is_between(lower_bound, upper_bound, closed)) # type: ignore[arg-type] - - def is_in(self, other: Any) -> Expr: - return self.__class__(self._expr.is_in(other)) - - def is_null(self) -> Expr: - return self.__class__(self._expr.is_null()) - - # --- partial reduction --- - def drop_nulls(self) -> Expr: - return self.__class__(self._expr.drop_nulls()) - - def sample(self, n: int, fraction: float, *, with_replacement: bool) -> Expr: - return self.__class__( - self._expr.sample(n, fraction=fraction, with_replacement=with_replacement) - ) - - # --- namespaces --- - @property - def str(self) -> ExprStringNamespace: - return ExprStringNamespace(self._expr.str) - - -class ExprStringNamespace(ExprStringNamespaceProtocol): - def __init__(self, expr: Any) -> None: - self._expr = expr - - def ends_with(self, suffix: str) -> Expr: - return Expr(self._expr.str.ends_with(suffix)) - - -class DType(DTypeProtocol): - def __init__(self, dtype: Any) -> None: - self._dtype = dtype - - @classmethod - def is_numeric(cls: type[Self]) -> bool: - return reverse_translate_dtype(cls).is_numeric() # type: ignore[no-any-return] - - -class NumericType(DType): ... - - -class TemporalType(DType): ... - - -class Int64(NumericType): ... - - -class Int32(NumericType): ... - - -class Int16(NumericType): ... - - -class Int8(NumericType): ... - - -class UInt64(NumericType): ... - - -class UInt32(NumericType): ... - - -class UInt16(NumericType): ... - - -class UInt8(NumericType): ... - - -class Float64(NumericType): ... - - -class Float32(NumericType): ... - - -class String(DType): ... - - -class Boolean(DType): ... - - -class Datetime(TemporalType): ... - - -class Date(TemporalType): ... - - -class Namespace(NamespaceProtocol): - Float64 = Float64 - Float32 = Float32 - Int64 = Int64 - Int32 = Int32 - Int16 = Int16 - Int8 = Int8 - UInt64 = UInt64 - UInt32 = UInt32 - UInt16 = UInt16 - UInt8 = UInt8 - Boolean = Boolean - String = String - - def Series(self, name: str, data: list[Any]) -> PolarsSeries: # noqa: N802 - import polars as pl - - from narwhals.polars import PolarsSeries - - return PolarsSeries(pl.Series(name=name, values=data)) - - # --- selection --- - def col(self, *names: str | Iterable[str]) -> Expr: - return Expr(pl.col(*names)) # type: ignore[arg-type] - - def all(self) -> Expr: - return Expr(pl.all()) - - # --- reduction --- - def sum(self, *columns: str) -> Expr: - return Expr(pl.sum(*columns)) - - def mean(self, *columns: str) -> Expr: - return Expr(pl.mean(*columns)) - - def max(self, *columns: str) -> Expr: - return Expr(pl.max(*columns)) - - def min(self, *columns: str) -> Expr: - return Expr(pl.min(*columns)) - - def len(self) -> Expr: - return Expr(pl.len()) - - # --- horizontal --- - def all_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: - return Expr(pl.all_horizontal(*[extract_native(v) for v in exprs])) - - def any_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: - return Expr(pl.any_horizontal(*[extract_native(v) for v in exprs])) - - def sum_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: - return Expr(pl.sum_horizontal(*[extract_native(v) for v in exprs])) - - def concat( - self, - items: Iterable[PolarsDataFrame], # type: ignore[override] - *, - how: str = "vertical", - ) -> PolarsDataFrame: - if how == "horizontal": - # TODO: is_eager / is_lazy not really correct here - return PolarsDataFrame( - pl.concat([extract_native(v) for v in items], how="horizontal"), - is_eager=True, - is_lazy=False, - ) - raise NotImplementedError - - -class PolarsSeries(SeriesProtocol): - def __init__(self, series: pl.Series) -> None: - self._series = series - - def alias(self, name: str) -> Self: - return self.__class__(self._series.alias(name)) - - @property - def name(self) -> str: - return self._series.name - - @property - def dtype(self) -> DType: - return translate_dtype(self._series.dtype) # type: ignore[no-any-return] - - @property - def shape(self) -> tuple[int]: - return self._series.shape - - def rename(self, name: str) -> Self: - return self.__class__(self._series.rename(name)) - - def cast( - self, - dtype: DType, # type: ignore[override] - ) -> Self: - return self.__class__(self._series.cast(reverse_translate_dtype(dtype))) - - def item(self) -> Any: - return self._series.item() - - def is_between( - self, lower_bound: Any, upper_bound: Any, closed: str = "both" - ) -> PolarsSeries: - return self.__class__(self._series.is_between(lower_bound, upper_bound, closed)) # type: ignore[arg-type] - - def is_in(self, other: Any) -> PolarsSeries: - return self.__class__(self._series.is_in(other)) - - def is_null(self) -> PolarsSeries: - return self.__class__(self._series.is_null()) - - def drop_nulls(self) -> PolarsSeries: - return self.__class__(self._series.drop_nulls()) - - def n_unique(self) -> int: - return self._series.n_unique() - - def unique(self) -> PolarsSeries: - return self.__class__(self._series.unique()) - - def zip_with(self, mask: Self, other: Self) -> Self: - return self.__class__( - self._series.zip_with(extract_native(mask), extract_native(other)) - ) - - def sample(self, n: int, fraction: float, *, with_replacement: bool) -> PolarsSeries: - return self.__class__( - self._series.sample(n, fraction=fraction, with_replacement=with_replacement) - ) - - def to_numpy(self) -> Any: - return self._series.to_numpy() - - def to_pandas(self) -> Any: - return self._series.to_pandas() - - -class PolarsDataFrame(DataFrameProtocol): - def __init__( - self, df: pl.DataFrame | pl.LazyFrame, *, is_eager: bool, is_lazy: bool - ) -> None: - self._dataframe = df - self._is_eager = is_eager - self._is_lazy = is_lazy - - def _from_dataframe(self, df: pl.DataFrame | pl.LazyFrame) -> Self: - # construct, preserving properties - return self.__class__(df, is_eager=self._is_eager, is_lazy=self._is_lazy) - - def __getitem__(self, column_name: str) -> PolarsSeries: - if not self._is_eager: - raise RuntimeError( - "DataFrame.shape can only be called if frame was instantiated with `is_eager=True`" - ) - assert isinstance(self._dataframe, pl.DataFrame) - return PolarsSeries( - self._dataframe[column_name], - ) - - # --- properties --- - @property - def columns(self) -> list[str]: - return self._dataframe.columns - - @property - def schema(self) -> dict[str, DTypeProtocol]: - return { - col: translate_dtype(dtype) for col, dtype in self._dataframe.schema.items() - } - - @property - def shape(self) -> tuple[int, int]: - if not self._is_eager: - raise RuntimeError( - "DataFrame.shape can only be called if frame was instantiated with `is_eager=True`" - ) - assert isinstance(self._dataframe, pl.DataFrame) - return self._dataframe.shape - - def iter_columns(self) -> Iterable[PolarsSeries]: - if not self._is_eager: - raise RuntimeError( - "DataFrame.iter_columns can only be called if frame was instantiated with `is_eager=True`" - ) - assert isinstance(self._dataframe, pl.DataFrame) - return (PolarsSeries(self._dataframe[col]) for col in self.columns) - - # --- reshape --- - def with_columns( - self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr - ) -> Self: - return self._from_dataframe( - self._dataframe.with_columns( - *[extract_native(v) for v in exprs], - **{key: extract_native(value) for key, value in named_exprs.items()}, - ) - ) - - def filter(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> Self: - return self._from_dataframe( - self._dataframe.filter(*[extract_native(v) for v in predicates]) - ) - - def select( - self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr - ) -> Self: - return self._from_dataframe( - self._dataframe.select( - *[extract_native(v) for v in exprs], - **{key: extract_native(value) for key, value in named_exprs.items()}, - ) - ) - - def rename(self, mapping: dict[str, str]) -> Self: - return self._from_dataframe(self._dataframe.rename(mapping)) - - # --- transform --- - def sort( - self, - by: str | Iterable[str], - *more_by: str, - descending: bool | Sequence[bool] = False, - ) -> Self: - return self._from_dataframe( - self._dataframe.sort(by, *more_by, descending=descending) - ) - - # --- convert --- - def lazy(self) -> Self: - return self.__class__(self._dataframe.lazy(), is_eager=False, is_lazy=True) - - def collect(self) -> Self: - if not self._is_lazy: - raise RuntimeError( - "DataFrame.collect can only be called if frame was instantiated with `is_lazy=True`" - ) - assert isinstance(self._dataframe, pl.LazyFrame) - return self.__class__(self._dataframe.collect(), is_eager=True, is_lazy=False) - - def cache(self) -> Self: - if not self._is_lazy: - raise RuntimeError( - "DataFrame.cache can only be called if frame was instantiated with `is_lazy=True`" - ) - assert isinstance(self._dataframe, pl.LazyFrame) - return self.__class__(self._dataframe.cache(), is_eager=False, is_lazy=True) - - def to_numpy(self) -> Any: - if not self._is_eager: - raise RuntimeError( - "DataFrame.to_numpy can only be called if frame was instantiated with `is_eager=True`" - ) - assert isinstance(self._dataframe, pl.DataFrame) - return self._dataframe.to_numpy() - - def to_pandas(self) -> Any: - if not self._is_eager: - raise RuntimeError( - "DataFrame.to_pandas can only be called if frame was instantiated with `is_eager=True`" - ) - assert isinstance(self._dataframe, pl.DataFrame) - return self._dataframe.to_pandas() - - def to_dict(self, *, as_series: bool = True) -> dict[str, Any]: - if not self._is_eager: - raise RuntimeError( - "DataFrame.to_dict can only be called if frame was instantiated with `is_eager=True`" - ) - assert isinstance(self._dataframe, pl.DataFrame) - return self._dataframe.to_dict(as_series=as_series) - - # --- actions --- - def join( - self, - other: Self, - *, - how: Literal["inner"] = "inner", - left_on: str | list[str], - right_on: str | list[str], - ) -> Self: - # todo validate eager/lazy only - return self._from_dataframe( - self._dataframe.join( - extract_native(other), how=how, left_on=left_on, right_on=right_on - ) - ) - - def group_by(self, *keys: str | Iterable[str]) -> GroupBy: - return GroupBy( - self._dataframe.group_by(*keys), - is_eager=self._is_eager, - is_lazy=self._is_lazy, - ) - - # --- partial reduction --- - def head(self, n: int) -> Self: - return self._from_dataframe(self._dataframe.head(n)) - - def unique(self, subset: list[str]) -> Self: - return self._from_dataframe(self._dataframe.unique(subset)) - - @property - def is_eager(self) -> bool: - return self._is_eager - - @property - def is_lazy(self) -> bool: - return self._is_lazy - - -class GroupBy(GroupByProtocol): - def __init__(self, groupby: Any, *, is_eager: bool, is_lazy: bool) -> None: - self._groupby = groupby - self._is_eager = is_eager - self._is_lazy = is_lazy - - def agg( - self, *aggs: IntoExpr | Iterable[IntoExpr], **named_aggs: IntoExpr - ) -> PolarsDataFrame: - return PolarsDataFrame( - self._groupby.agg( - *[extract_native(v) for v in flatten_into_expr(*aggs)], - **{key: extract_native(value) for key, value in named_aggs.items()}, - ), - is_eager=self._is_eager, - is_lazy=self._is_lazy, - ) - - -def reverse_translate_dtype(dtype: DType | type[DType]) -> Any: - if isinstance_or_issubclass(dtype, Float64): - return pl.Float64 - if isinstance_or_issubclass(dtype, Float32): - return pl.Float32 - if isinstance_or_issubclass(dtype, Int64): - return pl.Int64 - if isinstance_or_issubclass(dtype, Int32): - return pl.Int32 - if isinstance_or_issubclass(dtype, Int16): - return pl.Int16 - if isinstance_or_issubclass(dtype, UInt8): - return pl.UInt8 - if isinstance_or_issubclass(dtype, UInt64): - return pl.UInt64 - if isinstance_or_issubclass(dtype, UInt32): - return pl.UInt32 - if isinstance_or_issubclass(dtype, UInt16): - return pl.UInt16 - if isinstance_or_issubclass(dtype, UInt8): - return pl.UInt8 - if isinstance_or_issubclass(dtype, String): - return pl.String - if isinstance_or_issubclass(dtype, Boolean): - return pl.Boolean - if isinstance_or_issubclass(dtype, Datetime): - return pl.Datetime - if isinstance_or_issubclass(dtype, Date): - return pl.Date - msg = f"Unknown dtype: {dtype}" - raise TypeError(msg) - - -def translate_dtype(dtype: PolarsDataType) -> Any: - if dtype == pl.Float64: - return Float64 - if dtype == pl.Float32: - return Float32 - if dtype == pl.Int64: - return Int64 - if dtype == pl.Int32: - return Int32 - if dtype == pl.Int16: - return Int16 - if dtype == pl.UInt8: - return UInt8 - if dtype == pl.UInt64: - return UInt64 - if dtype == pl.UInt32: - return UInt32 - if dtype == pl.UInt16: - return UInt16 - if dtype == pl.UInt8: - return UInt8 - if dtype == pl.String: - return String - if dtype == pl.Boolean: - return Boolean - if dtype == pl.Datetime: - return Datetime - if dtype == pl.Date: - return Date - msg = f"Unknown dtype: {dtype}" - raise TypeError(msg) diff --git a/narwhals/series.py b/narwhals/series.py new file mode 100644 index 000000000..d42499846 --- /dev/null +++ b/narwhals/series.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any + +from narwhals.translate import get_pandas +from narwhals.translate import get_polars + +if TYPE_CHECKING: + from typing_extensions import Self + + +class Series: + def __init__( + self, + series: Any, + *, + implementation: str | None = None, + ) -> None: + from narwhals.pandas_like.series import PandasSeries + + if implementation is not None: + self._series = series + self._implementation = implementation + return + if (pl := get_polars()) is not None and isinstance(series, pl.Series): + self._series = series + self._implementation = "polars" + return + if (pd := get_pandas()) is not None and isinstance(series, pd.Series): + self._series = PandasSeries(series, implementation="pandas") + self._implementation = "pandas" + return + msg = f"Expected pandas or Polars Series, got: {type(series)}" + raise TypeError(msg) + + def _extract_native(self, arg: Any) -> Any: + from narwhals.expression import Expr + + if self._implementation != "polars": + return arg + if isinstance(arg, Series): + return arg._series + if isinstance(arg, Expr): + import polars as pl + + return arg._call(pl) + return arg + + def _from_series(self, series: Any) -> Self: + return self.__class__(series, implementation=self._implementation) + + def __repr__(self) -> str: # pragma: no cover + header = " Narwhals Series " + length = len(header) + return ( + "┌" + + "─" * length + + "┐\n" + + f"|{header}|\n" + + "| Use `narwhals.to_native()` to see native output |\n" + + "└" + + "─" * length + + "┘\n" + ) + + def alias(self, name: str) -> Self: + return self._from_series(self._series.alias(name)) + + @property + def name(self) -> str: + return self._series.name # type: ignore[no-any-return] + + @property + def dtype(self) -> Any: + return self._series.dtype + + @property + def shape(self) -> tuple[int]: + return self._series.shape # type: ignore[no-any-return] + + def rename(self, name: str) -> Self: + return self._from_series(self._series.rename(name)) + + def cast( + self, + dtype: Any, + ) -> Self: + return self._from_series(self._series.cast(dtype)) + + def item(self) -> Any: + return self._series.item() + + def is_between( + self, lower_bound: Any, upper_bound: Any, closed: str = "both" + ) -> Series: + return self._from_series( + self._series.is_between(lower_bound, upper_bound, closed) + ) + + def is_in(self, other: Any) -> Series: + return self._from_series(self._series.is_in(self._extract_native(other))) + + def is_null(self) -> Series: + return self._from_series(self._series.is_null()) + + def drop_nulls(self) -> Series: + return self._from_series(self._series.drop_nulls()) + + def n_unique(self) -> int: + return self._series.n_unique() # type: ignore[no-any-return] + + def unique(self) -> Series: + return self._from_series(self._series.unique()) + + def zip_with(self, mask: Self, other: Self) -> Self: + return self._from_series( + self._series.zip_with(self._extract_native(mask), self._extract_native(other)) + ) + + def sample(self, n: int, fraction: float, *, with_replacement: bool) -> Series: + return self._from_series( + self._series.sample(n, fraction=fraction, with_replacement=with_replacement) + ) + + def to_numpy(self) -> Any: + return self._series.to_numpy() + + def to_pandas(self) -> Any: + return self._series.to_pandas() diff --git a/narwhals/spec/__init__.py b/narwhals/spec/__init__.py deleted file mode 100644 index 8363dbbc9..000000000 --- a/narwhals/spec/__init__.py +++ /dev/null @@ -1,244 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING -from typing import Any -from typing import Iterable -from typing import Literal -from typing import Protocol -from typing import Sequence - -if TYPE_CHECKING: - from typing_extensions import Self - - -class Expr(Protocol): - # --- convert --- - def alias(self, name: str) -> Self: ... - - def cast(self, dtype: DType) -> Self: ... - - # --- binary --- - def __eq__(self, other: object) -> Expr: # type: ignore[override] - ... - - def __and__(self, other: Any) -> Expr: ... - - def __or__(self, other: Any) -> Expr: ... - - def __add__(self, other: Any) -> Expr: ... - - def __radd__(self, other: Any) -> Expr: ... - - def __sub__(self, other: Any) -> Expr: ... - - def __rsub__(self, other: Any) -> Expr: ... - - def __mul__(self, other: Any) -> Expr: ... - - def __rmul__(self, other: Any) -> Expr: ... - - def __le__(self, other: Any) -> Expr: ... - - def __lt__(self, other: Any) -> Expr: ... - - def __gt__(self, other: Any) -> Expr: ... - - def __ge__(self, other: Any) -> Expr: ... - - # --- unary --- - def mean(self) -> Expr: ... - - def sum(self) -> Expr: ... - - def min(self) -> Expr: ... - - def max(self) -> Expr: ... - - def n_unique(self) -> Expr: ... - - # --- transform --- - def is_between( - self, lower_bound: Any, upper_bound: Any, closed: str = "both" - ) -> Expr: ... - - def is_in(self, other: Any) -> Expr: ... - - def is_null(self) -> Expr: ... - - # --- partial reduction --- - def drop_nulls(self) -> Expr: ... - - def sample(self, n: int, fraction: float, *, with_replacement: bool) -> Expr: ... - - # --- namespaces --- - @property - def str(self) -> ExprStringNamespace: ... - - -class ExprStringNamespace(Protocol): - def ends_with(self, suffix: str) -> Expr: ... - - -class DType(Protocol): - @classmethod - def is_numeric(cls: type[Self]) -> bool: ... - - -class Namespace(Protocol): - Float64: DType - Float32: DType - Int64: DType - Int32: DType - Int16: DType - Int8: DType - UInt64: DType - UInt32: DType - UInt16: DType - UInt8: DType - Boolean: DType - String: DType - - # --- selection --- - def col(self, *names: str | Iterable[str]) -> Expr: ... - - def all(self) -> Expr: ... - - # --- reduction --- - def sum(self, *columns: str) -> Expr: ... - - def mean(self, *columns: str) -> Expr: ... - - def max(self, *columns: str) -> Expr: ... - - def min(self, *columns: str) -> Expr: ... - - def len(self) -> Expr: ... - - # --- horizontal --- - def all_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: ... - - def any_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: ... - - def sum_horizontal(self, *exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: ... - - def concat( - self, items: Iterable[DataFrame], *, how: str = "vertical" - ) -> DataFrame: ... - - -class Series(Protocol): - def alias(self, name: str) -> Self: ... - - @property - def name(self) -> str: ... - - @property - def shape(self) -> tuple[int]: ... - - def rename(self, name: str) -> Self: ... - - def cast(self, dtype: DType) -> Self: ... - - def item(self) -> Any: ... - - def is_between( - self, lower_bound: Any, upper_bound: Any, closed: str = "both" - ) -> Series: ... - - def is_in(self, other: Any) -> Series: ... - - def is_null(self) -> Series: ... - - def drop_nulls(self) -> Series: ... - - def n_unique(self) -> int: ... - - def zip_with(self, mask: Self, other: Self) -> Self: ... - - def sample(self, n: int, fraction: float, *, with_replacement: bool) -> Series: ... - - def to_numpy(self) -> Any: ... - - def to_pandas(self) -> Any: ... - - -class DataFrame(Protocol): - def __getitem__(self, column_name: str) -> Series: ... - - # --- properties --- - @property - def columns(self) -> list[str]: ... - - @property - def schema(self) -> dict[str, DType]: ... - - @property - def shape(self) -> tuple[int, int]: ... - - def iter_columns(self) -> Iterable[Series]: ... - - # --- reshape --- - def with_columns( - self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr - ) -> Self: ... - - def filter(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> Self: ... - - def select( - self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr - ) -> Self: ... - - def rename(self, mapping: dict[str, str]) -> Self: ... - - # --- transform --- - def sort( - self, - by: str | Iterable[str], - *more_by: str, - descending: bool | Sequence[bool] = False, - ) -> Self: ... - - # --- convert --- - def lazy(self) -> Self: ... - - def collect(self) -> Self: ... - - def cache(self) -> Self: ... - - def to_numpy(self) -> Any: ... - - def to_pandas(self) -> Any: ... - - def to_dict(self, *, as_series: bool = True) -> dict[str, Any]: ... - - # --- actions --- - def join( - self, - other: Self, - *, - how: Literal["inner"] = "inner", - left_on: str | list[str], - right_on: str | list[str], - ) -> Self: ... - - def group_by(self, *keys: str | Iterable[str]) -> GroupBy: ... - - # --- partial reduction --- - def head(self, n: int) -> Self: ... - - def unique(self, subset: list[str]) -> Self: ... - - @property - def is_eager(self) -> bool: ... - - @property - def is_lazy(self) -> bool: ... - - -class GroupBy(Protocol): - def agg( - self, *aggs: IntoExpr | Iterable[IntoExpr], **named_aggs: IntoExpr - ) -> DataFrame: ... - - -IntoExpr = Expr | str | int | float | Series diff --git a/narwhals/translate.py b/narwhals/translate.py index 0f4c7b716..968783717 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -1,204 +1,30 @@ from __future__ import annotations -from typing import TYPE_CHECKING from typing import Any -from typing import NamedTuple -from narwhals.dependencies import get_cudf -from narwhals.dependencies import get_modin from narwhals.dependencies import get_pandas from narwhals.dependencies import get_polars -if TYPE_CHECKING: - from narwhals.spec import DataFrame - from narwhals.spec import Namespace - from narwhals.spec import Series - - -class FrameTranslation(NamedTuple): - frame: DataFrame - namespace: Namespace - - -class SeriesTranslation(NamedTuple): - series: Series - namespace: Namespace - - -def translate_frame( - df: Any, - *, - is_eager: bool = False, - is_lazy: bool = False, -) -> DataFrame: - if is_eager and is_lazy: - msg = "Only one of `is_eager` and `is_lazy` can be True." - raise ValueError(msg) - - if hasattr(df, "__narwhals_frame__"): - return df.__narwhals_frame__(is_eager=is_eager, is_lazy=is_lazy) # type: ignore[no-any-return] - - if (pl := get_polars()) is not None: - if isinstance(df, pl.LazyFrame) and is_eager: - msg = ( - "Expected DataFrame, got LazyFrame. Set `is_eager=False` if you " - "function doesn't require eager execution, or collect your frame " - "before passing it to this function." - ) - raise TypeError(msg) - if isinstance(df, pl.DataFrame) and is_lazy: - msg = ( - "Expected LazyFrame, got DataFrame. Set `is_lazy=False` if you " - "function doesn't doesn't need to use `.collect`, or make your frame " - "before passing it to this function." - ) - raise TypeError(msg) - if isinstance(df, (pl.DataFrame, pl.LazyFrame)): - from narwhals.polars import PolarsDataFrame - - return PolarsDataFrame(df, is_eager=is_eager, is_lazy=is_lazy) - - if (pd := get_pandas()) is not None and isinstance(df, pd.DataFrame): - from narwhals.pandas_like.translate import translate_frame - - return translate_frame( - df, - implementation="pandas", - is_eager=is_eager, - is_lazy=is_lazy, - ) - - if (cudf := get_cudf()) is not None and isinstance(df, cudf.DataFrame): - from narwhals.pandas_like.translate import translate_frame - - return translate_frame( - df, implementation="cudf", is_eager=is_eager, is_lazy=is_lazy - ) - - if (mpd := get_modin()) is not None and isinstance(df, mpd.DataFrame): - from narwhals.pandas_like.translate import translate_frame - - return translate_frame( - df, implementation="modin", is_eager=is_eager, is_lazy=is_lazy - ) - - msg = f"Could not translate DataFrame {type(df)}, please open a feature request." - raise NotImplementedError(msg) - - -def translate_series( - series: Any, -) -> Series: - if hasattr(series, "__narwhals_series__"): - return series.__narwhals_series__() # type: ignore[no-any-return] - - if (pl := get_polars()) is not None and isinstance(series, pl.Series): - from narwhals.polars import PolarsSeries - - return PolarsSeries(series) - - if (pd := get_pandas()) is not None and isinstance(series, pd.Series): - from narwhals.pandas_like.translate import translate_series - - return translate_series( - series, - implementation="pandas", - ) - - if (cudf := get_cudf()) is not None and isinstance(series, cudf.Series): - from narwhals.pandas_like.translate import translate_series - - return translate_series(series, implementation="cudf") - - if (mpd := get_modin()) is not None and isinstance(series, mpd.Series): - from narwhals.pandas_like.translate import translate_series - - return translate_series(series, implementation="modin") - - msg = f"Could not translate {type(series)}, please open a feature request." - raise NotImplementedError(msg) - - -def translate_any(obj: Any) -> Series | DataFrame: - try: - return translate_series(obj) - except NotImplementedError: - return translate_frame(obj, is_eager=True) - - -def get_namespace(obj: Any) -> Namespace: - from narwhals.containers import is_cudf - from narwhals.containers import is_modin - from narwhals.containers import is_pandas - from narwhals.containers import is_polars - from narwhals.pandas_like.dataframe import PandasDataFrame - from narwhals.pandas_like.series import PandasSeries - from narwhals.polars import PolarsDataFrame - from narwhals.polars import PolarsSeries - - if obj == "pandas": - from narwhals.pandas_like.namespace import Namespace - - return Namespace(implementation="pandas") - if obj == "polars": - from narwhals.polars import Namespace # type: ignore[assignment] - - return Namespace() # type: ignore[call-arg] - - if isinstance(obj, (PandasDataFrame, PandasSeries)): - from narwhals.pandas_like.namespace import Namespace - - return Namespace(implementation=obj._implementation) - if isinstance(obj, (PolarsDataFrame, PolarsSeries)): - from narwhals.polars import Namespace # type: ignore[assignment] - - return Namespace() # type: ignore[call-arg] - - if is_polars(obj): - from narwhals.polars import Namespace # type: ignore[assignment] - - return Namespace(implementation="polars") - if is_pandas(obj): - from narwhals.pandas_like.namespace import Namespace - - return Namespace(implementation="pandas") - if is_modin(obj): - from narwhals.pandas_like.namespace import Namespace - - return Namespace(implementation="modin") - if is_cudf(obj): - from narwhals.pandas_like.namespace import Namespace - - return Namespace(implementation="cudf") - - raise NotImplementedError - def to_native(obj: Any) -> Any: - from narwhals.pandas_like.dataframe import PandasDataFrame - from narwhals.pandas_like.series import PandasSeries - from narwhals.polars import PolarsDataFrame - from narwhals.polars import PolarsSeries - - if isinstance(obj, PandasDataFrame): - return obj._dataframe - if isinstance(obj, PandasSeries): - return obj._series - if isinstance(obj, PolarsDataFrame): - return obj._dataframe - if isinstance(obj, PolarsSeries): - return obj._series + from narwhals.dataframe import DataFrame + from narwhals.series import Series + + if isinstance(obj, DataFrame): + return ( + obj._dataframe + if obj._implementation == "polars" + else obj._dataframe._dataframe + ) + if isinstance(obj, Series): + return obj._series if obj._implementation == "polars" else obj._series._series msg = f"Expected Narwhals object, got {type(obj)}." raise TypeError(msg) __all__ = [ - "translate_frame", - "translate_series", - "translate_any", "get_pandas", "get_polars", - "get_namespace", "to_native", ] diff --git a/narwhals/typing.py b/narwhals/typing.py new file mode 100644 index 000000000..a4b5422c0 --- /dev/null +++ b/narwhals/typing.py @@ -0,0 +1,8 @@ +from typing import TYPE_CHECKING +from typing import TypeAlias + +if TYPE_CHECKING: + from narwhals.expression import Expr + from narwhals.series import Series + + IntoExpr: TypeAlias = Expr | str | int | float | Series diff --git a/narwhals/utils.py b/narwhals/utils.py index 28784b2eb..6cae2c806 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -19,7 +19,7 @@ def flatten_into_expr(*args: IntoExpr | Iterable[IntoExpr]) -> list[IntoExpr]: if isinstance(arg, (list, tuple)): out.extend(arg) else: - out.append(arg) # type: ignore[arg-type] + out.append(arg) return out diff --git a/pyproject.toml b/pyproject.toml index ed63eca0b..542feaa60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ lint.select = [ "ALL", ] lint.ignore = [ + 'A001', 'A003', 'ANN101', 'ANN401', @@ -44,6 +45,7 @@ lint.ignore = [ 'DTZ', 'E501', 'EM101', # todo: enable + 'ERA001', # todo: enable 'FBT003', # todo: enable 'FIX', 'ICN001', diff --git a/t.py b/t.py new file mode 100644 index 000000000..6e4414887 --- /dev/null +++ b/t.py @@ -0,0 +1,125 @@ +# ruff: noqa +# type: ignore +import polars +import pandas as pd + +import narwhals as nw + +# df_raw = pd.DataFrame({"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}) +# df = nw.DataFrame(df_raw, is_lazy=True) +# df_raw_2 = pd.DataFrame({"a": [1, 3], "c": [7, 9]}) +# df2 = nw.DataFrame(df_raw_2, is_lazy=True) + +# result = df.sort("a", "b") +# print(nw.to_native(result)) + +# result = df.filter(nw.col("a") > 1) +# print(nw.to_native(result)) + +# result = df.with_columns( +# c=nw.col("a") + nw.col("b"), +# d=nw.col("a") - nw.col("a").mean(), +# ) +# print(nw.to_native(result)) +# result = df.with_columns(nw.all() * 2) +# print(nw.to_native(result)) + +# result = df.with_columns(horizonal_sum=nw.sum_horizontal(nw.col("a"), nw.col("b"))) +# print(nw.to_native(result)) +# result = df.with_columns(horizonal_sum=nw.sum_horizontal("a", nw.col("b"))) +# print(nw.to_native(result)) + + +# result = df.select(nw.all().sum()) +# print(nw.to_native(result)) +# result = df.select(nw.col("a", "b") * 2) +# print(nw.to_native(result)) + +# # # TODO! +# # # result = ( +# # # df.collect() +# # # .group_by("b") +# # # .agg( +# # # nw.all().sum(), +# # # ) +# # # ) +# # # print(nw.to_native(result)) + +# result = ( +# df.collect() +# .group_by("b") +# .agg( +# nw.col("a").sum(), +# simple=nw.col("a").sum(), +# complex=(nw.col("a") + 1).sum(), +# other=nw.sum("a"), +# ) +# ) +# print(nw.to_native(result)) +# print("multiple simple") +# result = ( +# df.collect() +# .group_by("b") +# .agg( +# nw.col("a", "z").sum(), +# ) +# ) +# print(nw.to_native(result)) + +# result = df.join(df2, left_on="a", right_on="a") +# print(nw.to_native(result)) + + +# result = df.rename({"a": "a_new", "b": "b_new"}) +# print(nw.to_native(result)) + +# result = df.collect().to_dict() +# print(result) +# print(polars.from_pandas(nw.to_native(df)).to_dict()) + +# result = df.collect().to_dict(as_series=False) +# print("this") +# print(result) +# print("that") +# print(polars.from_pandas(nw.to_native(df)).to_dict(as_series=False)) + +# agg = (nw.col("b") - nw.col("z").mean()).mean() +# print(nw.to_native(df.with_columns(d=agg))) +# result = df.group_by("a").agg(agg) +# print(nw.to_native(result)) + +# print(nw.col("a") + nw.col("b")) +# print(nw.col("a", "b").sum()) + +# result = df.select(nw.col("a", "b").sum()) +# print(nw.to_native(result)) + +# print(df.schema) +# print(df.schema['a'].is_numeric()) + +df_raw = pd.DataFrame( + { + "a": [1, 3, 2], + "b": [4.0, 4, 6], + "c": ["a", "b", "c"], + "d": [True, False, True], + } +) +df = nw.DataFrame(df_raw) +# print(df.schema) +# print(df.schema['a'].is_numeric()) +# print(df.schema['b'].is_numeric()) +# print(df.schema['c'].is_numeric()) +# print(df.schema['d'].is_numeric()) + +# result = df.with_columns(nw.col('a').cast(nw.Float32)) +# print(nw.to_native(result)) +# print(result._dataframe._dataframe.dtypes) + +# print(df.schema) +# result = df.select([col for (col, dtype) in df.schema.items() if dtype == nw.Float64]) +# print(nw.to_native(result)) +# print(result._dataframe._dataframe.dtypes) + +result = df.select("a", "b").select(nw.all() + nw.col("a")) +print(nw.to_native(result)) diff --git a/tests/tpch_q1_test.py b/tests/tpch_q1_test.py index 4d6e372de..8793cc1ce 100644 --- a/tests/tpch_q1_test.py +++ b/tests/tpch_q1_test.py @@ -8,8 +8,7 @@ import polars import pytest -from narwhals import get_namespace -from narwhals import translate_frame +import narwhals as nw from tests.utils import compare_dicts @@ -22,29 +21,28 @@ ) def test_q1(df_raw: Any) -> None: var_1 = datetime(1998, 9, 2) - df = translate_frame(df_raw, is_lazy=True) - pl = get_namespace(df) + df = nw.DataFrame(df_raw, is_lazy=True) query_result = ( - df.filter(pl.col("l_shipdate") <= var_1) + df.filter(nw.col("l_shipdate") <= var_1) .group_by(["l_returnflag", "l_linestatus"]) .agg( [ - pl.sum("l_quantity").alias("sum_qty"), - pl.sum("l_extendedprice").alias("sum_base_price"), - (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))) + nw.col("l_quantity").sum().alias("sum_qty"), + nw.col("l_extendedprice").sum().alias("sum_base_price"), + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))) .sum() .alias("sum_disc_price"), ( - pl.col("l_extendedprice") - * (1.0 - pl.col("l_discount")) - * (1.0 + pl.col("l_tax")) + nw.col("l_extendedprice") + * (1.0 - nw.col("l_discount")) + * (1.0 + nw.col("l_tax")) ) .sum() .alias("sum_charge"), - pl.mean("l_quantity").alias("avg_qty"), - pl.mean("l_extendedprice").alias("avg_price"), - pl.mean("l_discount").alias("avg_disc"), - pl.len().alias("count_order"), + nw.col("l_quantity").mean().alias("avg_qty"), + nw.col("l_extendedprice").mean().alias("avg_price"), + nw.col("l_discount").mean().alias("avg_disc"), + nw.len().alias("count_order"), ], ) .sort(["l_returnflag", "l_linestatus"]) @@ -84,29 +82,28 @@ def test_q1(df_raw: Any) -> None: @mock.patch.dict(os.environ, {"NARWHALS_FORCE_GENERIC": "1"}) def test_q1_w_pandas_agg_generic_path(df_raw: Any) -> None: var_1 = datetime(1998, 9, 2) - df = translate_frame(df_raw, is_lazy=True) - pl = get_namespace(df) + df = nw.DataFrame(df_raw, is_lazy=True) query_result = ( - df.filter(pl.col("l_shipdate") <= var_1) + df.filter(nw.col("l_shipdate") <= var_1) .group_by(["l_returnflag", "l_linestatus"]) .agg( [ - pl.sum("l_quantity").alias("sum_qty"), - pl.sum("l_extendedprice").alias("sum_base_price"), - (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))) + nw.sum("l_quantity").alias("sum_qty"), + nw.sum("l_extendedprice").alias("sum_base_price"), + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))) .sum() .alias("sum_disc_price"), ( - pl.col("l_extendedprice") - * (1.0 - pl.col("l_discount")) - * (1.0 + pl.col("l_tax")) + nw.col("l_extendedprice") + * (1.0 - nw.col("l_discount")) + * (1.0 + nw.col("l_tax")) ) .sum() .alias("sum_charge"), - pl.mean("l_quantity").alias("avg_qty"), - pl.mean("l_extendedprice").alias("avg_price"), - pl.mean("l_discount").alias("avg_disc"), - pl.len().alias("count_order"), + nw.mean("l_quantity").alias("avg_qty"), + nw.mean("l_extendedprice").alias("avg_price"), + nw.mean("l_discount").alias("avg_disc"), + nw.len().alias("count_order"), ], ) .sort(["l_returnflag", "l_linestatus"]) diff --git a/tests/utils.py b/tests/utils.py index 63b6a031f..c00006b86 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,7 @@ def compare_dicts(result: dict[str, Any], expected: dict[str, Any]) -> None: - for key in result: + for key in expected: for lhs, rhs in zip(result[key], expected[key]): if isinstance(lhs, float): assert abs(lhs - rhs) < 1e-6