From 11521bfa87762f524800e62a732d267276b951a5 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sat, 10 Aug 2024 22:53:08 +0200 Subject: [PATCH 01/21] feat: series `to_arrow` method (#769) * feat: series to_arrow method * switch raise condition * cudf no cover * use pyarrow compute --- docs/api-reference/series.md | 1 + narwhals/_arrow/series.py | 3 +++ narwhals/_pandas_like/series.py | 9 +++++++ narwhals/series.py | 39 ++++++++++++++++++++++++++++++ tests/series_only/to_arrow_test.py | 36 +++++++++++++++++++++++++++ utils/check_api_reference.py | 1 + 6 files changed, 89 insertions(+) create mode 100644 tests/series_only/to_arrow_test.py diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 2479196a2..7b7f62b8a 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -47,6 +47,7 @@ - std - sum - tail + - to_arrow - to_dummies - to_frame - to_list diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index fadc72b87..bce0f4715 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -615,6 +615,9 @@ def clip( return self._from_native_series(arr) + def to_arrow(self: Self) -> Any: + return self._native_series.combine_chunks() + @property def shape(self) -> tuple[int]: return (len(self._native_series),) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index e52a034e2..9a80e26f5 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -17,6 +17,7 @@ from narwhals.dependencies import get_modin from narwhals.dependencies import get_numpy from narwhals.dependencies import get_pandas +from narwhals.dependencies import get_pyarrow from narwhals.dependencies import get_pyarrow_compute from narwhals.utils import Implementation @@ -638,6 +639,14 @@ def clip( self._native_series.clip(lower_bound, upper_bound) ) + def to_arrow(self: Self) -> Any: + if self._implementation is Implementation.CUDF: # pragma: no cover + msg = "`to_arrow` is not implemented for CuDF backend." + raise NotImplementedError(msg) + + pa = get_pyarrow() + return pa.Array.from_pandas(self._native_series) + @property def str(self) -> PandasLikeSeriesStringNamespace: return PandasLikeSeriesStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index 76dd71e62..a6351f200 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2227,6 +2227,45 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: self._compliant_series.gather_every(n=n, offset=offset) ) + def to_arrow(self: Self) -> Any: + r""" + Convert to arrow. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = [1, 2, 3, 4] + >>> s_pd = pd.Series(name="a", data=data) + >>> s_pl = pl.Series(name="a", values=data) + + Let's define a dataframe-agnostic function that converts to arrow: + + >>> @nw.narwhalify + ... def func(s): + ... return s.to_arrow() + + >>> func(s_pd) # doctest:+NORMALIZE_WHITESPACE + + [ + 1, + 2, + 3, + 4 + ] + + >>> func(s_pl) # doctest:+NORMALIZE_WHITESPACE + + [ + 1, + 2, + 3, + 4 + ] + """ + + return self._compliant_series.to_arrow() + @property def str(self) -> SeriesStringNamespace: return SeriesStringNamespace(self) diff --git a/tests/series_only/to_arrow_test.py b/tests/series_only/to_arrow_test.py new file mode 100644 index 000000000..ebd90b7c2 --- /dev/null +++ b/tests/series_only/to_arrow_test.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +from typing import Any + +import pyarrow as pa +import pyarrow.compute as pc +import pytest + +import narwhals.stable.v1 as nw + + +def test_to_arrow(constructor_eager: Any) -> None: + data = [1, 2, 3] + result = nw.from_native(constructor_eager({"a": data}), eager_only=True)[ + "a" + ].to_arrow() + + assert pa.types.is_int64(result.type) + assert pc.all(pc.equal(result, pa.array(data, type=pa.int64()))) + + +def test_to_arrow_with_nulls(constructor_eager: Any, request: Any) -> None: + if "pandas_constructor" in str(constructor_eager) or "modin_constructor" in str( + constructor_eager + ): + request.applymarker(pytest.mark.xfail) + + data = [1, 2, None] + result = ( + nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] + .cast(nw.Int64) + .to_arrow() + ) + + assert pa.types.is_int64(result.type) + assert pc.all(pc.equal(result, pa.array(data, type=pa.int64()))) diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 80ee5d7aa..68c980086 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -148,6 +148,7 @@ .difference(expr) .difference( { + "to_arrow", "to_dummies", "to_pandas", "to_list", From 3526af836aea05e335e5c9519a28acb4e686ee07 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sat, 10 Aug 2024 22:54:16 +0200 Subject: [PATCH 02/21] feat: dataframe `to_arrow` method (#770) * feat: dataframe to_arrow method * doctest * old versions * skip pandas doctest --- docs/api-reference/dataframe.md | 1 + narwhals/_arrow/dataframe.py | 3 +++ narwhals/_pandas_like/dataframe.py | 9 ++++++++ narwhals/dataframe.py | 36 ++++++++++++++++++++++++++++++ tests/frame/to_arrow_test.py | 23 +++++++++++++++++++ 5 files changed, 72 insertions(+) create mode 100644 tests/frame/to_arrow_test.py diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index 7231b01d1..676f64076 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -31,6 +31,7 @@ - shape - sort - tail + - to_arrow - to_dict - to_numpy - to_pandas diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 738732e4d..ceadfdac7 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -495,3 +495,6 @@ def unique( def gather_every(self: Self, n: int, offset: int = 0) -> Self: return self._from_native_dataframe(self._native_dataframe[offset::n]) + + def to_arrow(self: Self) -> Any: + return self._native_dataframe diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 39af28211..d36a381b5 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -20,6 +20,7 @@ from narwhals.dependencies import get_modin from narwhals.dependencies import get_numpy from narwhals.dependencies import get_pandas +from narwhals.dependencies import get_pyarrow from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_unique_token @@ -592,3 +593,11 @@ def clone(self: Self) -> Self: def gather_every(self: Self, n: int, offset: int = 0) -> Self: return self._from_native_dataframe(self._native_dataframe.iloc[offset::n]) + + def to_arrow(self: Self) -> Any: + if self._implementation is Implementation.CUDF: # pragma: no cover + msg = "`to_arrow` is not implemented for CuDF backend." + raise NotImplementedError(msg) + + pa = get_pyarrow() + return pa.Table.from_pandas(self._native_dataframe) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index f8a12a399..b40aa160d 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -2020,6 +2020,42 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: """ return super().gather_every(n=n, offset=offset) + def to_arrow(self: Self) -> Any: + r""" + Convert to arrow table. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> data = {"foo": [1, 2, 3], "bar": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + Let's define a dataframe-agnostic function that converts to arrow table: + + >>> @nw.narwhalify + ... def func(df): + ... return df.to_arrow() + + >>> func(df_pd) # doctest:+SKIP + pyarrow.Table + foo: int64 + bar: string + ---- + foo: [[1,2,3]] + bar: [["a","b","c"]] + + >>> func(df_pl) # doctest:+NORMALIZE_WHITESPACE + pyarrow.Table + foo: int64 + bar: large_string + ---- + foo: [[1,2,3]] + bar: [["a","b","c"]] + """ + return self._compliant_frame.to_arrow() + class LazyFrame(BaseFrame[FrameT]): """ diff --git a/tests/frame/to_arrow_test.py b/tests/frame/to_arrow_test.py new file mode 100644 index 000000000..c1f395e59 --- /dev/null +++ b/tests/frame/to_arrow_test.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from typing import Any + +import pandas as pd +import pyarrow as pa +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version + + +def test_to_arrow(request: Any, constructor_eager: Any) -> None: + if "pandas" in str(constructor_eager) and parse_version(pd.__version__) < (1, 0, 0): + # pyarrow requires pandas>=1.0.0 + request.applymarker(pytest.mark.xfail) + + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]} + df_raw = constructor_eager(data) + result = nw.from_native(df_raw, eager_only=True).to_arrow() + + expected = pa.table(data) + assert result == expected From 2e9b3efaad7030b0bdc8ed9f8431af0282ca877e Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sat, 10 Aug 2024 22:55:29 +0200 Subject: [PATCH 03/21] feat: `DaskSelectorNamespace` (#771) * feat: dask selectors * skip if old for dask * I cannot read commands --- narwhals/_dask/namespace.py | 5 ++ narwhals/_dask/selectors.py | 172 ++++++++++++++++++++++++++++++++++++ tests/test_selectors.py | 32 +++---- 3 files changed, 194 insertions(+), 15 deletions(-) create mode 100644 narwhals/_dask/selectors.py diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index f94e4779d..8b66a8aec 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -8,6 +8,7 @@ from narwhals import dtypes from narwhals._dask.expr import DaskExpr +from narwhals._dask.selectors import DaskSelectorNamespace from narwhals._expression_parsing import parse_into_exprs from narwhals.dependencies import get_dask_dataframe from narwhals.dependencies import get_pandas @@ -38,6 +39,10 @@ class DaskNamespace: Duration = dtypes.Duration Date = dtypes.Date + @property + def selectors(self) -> DaskSelectorNamespace: + return DaskSelectorNamespace(backend_version=self._backend_version) + def __init__(self, *, backend_version: tuple[int, ...]) -> None: self._backend_version = backend_version diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py new file mode 100644 index 000000000..11d0f0c79 --- /dev/null +++ b/narwhals/_dask/selectors.py @@ -0,0 +1,172 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING +from typing import Any +from typing import NoReturn + +from narwhals import dtypes +from narwhals._dask.expr import DaskExpr + +if TYPE_CHECKING: + from typing_extensions import Self + + from narwhals._dask.dataframe import DaskLazyFrame + from narwhals.dtypes import DType + + +class DaskSelectorNamespace: + def __init__(self: Self, *, backend_version: tuple[int, ...]) -> None: + self._backend_version = backend_version + + def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> DaskSelector: + def func(df: DaskLazyFrame) -> list[Any]: + return [ + df._native_dataframe[col] + for col in df.columns + if df.schema[col] in dtypes + ] + + return DaskSelector( + func, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=False, + ) + + def numeric(self: Self) -> DaskSelector: + return self.by_dtype( + [ + dtypes.Int64, + dtypes.Int32, + dtypes.Int16, + dtypes.Int8, + dtypes.UInt64, + dtypes.UInt32, + dtypes.UInt16, + dtypes.UInt8, + dtypes.Float64, + dtypes.Float32, + ], + ) + + def categorical(self: Self) -> DaskSelector: + return self.by_dtype([dtypes.Categorical]) + + def string(self: Self) -> DaskSelector: + return self.by_dtype([dtypes.String]) + + def boolean(self: Self) -> DaskSelector: + return self.by_dtype([dtypes.Boolean]) + + def all(self: Self) -> DaskSelector: + def func(df: DaskLazyFrame) -> list[Any]: + return [df._native_dataframe[col] for col in df.columns] + + return DaskSelector( + func, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=False, + ) + + +class DaskSelector(DaskExpr): + def __repr__(self: Self) -> str: # pragma: no cover + return ( + f"DaskSelector(" + f"depth={self._depth}, " + f"function_name={self._function_name}, " + f"root_names={self._root_names}, " + f"output_names={self._output_names}" + ) + + def _to_expr(self: Self) -> DaskExpr: + return DaskExpr( + self._call, + depth=self._depth, + function_name=self._function_name, + root_names=self._root_names, + output_names=self._output_names, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + ) + + def __sub__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: + if isinstance(other, DaskSelector): + + def call(df: DaskLazyFrame) -> list[Any]: + lhs = self._call(df) + rhs = other._call(df) + return [x for x in lhs if x.name not in [x.name for x in rhs]] + + return DaskSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + ) + else: + return self._to_expr() - other + + def __or__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: + if isinstance(other, DaskSelector): + + def call(df: DaskLazyFrame) -> list[Any]: + lhs = self._call(df) + rhs = other._call(df) + return [ # type: ignore[no-any-return] + x for x in lhs if x.name not in [x.name for x in rhs] + ] + rhs + + return DaskSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + ) + else: + return self._to_expr() | other + + def __and__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: + if isinstance(other, DaskSelector): + + def call(df: DaskLazyFrame) -> list[Any]: + lhs = self._call(df) + rhs = other._call(df) + return [x for x in lhs if x.name in [x.name for x in rhs]] + + return DaskSelector( + call, + depth=0, + function_name="type_selector", + root_names=None, + output_names=None, + backend_version=self._backend_version, + returns_scalar=self._returns_scalar, + ) + else: + return self._to_expr() & other + + def __invert__(self: Self) -> DaskSelector: + return DaskSelectorNamespace(backend_version=self._backend_version).all() - self + + def __rsub__(self: Self, other: Any) -> NoReturn: + raise NotImplementedError + + def __rand__(self: Self, other: Any) -> NoReturn: + raise NotImplementedError + + def __ror__(self: Self, other: Any) -> NoReturn: + raise NotImplementedError diff --git a/tests/test_selectors.py b/tests/test_selectors.py index 5c57a9672..dcd9e7ec1 100644 --- a/tests/test_selectors.py +++ b/tests/test_selectors.py @@ -7,6 +7,7 @@ import pytest import narwhals.stable.v1 as nw +from narwhals.dependencies import get_dask from narwhals.selectors import all from narwhals.selectors import boolean from narwhals.selectors import by_dtype @@ -24,36 +25,28 @@ } -def test_selectors(constructor: Any, request: Any) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_selectors(constructor: Any) -> None: df = nw.from_native(constructor(data)) result = df.select(by_dtype([nw.Int64, nw.Float64]) + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} compare_dicts(result, expected) -def test_numeric(constructor: Any, request: Any) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_numeric(constructor: Any) -> None: df = nw.from_native(constructor(data)) result = df.select(numeric() + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} compare_dicts(result, expected) -def test_boolean(constructor: Any, request: Any) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_boolean(constructor: Any) -> None: df = nw.from_native(constructor(data)) result = df.select(boolean()) expected = {"d": [True, False, True]} compare_dicts(result, expected) -def test_string(constructor: Any, request: Any) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_string(constructor: Any) -> None: df = nw.from_native(constructor(data)) result = df.select(string()) expected = {"b": ["a", "b", "c"]} @@ -74,6 +67,17 @@ def test_categorical(request: Any, constructor: Any) -> None: compare_dicts(result, expected) +@pytest.mark.skipif((get_dask() is None), reason="too old for dask") +def test_dask_categorical() -> None: + import dask.dataframe as dd + + expected = {"b": ["a", "b", "c"]} + df_raw = dd.from_dict(expected, npartitions=1).astype({"b": "category"}) + df = nw.from_native(df_raw) + result = df.select(categorical()) + compare_dicts(result, expected) + + @pytest.mark.parametrize( ("selector", "expected"), [ @@ -89,10 +93,8 @@ def test_categorical(request: Any, constructor: Any) -> None: ], ) def test_set_ops( - constructor: Any, selector: nw.selectors.Selector, expected: list[str], request: Any + constructor: Any, selector: nw.selectors.Selector, expected: list[str] ) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(selector).collect_schema().names() assert sorted(result) == expected From ce2c0228731d3a01cf12991e8c7cfd17e7df7d6b Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sat, 10 Aug 2024 22:56:48 +0200 Subject: [PATCH 04/21] feat: dask namespace lit (#772) --- narwhals/_dask/namespace.py | 13 +++++++++++++ tests/frame/lit_test.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 8b66a8aec..82ade973b 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -68,6 +68,19 @@ def col(self, *column_names: str) -> DaskExpr: backend_version=self._backend_version, ) + def lit(self, value: Any, dtype: dtypes.DType | None) -> DaskExpr: + # TODO @FBruzzesi: cast to dtype once `reverse_translate_dtype` is implemented. + # It should be enough to add `.astype(reverse_translate_dtype(dtype))` + return DaskExpr( + lambda df: [df._native_dataframe.assign(lit=value).loc[:, "lit"]], + depth=0, + function_name="lit", + root_names=None, + output_names=["lit"], + returns_scalar=False, + backend_version=self._backend_version, + ) + def min(self, *column_names: str) -> DaskExpr: return DaskExpr.from_column_names( *column_names, diff --git a/tests/frame/lit_test.py b/tests/frame/lit_test.py index 212cffd60..328e4d8e0 100644 --- a/tests/frame/lit_test.py +++ b/tests/frame/lit_test.py @@ -20,7 +20,7 @@ def test_lit( constructor: Any, dtype: DType | None, expected_lit: list[Any], request: Any ) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) and dtype == nw.String: request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor(data) From 4793087d355985b7f60122c9feaf5b494b06b6eb Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 11 Aug 2024 10:06:18 +0100 Subject: [PATCH 05/21] docs: include "downloads per month" in readme (#774) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a9954402f..b4d431e6e 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ [![PyPI version](https://badge.fury.io/py/narwhals.svg)](https://badge.fury.io/py/narwhals) +[![Downloads](https://static.pepy.tech/badge/narwhals/month)](https://pepy.tech/project/narwhals) Extremely lightweight and extensible compatibility layer between dataframe libraries! From 435a0790f00806084925b5d56931c4d63a45e339 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 11 Aug 2024 10:14:30 +0100 Subject: [PATCH 06/21] docs: add Altair to "used by" section (#776) closes --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b4d431e6e..c9637037d 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ Get started! Join the party! +- [Altair](https://github.com/vega/altair/) - [Hamilton](https://github.com/DAGWorks-Inc/hamilton/tree/main/examples/narwhals) - [scikit-lego](https://github.com/koaning/scikit-lego) - [scikit-playtime](https://github.com/koaning/scikit-playtime) From 35e33d18fd256f13175a21131656a36b1fc2fa09 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 11 Aug 2024 12:17:19 +0100 Subject: [PATCH 07/21] chore: simplify Expr.clip (#777) --- narwhals/expr.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index 2a00b6dad..f9b69a927 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1810,8 +1810,8 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: # TODO @aivanoved: make type alias for numeric type def clip( self, - lower_bound: IntoExpr | Any | None = None, - upper_bound: IntoExpr | Any | None = None, + lower_bound: Any | None = None, + upper_bound: Any | None = None, ) -> Self: r""" Clip values in the Series. @@ -1916,11 +1916,7 @@ def clip( │ 3 │ └─────┘ """ - return self.__class__( - lambda plx: self._call(plx).clip( - extract_compliant(plx, lower_bound), extract_compliant(plx, upper_bound) - ) - ) + return self.__class__(lambda plx: self._call(plx).clip(lower_bound, upper_bound)) @property def str(self: Self) -> ExprStringNamespace: From 75d61e41899fa24fb9f60d4b2a6b258538309135 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sun, 11 Aug 2024 16:35:02 +0200 Subject: [PATCH 08/21] feat: dask `sum_horizontal` (#775) * feat: dask sum_horizontal --- narwhals/_arrow/namespace.py | 18 +++--------------- narwhals/_dask/namespace.py | 3 +++ narwhals/_pandas_like/namespace.py | 18 +++--------------- tests/expr_and_series/sum_horizontal_test.py | 4 +--- tests/test_selectors.py | 4 ++-- 5 files changed, 12 insertions(+), 35 deletions(-) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index fbb285b50..57bd5a4f1 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -167,25 +167,13 @@ def _lit_arrow_series(_: ArrowDataFrame) -> ArrowSeries: ) def all_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: - return reduce( - lambda x, y: x & y, - parse_into_exprs(*exprs, namespace=self), - ) + return reduce(lambda x, y: x & y, parse_into_exprs(*exprs, namespace=self)) def any_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: - return reduce( - lambda x, y: x | y, - parse_into_exprs(*exprs, namespace=self), - ) + return reduce(lambda x, y: x | y, parse_into_exprs(*exprs, namespace=self)) def sum_horizontal(self, *exprs: IntoArrowExpr) -> ArrowExpr: - return reduce( - lambda x, y: x + y, - parse_into_exprs( - *exprs, - namespace=self, - ), - ) + return reduce(lambda x, y: x + y, parse_into_exprs(*exprs, namespace=self)) def concat( self, diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 82ade973b..82a0e1586 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -133,6 +133,9 @@ def all_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: def any_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: return reduce(lambda x, y: x | y, parse_into_exprs(*exprs, namespace=self)) + def sum_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: + return reduce(lambda x, y: x + y, parse_into_exprs(*exprs, namespace=self)) + def _create_expr_from_series(self, _: Any) -> NoReturn: msg = "`_create_expr_from_series` for DaskNamespace exists only for compatibility" raise NotImplementedError(msg) diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 154e333be..13e2e99d3 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -202,25 +202,13 @@ def len(self) -> PandasLikeExpr: # --- horizontal --- def sum_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: - return reduce( - lambda x, y: x + y, - parse_into_exprs( - *exprs, - namespace=self, - ), - ) + return reduce(lambda x, y: x + y, parse_into_exprs(*exprs, namespace=self)) def all_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: - return reduce( - lambda x, y: x & y, - parse_into_exprs(*exprs, namespace=self), - ) + return reduce(lambda x, y: x & y, parse_into_exprs(*exprs, namespace=self)) def any_horizontal(self, *exprs: IntoPandasLikeExpr) -> PandasLikeExpr: - return reduce( - lambda x, y: x | y, - parse_into_exprs(*exprs, namespace=self), - ) + return reduce(lambda x, y: x | y, parse_into_exprs(*exprs, namespace=self)) def concat( self, diff --git a/tests/expr_and_series/sum_horizontal_test.py b/tests/expr_and_series/sum_horizontal_test.py index dd9c5d906..9411903cb 100644 --- a/tests/expr_and_series/sum_horizontal_test.py +++ b/tests/expr_and_series/sum_horizontal_test.py @@ -7,9 +7,7 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) -def test_sumh(constructor: Any, col_expr: Any, request: Any) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_sumh(constructor: Any, col_expr: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) result = df.with_columns(horizontal_sum=nw.sum_horizontal(col_expr, nw.col("b"))) diff --git a/tests/test_selectors.py b/tests/test_selectors.py index dcd9e7ec1..6b00a4d88 100644 --- a/tests/test_selectors.py +++ b/tests/test_selectors.py @@ -7,7 +7,7 @@ import pytest import narwhals.stable.v1 as nw -from narwhals.dependencies import get_dask +from narwhals.dependencies import get_dask_dataframe from narwhals.selectors import all from narwhals.selectors import boolean from narwhals.selectors import by_dtype @@ -67,7 +67,7 @@ def test_categorical(request: Any, constructor: Any) -> None: compare_dicts(result, expected) -@pytest.mark.skipif((get_dask() is None), reason="too old for dask") +@pytest.mark.skipif((get_dask_dataframe() is None), reason="too old for dask") def test_dask_categorical() -> None: import dask.dataframe as dd From cb4a583b9b83c23687c09bbe0ad55ab43d790de9 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sun, 11 Aug 2024 22:26:42 +0200 Subject: [PATCH 09/21] feat: dask lazyframe remaining methods (#778) * feat: dask dataframe remaining methods * gather_every in Expr --- narwhals/_dask/dataframe.py | 17 +++++++++++++++++ narwhals/_dask/expr.py | 10 ++++++++++ tests/expr_and_series/tail_test.py | 28 ++++++++++++++++++++++++++++ tests/frame/gather_every_test.py | 4 +--- tests/frame/tail_test.py | 6 +----- tests/series_only/tail_test.py | 14 -------------- 6 files changed, 57 insertions(+), 22 deletions(-) create mode 100644 tests/expr_and_series/tail_test.py delete mode 100644 tests/series_only/tail_test.py diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 776bcde20..685fc7b69 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -275,3 +275,20 @@ def group_by(self, *by: str) -> Any: from narwhals._dask.group_by import DaskLazyGroupBy return DaskLazyGroupBy(self, list(by)) + + def tail(self: Self, n: int) -> Self: + return self._from_native_dataframe( + self._native_dataframe.tail(n=n, compute=False) + ) + + def gather_every(self: Self, n: int, offset: int) -> Self: + row_index_token = generate_unique_token(n_bytes=8, columns=self.columns) + pln = self.__narwhals_namespace__() + return ( + self.with_row_index(name=row_index_token) + .filter( + pln.col(row_index_token) >= offset, # type: ignore[operator] + (pln.col(row_index_token) - offset) % n == 0, # type: ignore[arg-type] + ) + .drop(row_index_token) + ) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 632966e3c..83c160565 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -534,6 +534,16 @@ def func(_input: Any) -> Any: returns_scalar=False, ) + def tail(self: Self) -> NoReturn: + # We can't (yet?) allow methods which modify the index + msg = "`Expr.tail` is not supported for the Dask backend. Please use `LazyFrame.tail` instead." + raise NotImplementedError(msg) + + def gather_every(self: Self, n: int, offset: int = 0) -> NoReturn: + # We can't (yet?) allow methods which modify the index + msg = "`Expr.gather_every` is not supported for the Dask backend. Please use `LazyFrame.gather_every` instead." + raise NotImplementedError(msg) + @property def str(self: Self) -> DaskExprStringNamespace: return DaskExprStringNamespace(self) diff --git a/tests/expr_and_series/tail_test.py b/tests/expr_and_series/tail_test.py new file mode 100644 index 000000000..be17ffb4e --- /dev/null +++ b/tests/expr_and_series/tail_test.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import Any + +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + + +@pytest.mark.parametrize("n", [2, -1]) +def test_head(constructor: Any, n: int, request: Any) -> None: + if "dask" in str(constructor): + request.applymarker(pytest.mark.xfail) + if "polars" in str(constructor) and n < 0: + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor({"a": [1, 2, 3]})) + result = df.select(nw.col("a").tail(n)) + expected = {"a": [2, 3]} + compare_dicts(result, expected) + + +@pytest.mark.parametrize("n", [2, -1]) +def test_head_series(constructor_eager: Any, n: int) -> None: + df = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True) + result = df.select(df["a"].tail(n)) + expected = {"a": [2, 3]} + compare_dicts(result, expected) diff --git a/tests/frame/gather_every_test.py b/tests/frame/gather_every_test.py index a75edaca8..90b06e3d6 100644 --- a/tests/frame/gather_every_test.py +++ b/tests/frame/gather_every_test.py @@ -10,9 +10,7 @@ @pytest.mark.parametrize("n", [1, 2, 3]) @pytest.mark.parametrize("offset", [1, 2, 3]) -def test_gather_every(constructor: Any, n: int, offset: int, request: Any) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_gather_every(constructor: Any, n: int, offset: int) -> None: df = nw.from_native(constructor(data)) result = df.gather_every(n=n, offset=offset) expected = {"a": data["a"][offset::n]} diff --git a/tests/frame/tail_test.py b/tests/frame/tail_test.py index 6a5a6b601..e279caba9 100644 --- a/tests/frame/tail_test.py +++ b/tests/frame/tail_test.py @@ -2,15 +2,11 @@ from typing import Any -import pytest - import narwhals.stable.v1 as nw from tests.utils import compare_dicts -def test_tail(constructor: Any, request: Any) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_tail(constructor: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} expected = {"a": [3, 2], "b": [4, 6], "z": [8.0, 9]} diff --git a/tests/series_only/tail_test.py b/tests/series_only/tail_test.py deleted file mode 100644 index 058f45831..000000000 --- a/tests/series_only/tail_test.py +++ /dev/null @@ -1,14 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import pytest - -import narwhals.stable.v1 as nw - - -@pytest.mark.parametrize("n", [2, -1]) -def test_tail(constructor_eager: Any, n: int) -> None: - s = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)["a"] - - assert s.tail(n).to_list() == [2, 3] From 946db09ce5e291705d9715168c00ad0c9411c3d9 Mon Sep 17 00:00:00 2001 From: Luciano <66913960+lucianosrp@users.noreply.github.com> Date: Mon, 12 Aug 2024 20:21:17 +0200 Subject: [PATCH 10/21] feat: add `str.replace` and `str.replace_all` (#750) --- docs/api-reference/expr_str.md | 2 + docs/api-reference/series_str.md | 2 + narwhals/_arrow/expr.py | 34 ++++++ narwhals/_arrow/series.py | 19 ++++ narwhals/_dask/expr.py | 38 +++++++ narwhals/_pandas_like/expr.py | 23 ++++ narwhals/_pandas_like/series.py | 14 +++ narwhals/expr.py | 81 ++++++++++++++ narwhals/series.py | 79 +++++++++++++ tests/expr_and_series/str/replace_test.py | 128 ++++++++++++++++++++++ 10 files changed, 420 insertions(+) create mode 100644 tests/expr_and_series/str/replace_test.py diff --git a/docs/api-reference/expr_str.md b/docs/api-reference/expr_str.md index 3cfd87570..8cb0dd9ed 100644 --- a/docs/api-reference/expr_str.md +++ b/docs/api-reference/expr_str.md @@ -8,6 +8,8 @@ - ends_with - head - slice + - replace + - replace_all - starts_with - strip_chars - tail diff --git a/docs/api-reference/series_str.md b/docs/api-reference/series_str.md index 96dabcb18..af657deff 100644 --- a/docs/api-reference/series_str.md +++ b/docs/api-reference/series_str.md @@ -7,6 +7,8 @@ - contains - ends_with - head + - replace + - replace_all - slice - starts_with - strip_chars diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index ac0fbbccc..d3c2db109 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -419,6 +419,40 @@ class ArrowExprStringNamespace: def __init__(self, expr: ArrowExpr) -> None: self._expr = expr + def replace( + self, + pattern: str, + value: str, + *, + literal: bool = False, + n: int = 1, + ) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._expr, + "str", + "replace", + pattern, + value, + literal=literal, + n=n, + ) + + def replace_all( + self, + pattern: str, + value: str, + *, + literal: bool = False, + ) -> ArrowExpr: + return reuse_series_namespace_implementation( + self._expr, + "str", + "replace_all", + pattern, + value, + literal=literal, + ) + def strip_chars(self, characters: str | None = None) -> ArrowExpr: return reuse_series_namespace_implementation( self._expr, diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index bce0f4715..3f5926d22 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -840,6 +840,25 @@ class ArrowSeriesStringNamespace: def __init__(self: Self, series: ArrowSeries) -> None: self._arrow_series = series + def replace( + self, pattern: str, value: str, *, literal: bool = False, n: int = 1 + ) -> ArrowSeries: + pc = get_pyarrow_compute() + method = "replace_substring" if literal else "replace_substring_regex" + return self._arrow_series._from_native_series( + getattr(pc, method)( + self._arrow_series._native_series, + pattern=pattern, + replacement=value, + max_replacements=n, + ) + ) + + def replace_all( + self, pattern: str, value: str, *, literal: bool = False + ) -> ArrowSeries: + return self.replace(pattern, value, literal=literal, n=-1) + def strip_chars(self: Self, characters: str | None = None) -> ArrowSeries: pc = get_pyarrow_compute() whitespace = " \t\n\r\v\f" diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 83c160565..cbef97578 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -561,6 +561,44 @@ class DaskExprStringNamespace: def __init__(self, expr: DaskExpr) -> None: self._expr = expr + def replace( + self, + pattern: str, + value: str, + *, + literal: bool = False, + n: int = 1, + ) -> DaskExpr: + return self._expr._from_call( + lambda _input, _pattern, _value, _literal, _n: _input.str.replace( + _pattern, _value, regex=not _literal, n=_n + ), + "replace", + pattern, + value, + literal, + n, + returns_scalar=False, + ) + + def replace_all( + self, + pattern: str, + value: str, + *, + literal: bool = False, + ) -> DaskExpr: + return self._expr._from_call( + lambda _input, _pattern, _value, _literal: _input.str.replace( + _pattern, _value, n=-1, regex=not _literal + ), + "replace", + pattern, + value, + literal, + returns_scalar=False, + ) + def strip_chars(self, characters: str | None = None) -> DaskExpr: return self._expr._from_call( lambda _input, characters: _input.str.strip(characters), diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index c978e26f0..1f62dc640 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -371,6 +371,29 @@ class PandasLikeExprStringNamespace: def __init__(self, expr: PandasLikeExpr) -> None: self._expr = expr + def replace( + self, + pattern: str, + value: str, + *, + literal: bool = False, + n: int = 1, + ) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._expr, "str", "replace", pattern, value, literal=literal, n=n + ) + + def replace_all( + self, + pattern: str, + value: str, + *, + literal: bool = False, + ) -> PandasLikeExpr: + return reuse_series_namespace_implementation( + self._expr, "str", "replace_all", pattern, value, literal=literal + ) + def strip_chars(self, characters: str | None = None) -> PandasLikeExpr: return reuse_series_namespace_implementation( self._expr, diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 9a80e26f5..81594c384 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -675,6 +675,20 @@ class PandasLikeSeriesStringNamespace: def __init__(self, series: PandasLikeSeries) -> None: self._pandas_series = series + def replace( + self, pattern: str, value: str, *, literal: bool = False, n: int = 1 + ) -> PandasLikeSeries: + return self._pandas_series._from_native_series( + self._pandas_series._native_series.str.replace( + pat=pattern, repl=value, n=n, regex=not literal + ), + ) + + def replace_all( + self, pattern: str, value: str, *, literal: bool = False + ) -> PandasLikeSeries: + return self.replace(pattern, value, literal=literal, n=-1) + def strip_chars(self, characters: str | None) -> PandasLikeSeries: return self._pandas_series._from_native_series( self._pandas_series._native_series.str.strip(characters), diff --git a/narwhals/expr.py b/narwhals/expr.py index f9b69a927..1de0af2d5 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1986,6 +1986,87 @@ class ExprStringNamespace: def __init__(self, expr: Expr) -> None: self._expr = expr + def replace( + self, pattern: str, value: str, *, literal: bool = False, n: int = 1 + ) -> Expr: + r""" + Replace first matching regex/literal substring with a new string value. + + Arguments: + pattern: A valid regular expression pattern. + value: String that will replace the matched substring. + literal: Treat `pattern` as a literal string. + n: Number of matches to replace. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> data = {"foo": ["123abc", "abc abc123"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + We define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... df = df.with_columns(replaced=nw.col("foo").str.replace("abc", "")) + ... return df.to_dict(as_series=False) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + {'foo': ['123abc', 'abc abc123'], 'replaced': ['123', ' abc123']} + + >>> func(df_pl) + {'foo': ['123abc', 'abc abc123'], 'replaced': ['123', ' abc123']} + + """ + return self._expr.__class__( + lambda plx: self._expr._call(plx).str.replace( + pattern, value, literal=literal, n=n + ) + ) + + def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> Expr: + r""" + Replace all matching regex/literal substring with a new string value. + + Arguments: + pattern: A valid regular expression pattern. + value: String that will replace the matched substring. + literal: Treat `pattern` as a literal string. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> data = {"foo": ["123abc", "abc abc123"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + + We define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... df = df.with_columns(replaced=nw.col("foo").str.replace_all("abc", "")) + ... return df.to_dict(as_series=False) + + We can then pass either pandas or Polars to `func`: + + >>> func(df_pd) + {'foo': ['123abc', 'abc abc123'], 'replaced': ['123', ' 123']} + + >>> func(df_pl) + {'foo': ['123abc', 'abc abc123'], 'replaced': ['123', ' 123']} + + """ + return self._expr.__class__( + lambda plx: self._expr._call(plx).str.replace_all( + pattern, value, literal=literal + ) + ) + def strip_chars(self, characters: str | None = None) -> Expr: r""" Remove leading and trailing characters. diff --git a/narwhals/series.py b/narwhals/series.py index a6351f200..ca3fbbb09 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2327,6 +2327,85 @@ class SeriesStringNamespace: def __init__(self, series: Series) -> None: self._narwhals_series = series + def replace( + self, pattern: str, value: str, *, literal: bool = False, n: int = 1 + ) -> Series: + r""" + Replace first matching regex/literal substring with a new string value. + + Arguments: + pattern: A valid regular expression pattern. + value: String that will replace the matched substring. + literal: Treat `pattern` as a literal string. + n: Number of matches to replace. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> data = ["123abc", "abc abc123"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + + We define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... s = s.str.replace("abc", "") + ... return s.to_list() + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + ['123', ' abc123'] + + >>> func(s_pl) + ['123', ' abc123'] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.replace( + pattern, value, literal=literal, n=n + ) + ) + + def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> Series: + r""" + Replace all matching regex/literal substring with a new string value. + + Arguments: + pattern: A valid regular expression pattern. + value: String that will replace the matched substring. + literal: Treat `pattern` as a literal string. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> data = ["123abc", "abc abc123"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + + We define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(s): + ... s = s.str.replace_all("abc", "") + ... return s.to_list() + + We can then pass either pandas or Polars to `func`: + + >>> func(s_pd) + ['123', ' 123'] + + >>> func(s_pl) + ['123', ' 123'] + """ + return self._narwhals_series._from_compliant_series( + self._narwhals_series._compliant_series.str.replace_all( + pattern, value, literal=literal + ) + ) + def strip_chars(self, characters: str | None = None) -> Series: r""" Remove leading and trailing characters. diff --git a/tests/expr_and_series/str/replace_test.py b/tests/expr_and_series/str/replace_test.py new file mode 100644 index 000000000..b3cb556af --- /dev/null +++ b/tests/expr_and_series/str/replace_test.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +from typing import Any + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import compare_dicts + +replace_data = [ + ( + {"a": ["123abc", "abc456"]}, + r"abc\b", + "ABC", + 1, + False, + {"a": ["123ABC", "abc456"]}, + ), + ({"a": ["abc abc", "abc456"]}, r"abc", "", 1, False, {"a": [" abc", "456"]}), + ({"a": ["abc abc abc", "456abc"]}, r"abc", "", -1, False, {"a": [" ", "456"]}), + ( + {"a": ["Dollar $ign", "literal"]}, + r"$", + "S", + -1, + True, + {"a": ["Dollar Sign", "literal"]}, + ), +] + +replace_all_data = [ + ( + {"a": ["123abc", "abc456"]}, + r"abc\b", + "ABC", + False, + {"a": ["123ABC", "abc456"]}, + ), + ({"a": ["abc abc", "abc456"]}, r"abc", "", False, {"a": [" ", "456"]}), + ({"a": ["abc abc abc", "456abc"]}, r"abc", "", False, {"a": [" ", "456"]}), + ( + {"a": ["Dollar $ign", "literal"]}, + r"$", + "S", + True, + {"a": ["Dollar Sign", "literal"]}, + ), +] + + +@pytest.mark.parametrize( + ("data", "pattern", "value", "n", "literal", "expected"), + replace_data, +) +def test_str_replace_series( + constructor_eager: Any, + data: dict[str, list[str]], + pattern: str, + value: str, + n: int, + literal: bool, # noqa: FBT001 + expected: dict[str, list[str]], +) -> None: + df = nw.from_native(constructor_eager(data), eager_only=True) + + result_series = df["a"].str.replace( + pattern=pattern, value=value, n=n, literal=literal + ) + assert result_series.to_list() == expected["a"] + + +@pytest.mark.parametrize( + ("data", "pattern", "value", "literal", "expected"), + replace_all_data, +) +def test_str_replace_all_series( + constructor_eager: Any, + data: dict[str, list[str]], + pattern: str, + value: str, + literal: bool, # noqa: FBT001 + expected: dict[str, list[str]], +) -> None: + df = nw.from_native(constructor_eager(data), eager_only=True) + + result_series = df["a"].str.replace_all(pattern=pattern, value=value, literal=literal) + assert result_series.to_list() == expected["a"] + + +@pytest.mark.parametrize( + ("data", "pattern", "value", "n", "literal", "expected"), + replace_data, +) +def test_str_replace_expr( + constructor: Any, + data: dict[str, list[str]], + pattern: str, + value: str, + n: int, + literal: bool, # noqa: FBT001 + expected: dict[str, list[str]], +) -> None: + df = nw.from_native(constructor(data)) + + result_df = df.select( + nw.col("a").str.replace(pattern=pattern, value=value, n=n, literal=literal) + ) + compare_dicts(result_df, expected) + + +@pytest.mark.parametrize( + ("data", "pattern", "value", "literal", "expected"), + replace_all_data, +) +def test_str_replace_all_expr( + constructor: Any, + data: dict[str, list[str]], + pattern: str, + value: str, + literal: bool, # noqa: FBT001 + expected: dict[str, list[str]], +) -> None: + df = nw.from_native(constructor(data)) + + result = df.select( + nw.col("a").str.replace_all(pattern=pattern, value=value, literal=literal) + ) + compare_dicts(result, expected) From 39be5d969a9fde33b14a227830f6ff2ceee66eb2 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 12 Aug 2024 19:33:30 +0100 Subject: [PATCH 11/21] feat: implement LazyFrame.head for Dask (plus: run tpc-h q2 query with Dask present) (#768) --- .github/workflows/extremes.yml | 29 ++- narwhals/_dask/dataframe.py | 11 +- narwhals/_dask/expr.py | 5 + narwhals/_dask/namespace.py | 7 +- pyproject.toml | 1 + tests/expr_and_series/head_test.py | 28 +++ tests/frame/head_test.py | 9 +- tests/series_only/head_test.py | 14 -- tpch/notebooks/gpu/execute.ipynb | 10 +- tpch/notebooks/q10/execute.ipynb | 8 +- tpch/notebooks/q11/execute.ipynb | 8 +- tpch/notebooks/q15/execute.ipynb | 8 +- tpch/notebooks/q17/execute.ipynb | 8 +- tpch/notebooks/q18/execute.ipynb | 8 +- tpch/notebooks/q19/execute.ipynb | 8 +- tpch/notebooks/q2/execute.ipynb | 309 ++++++++--------------------- tpch/notebooks/q20/execute.ipynb | 8 +- tpch/notebooks/q21/execute.ipynb | 8 +- tpch/notebooks/q3/execute.ipynb | 16 +- tpch/notebooks/q4/execute.ipynb | 14 +- tpch/notebooks/q5/execute.ipynb | 14 +- tpch/notebooks/q6/execute.ipynb | 16 +- tpch/notebooks/q7/execute.ipynb | 14 +- tpch/notebooks/q8/execute.ipynb | 10 +- tpch/notebooks/q9/execute.ipynb | 8 +- 25 files changed, 247 insertions(+), 332 deletions(-) create mode 100644 tests/expr_and_series/head_test.py delete mode 100644 tests/series_only/head_test.py diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index 6895fcb4b..e9d0d673e 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -20,7 +20,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install uv (Unix) run: curl -LsSf https://astral.sh/uv/install.sh | sh - - name: install-minimu-versions + - name: install-minimum-versions run: uv pip install tox virtualenv setuptools pandas==0.25.3 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system - name: install-reqs run: uv pip install -r requirements-dev.txt --system @@ -43,7 +43,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install uv (Unix) run: curl -LsSf https://astral.sh/uv/install.sh | sh - - name: install-minimu-versions + - name: install-minimum-versions run: uv pip install tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system - name: install-reqs run: uv pip install -r requirements-dev.txt --system @@ -54,6 +54,31 @@ jobs: - name: Run doctests run: pytest narwhals --doctest-modules + not_so_old_versions: + strategy: + matrix: + python-version: ["3.9"] + os: [ubuntu-latest] + + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install uv (Unix) + run: curl -LsSf https://astral.sh/uv/install.sh | sh + - name: install-minimum-versions + run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==14.0.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.7 tzdata --system + - name: install-reqs + run: uv pip install -r requirements-dev.txt --system + - name: show-deps + run: uv pip freeze + - name: Run pytest + run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow + - name: Run doctests + run: pytest narwhals --doctest-modules + pandas-nightly-and-dask: strategy: matrix: diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 685fc7b69..d4daaf507 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -97,7 +97,11 @@ def select( if not new_series: # return empty dataframe, like Polars does pd = get_pandas() - return self._from_native_dataframe(dd.from_pandas(pd.DataFrame())) + return self._from_native_dataframe( + dd.from_pandas( + pd.DataFrame(), npartitions=self._native_dataframe.npartitions + ) + ) if all(getattr(expr, "_returns_scalar", False) for expr in exprs) and all( getattr(val, "_returns_scalar", False) for val in named_exprs.values() @@ -136,6 +140,11 @@ def with_row_index(self: Self, name: str) -> Self: def rename(self: Self, mapping: dict[str, str]) -> Self: return self._from_native_dataframe(self._native_dataframe.rename(columns=mapping)) + def head(self: Self, n: int) -> Self: + return self._from_native_dataframe( + self._native_dataframe.head(n=n, compute=False, npartitions=-1) + ) + def unique( self: Self, subset: str | list[str] | None, diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index cbef97578..f480575c5 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -434,6 +434,11 @@ def drop_nulls(self) -> NoReturn: msg = "`Expr.drop_nulls` is not supported for the Dask backend. Please use `LazyFrame.drop_nulls` instead." raise NotImplementedError(msg) + def head(self) -> NoReturn: + # We can't (yet?) allow methods which modify the index + msg = "`Expr.head` is not supported for the Dask backend. Please use `LazyFrame.head` instead." + raise NotImplementedError(msg) + def sort(self, *, descending: bool = False, nulls_last: bool = False) -> NoReturn: # We can't (yet?) allow methods which modify the index msg = "`Expr.sort` is not supported for the Dask backend. Please use `LazyFrame.sort` instead." diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 82a0e1586..edd2eb496 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -111,7 +111,12 @@ def len(self) -> DaskExpr: def func(df: DaskLazyFrame) -> list[Any]: if not df.columns: - return [dd.from_pandas(pd.Series([0], name="len"))] + return [ + dd.from_pandas( + pd.Series([0], name="len"), + npartitions=df._native_dataframe.npartitions, + ) + ] return [ df._native_dataframe.loc[:, df.columns[0]].size.to_series().rename("len") ] diff --git a/pyproject.toml b/pyproject.toml index 395af89f9..96f334000 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ exclude = [ pandas = ["pandas>=0.25.3"] polars = ["polars>=0.20.3"] pyarrow = ['pyarrow>=11.0.0'] +dask = ['dask[dataframe]>=2024.7'] [project.urls] "Homepage" = "https://github.com/narwhals-dev/narwhals" diff --git a/tests/expr_and_series/head_test.py b/tests/expr_and_series/head_test.py new file mode 100644 index 000000000..ef2ed1bf1 --- /dev/null +++ b/tests/expr_and_series/head_test.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import Any + +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + + +@pytest.mark.parametrize("n", [2, -1]) +def test_head(constructor: Any, n: int, request: Any) -> None: + if "dask" in str(constructor): + request.applymarker(pytest.mark.xfail) + if "polars" in str(constructor) and n < 0: + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor({"a": [1, 2, 3]})) + result = df.select(nw.col("a").head(n)) + expected = {"a": [1, 2]} + compare_dicts(result, expected) + + +@pytest.mark.parametrize("n", [2, -1]) +def test_head_series(constructor_eager: Any, n: int) -> None: + df = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True) + result = df.select(df["a"].head(n)) + expected = {"a": [1, 2]} + compare_dicts(result, expected) diff --git a/tests/frame/head_test.py b/tests/frame/head_test.py index 9b14e1f12..e4b762f48 100644 --- a/tests/frame/head_test.py +++ b/tests/frame/head_test.py @@ -2,15 +2,11 @@ from typing import Any -import pytest - import narwhals.stable.v1 as nw from tests.utils import compare_dicts -def test_head(constructor: Any, request: Any) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_head(constructor: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} expected = {"a": [1, 3], "b": [4, 4], "z": [7.0, 8.0]} @@ -26,6 +22,3 @@ def test_head(constructor: Any, request: Any) -> None: # negative indices not allowed for lazyframes result = df.lazy().collect().head(-1) compare_dicts(result, expected) - - result = df.select(nw.col("a").head(2)) - compare_dicts(result, {"a": expected["a"]}) diff --git a/tests/series_only/head_test.py b/tests/series_only/head_test.py deleted file mode 100644 index 6869306c2..000000000 --- a/tests/series_only/head_test.py +++ /dev/null @@ -1,14 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import pytest - -import narwhals.stable.v1 as nw - - -@pytest.mark.parametrize("n", [2, -1]) -def test_head(constructor_eager: Any, n: int) -> None: - s = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)["a"] - - assert s.head(n).to_list() == [1, 2] diff --git a/tpch/notebooks/gpu/execute.ipynb b/tpch/notebooks/gpu/execute.ipynb index c776cc1cf..a117c9187 100755 --- a/tpch/notebooks/gpu/execute.ipynb +++ b/tpch/notebooks/gpu/execute.ipynb @@ -452,7 +452,7 @@ "source": [ "import cudf\n", "fn = cudf.read_parquet\n", - "timings = %timeit -o q1(fn(lineitem))\n", + "timings = %timeit -o -q q1(fn(lineitem))\n", "results['q1'] = timings.all_runs" ] }, @@ -474,7 +474,7 @@ "source": [ "import cudf\n", "fn = cudf.read_parquet\n", - "timings = %timeit -o q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", + "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results['q2'] = timings.all_runs" ] }, @@ -496,7 +496,7 @@ "source": [ "import cudf\n", "fn = cudf.read_parquet\n", - "timings = %timeit -o q3(fn(customer), fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", "results['q3'] = timings.all_runs" ] }, @@ -518,7 +518,7 @@ "source": [ "import cudf\n", "fn = cudf.read_parquet\n", - "timings = %timeit -o q4(fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", "results['q4'] = timings.all_runs" ] }, @@ -540,7 +540,7 @@ "source": [ "import cudf\n", "fn = cudf.read_parquet\n", - "timings = %timeit -o q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results['q5'] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q10/execute.ipynb b/tpch/notebooks/q10/execute.ipynb index 307f69e7a..85ec0f14b 100644 --- a/tpch/notebooks/q10/execute.ipynb +++ b/tpch/notebooks/q10/execute.ipynb @@ -198,7 +198,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -235,7 +235,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -272,7 +272,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -309,7 +309,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q10(fn(customer), fn(nation), fn(lineitem), fn(orders)).collect()\n", + "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q11/execute.ipynb b/tpch/notebooks/q11/execute.ipynb index fec9ee27e..33951d922 100644 --- a/tpch/notebooks/q11/execute.ipynb +++ b/tpch/notebooks/q11/execute.ipynb @@ -186,7 +186,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\n", + "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -223,7 +223,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\n", + "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -260,7 +260,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier))\n", + "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -297,7 +297,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q11(fn(partsupp), fn(nation), fn(supplier)).collect()\n", + "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q15/execute.ipynb b/tpch/notebooks/q15/execute.ipynb index b487a9bf3..0baf11956 100644 --- a/tpch/notebooks/q15/execute.ipynb +++ b/tpch/notebooks/q15/execute.ipynb @@ -177,7 +177,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q15(fn(lineitem), fn(supplier))\n", + "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -214,7 +214,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q15(fn(lineitem), fn(supplier))\n", + "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -251,7 +251,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q15(fn(lineitem), fn(supplier))\n", + "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -288,7 +288,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q15(fn(lineitem), fn(supplier)).collect()\n", + "timings = %timeit -o -q q15(fn(lineitem), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q17/execute.ipynb b/tpch/notebooks/q17/execute.ipynb index 958c7f5be..b13445d28 100644 --- a/tpch/notebooks/q17/execute.ipynb +++ b/tpch/notebooks/q17/execute.ipynb @@ -173,7 +173,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q17(fn(lineitem), fn(part))\n", + "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" ] }, @@ -210,7 +210,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q17(fn(lineitem), fn(part))\n", + "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" ] }, @@ -247,7 +247,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q17(fn(lineitem), fn(part))\n", + "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" ] }, @@ -284,7 +284,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q17(fn(lineitem), fn(part)).collect()\n", + "timings = %timeit -o -q q17(fn(lineitem), fn(part)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q18/execute.ipynb b/tpch/notebooks/q18/execute.ipynb index 21557c957..c90629e0f 100644 --- a/tpch/notebooks/q18/execute.ipynb +++ b/tpch/notebooks/q18/execute.ipynb @@ -121,7 +121,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q19(fn(lineitem), fn(part))\n", + "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" ] }, @@ -140,7 +140,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -159,7 +159,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -178,7 +178,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q18(fn(customer), fn(lineitem), fn(orders)).collect()\n", + "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q19/execute.ipynb b/tpch/notebooks/q19/execute.ipynb index a8cd3fea3..8483e06d5 100644 --- a/tpch/notebooks/q19/execute.ipynb +++ b/tpch/notebooks/q19/execute.ipynb @@ -194,7 +194,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q19(fn(lineitem), fn(part))\n", + "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" ] }, @@ -231,7 +231,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q19(fn(lineitem), fn(part))\n", + "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" ] }, @@ -268,7 +268,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q19(fn(lineitem), fn(part))\n", + "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" ] }, @@ -305,7 +305,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q19(fn(lineitem), fn(part)).collect()\n", + "timings = %timeit -o -q q19(fn(lineitem), fn(part)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q2/execute.ipynb b/tpch/notebooks/q2/execute.ipynb index b4e59307b..c05345336 100755 --- a/tpch/notebooks/q2/execute.ipynb +++ b/tpch/notebooks/q2/execute.ipynb @@ -16,13 +16,23 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ibis-framework " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow dask[dataframe]" ] }, { "cell_type": "code", "execution_count": null, "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install git+https://github.com/MarcoGorelli/narwhals.git@more-dask-tpch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", "metadata": { "papermill": { "duration": 0.907754, @@ -42,64 +52,6 @@ "pd.options.future.infer_string = True" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "2", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Any\n", - "\n", - "def q2_pandas_native(\n", - " region_ds: Any,\n", - " nation_ds: Any,\n", - " supplier_ds: Any,\n", - " part_ds: Any,\n", - " part_supp_ds: Any,\n", - "):\n", - " var1 = 15\n", - " var2 = \"BRASS\"\n", - " var3 = \"EUROPE\"\n", - "\n", - " jn = (\n", - " part_ds.merge(part_supp_ds, left_on=\"p_partkey\", right_on=\"ps_partkey\")\n", - " .merge(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n", - " .merge(nation_ds, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", - " .merge(region_ds, left_on=\"n_regionkey\", right_on=\"r_regionkey\")\n", - " )\n", - "\n", - " jn = jn[jn[\"p_size\"] == var1]\n", - " jn = jn[jn[\"p_type\"].str.endswith(var2)]\n", - " jn = jn[jn[\"r_name\"] == var3]\n", - "\n", - " gb = jn.groupby(\"p_partkey\", as_index=False)\n", - " agg = gb[\"ps_supplycost\"].min()\n", - " jn2 = agg.merge(jn, on=[\"p_partkey\", \"ps_supplycost\"])\n", - "\n", - " sel = jn2.loc[\n", - " :,\n", - " [\n", - " \"s_acctbal\",\n", - " \"s_name\",\n", - " \"n_name\",\n", - " \"p_partkey\",\n", - " \"p_mfgr\",\n", - " \"s_address\",\n", - " \"s_phone\",\n", - " \"s_comment\",\n", - " ],\n", - " ]\n", - "\n", - " sort = sel.sort_values(\n", - " by=[\"s_acctbal\", \"n_name\", \"s_name\", \"p_partkey\"],\n", - " ascending=[False, True, True, True],\n", - " )\n", - " result_df = sort.head(100)\n", - "\n", - " return result_df # type: ignore[no-any-return]" - ] - }, { "cell_type": "code", "execution_count": null, @@ -117,26 +69,20 @@ "outputs": [], "source": [ "from typing import Any\n", - "from datetime import datetime\n", "import narwhals as nw\n", "\n", + "@nw.narwhalify\n", "def q2(\n", - " region_ds_raw: Any,\n", - " nation_ds_raw: Any,\n", - " supplier_ds_raw: Any,\n", - " part_ds_raw: Any,\n", - " part_supp_ds_raw: Any,\n", + " region_ds: Any,\n", + " nation_ds: Any,\n", + " supplier_ds: Any,\n", + " part_ds: Any,\n", + " part_supp_ds: Any,\n", ") -> Any:\n", " var_1 = 15\n", " var_2 = \"BRASS\"\n", " var_3 = \"EUROPE\"\n", "\n", - " region_ds = nw.from_native(region_ds_raw)\n", - " nation_ds = nw.from_native(nation_ds_raw)\n", - " supplier_ds = nw.from_native(supplier_ds_raw)\n", - " part_ds = nw.from_native(part_ds_raw)\n", - " part_supp_ds = nw.from_native(part_supp_ds_raw)\n", - "\n", " result_q2 = (\n", " part_ds.join(part_supp_ds, left_on=\"p_partkey\", right_on=\"ps_partkey\")\n", " .join(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n", @@ -160,9 +106,9 @@ " \"s_comment\",\n", " ]\n", "\n", - " q_final = (\n", + " return (\n", " result_q2.group_by(\"p_partkey\")\n", - " .agg(nw.min(\"ps_supplycost\").alias(\"ps_supplycost\"))\n", + " .agg(nw.col(\"ps_supplycost\").min().alias(\"ps_supplycost\"))\n", " .join(\n", " result_q2,\n", " left_on=[\"p_partkey\", \"ps_supplycost\"],\n", @@ -170,77 +116,17 @@ " )\n", " .select(final_cols)\n", " .sort(\n", - " by=[\"s_acctbal\", \"n_name\", \"s_name\", \"p_partkey\"],\n", + " [\"s_acctbal\", \"n_name\", \"s_name\", \"p_partkey\"],\n", " descending=[True, False, False, False],\n", " )\n", " .head(100)\n", - " )\n", - "\n", - " return nw.to_native(q_final)" + " )" ] }, { "cell_type": "code", "execution_count": null, "id": "4", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Any\n", - "from datetime import datetime\n", - "import ibis\n", - "\n", - "def q2_ibis(\n", - " region: Any,\n", - " nation: Any,\n", - " supplier: Any,\n", - " part: Any,\n", - " partsupp: Any,\n", - " *,\n", - " tool: str,\n", - ") -> Any:\n", - " var1 = 15\n", - " var2 = \"BRASS\"\n", - " var3 = \"EUROPE\"\n", - "\n", - " q2 = (\n", - " part.join(partsupp, part[\"p_partkey\"] == partsupp[\"ps_partkey\"])\n", - " .join(supplier, partsupp[\"ps_suppkey\"] == supplier[\"s_suppkey\"])\n", - " .join(nation, supplier[\"s_nationkey\"] == nation[\"n_nationkey\"])\n", - " .join(region, nation[\"n_regionkey\"] == region[\"r_regionkey\"])\n", - " .filter(ibis._[\"p_size\"] == var1)\n", - " .filter(ibis._[\"p_type\"].endswith(var2))\n", - " .filter(ibis._[\"r_name\"] == var3)\n", - " )\n", - "\n", - " q_final = (\n", - " q2.group_by(\"p_partkey\")\n", - " .agg(ps_supplycost=ibis._[\"ps_supplycost\"].min())\n", - " .join(q2, [\"p_partkey\"])\n", - " .select(\n", - " \"s_acctbal\",\n", - " \"s_name\",\n", - " \"n_name\",\n", - " \"p_partkey\",\n", - " \"p_mfgr\",\n", - " \"s_address\",\n", - " \"s_phone\",\n", - " \"s_comment\",\n", - " )\n", - " .order_by(ibis.desc(\"s_acctbal\"), \"n_name\", \"s_name\", \"p_partkey\")\n", - " .limit(100)\n", - " )\n", - " if tool == 'pandas':\n", - " return q_final.to_pandas()\n", - " if tool == 'polars':\n", - " return q_final.to_polars()\n", - " raise ValueError(\"expected pandas or polars\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5", "metadata": { "papermill": { "duration": 0.013325, @@ -267,7 +153,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6", + "id": "5", "metadata": { "papermill": { "duration": 0.014284, @@ -280,25 +166,23 @@ }, "outputs": [], "source": [ - "import ibis\n", - "\n", - "con_pd = ibis.pandas.connect()\n", - "con_pl = ibis.polars.connect()\n", + "import pyarrow.parquet as pq\n", + "import dask.dataframe as dd\n", "\n", "IO_FUNCS = {\n", " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", " 'polars[eager]': lambda x: pl.read_parquet(x),\n", " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " 'pyarrow': lambda x: pq.read_table(x),\n", + " 'dask': lambda x: dd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", "}" ] }, { "cell_type": "code", "execution_count": null, - "id": "7", + "id": "6", "metadata": {}, "outputs": [], "source": [ @@ -307,70 +191,7 @@ }, { "cell_type": "markdown", - "id": "8", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtypes, via ibis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9", - "metadata": {}, - "outputs": [], - "source": [ - "tool = 'pandas[pyarrow][ibis]'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q2_ibis(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp), tool='pandas')\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "10", - "metadata": {}, - "source": [ - "## Polars scan_parquet via ibis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "11", - "metadata": {}, - "outputs": [], - "source": [ - "tool = 'polars[lazy][ibis]'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q2_ibis(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp), tool='polars')\n", - "results[tool] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "12", - "metadata": {}, - "source": [ - "## pandas, pyarrow dtypes, native" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13", - "metadata": {}, - "outputs": [], - "source": [ - "tool = 'pandas[pyarrow]'\n", - "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q2_pandas_native(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", - "results[tool+'[native]'] = timings.all_runs" - ] - }, - { - "cell_type": "markdown", - "id": "14", + "id": "7", "metadata": { "papermill": { "duration": 0.005113, @@ -382,13 +203,13 @@ "tags": [] }, "source": [ - "## pandas via Narwhals" + "## pandas" ] }, { "cell_type": "code", "execution_count": null, - "id": "15", + "id": "8", "metadata": { "papermill": { "duration": 196.786925, @@ -403,13 +224,13 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", + "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" ] }, { "cell_type": "markdown", - "id": "16", + "id": "9", "metadata": { "papermill": { "duration": 0.005184, @@ -421,13 +242,13 @@ "tags": [] }, "source": [ - "## pandas, pyarrow dtypes, via Narwhals" + "## pandas, pyarrow dtypes" ] }, { "cell_type": "code", "execution_count": null, - "id": "17", + "id": "10", "metadata": { "papermill": { "duration": 158.748353, @@ -442,13 +263,13 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", + "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" ] }, { "cell_type": "markdown", - "id": "18", + "id": "11", "metadata": { "papermill": { "duration": 0.005773, @@ -466,7 +287,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19", + "id": "12", "metadata": { "papermill": { "duration": 37.821116, @@ -481,13 +302,13 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", + "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" ] }, { "cell_type": "markdown", - "id": "20", + "id": "13", "metadata": { "papermill": { "duration": 0.005515, @@ -505,7 +326,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21", + "id": "14", "metadata": { "papermill": { "duration": 4.800698, @@ -520,13 +341,55 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp)).collect()\n", + "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp)).collect()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": {}, + "source": [ + "## PyArrow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": {}, + "outputs": [], + "source": [ + "tool = 'pyarrow'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" ] }, { "cell_type": "markdown", - "id": "22", + "id": "17", + "metadata": {}, + "source": [ + "## Dask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": {}, + "outputs": [], + "source": [ + "tool = 'dask'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp)).compute()\n", + "results[tool] = timings.all_runs" + ] + }, + { + "cell_type": "markdown", + "id": "19", "metadata": {}, "source": [ "## Save" @@ -535,7 +398,7 @@ { "cell_type": "code", "execution_count": null, - "id": "23", + "id": "20", "metadata": {}, "outputs": [], "source": [ @@ -586,7 +449,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.9" }, "papermill": { "default_parameters": {}, diff --git a/tpch/notebooks/q20/execute.ipynb b/tpch/notebooks/q20/execute.ipynb index f0719f317..aecb3a473 100644 --- a/tpch/notebooks/q20/execute.ipynb +++ b/tpch/notebooks/q20/execute.ipynb @@ -195,7 +195,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", + "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -232,7 +232,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", + "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -269,7 +269,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", + "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -306,7 +306,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()\n", + "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q21/execute.ipynb b/tpch/notebooks/q21/execute.ipynb index dc5063f52..b51b15dce 100755 --- a/tpch/notebooks/q21/execute.ipynb +++ b/tpch/notebooks/q21/execute.ipynb @@ -218,7 +218,7 @@ "\n", "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", "\n", - "timings = %timeit -o q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", + "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", "results[tool] = timings.all_runs" ] }, @@ -259,7 +259,7 @@ "fn = IO_FUNCS[tool]\n", "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", "\n", - "timings = %timeit -o q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", + "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", "results[tool] = timings.all_runs" ] }, @@ -300,7 +300,7 @@ "fn = IO_FUNCS[tool]\n", "\n", "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", - "timings = %timeit -o q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", + "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", "results[tool] = timings.all_runs" ] }, @@ -341,7 +341,7 @@ "fn = IO_FUNCS[tool]\n", "\n", "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", - "timings = %timeit -o q21(lineitem_raw, nation_raw, orders_raw, supplier_raw).collect()\n", + "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q3/execute.ipynb b/tpch/notebooks/q3/execute.ipynb index f289ea913..80178cae1 100755 --- a/tpch/notebooks/q3/execute.ipynb +++ b/tpch/notebooks/q3/execute.ipynb @@ -16,7 +16,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ibis-framework " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -278,7 +278,7 @@ "source": [ "tool = 'pandas[pyarrow][ibis]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='pandas')\n", + "timings = %timeit -o -q q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='pandas')\n", "results[tool] = timings.all_runs" ] }, @@ -299,7 +299,7 @@ "source": [ "tool = 'polars[lazy][ibis]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='polars')\n", + "timings = %timeit -o -q q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='polars')\n", "results[tool] = timings.all_runs" ] }, @@ -320,7 +320,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q3_pandas_native(fn(customer), fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q3_pandas_native(fn(customer), fn(lineitem), fn(orders))\n", "results[tool+'[native]'] = timings.all_runs" ] }, @@ -359,7 +359,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q3(fn(customer), fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -398,7 +398,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q3(fn(customer), fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -437,7 +437,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q3(fn(customer), fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -476,7 +476,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q3(fn(customer), fn(lineitem), fn(orders)).collect()\n", + "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q4/execute.ipynb b/tpch/notebooks/q4/execute.ipynb index f5d1b97bd..df07c9c5f 100755 --- a/tpch/notebooks/q4/execute.ipynb +++ b/tpch/notebooks/q4/execute.ipynb @@ -16,7 +16,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ibis-framework " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -243,7 +243,7 @@ "source": [ "tool = 'polars[lazy][ibis]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q4_ibis(fn(lineitem), fn(orders), tool='polars')\n", + "timings = %timeit -o -q q4_ibis(fn(lineitem), fn(orders), tool='polars')\n", "results[tool] = timings.all_runs" ] }, @@ -264,7 +264,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q4_pandas_native(fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q4_pandas_native(fn(lineitem), fn(orders))\n", "results[tool+'[native]'] = timings.all_runs" ] }, @@ -303,7 +303,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q4(fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -342,7 +342,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q4(fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -381,7 +381,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q4(fn(lineitem), fn(orders))\n", + "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" ] }, @@ -420,7 +420,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q4(fn(lineitem), fn(orders)).collect()\n", + "timings = %timeit -o -q q4(fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q5/execute.ipynb b/tpch/notebooks/q5/execute.ipynb index a56ae03d1..5f6df9bbc 100755 --- a/tpch/notebooks/q5/execute.ipynb +++ b/tpch/notebooks/q5/execute.ipynb @@ -16,7 +16,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ibis-framework " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -275,7 +275,7 @@ "source": [ "tool = 'polars[lazy][ibis]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q5_ibis(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", + "timings = %timeit -o -q q5_ibis(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", "results[tool] = timings.all_runs" ] }, @@ -296,7 +296,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q5_pandas_native(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q5_pandas_native(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool+'[native]'] = timings.all_runs" ] }, @@ -335,7 +335,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -374,7 +374,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -413,7 +413,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -452,7 +452,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", + "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q6/execute.ipynb b/tpch/notebooks/q6/execute.ipynb index 0f8d6ce58..b101aa98d 100755 --- a/tpch/notebooks/q6/execute.ipynb +++ b/tpch/notebooks/q6/execute.ipynb @@ -16,7 +16,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals ibis-framework " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -231,7 +231,7 @@ "source": [ "tool = 'pandas[pyarrow][ibis]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q6_ibis(fn(lineitem), tool='pandas')\n", + "timings = %timeit -o -q q6_ibis(fn(lineitem), tool='pandas')\n", "results[tool] = timings.all_runs" ] }, @@ -252,7 +252,7 @@ "source": [ "tool = 'polars[lazy][ibis]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q6_ibis(fn(lineitem), tool='polars')\n", + "timings = %timeit -o -q q6_ibis(fn(lineitem), tool='polars')\n", "results[tool] = timings.all_runs" ] }, @@ -273,7 +273,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q6_pandas_native(fn(lineitem))\n", + "timings = %timeit -o -q q6_pandas_native(fn(lineitem))\n", "results[tool+'[native]'] = timings.all_runs" ] }, @@ -312,7 +312,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q6(fn(lineitem))\n", + "timings = %timeit -o -q q6(fn(lineitem))\n", "results[tool] = timings.all_runs" ] }, @@ -351,7 +351,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q6(fn(lineitem))\n", + "timings = %timeit -o -q q6(fn(lineitem))\n", "results[tool] = timings.all_runs" ] }, @@ -390,7 +390,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q6(fn(lineitem))\n", + "timings = %timeit -o -q q6(fn(lineitem))\n", "results[tool] = timings.all_runs" ] }, @@ -429,7 +429,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q6(fn(lineitem)).collect()\n", + "timings = %timeit -o -q q6(fn(lineitem)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q7/execute.ipynb b/tpch/notebooks/q7/execute.ipynb index 3b64df2fc..1213043b0 100755 --- a/tpch/notebooks/q7/execute.ipynb +++ b/tpch/notebooks/q7/execute.ipynb @@ -326,7 +326,7 @@ "source": [ "tool = 'pandas[pyarrow][ibis]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='pandas')\n", + "timings = %timeit -o -q q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='pandas')\n", "results[tool] = timings.all_runs" ] }, @@ -347,7 +347,7 @@ "source": [ "tool = 'polars[lazy][ibis]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", + "timings = %timeit -o -q q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", "results[tool] = timings.all_runs" ] }, @@ -368,7 +368,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7_pandas_native(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q7_pandas_native(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool+'[native]'] = timings.all_runs" ] }, @@ -407,7 +407,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -446,7 +446,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -485,7 +485,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -524,7 +524,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", + "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q8/execute.ipynb b/tpch/notebooks/q8/execute.ipynb index 531cad195..b10b87907 100755 --- a/tpch/notebooks/q8/execute.ipynb +++ b/tpch/notebooks/q8/execute.ipynb @@ -260,7 +260,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7_pandas_native(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q7_pandas_native(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool+'[native]'] = timings.all_runs" ] }, @@ -299,7 +299,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -338,7 +338,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -377,7 +377,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -416,7 +416,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", + "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" ] }, diff --git a/tpch/notebooks/q9/execute.ipynb b/tpch/notebooks/q9/execute.ipynb index d7412426c..86417e180 100644 --- a/tpch/notebooks/q9/execute.ipynb +++ b/tpch/notebooks/q9/execute.ipynb @@ -190,7 +190,7 @@ "source": [ "tool = 'pandas'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -227,7 +227,7 @@ "source": [ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -264,7 +264,7 @@ "source": [ "tool = 'polars[eager]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", + "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" ] }, @@ -301,7 +301,7 @@ "source": [ "tool = 'polars[lazy]'\n", "fn = IO_FUNCS[tool]\n", - "timings = %timeit -o q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)).collect()\n", + "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" ] }, From 54cd27782be49c9193982f99de44fc479e26db0c Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 12 Aug 2024 20:50:17 +0200 Subject: [PATCH 12/21] feat: extend dataframe drop_nulls method (#779) --- narwhals/_arrow/dataframe.py | 8 ++++++-- narwhals/_dask/dataframe.py | 8 ++++++-- narwhals/_pandas_like/dataframe.py | 8 ++++++-- narwhals/dataframe.py | 20 ++++++++++++++------ tests/frame/drop_nulls_test.py | 14 ++++++++++++++ 5 files changed, 46 insertions(+), 12 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index ceadfdac7..13ff0da0d 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -288,8 +288,12 @@ def join( def drop(self, *columns: str) -> Self: return self._from_native_dataframe(self._native_dataframe.drop(list(columns))) - def drop_nulls(self) -> Self: - return self._from_native_dataframe(self._native_dataframe.drop_null()) + def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: + if subset is None: + return self._from_native_dataframe(self._native_dataframe.drop_null()) + subset = [subset] if isinstance(subset, str) else subset + plx = self.__narwhals_namespace__() + return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) def sort( self, diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index d4daaf507..6774f762a 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -114,8 +114,12 @@ def select( df = self._native_dataframe.assign(**new_series).loc[:, list(new_series.keys())] return self._from_native_dataframe(df) - def drop_nulls(self) -> Self: - return self._from_native_dataframe(self._native_dataframe.dropna()) + def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: + if subset is None: + return self._from_native_dataframe(self._native_dataframe.dropna()) + subset = [subset] if isinstance(subset, str) else subset + plx = self.__narwhals_namespace__() + return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) @property def schema(self) -> dict[str, DType]: diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index d36a381b5..1f2a4500f 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -234,8 +234,12 @@ def select( ) return self._from_native_dataframe(df) - def drop_nulls(self) -> Self: - return self._from_native_dataframe(self._native_dataframe.dropna(axis=0)) + def drop_nulls(self, subset: str | list[str] | None) -> Self: + if subset is None: + return self._from_native_dataframe(self._native_dataframe.dropna(axis=0)) + subset = [subset] if isinstance(subset, str) else subset + plx = self.__narwhals_namespace__() + return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) def with_row_index(self, name: str) -> Self: row_index = create_native_series( diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index b40aa160d..487767c34 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -95,9 +95,9 @@ def with_row_index(self, name: str = "index") -> Self: self._compliant_frame.with_row_index(name), ) - def drop_nulls(self) -> Self: + def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: return self._from_compliant_dataframe( - self._compliant_frame.drop_nulls(), + self._compliant_frame.drop_nulls(subset=subset), ) @property @@ -663,10 +663,14 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se """ return super().pipe(function, *args, **kwargs) - def drop_nulls(self) -> Self: + def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: """ Drop null values. + Arguments: + subset: Column name(s) for which null values are considered. If set to None + (default), use all columns. + Notes: pandas and Polars handle null values differently. Polars distinguishes between NaN and Null, whereas pandas doesn't. @@ -700,7 +704,7 @@ def drop_nulls(self) -> Self: │ 1.0 ┆ 1.0 │ └─────┴─────┘ """ - return super().drop_nulls() + return super().drop_nulls(subset=subset) def with_row_index(self, name: str = "index") -> Self: """ @@ -2179,10 +2183,14 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se """ return super().pipe(function, *args, **kwargs) - def drop_nulls(self) -> Self: + def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: """ Drop null values. + Arguments: + subset: Column name(s) for which null values are considered. If set to None + (default), use all columns. + Notes: pandas and Polars handle null values differently. Polars distinguishes between NaN and Null, whereas pandas doesn't. @@ -2216,7 +2224,7 @@ def drop_nulls(self) -> Self: │ 1.0 ┆ 1.0 │ └─────┴─────┘ """ - return super().drop_nulls() + return super().drop_nulls(subset=subset) def with_row_index(self, name: str = "index") -> Self: """ diff --git a/tests/frame/drop_nulls_test.py b/tests/frame/drop_nulls_test.py index 53cc57bc2..58c9486ed 100644 --- a/tests/frame/drop_nulls_test.py +++ b/tests/frame/drop_nulls_test.py @@ -1,5 +1,9 @@ +from __future__ import annotations + from typing import Any +import pytest + import narwhals.stable.v1 as nw from tests.utils import compare_dicts @@ -16,3 +20,13 @@ def test_drop_nulls(constructor: Any) -> None: "b": [3.0, 5.0], } compare_dicts(result, expected) + + +@pytest.mark.parametrize("subset", ["a", ["a"]]) +def test_drop_nulls_subset(constructor: Any, subset: str | list[str]) -> None: + result = nw.from_native(constructor(data)).drop_nulls(subset=subset) + expected = { + "a": [1, 2.0, 4.0], + "b": [float("nan"), 3.0, 5.0], + } + compare_dicts(result, expected) From 5b0df48700777bea6e22f4026815f747928437d9 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 12 Aug 2024 19:59:10 +0100 Subject: [PATCH 13/21] feat: allow `from_dict` to not have native_namespace arg if all inputs are already narwhals Series (#760) --- narwhals/functions.py | 20 ++++++++++++++++++-- narwhals/stable/v1.py | 5 +++-- tests/from_dict_test.py | 24 ++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/narwhals/functions.py b/narwhals/functions.py index e3d457adf..45f273df8 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -52,7 +52,7 @@ def from_dict( data: dict[str, Any], schema: dict[str, DType] | Schema | None = None, *, - native_namespace: ModuleType, + native_namespace: ModuleType | None = None, ) -> DataFrame[Any]: """ Instantiate DataFrame from dictionary. @@ -64,7 +64,8 @@ def from_dict( Arguments: data: Dictionary to create DataFrame from. schema: The DataFrame schema as Schema or dict of {name: type}. - native_namespace: The native library to use for DataFrame creation. + native_namespace: The native library to use for DataFrame creation. Only + necessary if inputs are not Narwhals Series. Examples: >>> import pandas as pd @@ -97,6 +98,21 @@ def from_dict( │ 2 ┆ 4 │ └─────┴─────┘ """ + from narwhals.series import Series + from narwhals.translate import to_native + + if not data: + msg = "from_dict cannot be called with empty dictionary" + raise ValueError(msg) + if native_namespace is None: + for val in data.values(): + if isinstance(val, Series): + native_namespace = val.__native_namespace__() + break + else: + msg = "Calling `from_dict` without `native_namespace` is only supported if all input values are already Narwhals Series" + raise TypeError(msg) + data = {key: to_native(value, strict=False) for key, value in data.items()} implementation = Implementation.from_native_namespace(native_namespace) if implementation is Implementation.POLARS: diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 5c067bf45..1c1c91711 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -1473,7 +1473,7 @@ def from_dict( data: dict[str, Any], schema: dict[str, DType] | Schema | None = None, *, - native_namespace: ModuleType, + native_namespace: ModuleType | None = None, ) -> DataFrame[Any]: """ Instantiate DataFrame from dictionary. @@ -1485,7 +1485,8 @@ def from_dict( Arguments: data: Dictionary to create DataFrame from. schema: The DataFrame schema as Schema or dict of {name: type}. - native_namespace: The native library to use for DataFrame creation. + native_namespace: The native library to use for DataFrame creation. Only + necessary if inputs are not Narwhals Series. Examples: >>> import pandas as pd diff --git a/tests/from_dict_test.py b/tests/from_dict_test.py index ad6248f7c..cfaf99a7b 100644 --- a/tests/from_dict_test.py +++ b/tests/from_dict_test.py @@ -29,3 +29,27 @@ def test_from_dict_schema(constructor: Any, request: Any) -> None: schema=schema, # type: ignore[arg-type] ) assert result.collect_schema() == schema + + +def test_from_dict_without_namespace(constructor: Any) -> None: + df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy().collect() + result = nw.from_dict({"c": df["a"], "d": df["b"]}) + compare_dicts(result, {"c": [1, 2, 3], "d": [4, 5, 6]}) + + +def test_from_dict_without_namespace_invalid(constructor: Any) -> None: + df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy().collect() + with pytest.raises(TypeError, match="namespace"): + nw.from_dict({"c": nw.to_native(df["a"]), "d": nw.to_native(df["b"])}) + + +def test_from_dict_one_native_one_narwhals(constructor: Any) -> None: + df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy().collect() + result = nw.from_dict({"c": nw.to_native(df["a"]), "d": df["b"]}) + expected = {"c": [1, 2, 3], "d": [4, 5, 6]} + compare_dicts(result, expected) + + +def test_from_dict_empty() -> None: + with pytest.raises(ValueError, match="empty"): + nw.from_dict({}) From 0cff9d43aff0106496824f966dc135e9a6e6383e Mon Sep 17 00:00:00 2001 From: Magdalena Kowalczuk <74981211+anopsy@users.noreply.github.com> Date: Tue, 13 Aug 2024 11:03:00 +0200 Subject: [PATCH 14/21] release: Bump version to 1.4.0 (#781) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index a4ca987d2..eec9d66d8 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.3.0' +'1.4.0' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 6571e66a2..d67a0587e 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -48,7 +48,7 @@ from narwhals.utils import maybe_convert_dtypes from narwhals.utils import maybe_set_index -__version__ = "1.3.0" +__version__ = "1.4.0" __all__ = [ "selectors", diff --git a/pyproject.toml b/pyproject.toml index 96f334000..854f8e9e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.3.0" +version = "1.4.0" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From a75b7265af3b5278a41166c36d09e7d51e46b365 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 13 Aug 2024 10:08:03 +0100 Subject: [PATCH 15/21] chore: rename internal dtype functions (#780) --- .github/workflows/publish_to_pypi.yml | 2 +- docs/extending.md | 6 ++- narwhals/_arrow/dataframe.py | 4 +- narwhals/_arrow/expr.py | 4 +- narwhals/_arrow/series.py | 10 +---- narwhals/_arrow/utils.py | 2 +- narwhals/_dask/expr.py | 1 + narwhals/_dask/namespace.py | 4 +- narwhals/_pandas_like/dataframe.py | 4 +- narwhals/_pandas_like/expr.py | 4 +- narwhals/_pandas_like/series.py | 10 +---- narwhals/_pandas_like/utils.py | 14 ++++++- narwhals/_polars/expr.py | 11 +----- narwhals/_polars/namespace.py | 6 +-- narwhals/_polars/series.py | 8 +--- narwhals/_polars/utils.py | 7 ++-- narwhals/dtypes.py | 53 +-------------------------- narwhals/expr.py | 9 ++--- narwhals/functions.py | 12 +++--- narwhals/selectors.py | 7 +--- narwhals/series.py | 11 +----- 21 files changed, 54 insertions(+), 135 deletions(-) diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish_to_pypi.yml index e2e2ed1e8..bc9003ce0 100644 --- a/.github/workflows/publish_to_pypi.yml +++ b/.github/workflows/publish_to_pypi.yml @@ -30,7 +30,7 @@ jobs: publish-to-pypi: name: >- Publish Python 🐍 distribution 📦 to PyPI - if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + if: startsWith(github.ref, 'refs/tags/v') # only publish to PyPI on tag pushes needs: - build runs-on: ubuntu-latest diff --git a/docs/extending.md b/docs/extending.md index dc0d4671b..a41eae4fc 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -26,14 +26,16 @@ Make sure that, in addition to the public Narwhals API, you also define: from `Narwhals.DataFrame` - `DataFrame.__narwhals_namespace__`: return an object which implements public top-level functions from `narwhals` (e.g. `narwhals.col`, `narwhals.concat`, ...) + - `DataFrame.__native_namespace__`: return a native namespace object which must have a + `from_dict` method - `LazyFrame.__narwhals_lazyframe__`: return an object which implements public methods from `Narwhals.LazyFrame` - `LazyFrame.__narwhals_namespace__`: return an object which implements public top-level functions from `narwhals` (e.g. `narwhals.col`, `narwhals.concat`, ...) + - `LazyFrame.__native_namespace__`: return a native namespace object which must have a + `from_dict` method - `Series.__narwhals_series__`: return an object which implements public methods from `Narwhals.Series` - - `Series.__narwhals_namespace__`: return an object which implements public top-level - functions from `narwhals` (e.g. `narwhals.col`, `narwhals.concat`, ...) If your library doesn't distinguish between lazy and eager, then it's OK for your dataframe object to implement both `__narwhals_dataframe__` and `__narwhals_lazyframe__`. In fact, diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 13ff0da0d..f503d10da 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -347,9 +347,7 @@ def filter( self, *predicates: IntoArrowExpr, ) -> Self: - from narwhals._arrow.namespace import ArrowNamespace - - plx = ArrowNamespace(backend_version=self._backend_version) + plx = self.__narwhals_namespace__() expr = plx.all_horizontal(*predicates) # Safety: all_horizontal's expression only returns a single column. mask = expr._call(self)[0] diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index d3c2db109..cc8b46567 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -158,9 +158,7 @@ def len(self) -> Self: return reuse_series_implementation(self, "len", returns_scalar=True) def filter(self, *predicates: Any) -> Self: - from narwhals._arrow.namespace import ArrowNamespace - - plx = ArrowNamespace(backend_version=self._backend_version) + plx = self.__narwhals_namespace__() expr = plx.all_horizontal(*predicates) return reuse_series_implementation(self, "filter", other=expr) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 3f5926d22..40a94315e 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -9,7 +9,7 @@ from narwhals._arrow.utils import cast_for_truediv from narwhals._arrow.utils import floordiv_compat -from narwhals._arrow.utils import reverse_translate_dtype +from narwhals._arrow.utils import narwhals_to_native_dtype from narwhals._arrow.utils import translate_dtype from narwhals._arrow.utils import validate_column_comparand from narwhals.dependencies import get_numpy @@ -23,7 +23,6 @@ from typing_extensions import Self from narwhals._arrow.dataframe import ArrowDataFrame - from narwhals._arrow.namespace import ArrowNamespace from narwhals.dtypes import DType @@ -265,11 +264,6 @@ def n_unique(self) -> int: def __native_namespace__(self) -> Any: # pragma: no cover return get_pyarrow() - def __narwhals_namespace__(self) -> ArrowNamespace: - from narwhals._arrow.namespace import ArrowNamespace - - return ArrowNamespace(backend_version=self._backend_version) - @property def name(self) -> str: return self._name @@ -369,7 +363,7 @@ def is_null(self) -> Self: def cast(self, dtype: DType) -> Self: pc = get_pyarrow_compute() ser = self._native_series - dtype = reverse_translate_dtype(dtype) + dtype = narwhals_to_native_dtype(dtype) return self._from_native_series(pc.cast(ser, dtype)) def null_count(self: Self) -> int: diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 465af61d4..a6b56a355 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -55,7 +55,7 @@ def translate_dtype(dtype: Any) -> dtypes.DType: raise AssertionError -def reverse_translate_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: +def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: from narwhals import dtypes pa = get_pyarrow() diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index f480575c5..a8472887b 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -44,6 +44,7 @@ def __init__( def __narwhals_expr__(self) -> None: ... def __narwhals_namespace__(self) -> DaskNamespace: # pragma: no cover + # Unused, just for compatibility with PandasLikeExpr from narwhals._dask.namespace import DaskNamespace return DaskNamespace(backend_version=self._backend_version) diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index edd2eb496..55da051fe 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -69,8 +69,8 @@ def col(self, *column_names: str) -> DaskExpr: ) def lit(self, value: Any, dtype: dtypes.DType | None) -> DaskExpr: - # TODO @FBruzzesi: cast to dtype once `reverse_translate_dtype` is implemented. - # It should be enough to add `.astype(reverse_translate_dtype(dtype))` + # TODO @FBruzzesi: cast to dtype once `narwhals_to_native_dtype` is implemented. + # It should be enough to add `.astype(narwhals_to_native_dtype(dtype))` return DaskExpr( lambda df: [df._native_dataframe.assign(lit=value).loc[:, "lit"]], depth=0, diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 1f2a4500f..97e402793 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -260,9 +260,7 @@ def filter( self, *predicates: IntoPandasLikeExpr, ) -> Self: - from narwhals._pandas_like.namespace import PandasLikeNamespace - - plx = PandasLikeNamespace(self._implementation, self._backend_version) + plx = self.__narwhals_namespace__() expr = plx.all_horizontal(*predicates) # Safety: all_horizontal's expression only returns a single column. mask = expr._call(self)[0] diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 1f62dc640..4353ae712 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -226,9 +226,7 @@ def arg_true(self) -> Self: return reuse_series_implementation(self, "arg_true") def filter(self, *predicates: Any) -> Self: - from narwhals._pandas_like.namespace import PandasLikeNamespace - - plx = PandasLikeNamespace(self._implementation, self._backend_version) + plx = self.__narwhals_namespace__() expr = plx.all_horizontal(*predicates) return reuse_series_implementation(self, "filter", other=expr) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 81594c384..090fe9495 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -8,8 +8,8 @@ from typing import overload from narwhals._pandas_like.utils import int_dtype_mapper +from narwhals._pandas_like.utils import narwhals_to_native_dtype from narwhals._pandas_like.utils import native_series_from_iterable -from narwhals._pandas_like.utils import reverse_translate_dtype from narwhals._pandas_like.utils import to_datetime from narwhals._pandas_like.utils import translate_dtype from narwhals._pandas_like.utils import validate_column_comparand @@ -25,7 +25,6 @@ from typing_extensions import Self from narwhals._pandas_like.dataframe import PandasLikeDataFrame - from narwhals._pandas_like.namespace import PandasLikeNamespace from narwhals.dtypes import DType PANDAS_TO_NUMPY_DTYPE_NO_MISSING = { @@ -99,11 +98,6 @@ def __init__( else: self._use_copy_false = False - def __narwhals_namespace__(self) -> PandasLikeNamespace: - from narwhals._pandas_like.namespace import PandasLikeNamespace - - return PandasLikeNamespace(self._implementation, self._backend_version) - def __native_namespace__(self) -> Any: if self._implementation is Implementation.PANDAS: return get_pandas() @@ -181,7 +175,7 @@ def cast( dtype: Any, ) -> Self: ser = self._native_series - dtype = reverse_translate_dtype(dtype, ser.dtype, self._implementation) + dtype = narwhals_to_native_dtype(dtype, ser.dtype, self._implementation) return self._from_native_series(ser.astype(dtype)) def item(self: Self, index: int | None = None) -> Any: diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 71ebbdbce..e80654087 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -303,11 +303,20 @@ def get_dtype_backend(dtype: Any, implementation: Implementation) -> str: return "numpy" -def reverse_translate_dtype( # noqa: PLR0915 +def narwhals_to_native_dtype( # noqa: PLR0915 dtype: DType | type[DType], starting_dtype: Any, implementation: Implementation ) -> Any: from narwhals import dtypes + if "polars" in str(type(dtype)): + msg = ( + f"Expected Narwhals object, got: {type(dtype)}.\n\n" + "Perhaps you:\n" + "- Forgot a `nw.from_native` somewhere?\n" + "- Used `pl.Int64` instead of `nw.Int64`?" + ) + raise TypeError(msg) + dtype_backend = get_dtype_backend(starting_dtype, implementation) if isinstance_or_issubclass(dtype, dtypes.Float64): if dtype_backend == "pyarrow-nullable": @@ -413,6 +422,9 @@ def reverse_translate_dtype( # noqa: PLR0915 return "date32[pyarrow]" msg = "Date dtype only supported for pyarrow-backed data types in pandas" raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Enum): + msg = "Converting to Enum is not (yet) supported" + raise NotImplementedError(msg) msg = f"Unknown dtype: {dtype}" # pragma: no cover raise AssertionError(msg) diff --git a/narwhals/_polars/expr.py b/narwhals/_polars/expr.py index 98aac298e..4f1532823 100644 --- a/narwhals/_polars/expr.py +++ b/narwhals/_polars/expr.py @@ -3,10 +3,9 @@ from typing import TYPE_CHECKING from typing import Any -from narwhals._polars.namespace import PolarsNamespace from narwhals._polars.utils import extract_args_kwargs from narwhals._polars.utils import extract_native -from narwhals._polars.utils import reverse_translate_dtype +from narwhals._polars.utils import narwhals_to_native_dtype from narwhals.utils import Implementation if TYPE_CHECKING: @@ -23,12 +22,6 @@ def __init__(self, expr: Any) -> None: def __repr__(self) -> str: # pragma: no cover return "PolarsExpr" - def __narwhals_expr__(self) -> Self: # pragma: no cover - return self - - def __narwhals_namespace__(self) -> PolarsNamespace: # pragma: no cover - return PolarsNamespace(backend_version=self._backend_version) - def _from_native_expr(self, expr: Any) -> Self: return self.__class__(expr) @@ -43,7 +36,7 @@ def func(*args: Any, **kwargs: Any) -> Any: def cast(self, dtype: DType) -> Self: expr = self._native_expr - dtype = reverse_translate_dtype(dtype) + dtype = narwhals_to_native_dtype(dtype) return self._from_native_expr(expr.cast(dtype)) def __eq__(self, other: object) -> Self: # type: ignore[override] diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index 683b83c12..5a34554a4 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -7,7 +7,7 @@ from narwhals import dtypes from narwhals._polars.utils import extract_args_kwargs -from narwhals._polars.utils import reverse_translate_dtype +from narwhals._polars.utils import narwhals_to_native_dtype from narwhals.dependencies import get_polars from narwhals.utils import Implementation @@ -82,7 +82,7 @@ def lit(self, value: Any, dtype: dtypes.DType | None = None) -> PolarsExpr: pl = get_polars() if dtype is not None: - return PolarsExpr(pl.lit(value, dtype=reverse_translate_dtype(dtype))) + return PolarsExpr(pl.lit(value, dtype=narwhals_to_native_dtype(dtype))) return PolarsExpr(pl.lit(value)) def mean(self, *column_names: str) -> Any: @@ -102,7 +102,7 @@ def by_dtype(self, dtypes: Iterable[dtypes.DType]) -> PolarsExpr: pl = get_polars() return PolarsExpr( - pl.selectors.by_dtype([reverse_translate_dtype(dtype) for dtype in dtypes]) + pl.selectors.by_dtype([narwhals_to_native_dtype(dtype) for dtype in dtypes]) ) def numeric(self) -> PolarsExpr: diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 4a5fd2b7b..e71520042 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -17,8 +17,7 @@ from narwhals._polars.dataframe import PolarsDataFrame from narwhals.dtypes import DType -from narwhals._polars.namespace import PolarsNamespace -from narwhals._polars.utils import reverse_translate_dtype +from narwhals._polars.utils import narwhals_to_native_dtype from narwhals._polars.utils import translate_dtype PL = get_polars() @@ -39,9 +38,6 @@ def __narwhals_series__(self) -> Self: def __native_namespace__(self) -> Any: return get_polars() - def __narwhals_namespace__(self) -> PolarsNamespace: - return PolarsNamespace(backend_version=self._backend_version) - def _from_native_series(self, series: Any) -> Self: return self.__class__(series, backend_version=self._backend_version) @@ -94,7 +90,7 @@ def __getitem__(self, item: int | slice | Sequence[int]) -> Any | Self: def cast(self, dtype: DType) -> Self: ser = self._native_series - dtype = reverse_translate_dtype(dtype) + dtype = narwhals_to_native_dtype(dtype) return self._from_native_series(ser.cast(dtype)) def __array__(self, dtype: Any = None, copy: bool | None = None) -> np.ndarray: diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 2dc92b3ad..7c7dbe0fa 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -68,7 +68,7 @@ def translate_dtype(dtype: Any) -> dtypes.DType: return dtypes.Unknown() -def reverse_translate_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: +def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: pl = get_polars() from narwhals import dtypes @@ -100,8 +100,9 @@ def reverse_translate_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: return pl.Object() if dtype == dtypes.Categorical: return pl.Categorical() - if dtype == dtypes.Enum: # pragma: no cover - return pl.Enum() + if dtype == dtypes.Enum: + msg = "Converting to Enum is not (yet) supported" + raise NotImplementedError(msg) if dtype == dtypes.Datetime: return pl.Datetime() if dtype == dtypes.Duration: diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index 9b56d6141..4d8da4293 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -1,9 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Any - -from narwhals.utils import isinstance_or_issubclass if TYPE_CHECKING: from typing_extensions import Self @@ -18,6 +15,8 @@ def is_numeric(cls: type[Self]) -> bool: return issubclass(cls, NumericType) def __eq__(self, other: DType | type[DType]) -> bool: # type: ignore[override] + from narwhals.utils import isinstance_or_issubclass + return isinstance_or_issubclass(other, type(self)) def __hash__(self) -> int: @@ -85,51 +84,3 @@ class Enum(DType): ... class Date(TemporalType): ... - - -def translate_dtype(plx: Any, dtype: DType) -> Any: - if "polars" in str(type(dtype)): - msg = ( - f"Expected Narwhals object, got: {type(dtype)}.\n\n" - "Perhaps you:\n" - "- Forgot a `nw.from_native` somewhere?\n" - "- Used `pl.Int64` instead of `nw.Int64`?" - ) - raise TypeError(msg) - if dtype == Float64: - return plx.Float64 - if dtype == Float32: - return plx.Float32 - if dtype == Int64: - return plx.Int64 - if dtype == Int32: - return plx.Int32 - if dtype == Int16: - return plx.Int16 - if dtype == Int8: - return plx.Int8 - if dtype == UInt64: - return plx.UInt64 - if dtype == UInt32: - return plx.UInt32 - if dtype == UInt16: - return plx.UInt16 - if dtype == UInt8: - return plx.UInt8 - if dtype == String: - return plx.String - if dtype == Boolean: - return plx.Boolean - if dtype == Categorical: - return plx.Categorical - if dtype == Enum: - msg = "Converting to Enum is not (yet) supported" - raise NotImplementedError(msg) - if dtype == Datetime: - return plx.Datetime - if dtype == Duration: - return plx.Duration - if dtype == Date: - return plx.Date - msg = f"Unknown dtype: {dtype}" # pragma: no cover - raise AssertionError(msg) diff --git a/narwhals/expr.py b/narwhals/expr.py index 1de0af2d5..8b3f24f12 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -7,13 +7,12 @@ from typing import Literal from narwhals.dependencies import get_numpy -from narwhals.dtypes import DType -from narwhals.dtypes import translate_dtype from narwhals.utils import flatten if TYPE_CHECKING: from typing_extensions import Self + from narwhals.dtypes import DType from narwhals.typing import IntoExpr @@ -165,7 +164,7 @@ def cast( """ return self.__class__( - lambda plx: self._call(plx).cast(translate_dtype(plx, dtype)), + lambda plx: self._call(plx).cast(dtype), ) # --- binary --- @@ -4065,9 +4064,7 @@ def lit(value: Any, dtype: DType | None = None) -> Expr: msg = f"Nested datatypes are not supported yet. Got {value}" raise NotImplementedError(msg) - if dtype is None: - return Expr(lambda plx: plx.lit(value, dtype)) - return Expr(lambda plx: plx.lit(value, translate_dtype(plx, dtype))) + return Expr(lambda plx: plx.lit(value, dtype)) def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: diff --git a/narwhals/functions.py b/narwhals/functions.py index 45f273df8..13db8c34b 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -118,11 +118,11 @@ def from_dict( if implementation is Implementation.POLARS: if schema: from narwhals._polars.utils import ( - reverse_translate_dtype as polars_reverse_translate_dtype, + narwhals_to_native_dtype as polars_narwhals_to_native_dtype, ) schema = { - name: polars_reverse_translate_dtype(dtype) + name: polars_narwhals_to_native_dtype(dtype) for name, dtype in schema.items() } @@ -136,11 +136,11 @@ def from_dict( if schema: from narwhals._pandas_like.utils import ( - reverse_translate_dtype as pandas_like_reverse_translate_dtype, + narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype, ) schema = { - name: pandas_like_reverse_translate_dtype( + name: pandas_like_narwhals_to_native_dtype( schema[name], native_type, implementation ) for name, native_type in native_frame.dtypes.items() @@ -150,12 +150,12 @@ def from_dict( elif implementation is Implementation.PYARROW: if schema: from narwhals._arrow.utils import ( - reverse_translate_dtype as arrow_reverse_translate_dtype, + narwhals_to_native_dtype as arrow_narwhals_to_native_dtype, ) schema = native_namespace.schema( [ - (name, arrow_reverse_translate_dtype(dtype)) + (name, arrow_narwhals_to_native_dtype(dtype)) for name, dtype in schema.items() ] ) diff --git a/narwhals/selectors.py b/narwhals/selectors.py index fca1c4cdf..7c06a79c9 100644 --- a/narwhals/selectors.py +++ b/narwhals/selectors.py @@ -2,7 +2,6 @@ from typing import Any -from narwhals.dtypes import translate_dtype from narwhals.expr import Expr from narwhals.utils import flatten @@ -51,11 +50,7 @@ def by_dtype(*dtypes: Any) -> Expr: │ 4 ┆ 4.6 │ └─────┴─────┘ """ - return Selector( - lambda plx: plx.selectors.by_dtype( - [translate_dtype(plx, dtype) for dtype in flatten(dtypes)] - ) - ) + return Selector(lambda plx: plx.selectors.by_dtype(flatten(dtypes))) def numeric() -> Expr: diff --git a/narwhals/series.py b/narwhals/series.py index ca3fbbb09..d575ef707 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -7,8 +7,6 @@ from typing import Sequence from typing import overload -from narwhals.dtypes import translate_dtype - if TYPE_CHECKING: import numpy as np from typing_extensions import Self @@ -57,9 +55,6 @@ def __getitem__(self, idx: int | slice | Sequence[int]) -> Any | Self: def __native_namespace__(self) -> Any: return self._compliant_series.__native_namespace__() - def __narwhals_namespace__(self) -> Any: - return self._compliant_series.__narwhals_namespace__() - @property def shape(self) -> tuple[int]: """ @@ -281,11 +276,7 @@ def cast( 1 ] """ - return self._from_compliant_series( - self._compliant_series.cast( - translate_dtype(self.__narwhals_namespace__(), dtype) - ) - ) + return self._from_compliant_series(self._compliant_series.cast(dtype)) def to_frame(self) -> DataFrame[Any]: """ From 6324c10b2cb1d4104bba40247acacfd1cbf04a5b Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 13 Aug 2024 15:52:34 +0100 Subject: [PATCH 16/21] chore: rename _native_dataframe to _native_frame (#785) --- docs/how_it_works.md | 6 +- narwhals/_arrow/dataframe.py | 120 ++++++++++----------- narwhals/_arrow/expr.py | 2 +- narwhals/_arrow/group_by.py | 4 +- narwhals/_arrow/namespace.py | 6 +- narwhals/_dask/dataframe.py | 88 ++++++++-------- narwhals/_dask/expr.py | 4 +- narwhals/_dask/group_by.py | 6 +- narwhals/_dask/namespace.py | 12 +-- narwhals/_dask/selectors.py | 6 +- narwhals/_dask/utils.py | 4 +- narwhals/_interchange/dataframe.py | 8 +- narwhals/_pandas_like/dataframe.py | 162 ++++++++++++++--------------- narwhals/_pandas_like/expr.py | 2 +- narwhals/_pandas_like/group_by.py | 16 ++- narwhals/_pandas_like/namespace.py | 8 +- narwhals/_polars/dataframe.py | 58 +++++------ narwhals/_polars/group_by.py | 10 +- narwhals/_polars/namespace.py | 2 +- narwhals/_polars/utils.py | 2 +- narwhals/translate.py | 2 +- narwhals/utils.py | 28 ++--- 22 files changed, 264 insertions(+), 292 deletions(-) diff --git a/docs/how_it_works.md b/docs/how_it_works.md index b9b28992b..cda98a2b6 100644 --- a/docs/how_it_works.md +++ b/docs/how_it_works.md @@ -88,7 +88,7 @@ The `_call` method gives us that function! Let's see it in action. Note: the following examples use `PandasLikeDataFrame` and `PandasLikeSeries`. These are backed by actual `pandas.DataFrame`s and `pandas.Series` respectively and are Narwhals-compliant. We can access the -underlying pandas objects via `PandasLikeDataFrame._native_dataframe` and `PandasLikeSeries._native_series`. +underlying pandas objects via `PandasLikeDataFrame._native_frame` and `PandasLikeSeries._native_series`. ```python exec="1" result="python" session="pandas_impl" source="above" import narwhals as nw @@ -222,10 +222,10 @@ df_compliant = df._compliant_frame result = df_compliant.select(expr) ``` -We can then view the underlying pandas Dataframe which was produced by calling `._native_dataframe`: +We can then view the underlying pandas Dataframe which was produced by calling `._native_frame`: ```python exec="1" result="python" session="pandas_api_mapping" source="above" -print(result._native_dataframe) +print(result._native_frame) ``` which is the same as we'd have obtained by just using the Narwhals API directly: diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index f503d10da..7497c21f1 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -35,7 +35,7 @@ class ArrowDataFrame: def __init__( self, native_dataframe: Any, *, backend_version: tuple[int, ...] ) -> None: - self._native_dataframe = native_dataframe + self._native_frame = native_dataframe self._implementation = Implementation.PYARROW self._backend_version = backend_version @@ -53,15 +53,15 @@ def __narwhals_dataframe__(self) -> Self: def __narwhals_lazyframe__(self) -> Self: return self - def _from_native_dataframe(self, df: Any) -> Self: + def _from_native_frame(self, df: Any) -> Self: return self.__class__(df, backend_version=self._backend_version) @property def shape(self) -> tuple[int, int]: - return self._native_dataframe.shape # type: ignore[no-any-return] + return self._native_frame.shape # type: ignore[no-any-return] def __len__(self) -> int: - return len(self._native_dataframe) + return len(self._native_frame) def rows( self, *, named: bool = False @@ -69,7 +69,7 @@ def rows( if not named: msg = "Unnamed rows are not yet supported on PyArrow tables" raise NotImplementedError(msg) - return self._native_dataframe.to_pylist() # type: ignore[no-any-return] + return self._native_frame.to_pylist() # type: ignore[no-any-return] def iter_rows( self, @@ -77,7 +77,7 @@ def iter_rows( named: bool = False, buffer_size: int = 512, ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: - df = self._native_dataframe + df = self._native_frame num_rows = df.num_rows if not named: @@ -96,7 +96,7 @@ def get_column(self, name: str) -> ArrowSeries: raise TypeError(msg) return ArrowSeries( - self._native_dataframe[name], + self._native_frame[name], name=name, backend_version=self._backend_version, ) @@ -120,7 +120,7 @@ def __getitem__( from narwhals._arrow.series import ArrowSeries return ArrowSeries( - self._native_dataframe[item], + self._native_frame[item], name=item, backend_version=self._backend_version, ) @@ -129,8 +129,8 @@ def __getitem__( and len(item) == 2 and isinstance(item[1], (list, tuple)) ): - return self._from_native_dataframe( - self._native_dataframe.take(item[0]).select(item[1]) + return self._from_native_frame( + self._native_frame.take(item[0]).select(item[1]) ) elif isinstance(item, tuple) and len(item) == 2: @@ -139,7 +139,7 @@ def __getitem__( # PyArrow columns are always strings col_name = item[1] if isinstance(item[1], str) else self.columns[item[1]] return ArrowSeries( - self._native_dataframe[col_name].take(item[0]), + self._native_frame[col_name].take(item[0]), name=col_name, backend_version=self._backend_version, ) @@ -149,9 +149,9 @@ def __getitem__( msg = "Slicing with step is not supported on PyArrow tables" raise NotImplementedError(msg) start = item.start or 0 - stop = item.stop or len(self._native_dataframe) - return self._from_native_dataframe( - self._native_dataframe.slice(item.start, stop - start), + stop = item.stop or len(self._native_frame) + return self._from_native_frame( + self._native_frame.slice(item.start, stop - start), ) elif isinstance(item, Sequence) or ( @@ -159,7 +159,7 @@ def __getitem__( and isinstance(item, np.ndarray) and item.ndim == 1 ): - return self._from_native_dataframe(self._native_dataframe.take(item)) + return self._from_native_frame(self._native_frame.take(item)) else: # pragma: no cover msg = f"Expected str or slice, got: {type(item)}" @@ -167,7 +167,7 @@ def __getitem__( @property def schema(self) -> dict[str, DType]: - schema = self._native_dataframe.schema + schema = self._native_frame.schema return { name: translate_dtype(dtype) for name, dtype in zip(schema.names, schema.types) @@ -178,7 +178,7 @@ def collect_schema(self) -> dict[str, DType]: @property def columns(self) -> list[str]: - return self._native_dataframe.schema.names # type: ignore[no-any-return] + return self._native_frame.schema.names # type: ignore[no-any-return] def select( self, @@ -188,16 +188,14 @@ def select( new_series = evaluate_into_exprs(self, *exprs, **named_exprs) if not new_series: # return empty dataframe, like Polars does - return self._from_native_dataframe( - self._native_dataframe.__class__.from_arrays([]) - ) + return self._from_native_frame(self._native_frame.__class__.from_arrays([])) names = [s.name for s in new_series] pa = get_pyarrow() df = pa.Table.from_arrays( broadcast_series(new_series), names=names, ) - return self._from_native_dataframe(df) + return self._from_native_frame(df) def with_columns( self, @@ -220,7 +218,7 @@ def with_columns( ) ) else: - to_concat.append(self._native_dataframe[name]) + to_concat.append(self._native_frame[name]) output_names.append(name) for s in new_column_name_to_new_column_map: to_concat.append( @@ -231,8 +229,8 @@ def with_columns( ) ) output_names.append(s) - df = self._native_dataframe.__class__.from_arrays(to_concat, names=output_names) - return self._from_native_dataframe(df) + df = self._native_frame.__class__.from_arrays(to_concat, names=output_names) + return self._from_native_frame(df) def group_by(self, *keys: str) -> ArrowGroupBy: from narwhals._arrow.group_by import ArrowGroupBy @@ -265,9 +263,9 @@ def join( n_bytes=8, columns=[*self.columns, *other.columns] ) - return self._from_native_dataframe( - self.with_columns(**{key_token: plx.lit(0, None)})._native_dataframe.join( - other.with_columns(**{key_token: plx.lit(0, None)})._native_dataframe, + return self._from_native_frame( + self.with_columns(**{key_token: plx.lit(0, None)})._native_frame.join( + other.with_columns(**{key_token: plx.lit(0, None)})._native_frame, keys=key_token, right_keys=key_token, join_type="inner", @@ -275,9 +273,9 @@ def join( ), ).drop(key_token) - return self._from_native_dataframe( - self._native_dataframe.join( - other._native_dataframe, + return self._from_native_frame( + self._native_frame.join( + other._native_frame, keys=left_on, right_keys=right_on, join_type=how_to_join_map[how], @@ -286,11 +284,11 @@ def join( ) def drop(self, *columns: str) -> Self: - return self._from_native_dataframe(self._native_dataframe.drop(list(columns))) + return self._from_native_frame(self._native_frame.drop(list(columns))) def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: if subset is None: - return self._from_native_dataframe(self._native_dataframe.drop_null()) + return self._from_native_frame(self._native_frame.drop_null()) subset = [subset] if isinstance(subset, str) else subset plx = self.__narwhals_namespace__() return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) @@ -302,7 +300,7 @@ def sort( descending: bool | Sequence[bool] = False, ) -> Self: flat_keys = flatten([*flatten([by]), *more_by]) - df = self._native_dataframe + df = self._native_frame if isinstance(descending, bool): order = "descending" if descending else "ascending" @@ -312,18 +310,18 @@ def sort( (key, "descending" if is_descending else "ascending") for key, is_descending in zip(flat_keys, descending) ] - return self._from_native_dataframe(df.sort_by(sorting=sorting)) + return self._from_native_frame(df.sort_by(sorting=sorting)) def to_pandas(self) -> Any: - return self._native_dataframe.to_pandas() + return self._native_frame.to_pandas() def to_numpy(self) -> Any: import numpy as np - return np.column_stack([col.to_numpy() for col in self._native_dataframe.columns]) + return np.column_stack([col.to_numpy() for col in self._native_frame.columns]) def to_dict(self, *, as_series: bool) -> Any: - df = self._native_dataframe + df = self._native_frame names_and_values = zip(df.column_names, df.columns) if as_series: @@ -338,10 +336,10 @@ def to_dict(self, *, as_series: bool) -> Any: def with_row_index(self, name: str) -> Self: pa = get_pyarrow() - df = self._native_dataframe + df = self._native_frame row_indices = pa.array(range(df.num_rows)) - return self._from_native_dataframe(df.append_column(name, row_indices)) + return self._from_native_frame(df.append_column(name, row_indices)) def filter( self, @@ -351,42 +349,38 @@ def filter( expr = plx.all_horizontal(*predicates) # Safety: all_horizontal's expression only returns a single column. mask = expr._call(self)[0] - return self._from_native_dataframe( - self._native_dataframe.filter(mask._native_series) - ) + return self._from_native_frame(self._native_frame.filter(mask._native_series)) def null_count(self) -> Self: pa = get_pyarrow() - df = self._native_dataframe + df = self._native_frame names_and_values = zip(df.column_names, df.columns) - return self._from_native_dataframe( + return self._from_native_frame( pa.table({name: [col.null_count] for name, col in names_and_values}) ) def head(self, n: int) -> Self: - df = self._native_dataframe + df = self._native_frame if n >= 0: - return self._from_native_dataframe(df.slice(0, n)) + return self._from_native_frame(df.slice(0, n)) else: num_rows = df.num_rows - return self._from_native_dataframe(df.slice(0, max(0, num_rows + n))) + return self._from_native_frame(df.slice(0, max(0, num_rows + n))) def tail(self, n: int) -> Self: - df = self._native_dataframe + df = self._native_frame if n >= 0: num_rows = df.num_rows - return self._from_native_dataframe(df.slice(max(0, num_rows - n))) + return self._from_native_frame(df.slice(max(0, num_rows - n))) else: - return self._from_native_dataframe(df.slice(abs(n))) + return self._from_native_frame(df.slice(abs(n))) def lazy(self) -> Self: return self def collect(self) -> ArrowDataFrame: - return ArrowDataFrame( - self._native_dataframe, backend_version=self._backend_version - ) + return ArrowDataFrame(self._native_frame, backend_version=self._backend_version) def clone(self) -> Self: msg = "clone is not yet supported on PyArrow tables" @@ -404,23 +398,23 @@ def item(self: Self, row: int | None = None, column: int | str | None = None) -> f" frame has shape {self.shape!r}" ) raise ValueError(msg) - return self._native_dataframe[0][0] + return self._native_frame[0][0] elif row is None or column is None: msg = "cannot call `.item()` with only one of `row` or `column`" raise ValueError(msg) _col = self.columns.index(column) if isinstance(column, str) else column - return self._native_dataframe[_col][row] + return self._native_frame[_col][row] def rename(self, mapping: dict[str, str]) -> Self: - df = self._native_dataframe + df = self._native_frame new_cols = [mapping.get(c, c) for c in df.column_names] - return self._from_native_dataframe(df.rename_columns(new_cols)) + return self._from_native_frame(df.rename_columns(new_cols)) def write_parquet(self, file: Any) -> Any: pp = get_pyarrow_parquet() - pp.write_table(self._native_dataframe, file) + pp.write_table(self._native_frame, file) def is_duplicated(self: Self) -> ArrowSeries: from narwhals._arrow.series import ArrowSeries @@ -428,7 +422,7 @@ def is_duplicated(self: Self) -> ArrowSeries: np = get_numpy() pa = get_pyarrow() pc = get_pyarrow_compute() - df = self._native_dataframe + df = self._native_frame columns = self.columns col_token = generate_unique_token(n_bytes=8, columns=columns) @@ -472,7 +466,7 @@ def unique( pa = get_pyarrow() pc = get_pyarrow_compute() - df = self._native_dataframe + df = self._native_frame if isinstance(subset, str): subset = [subset] @@ -490,13 +484,13 @@ def unique( .column(f"{col_token}_{agg_func}") ) - return self._from_native_dataframe(pc.take(df, keep_idx)) + return self._from_native_frame(pc.take(df, keep_idx)) keep_idx = self.select(*subset).is_unique() return self.filter(keep_idx) def gather_every(self: Self, n: int, offset: int = 0) -> Self: - return self._from_native_dataframe(self._native_dataframe[offset::n]) + return self._from_native_frame(self._native_frame[offset::n]) def to_arrow(self: Self) -> Any: - return self._native_dataframe + return self._native_frame diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index cc8b46567..ca9293b8b 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -56,7 +56,7 @@ def from_column_names( def func(df: ArrowDataFrame) -> list[ArrowSeries]: return [ ArrowSeries( - df._native_dataframe[column_name], + df._native_frame[column_name], name=column_name, backend_version=df._backend_version, ) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index b05b41b82..ecdfc02a6 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -23,7 +23,7 @@ def __init__(self, df: ArrowDataFrame, keys: list[str]) -> None: pa = get_pyarrow() self._df = df self._keys = list(keys) - self._grouped = pa.TableGroupBy(self._df._native_dataframe, list(self._keys)) + self._grouped = pa.TableGroupBy(self._df._native_frame, list(self._keys)) def agg( self, @@ -51,7 +51,7 @@ def agg( exprs, self._keys, output_names, - self._df._from_native_dataframe, + self._df._from_native_frame, ) def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]: diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index 57bd5a4f1..ffb3f2d15 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -114,7 +114,7 @@ def len(self) -> ArrowExpr: return ArrowExpr( # pragma: no cover lambda df: [ ArrowSeries._from_iterable( - [len(df._native_dataframe)], + [len(df._native_frame)], name="len", backend_version=self._backend_version, ) @@ -133,7 +133,7 @@ def all(self) -> ArrowExpr: return ArrowExpr( lambda df: [ ArrowSeries( - df._native_dataframe[column_name], + df._native_frame[column_name], name=column_name, backend_version=df._backend_version, ) @@ -181,7 +181,7 @@ def concat( *, how: str = "vertical", ) -> ArrowDataFrame: - dfs: list[Any] = [item._native_dataframe for item in items] + dfs: list[Any] = [item._native_frame for item in items] if how == "horizontal": return ArrowDataFrame( diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 6774f762a..33f3c8b85 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -29,7 +29,7 @@ class DaskLazyFrame: def __init__( self, native_dataframe: Any, *, backend_version: tuple[int, ...] ) -> None: - self._native_dataframe = native_dataframe + self._native_frame = native_dataframe self._backend_version = backend_version def __native_namespace__(self) -> Any: # pragma: no cover @@ -43,19 +43,19 @@ def __narwhals_namespace__(self) -> DaskNamespace: def __narwhals_lazyframe__(self) -> Self: return self - def _from_native_dataframe(self, df: Any) -> Self: + def _from_native_frame(self, df: Any) -> Self: return self.__class__(df, backend_version=self._backend_version) def with_columns(self, *exprs: DaskExpr, **named_exprs: DaskExpr) -> Self: - df = self._native_dataframe + df = self._native_frame new_series = parse_exprs_and_named_exprs(self, *exprs, **named_exprs) df = df.assign(**new_series) - return self._from_native_dataframe(df) + return self._from_native_frame(df) def collect(self) -> Any: from narwhals._pandas_like.dataframe import PandasLikeDataFrame - result = self._native_dataframe.compute() + result = self._native_frame.compute() return PandasLikeDataFrame( result, implementation=Implementation.PANDAS, @@ -64,7 +64,7 @@ def collect(self) -> Any: @property def columns(self) -> list[str]: - return self._native_dataframe.columns.tolist() # type: ignore[no-any-return] + return self._native_frame.columns.tolist() # type: ignore[no-any-return] def filter( self, @@ -76,7 +76,7 @@ def filter( expr = plx.all_horizontal(*predicates) # Safety: all_horizontal's expression only returns a single column. mask = expr._call(self)[0] - return self._from_native_dataframe(self._native_dataframe.loc[mask]) + return self._from_native_frame(self._native_frame.loc[mask]) def lazy(self) -> Self: return self @@ -90,17 +90,15 @@ def select( if exprs and all(isinstance(x, str) for x in exprs) and not named_exprs: # This is a simple slice => fastpath! - return self._from_native_dataframe(self._native_dataframe.loc[:, exprs]) + return self._from_native_frame(self._native_frame.loc[:, exprs]) new_series = parse_exprs_and_named_exprs(self, *exprs, **named_exprs) if not new_series: # return empty dataframe, like Polars does pd = get_pandas() - return self._from_native_dataframe( - dd.from_pandas( - pd.DataFrame(), npartitions=self._native_dataframe.npartitions - ) + return self._from_native_frame( + dd.from_pandas(pd.DataFrame(), npartitions=self._native_frame.npartitions) ) if all(getattr(expr, "_returns_scalar", False) for expr in exprs) and all( @@ -109,14 +107,14 @@ def select( df = dd.concat( [val.to_series().rename(name) for name, val in new_series.items()], axis=1 ) - return self._from_native_dataframe(df) + return self._from_native_frame(df) - df = self._native_dataframe.assign(**new_series).loc[:, list(new_series.keys())] - return self._from_native_dataframe(df) + df = self._native_frame.assign(**new_series).loc[:, list(new_series.keys())] + return self._from_native_frame(df) def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: if subset is None: - return self._from_native_dataframe(self._native_dataframe.dropna()) + return self._from_native_frame(self._native_frame.dropna()) subset = [subset] if isinstance(subset, str) else subset plx = self.__narwhals_namespace__() return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) @@ -124,29 +122,27 @@ def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: @property def schema(self) -> dict[str, DType]: return { - col: translate_dtype(self._native_dataframe.loc[:, col]) - for col in self._native_dataframe.columns + col: translate_dtype(self._native_frame.loc[:, col]) + for col in self._native_frame.columns } def collect_schema(self) -> dict[str, DType]: return self.schema def drop(self: Self, *columns: str) -> Self: - return self._from_native_dataframe( - self._native_dataframe.drop(columns=list(columns)) - ) + return self._from_native_frame(self._native_frame.drop(columns=list(columns))) def with_row_index(self: Self, name: str) -> Self: # Implementation is based on the following StackOverflow reply: # https://stackoverflow.com/questions/60831518/in-dask-how-does-one-add-a-range-of-integersauto-increment-to-a-new-column/60852409#60852409 - return self._from_native_dataframe(add_row_index(self._native_dataframe, name)) + return self._from_native_frame(add_row_index(self._native_frame, name)) def rename(self: Self, mapping: dict[str, str]) -> Self: - return self._from_native_dataframe(self._native_dataframe.rename(columns=mapping)) + return self._from_native_frame(self._native_frame.rename(columns=mapping)) def head(self: Self, n: int) -> Self: - return self._from_native_dataframe( - self._native_dataframe.head(n=n, compute=False, npartitions=-1) + return self._from_native_frame( + self._native_frame.head(n=n, compute=False, npartitions=-1) ) def unique( @@ -162,7 +158,7 @@ def unique( and has no effect on the output. """ subset = flatten(subset) if subset else None - native_frame = self._native_dataframe + native_frame = self._native_frame if keep == "none": subset = subset or self.columns token = generate_unique_token(n_bytes=8, columns=subset) @@ -173,7 +169,7 @@ def unique( else: mapped_keep = {"any": "first"}.get(keep, keep) result = native_frame.drop_duplicates(subset=subset, keep=mapped_keep) - return self._from_native_dataframe(result) + return self._from_native_frame(result) def sort( self: Self, @@ -182,12 +178,12 @@ def sort( descending: bool | Sequence[bool] = False, ) -> Self: flat_keys = flatten([*flatten([by]), *more_by]) - df = self._native_dataframe + df = self._native_frame if isinstance(descending, bool): ascending: bool | list[bool] = not descending else: ascending = [not d for d in descending] - return self._from_native_dataframe(df.sort_values(flat_keys, ascending=ascending)) + return self._from_native_frame(df.sort_values(flat_keys, ascending=ascending)) def join( self: Self, @@ -207,9 +203,9 @@ def join( n_bytes=8, columns=[*self.columns, *other.columns] ) - return self._from_native_dataframe( - self._native_dataframe.assign(**{key_token: 0}).merge( - other._native_dataframe.assign(**{key_token: 0}), + return self._from_native_frame( + self._native_frame.assign(**{key_token: 0}).merge( + other._native_frame.assign(**{key_token: 0}), how="inner", left_on=key_token, right_on=key_token, @@ -223,33 +219,33 @@ def join( ) other_native = ( - other._native_dataframe.loc[:, right_on] + other._native_frame.loc[:, right_on] .rename( # rename to avoid creating extra columns in join columns=dict(zip(right_on, left_on)) # type: ignore[arg-type] ) .drop_duplicates() ) - df = self._native_dataframe.merge( + df = self._native_frame.merge( other_native, how="outer", indicator=indicator_token, left_on=left_on, right_on=left_on, ) - return self._from_native_dataframe( + return self._from_native_frame( df.loc[df[indicator_token] == "left_only"].drop(columns=[indicator_token]) ) if how == "semi": other_native = ( - other._native_dataframe.loc[:, right_on] + other._native_frame.loc[:, right_on] .rename( # rename to avoid creating extra columns in join columns=dict(zip(right_on, left_on)) # type: ignore[arg-type] ) .drop_duplicates() # avoids potential rows duplication from inner join ) - return self._from_native_dataframe( - self._native_dataframe.merge( + return self._from_native_frame( + self._native_frame.merge( other_native, how="inner", left_on=left_on, @@ -258,8 +254,8 @@ def join( ) if how == "left": - other_native = other._native_dataframe - result_native = self._native_dataframe.merge( + other_native = other._native_frame + result_native = self._native_frame.merge( other_native, how="left", left_on=left_on, @@ -272,11 +268,11 @@ def join( extra.append(right_key) elif right_key != left_key: extra.append(f"{right_key}_right") - return self._from_native_dataframe(result_native.drop(columns=extra)) + return self._from_native_frame(result_native.drop(columns=extra)) - return self._from_native_dataframe( - self._native_dataframe.merge( - other._native_dataframe, + return self._from_native_frame( + self._native_frame.merge( + other._native_frame, left_on=left_on, right_on=right_on, how=how, @@ -290,9 +286,7 @@ def group_by(self, *by: str) -> Any: return DaskLazyGroupBy(self, list(by)) def tail(self: Self, n: int) -> Self: - return self._from_native_dataframe( - self._native_dataframe.tail(n=n, compute=False) - ) + return self._from_native_frame(self._native_frame.tail(n=n, compute=False)) def gather_every(self: Self, n: int, offset: int) -> Self: row_index_token = generate_unique_token(n_bytes=8, columns=self.columns) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index a8472887b..62dc16460 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -56,9 +56,7 @@ def from_column_names( backend_version: tuple[int, ...], ) -> Self: def func(df: DaskLazyFrame) -> list[Any]: - return [ - df._native_dataframe.loc[:, column_name] for column_name in column_names - ] + return [df._native_frame.loc[:, column_name] for column_name in column_names] return cls( func, diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py index 57e112c8c..8538c62d2 100644 --- a/narwhals/_dask/group_by.py +++ b/narwhals/_dask/group_by.py @@ -23,7 +23,7 @@ class DaskLazyGroupBy: def __init__(self, df: DaskLazyFrame, keys: list[str]) -> None: self._df = df self._keys = keys - self._grouped = self._df._native_dataframe.groupby( + self._grouped = self._df._native_frame.groupby( list(self._keys), dropna=False, ) @@ -54,10 +54,10 @@ def agg( self._grouped, exprs, self._keys, - self._from_native_dataframe, + self._from_native_frame, ) - def _from_native_dataframe(self, df: DaskLazyFrame) -> DaskLazyFrame: + def _from_native_frame(self, df: DaskLazyFrame) -> DaskLazyFrame: from narwhals._dask.dataframe import DaskLazyFrame return DaskLazyFrame( diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 55da051fe..afff9fee5 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -48,9 +48,7 @@ def __init__(self, *, backend_version: tuple[int, ...]) -> None: def all(self) -> DaskExpr: def func(df: DaskLazyFrame) -> list[Any]: - return [ - df._native_dataframe.loc[:, column_name] for column_name in df.columns - ] + return [df._native_frame.loc[:, column_name] for column_name in df.columns] return DaskExpr( func, @@ -72,7 +70,7 @@ def lit(self, value: Any, dtype: dtypes.DType | None) -> DaskExpr: # TODO @FBruzzesi: cast to dtype once `narwhals_to_native_dtype` is implemented. # It should be enough to add `.astype(narwhals_to_native_dtype(dtype))` return DaskExpr( - lambda df: [df._native_dataframe.assign(lit=value).loc[:, "lit"]], + lambda df: [df._native_frame.assign(lit=value).loc[:, "lit"]], depth=0, function_name="lit", root_names=None, @@ -114,12 +112,10 @@ def func(df: DaskLazyFrame) -> list[Any]: return [ dd.from_pandas( pd.Series([0], name="len"), - npartitions=df._native_dataframe.npartitions, + npartitions=df._native_frame.npartitions, ) ] - return [ - df._native_dataframe.loc[:, df.columns[0]].size.to_series().rename("len") - ] + return [df._native_frame.loc[:, df.columns[0]].size.to_series().rename("len")] # coverage bug? this is definitely hit return DaskExpr( # pragma: no cover diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 11d0f0c79..073b3abd8 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -21,9 +21,7 @@ def __init__(self: Self, *, backend_version: tuple[int, ...]) -> None: def by_dtype(self: Self, dtypes: list[DType | type[DType]]) -> DaskSelector: def func(df: DaskLazyFrame) -> list[Any]: return [ - df._native_dataframe[col] - for col in df.columns - if df.schema[col] in dtypes + df._native_frame[col] for col in df.columns if df.schema[col] in dtypes ] return DaskSelector( @@ -63,7 +61,7 @@ def boolean(self: Self) -> DaskSelector: def all(self: Self) -> DaskSelector: def func(df: DaskLazyFrame) -> list[Any]: - return [df._native_dataframe[col] for col in df.columns] + return [df._native_frame[col] for col in df.columns] return DaskSelector( func, diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index c0f77e2c5..27f0d7643 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -19,7 +19,7 @@ def maybe_evaluate(df: DaskLazyFrame, obj: Any) -> Any: raise NotImplementedError(msg) result = results[0] if not get_dask_expr()._expr.are_co_aligned( - df._native_dataframe._expr, result._expr + df._native_frame._expr, result._expr ): # pragma: no cover # are_co_aligned is a method which cheaply checks if two Dask expressions # have the same index, and therefore don't require index alignment. @@ -48,7 +48,7 @@ def parse_exprs_and_named_exprs( if hasattr(expr, "__narwhals_expr__"): _results = expr._call(df) elif isinstance(expr, str): - _results = [df._native_dataframe.loc[:, expr]] + _results = [df._native_frame.loc[:, expr]] else: # pragma: no cover msg = f"Expected expression or column name, got: {expr}" raise TypeError(msg) diff --git a/narwhals/_interchange/dataframe.py b/narwhals/_interchange/dataframe.py index c7fc5ea3d..bf1b17243 100644 --- a/narwhals/_interchange/dataframe.py +++ b/narwhals/_interchange/dataframe.py @@ -69,7 +69,7 @@ def map_interchange_dtype_to_narwhals_dtype( class InterchangeFrame: def __init__(self, df: Any) -> None: - self._native_dataframe = df + self._native_frame = df def __narwhals_dataframe__(self) -> Any: return self @@ -77,15 +77,15 @@ def __narwhals_dataframe__(self) -> Any: def __getitem__(self, item: str) -> InterchangeSeries: from narwhals._interchange.series import InterchangeSeries - return InterchangeSeries(self._native_dataframe.get_column_by_name(item)) + return InterchangeSeries(self._native_frame.get_column_by_name(item)) @property def schema(self) -> dict[str, dtypes.DType]: return { column_name: map_interchange_dtype_to_narwhals_dtype( - self._native_dataframe.get_column_by_name(column_name).dtype + self._native_frame.get_column_by_name(column_name).dtype ) - for column_name in self._native_dataframe.column_names() + for column_name in self._native_frame.column_names() } def __getattr__(self, attr: str) -> NoReturn: diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 97e402793..f6fb2489e 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -45,7 +45,7 @@ def __init__( backend_version: tuple[int, ...], ) -> None: self._validate_columns(native_dataframe.columns) - self._native_dataframe = native_dataframe + self._native_frame = native_dataframe self._implementation = implementation self._backend_version = backend_version @@ -71,7 +71,7 @@ def __native_namespace__(self) -> Any: raise AssertionError(msg) def __len__(self) -> int: - return len(self._native_dataframe) + return len(self._native_frame) def _validate_columns(self, columns: Sequence[str]) -> None: if len(columns) != len(set(columns)): @@ -83,7 +83,7 @@ def _validate_columns(self, columns: Sequence[str]) -> None: msg = "Please report a bug" # pragma: no cover raise AssertionError(msg) - def _from_native_dataframe(self, df: Any) -> Self: + def _from_native_frame(self, df: Any) -> Self: return self.__class__( df, implementation=self._implementation, @@ -94,7 +94,7 @@ def get_column(self, name: str) -> PandasLikeSeries: from narwhals._pandas_like.series import PandasLikeSeries return PandasLikeSeries( - self._native_dataframe.loc[:, name], + self._native_frame.loc[:, name], implementation=self._implementation, backend_version=self._backend_version, ) @@ -118,7 +118,7 @@ def __getitem__( from narwhals._pandas_like.series import PandasLikeSeries return PandasLikeSeries( - self._native_dataframe.loc[:, item], + self._native_frame.loc[:, item], implementation=self._implementation, backend_version=self._backend_version, ) @@ -129,13 +129,13 @@ def __getitem__( and isinstance(item[1], (tuple, list)) ): if all(isinstance(x, int) for x in item[1]): - return self._from_native_dataframe(self._native_dataframe.iloc[item]) + return self._from_native_frame(self._native_frame.iloc[item]) if all(isinstance(x, str) for x in item[1]): item = ( item[0], - self._native_dataframe.columns.get_indexer(item[1]), + self._native_frame.columns.get_indexer(item[1]), ) - return self._from_native_dataframe(self._native_dataframe.iloc[item]) + return self._from_native_frame(self._native_frame.iloc[item]) msg = ( f"Expected sequence str or int, got: {type(item[1])}" # pragma: no cover ) @@ -145,10 +145,10 @@ def __getitem__( from narwhals._pandas_like.series import PandasLikeSeries if isinstance(item[1], str): - item = (item[0], self._native_dataframe.columns.get_loc(item[1])) - native_series = self._native_dataframe.iloc[item] + item = (item[0], self._native_frame.columns.get_loc(item[1])) + native_series = self._native_frame.iloc[item] elif isinstance(item[1], int): - native_series = self._native_dataframe.iloc[item] + native_series = self._native_frame.iloc[item] else: # pragma: no cover msg = f"Expected str or int, got: {type(item[1])}" raise TypeError(msg) @@ -164,7 +164,7 @@ def __getitem__( and isinstance(item, np.ndarray) and item.ndim == 1 ): - return self._from_native_dataframe(self._native_dataframe.iloc[item]) + return self._from_native_frame(self._native_frame.iloc[item]) else: # pragma: no cover msg = f"Expected str or slice, got: {type(item)}" @@ -173,15 +173,15 @@ def __getitem__( # --- properties --- @property def columns(self) -> list[str]: - return self._native_dataframe.columns.tolist() # type: ignore[no-any-return] + return self._native_frame.columns.tolist() # type: ignore[no-any-return] def rows( self, *, named: bool = False ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: if not named: - return list(self._native_dataframe.itertuples(index=False, name=None)) + return list(self._native_frame.itertuples(index=False, name=None)) - return self._native_dataframe.to_dict(orient="records") # type: ignore[no-any-return] + return self._native_frame.to_dict(orient="records") # type: ignore[no-any-return] def iter_rows( self, @@ -195,19 +195,19 @@ def iter_rows( and has no effect on the output. """ if not named: - yield from self._native_dataframe.itertuples(index=False, name=None) + yield from self._native_frame.itertuples(index=False, name=None) else: - col_names = self._native_dataframe.columns + col_names = self._native_frame.columns yield from ( dict(zip(col_names, row)) - for row in self._native_dataframe.itertuples(index=False) + for row in self._native_frame.itertuples(index=False) ) # type: ignore[misc] @property def schema(self) -> dict[str, DType]: return { - col: translate_dtype(self._native_dataframe.loc[:, col]) - for col in self._native_dataframe.columns + col: translate_dtype(self._native_frame.loc[:, col]) + for col in self._native_frame.columns } def collect_schema(self) -> dict[str, DType]: @@ -221,36 +221,36 @@ def select( ) -> Self: if exprs and all(isinstance(x, str) for x in exprs) and not named_exprs: # This is a simple slice => fastpath! - return self._from_native_dataframe(self._native_dataframe.loc[:, exprs]) + return self._from_native_frame(self._native_frame.loc[:, exprs]) new_series = evaluate_into_exprs(self, *exprs, **named_exprs) if not new_series: # return empty dataframe, like Polars does - return self._from_native_dataframe(self._native_dataframe.__class__()) + return self._from_native_frame(self._native_frame.__class__()) new_series = broadcast_series(new_series) df = horizontal_concat( new_series, implementation=self._implementation, backend_version=self._backend_version, ) - return self._from_native_dataframe(df) + return self._from_native_frame(df) def drop_nulls(self, subset: str | list[str] | None) -> Self: if subset is None: - return self._from_native_dataframe(self._native_dataframe.dropna(axis=0)) + return self._from_native_frame(self._native_frame.dropna(axis=0)) subset = [subset] if isinstance(subset, str) else subset plx = self.__narwhals_namespace__() return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) def with_row_index(self, name: str) -> Self: row_index = create_native_series( - range(len(self._native_dataframe)), - index=self._native_dataframe.index, + range(len(self._native_frame)), + index=self._native_frame.index, implementation=self._implementation, backend_version=self._backend_version, ).alias(name) - return self._from_native_dataframe( + return self._from_native_frame( horizontal_concat( - [row_index._native_series, self._native_dataframe], + [row_index._native_series, self._native_frame], implementation=self._implementation, backend_version=self._backend_version, ) @@ -264,15 +264,15 @@ def filter( expr = plx.all_horizontal(*predicates) # Safety: all_horizontal's expression only returns a single column. mask = expr._call(self)[0] - _mask = validate_dataframe_comparand(self._native_dataframe.index, mask) - return self._from_native_dataframe(self._native_dataframe.loc[_mask]) + _mask = validate_dataframe_comparand(self._native_frame.index, mask) + return self._from_native_frame(self._native_frame.loc[_mask]) def with_columns( self, *exprs: IntoPandasLikeExpr, **named_exprs: IntoPandasLikeExpr, ) -> Self: - index = self._native_dataframe.index + index = self._native_frame.index new_columns = evaluate_into_exprs(self, *exprs, **named_exprs) if not new_columns and len(self) == 0: @@ -293,7 +293,7 @@ def with_columns( new_column_name_to_new_column_map = {s.name: s for s in new_columns} to_concat = [] # Make sure to preserve column order - for name in self._native_dataframe.columns: + for name in self._native_frame.columns: if name in new_column_name_to_new_column_map: to_concat.append( validate_dataframe_comparand( @@ -301,7 +301,7 @@ def with_columns( ) ) else: - to_concat.append(self._native_dataframe.loc[:, name]) + to_concat.append(self._native_frame.loc[:, name]) to_concat.extend( validate_dataframe_comparand(index, new_column_name_to_new_column_map[s]) for s in new_column_name_to_new_column_map @@ -313,18 +313,16 @@ def with_columns( backend_version=self._backend_version, ) else: - df = self._native_dataframe.copy(deep=False) + df = self._native_frame.copy(deep=False) for s in new_columns: df[s.name] = validate_dataframe_comparand(index, s) - return self._from_native_dataframe(df) + return self._from_native_frame(df) def rename(self, mapping: dict[str, str]) -> Self: - return self._from_native_dataframe(self._native_dataframe.rename(columns=mapping)) + return self._from_native_frame(self._native_frame.rename(columns=mapping)) def drop(self, *columns: str) -> Self: - return self._from_native_dataframe( - self._native_dataframe.drop(columns=list(columns)) - ) + return self._from_native_frame(self._native_frame.drop(columns=list(columns))) # --- transform --- def sort( @@ -334,17 +332,17 @@ def sort( descending: bool | Sequence[bool] = False, ) -> Self: flat_keys = flatten([*flatten([by]), *more_by]) - df = self._native_dataframe + df = self._native_frame if isinstance(descending, bool): ascending: bool | list[bool] = not descending else: ascending = [not d for d in descending] - return self._from_native_dataframe(df.sort_values(flat_keys, ascending=ascending)) + return self._from_native_frame(df.sort_values(flat_keys, ascending=ascending)) # --- convert --- def collect(self) -> PandasLikeDataFrame: return PandasLikeDataFrame( - self._native_dataframe, + self._native_frame, implementation=self._implementation, backend_version=self._backend_version, ) @@ -383,9 +381,9 @@ def join( n_bytes=8, columns=[*self.columns, *other.columns] ) - return self._from_native_dataframe( - self._native_dataframe.assign(**{key_token: 0}).merge( - other._native_dataframe.assign(**{key_token: 0}), + return self._from_native_frame( + self._native_frame.assign(**{key_token: 0}).merge( + other._native_frame.assign(**{key_token: 0}), how="inner", left_on=key_token, right_on=key_token, @@ -393,9 +391,9 @@ def join( ), ).drop(key_token) else: - return self._from_native_dataframe( - self._native_dataframe.merge( - other._native_dataframe, + return self._from_native_frame( + self._native_frame.merge( + other._native_frame, how="cross", suffixes=("", "_right"), ), @@ -407,14 +405,14 @@ def join( ) other_native = ( - other._native_dataframe.loc[:, right_on] + other._native_frame.loc[:, right_on] .rename( # rename to avoid creating extra columns in join columns=dict(zip(right_on, left_on)) # type: ignore[arg-type] ) .drop_duplicates() ) - return self._from_native_dataframe( - self._native_dataframe.merge( + return self._from_native_frame( + self._native_frame.merge( other_native, how="outer", indicator=indicator_token, @@ -427,14 +425,14 @@ def join( if how == "semi": other_native = ( - other._native_dataframe.loc[:, right_on] + other._native_frame.loc[:, right_on] .rename( # rename to avoid creating extra columns in join columns=dict(zip(right_on, left_on)) # type: ignore[arg-type] ) .drop_duplicates() # avoids potential rows duplication from inner join ) - return self._from_native_dataframe( - self._native_dataframe.merge( + return self._from_native_frame( + self._native_frame.merge( other_native, how="inner", left_on=left_on, @@ -443,8 +441,8 @@ def join( ) if how == "left": - other_native = other._native_dataframe - result_native = self._native_dataframe.merge( + other_native = other._native_frame + result_native = self._native_frame.merge( other_native, how="left", left_on=left_on, @@ -457,11 +455,11 @@ def join( extra.append(right_key) elif right_key != left_key: extra.append(f"{right_key}_right") - return self._from_native_dataframe(result_native.drop(columns=extra)) + return self._from_native_frame(result_native.drop(columns=extra)) - return self._from_native_dataframe( - self._native_dataframe.merge( - other._native_dataframe, + return self._from_native_frame( + self._native_frame.merge( + other._native_frame, left_on=left_on, right_on=right_on, how=how, @@ -472,10 +470,10 @@ def join( # --- partial reduction --- def head(self, n: int) -> Self: - return self._from_native_dataframe(self._native_dataframe.head(n)) + return self._from_native_frame(self._native_frame.head(n)) def tail(self, n: int) -> Self: - return self._from_native_dataframe(self._native_dataframe.tail(n)) + return self._from_native_frame(self._native_frame.tail(n)) def unique( self: Self, @@ -492,8 +490,8 @@ def unique( mapped_keep = {"none": False, "any": "first"}.get(keep, keep) subset = flatten(subset) if subset else None - return self._from_native_dataframe( - self._native_dataframe.drop_duplicates(subset=subset, keep=mapped_keep) + return self._from_native_frame( + self._native_frame.drop_duplicates(subset=subset, keep=mapped_keep) ) # --- lazy-only --- @@ -502,7 +500,7 @@ def lazy(self) -> Self: @property def shape(self) -> tuple[int, int]: - return self._native_dataframe.shape # type: ignore[no-any-return] + return self._native_frame.shape # type: ignore[no-any-return] def to_dict(self, *, as_series: bool = False) -> dict[str, Any]: from narwhals._pandas_like.series import PandasLikeSeries @@ -511,13 +509,13 @@ def to_dict(self, *, as_series: bool = False) -> dict[str, Any]: # TODO(Unassigned): should this return narwhals series? return { col: PandasLikeSeries( - self._native_dataframe.loc[:, col], + self._native_frame.loc[:, col], implementation=self._implementation, backend_version=self._backend_version, ) for col in self.columns } - return self._native_dataframe.to_dict(orient="list") # type: ignore[no-any-return] + return self._native_frame.to_dict(orient="list") # type: ignore[no-any-return] def to_numpy(self) -> Any: from narwhals._pandas_like.series import PANDAS_TO_NUMPY_DTYPE_MISSING @@ -526,48 +524,48 @@ def to_numpy(self) -> Any: # Series to numpy and let numpy find a common dtype. # If there aren't any dtypes where `to_numpy()` is "broken" (i.e. it # returns Object) then we just call `to_numpy()` on the DataFrame. - for dtype in self._native_dataframe.dtypes: + for dtype in self._native_frame.dtypes: if str(dtype) in PANDAS_TO_NUMPY_DTYPE_MISSING: import numpy as np return np.hstack([self[col].to_numpy()[:, None] for col in self.columns]) - return self._native_dataframe.to_numpy() + return self._native_frame.to_numpy() def to_pandas(self) -> Any: if self._implementation is Implementation.PANDAS: - return self._native_dataframe + return self._native_frame if self._implementation is Implementation.MODIN: # pragma: no cover - return self._native_dataframe._to_pandas() - return self._native_dataframe.to_pandas() # pragma: no cover + return self._native_frame._to_pandas() + return self._native_frame.to_pandas() # pragma: no cover def write_parquet(self, file: Any) -> Any: - self._native_dataframe.to_parquet(file) + self._native_frame.to_parquet(file) # --- descriptive --- def is_duplicated(self: Self) -> PandasLikeSeries: from narwhals._pandas_like.series import PandasLikeSeries return PandasLikeSeries( - self._native_dataframe.duplicated(keep=False), + self._native_frame.duplicated(keep=False), implementation=self._implementation, backend_version=self._backend_version, ) def is_empty(self: Self) -> bool: - return self._native_dataframe.empty # type: ignore[no-any-return] + return self._native_frame.empty # type: ignore[no-any-return] def is_unique(self: Self) -> PandasLikeSeries: from narwhals._pandas_like.series import PandasLikeSeries return PandasLikeSeries( - ~self._native_dataframe.duplicated(keep=False), + ~self._native_frame.duplicated(keep=False), implementation=self._implementation, backend_version=self._backend_version, ) def null_count(self: Self) -> PandasLikeDataFrame: return PandasLikeDataFrame( - self._native_dataframe.isna().sum(axis=0).to_frame().transpose(), + self._native_frame.isna().sum(axis=0).to_frame().transpose(), implementation=self._implementation, backend_version=self._backend_version, ) @@ -581,20 +579,20 @@ def item(self: Self, row: int | None = None, column: int | str | None = None) -> f" frame has shape {self.shape!r}" ) raise ValueError(msg) - return self._native_dataframe.iloc[0, 0] + return self._native_frame.iloc[0, 0] elif row is None or column is None: msg = "cannot call `.item()` with only one of `row` or `column`" raise ValueError(msg) _col = self.columns.index(column) if isinstance(column, str) else column - return self._native_dataframe.iloc[row, _col] + return self._native_frame.iloc[row, _col] def clone(self: Self) -> Self: - return self._from_native_dataframe(self._native_dataframe.copy()) + return self._from_native_frame(self._native_frame.copy()) def gather_every(self: Self, n: int, offset: int = 0) -> Self: - return self._from_native_dataframe(self._native_dataframe.iloc[offset::n]) + return self._from_native_frame(self._native_frame.iloc[offset::n]) def to_arrow(self: Self) -> Any: if self._implementation is Implementation.CUDF: # pragma: no cover @@ -602,4 +600,4 @@ def to_arrow(self: Self) -> Any: raise NotImplementedError(msg) pa = get_pyarrow() - return pa.Table.from_pandas(self._native_dataframe) + return pa.Table.from_pandas(self._native_frame) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 4353ae712..193b1786c 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -64,7 +64,7 @@ def from_column_names( def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: return [ PandasLikeSeries( - df._native_dataframe.loc[:, column_name], + df._native_frame.loc[:, column_name], implementation=df._implementation, backend_version=df._backend_version, ) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 129b047b5..11abc85c8 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -32,16 +32,16 @@ def __init__(self, df: PandasLikeDataFrame, keys: list[str]) -> None: self._df._implementation is Implementation.PANDAS and self._df._backend_version < (1, 0) ): # pragma: no cover - if self._df._native_dataframe.loc[:, self._keys].isna().any().any(): + if self._df._native_frame.loc[:, self._keys].isna().any().any(): msg = "Grouping by null values is not supported in pandas < 1.0.0" raise NotImplementedError(msg) - self._grouped = self._df._native_dataframe.groupby( + self._grouped = self._df._native_frame.groupby( list(self._keys), sort=False, as_index=True, ) else: - self._grouped = self._df._native_dataframe.groupby( + self._grouped = self._df._native_frame.groupby( list(self._keys), sort=False, as_index=True, @@ -75,13 +75,13 @@ def agg( exprs, self._keys, output_names, - self._from_native_dataframe, - dataframe_is_empty=self._df._native_dataframe.empty, + self._from_native_frame, + dataframe_is_empty=self._df._native_frame.empty, implementation=implementation, backend_version=self._df._backend_version, ) - def _from_native_dataframe(self, df: PandasLikeDataFrame) -> PandasLikeDataFrame: + def _from_native_frame(self, df: PandasLikeDataFrame) -> PandasLikeDataFrame: from narwhals._pandas_like.dataframe import PandasLikeDataFrame return PandasLikeDataFrame( @@ -100,9 +100,7 @@ def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]: category=FutureWarning, ) iterator = self._grouped.__iter__() - yield from ( - (key, self._from_native_dataframe(sub_df)) for (key, sub_df) in iterator - ) + yield from ((key, self._from_native_frame(sub_df)) for (key, sub_df) in iterator) def agg_pandas( diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 13e2e99d3..1d07fde28 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -115,7 +115,7 @@ def all(self) -> PandasLikeExpr: return PandasLikeExpr( lambda df: [ PandasLikeSeries( - df._native_dataframe.loc[:, column_name], + df._native_frame.loc[:, column_name], implementation=self._implementation, backend_version=self._backend_version, ) @@ -134,7 +134,7 @@ def _lit_pandas_series(df: PandasLikeDataFrame) -> PandasLikeSeries: pandas_series = PandasLikeSeries._from_iterable( data=[value], name="lit", - index=df._native_dataframe.index[0:1], + index=df._native_frame.index[0:1], implementation=self._implementation, backend_version=self._backend_version, ) @@ -185,7 +185,7 @@ def len(self) -> PandasLikeExpr: return PandasLikeExpr( lambda df: [ PandasLikeSeries._from_iterable( - [len(df._native_dataframe)], + [len(df._native_frame)], name="len", index=[0], implementation=self._implementation, @@ -216,7 +216,7 @@ def concat( *, how: str = "vertical", ) -> PandasLikeDataFrame: - dfs: list[Any] = [item._native_dataframe for item in items] + dfs: list[Any] = [item._native_frame for item in items] if how == "horizontal": return PandasLikeDataFrame( horizontal_concat( diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index f137739fe..afef8187b 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -15,7 +15,7 @@ class PolarsDataFrame: def __init__(self, df: Any, *, backend_version: tuple[int, ...]) -> None: - self._native_dataframe = df + self._native_frame = df self._implementation = Implementation.POLARS self._backend_version = backend_version @@ -31,7 +31,7 @@ def __narwhals_namespace__(self) -> PolarsNamespace: def __native_namespace__(self) -> Any: return get_polars() - def _from_native_dataframe(self, df: Any) -> Self: + def _from_native_frame(self, df: Any) -> Self: return self.__class__(df, backend_version=self._backend_version) def _from_native_object(self, obj: Any) -> Any: @@ -41,7 +41,7 @@ def _from_native_object(self, obj: Any) -> Any: return PolarsSeries(obj, backend_version=self._backend_version) if isinstance(obj, pl.DataFrame): - return self._from_native_dataframe(obj) + return self._from_native_frame(obj) # scalar return obj @@ -52,30 +52,30 @@ def __getattr__(self, attr: str) -> Any: def func(*args: Any, **kwargs: Any) -> Any: args, kwargs = extract_args_kwargs(args, kwargs) # type: ignore[assignment] return self._from_native_object( - getattr(self._native_dataframe, attr)(*args, **kwargs) + getattr(self._native_frame, attr)(*args, **kwargs) ) return func @property def schema(self) -> dict[str, Any]: - schema = self._native_dataframe.schema + schema = self._native_frame.schema return {name: translate_dtype(dtype) for name, dtype in schema.items()} def collect_schema(self) -> dict[str, Any]: if self._backend_version < (1,): # pragma: no cover - schema = self._native_dataframe.schema + schema = self._native_frame.schema else: - schema = dict(self._native_dataframe.collect_schema()) + schema = dict(self._native_frame.collect_schema()) return {name: translate_dtype(dtype) for name, dtype in schema.items()} @property def shape(self) -> tuple[int, int]: - return self._native_dataframe.shape # type: ignore[no-any-return] + return self._native_frame.shape # type: ignore[no-any-return] def __getitem__(self, item: Any) -> Any: pl = get_polars() - result = self._native_dataframe.__getitem__(item) + result = self._native_frame.__getitem__(item) if isinstance(result, pl.Series): from narwhals._polars.series import PolarsSeries @@ -86,23 +86,23 @@ def get_column(self, name: str) -> Any: from narwhals._polars.series import PolarsSeries return PolarsSeries( - self._native_dataframe.get_column(name), backend_version=self._backend_version + self._native_frame.get_column(name), backend_version=self._backend_version ) def is_empty(self) -> bool: - return len(self._native_dataframe) == 0 + return len(self._native_frame) == 0 @property def columns(self) -> list[str]: - return self._native_dataframe.columns # type: ignore[no-any-return] + return self._native_frame.columns # type: ignore[no-any-return] def lazy(self) -> PolarsLazyFrame: return PolarsLazyFrame( - self._native_dataframe.lazy(), backend_version=self._backend_version + self._native_frame.lazy(), backend_version=self._backend_version ) def to_dict(self, *, as_series: bool) -> Any: - df = self._native_dataframe + df = self._native_frame if as_series: from narwhals._polars.series import PolarsSeries @@ -121,15 +121,13 @@ def group_by(self, *by: str) -> Any: def with_row_index(self, name: str) -> Any: if self._backend_version < (0, 20, 4): # pragma: no cover - return self._from_native_dataframe( - self._native_dataframe.with_row_count(name) - ) - return self._from_native_dataframe(self._native_dataframe.with_row_index(name)) + return self._from_native_frame(self._native_frame.with_row_count(name)) + return self._from_native_frame(self._native_frame.with_row_index(name)) class PolarsLazyFrame: def __init__(self, df: Any, *, backend_version: tuple[int, ...]) -> None: - self._native_dataframe = df + self._native_frame = df self._backend_version = backend_version def __repr__(self) -> str: # pragma: no cover @@ -144,37 +142,37 @@ def __narwhals_namespace__(self) -> PolarsNamespace: def __native_namespace__(self) -> Any: # pragma: no cover return get_polars() - def _from_native_dataframe(self, df: Any) -> Self: + def _from_native_frame(self, df: Any) -> Self: return self.__class__(df, backend_version=self._backend_version) def __getattr__(self, attr: str) -> Any: def func(*args: Any, **kwargs: Any) -> Any: args, kwargs = extract_args_kwargs(args, kwargs) # type: ignore[assignment] - return self._from_native_dataframe( - getattr(self._native_dataframe, attr)(*args, **kwargs) + return self._from_native_frame( + getattr(self._native_frame, attr)(*args, **kwargs) ) return func @property def columns(self) -> list[str]: - return self._native_dataframe.columns # type: ignore[no-any-return] + return self._native_frame.columns # type: ignore[no-any-return] @property def schema(self) -> dict[str, Any]: - schema = self._native_dataframe.schema + schema = self._native_frame.schema return {name: translate_dtype(dtype) for name, dtype in schema.items()} def collect_schema(self) -> dict[str, Any]: if self._backend_version < (1,): # pragma: no cover - schema = self._native_dataframe.schema + schema = self._native_frame.schema else: - schema = dict(self._native_dataframe.collect_schema()) + schema = dict(self._native_frame.collect_schema()) return {name: translate_dtype(dtype) for name, dtype in schema.items()} def collect(self) -> PolarsDataFrame: return PolarsDataFrame( - self._native_dataframe.collect(), backend_version=self._backend_version + self._native_frame.collect(), backend_version=self._backend_version ) def group_by(self, *by: str) -> Any: @@ -184,7 +182,5 @@ def group_by(self, *by: str) -> Any: def with_row_index(self, name: str) -> Any: if self._backend_version < (0, 20, 4): # pragma: no cover - return self._from_native_dataframe( - self._native_dataframe.with_row_count(name) - ) - return self._from_native_dataframe(self._native_dataframe.with_row_index(name)) + return self._from_native_frame(self._native_frame.with_row_count(name)) + return self._from_native_frame(self._native_frame.with_row_index(name)) diff --git a/narwhals/_polars/group_by.py b/narwhals/_polars/group_by.py index c0de75736..f03da610e 100644 --- a/narwhals/_polars/group_by.py +++ b/narwhals/_polars/group_by.py @@ -14,27 +14,27 @@ class PolarsGroupBy: def __init__(self, df: Any, keys: list[str]) -> None: self._compliant_frame = df self.keys = keys - self._grouped = df._native_dataframe.group_by(keys) + self._grouped = df._native_frame.group_by(keys) def agg(self, *aggs: Any, **named_aggs: Any) -> PolarsDataFrame: aggs, named_aggs = extract_args_kwargs(aggs, named_aggs) # type: ignore[assignment] - return self._compliant_frame._from_native_dataframe( # type: ignore[no-any-return] + return self._compliant_frame._from_native_frame( # type: ignore[no-any-return] self._grouped.agg(*aggs, **named_aggs), ) def __iter__(self) -> Any: for key, df in self._grouped: - yield tuple(key), self._compliant_frame._from_native_dataframe(df) + yield tuple(key), self._compliant_frame._from_native_frame(df) class PolarsLazyGroupBy: def __init__(self, df: Any, keys: list[str]) -> None: self._compliant_frame = df self.keys = keys - self._grouped = df._native_dataframe.group_by(keys) + self._grouped = df._native_frame.group_by(keys) def agg(self, *aggs: Any, **named_aggs: Any) -> PolarsLazyFrame: aggs, named_aggs = extract_args_kwargs(aggs, named_aggs) # type: ignore[assignment] - return self._compliant_frame._from_native_dataframe( # type: ignore[no-any-return] + return self._compliant_frame._from_native_frame( # type: ignore[no-any-return] self._grouped.agg(*aggs, **named_aggs), ) diff --git a/narwhals/_polars/namespace.py b/narwhals/_polars/namespace.py index 5a34554a4..ef6b00730 100644 --- a/narwhals/_polars/namespace.py +++ b/narwhals/_polars/namespace.py @@ -71,7 +71,7 @@ def concat( from narwhals._polars.dataframe import PolarsLazyFrame pl = get_polars() - dfs: list[Any] = [item._native_dataframe for item in items] + dfs: list[Any] = [item._native_frame for item in items] result = pl.concat(dfs, how=how) if isinstance(result, pl.DataFrame): return PolarsDataFrame(result, backend_version=items[0]._backend_version) diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 7c7dbe0fa..51f0b1898 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -13,7 +13,7 @@ def extract_native(obj: Any) -> Any: from narwhals._polars.series import PolarsSeries if isinstance(obj, (PolarsDataFrame, PolarsLazyFrame)): - return obj._native_dataframe + return obj._native_frame if isinstance(obj, PolarsSeries): return obj._native_series if isinstance(obj, PolarsExpr): diff --git a/narwhals/translate.py b/narwhals/translate.py index 00c2b05f0..2779822cd 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -60,7 +60,7 @@ def to_native( from narwhals.series import Series if isinstance(narwhals_object, BaseFrame): - return narwhals_object._compliant_frame._native_dataframe + return narwhals_object._compliant_frame._native_frame if isinstance(narwhals_object, Series): return narwhals_object._compliant_series._native_series diff --git a/narwhals/utils.py b/narwhals/utils.py index 2034c6feb..78299ba4b 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -177,23 +177,23 @@ def _validate_index(index: Any) -> None: if isinstance( getattr(lhs_any, "_compliant_frame", None), PandasLikeDataFrame ) and isinstance(getattr(rhs_any, "_compliant_frame", None), PandasLikeDataFrame): - _validate_index(lhs_any._compliant_frame._native_dataframe.index) - _validate_index(rhs_any._compliant_frame._native_dataframe.index) + _validate_index(lhs_any._compliant_frame._native_frame.index) + _validate_index(rhs_any._compliant_frame._native_frame.index) return lhs_any._from_compliant_dataframe( # type: ignore[no-any-return] - lhs_any._compliant_frame._from_native_dataframe( - lhs_any._compliant_frame._native_dataframe.loc[ - rhs_any._compliant_frame._native_dataframe.index + lhs_any._compliant_frame._from_native_frame( + lhs_any._compliant_frame._native_frame.loc[ + rhs_any._compliant_frame._native_frame.index ] ) ) if isinstance( getattr(lhs_any, "_compliant_frame", None), PandasLikeDataFrame ) and isinstance(getattr(rhs_any, "_compliant_series", None), PandasLikeSeries): - _validate_index(lhs_any._compliant_frame._native_dataframe.index) + _validate_index(lhs_any._compliant_frame._native_frame.index) _validate_index(rhs_any._compliant_series._native_series.index) return lhs_any._from_compliant_dataframe( # type: ignore[no-any-return] - lhs_any._compliant_frame._from_native_dataframe( - lhs_any._compliant_frame._native_dataframe.loc[ + lhs_any._compliant_frame._from_native_frame( + lhs_any._compliant_frame._native_frame.loc[ rhs_any._compliant_series._native_series.index ] ) @@ -202,11 +202,11 @@ def _validate_index(index: Any) -> None: getattr(lhs_any, "_compliant_series", None), PandasLikeSeries ) and isinstance(getattr(rhs_any, "_compliant_frame", None), PandasLikeDataFrame): _validate_index(lhs_any._compliant_series._native_series.index) - _validate_index(rhs_any._compliant_frame._native_dataframe.index) + _validate_index(rhs_any._compliant_frame._native_frame.index) return lhs_any._from_compliant_series( # type: ignore[no-any-return] lhs_any._compliant_series._from_native_series( lhs_any._compliant_series._native_series.loc[ - rhs_any._compliant_frame._native_dataframe.index + rhs_any._compliant_frame._native_frame.index ] ) ) @@ -256,8 +256,8 @@ def maybe_set_index(df: T, column_names: str | list[str]) -> T: df_any = cast(Any, df) if isinstance(getattr(df_any, "_compliant_frame", None), PandasLikeDataFrame): return df_any._from_compliant_dataframe( # type: ignore[no-any-return] - df_any._compliant_frame._from_native_dataframe( - df_any._compliant_frame._native_dataframe.set_index(column_names) + df_any._compliant_frame._from_native_frame( + df_any._compliant_frame._native_frame.set_index(column_names) ) ) return df @@ -293,8 +293,8 @@ def maybe_convert_dtypes(df: T, *args: bool, **kwargs: bool | str) -> T: df_any = cast(Any, df) if isinstance(getattr(df_any, "_compliant_frame", None), PandasLikeDataFrame): return df_any._from_compliant_dataframe( # type: ignore[no-any-return] - df_any._compliant_frame._from_native_dataframe( - df_any._compliant_frame._native_dataframe.convert_dtypes(*args, **kwargs) + df_any._compliant_frame._from_native_frame( + df_any._compliant_frame._native_frame.convert_dtypes(*args, **kwargs) ) ) return df From 6522a62a941008ee8fcfc24967eabfbf88d8d8fb Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 13 Aug 2024 18:04:39 +0200 Subject: [PATCH 17/21] feat: extend dataframe `drop` method (#773) --- narwhals/_arrow/dataframe.py | 16 +++++++--- narwhals/_dask/dataframe.py | 19 +++++++---- narwhals/_exceptions.py | 4 +++ narwhals/_pandas_like/dataframe.py | 18 +++++++---- narwhals/_polars/dataframe.py | 14 ++++++++ narwhals/dataframe.py | 24 +++++++++----- narwhals/utils.py | 19 +++++++++++ tests/frame/drop_test.py | 51 ++++++++++++++++++++++++++---- 8 files changed, 133 insertions(+), 32 deletions(-) create mode 100644 narwhals/_exceptions.py diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 7497c21f1..880e6d952 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -19,6 +19,7 @@ from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_unique_token +from narwhals.utils import parse_columns_to_drop if TYPE_CHECKING: from typing_extensions import Self @@ -264,14 +265,16 @@ def join( ) return self._from_native_frame( - self.with_columns(**{key_token: plx.lit(0, None)})._native_frame.join( + self.with_columns(**{key_token: plx.lit(0, None)}) + ._native_frame.join( other.with_columns(**{key_token: plx.lit(0, None)})._native_frame, keys=key_token, right_keys=key_token, join_type="inner", right_suffix="_right", - ), - ).drop(key_token) + ) + .drop([key_token]), + ) return self._from_native_frame( self._native_frame.join( @@ -283,8 +286,11 @@ def join( ), ) - def drop(self, *columns: str) -> Self: - return self._from_native_frame(self._native_frame.drop(list(columns))) + def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 + to_drop = parse_columns_to_drop( + compliant_frame=self, columns=columns, strict=strict + ) + return self._from_native_frame(self._native_frame.drop(to_drop)) def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: if subset is None: diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 33f3c8b85..cf3c6cc12 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -14,6 +14,7 @@ from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_unique_token +from narwhals.utils import parse_columns_to_drop from narwhals.utils import parse_version if TYPE_CHECKING: @@ -129,8 +130,12 @@ def schema(self) -> dict[str, DType]: def collect_schema(self) -> dict[str, DType]: return self.schema - def drop(self: Self, *columns: str) -> Self: - return self._from_native_frame(self._native_frame.drop(columns=list(columns))) + def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 + to_drop = parse_columns_to_drop( + compliant_frame=self, columns=columns, strict=strict + ) + + return self._from_native_frame(self._native_frame.drop(columns=to_drop)) def with_row_index(self: Self, name: str) -> Self: # Implementation is based on the following StackOverflow reply: @@ -204,14 +209,16 @@ def join( ) return self._from_native_frame( - self._native_frame.assign(**{key_token: 0}).merge( + self._native_frame.assign(**{key_token: 0}) + .merge( other._native_frame.assign(**{key_token: 0}), how="inner", left_on=key_token, right_on=key_token, suffixes=("", "_right"), - ), - ).drop(key_token) + ) + .drop(columns=key_token), + ) if how == "anti": indicator_token = generate_unique_token( @@ -297,5 +304,5 @@ def gather_every(self: Self, n: int, offset: int) -> Self: pln.col(row_index_token) >= offset, # type: ignore[operator] (pln.col(row_index_token) - offset) % n == 0, # type: ignore[arg-type] ) - .drop(row_index_token) + .drop([row_index_token], strict=False) ) diff --git a/narwhals/_exceptions.py b/narwhals/_exceptions.py new file mode 100644 index 000000000..189954516 --- /dev/null +++ b/narwhals/_exceptions.py @@ -0,0 +1,4 @@ +from __future__ import annotations + + +class ColumnNotFoundError(Exception): ... diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index f6fb2489e..a8729eeec 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -24,6 +24,7 @@ from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_unique_token +from narwhals.utils import parse_columns_to_drop if TYPE_CHECKING: from typing_extensions import Self @@ -321,8 +322,11 @@ def with_columns( def rename(self, mapping: dict[str, str]) -> Self: return self._from_native_frame(self._native_frame.rename(columns=mapping)) - def drop(self, *columns: str) -> Self: - return self._from_native_frame(self._native_frame.drop(columns=list(columns))) + def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 + to_drop = parse_columns_to_drop( + compliant_frame=self, columns=columns, strict=strict + ) + return self._from_native_frame(self._native_frame.drop(columns=to_drop)) # --- transform --- def sort( @@ -382,14 +386,16 @@ def join( ) return self._from_native_frame( - self._native_frame.assign(**{key_token: 0}).merge( + self._native_frame.assign(**{key_token: 0}) + .merge( other._native_frame.assign(**{key_token: 0}), how="inner", left_on=key_token, right_on=key_token, suffixes=("", "_right"), - ), - ).drop(key_token) + ) + .drop(columns=key_token), + ) else: return self._from_native_frame( self._native_frame.merge( @@ -420,7 +426,7 @@ def join( right_on=left_on, ) .loc[lambda t: t[indicator_token] == "left_only"] - .drop(columns=[indicator_token]) + .drop(columns=indicator_token) ) if how == "semi": diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index afef8187b..2137f8913 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -8,6 +8,7 @@ from narwhals._polars.utils import translate_dtype from narwhals.dependencies import get_polars from narwhals.utils import Implementation +from narwhals.utils import parse_columns_to_drop if TYPE_CHECKING: from typing_extensions import Self @@ -124,6 +125,14 @@ def with_row_index(self, name: str) -> Any: return self._from_native_frame(self._native_frame.with_row_count(name)) return self._from_native_frame(self._native_frame.with_row_index(name)) + def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 + if self._backend_version < (1, 0, 0): # pragma: no cover + to_drop = parse_columns_to_drop( + compliant_frame=self, columns=columns, strict=strict + ) + return self._from_native_frame(self._native_frame.drop(to_drop)) + return self._from_native_frame(self._native_frame.drop(columns, strict=strict)) + class PolarsLazyFrame: def __init__(self, df: Any, *, backend_version: tuple[int, ...]) -> None: @@ -184,3 +193,8 @@ def with_row_index(self, name: str) -> Any: if self._backend_version < (0, 20, 4): # pragma: no cover return self._from_native_frame(self._native_frame.with_row_count(name)) return self._from_native_frame(self._native_frame.with_row_index(name)) + + def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 + if self._backend_version < (1, 0, 0): # pragma: no cover + return self._from_native_frame(self._native_frame.drop(columns)) + return self._from_native_frame(self._native_frame.drop(columns, strict=strict)) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 487767c34..bd6ef1155 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -137,9 +137,9 @@ def head(self, n: int) -> Self: def tail(self, n: int) -> Self: return self._from_compliant_dataframe(self._compliant_frame.tail(n)) - def drop(self, *columns: str | Iterable[str]) -> Self: + def drop(self, *columns: Iterable[str], strict: bool) -> Self: return self._from_compliant_dataframe( - self._compliant_frame.drop(*flatten(columns)) + self._compliant_frame.drop(columns, strict=strict) ) def unique( @@ -1286,12 +1286,14 @@ def tail(self, n: int = 5) -> Self: """ return super().tail(n) - def drop(self, *columns: str | Iterable[str]) -> Self: + def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: """ Remove columns from the dataframe. Arguments: *columns: Names of the columns that should be removed from the dataframe. + strict: Validate that all column names exist in the schema and throw an + exception if a column name does not exist in the schema. Examples: >>> import pandas as pd @@ -1349,7 +1351,7 @@ def drop(self, *columns: str | Iterable[str]) -> Self: │ 8.0 │ └─────┘ """ - return super().drop(*columns) + return super().drop(*flatten(columns), strict=strict) def unique( self, @@ -2743,13 +2745,19 @@ def tail(self, n: int = 5) -> Self: """ return super().tail(n) - def drop(self, *columns: str | Iterable[str]) -> Self: + def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: r""" Remove columns from the LazyFrame. Arguments: - *columns: Names of the columns that should be removed from the - dataframe. Accepts column selector input. + *columns: Names of the columns that should be removed from the dataframe. + strict: Validate that all column names exist in the schema and throw an + exception if a column name does not exist in the schema. + + Warning: + `strict` argument is ignored for `polars<1.0.0`. + + Please consider upgrading to a newer version or pass to eager mode. Examples: >>> import pandas as pd @@ -2807,7 +2815,7 @@ def drop(self, *columns: str | Iterable[str]) -> Self: │ 8.0 │ └─────┘ """ - return super().drop(*flatten(columns)) + return super().drop(*flatten(columns), strict=strict) def unique( self, diff --git a/narwhals/utils.py b/narwhals/utils.py index 78299ba4b..512099cc5 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -12,6 +12,7 @@ from typing import cast from narwhals import dtypes +from narwhals._exceptions import ColumnNotFoundError from narwhals.dependencies import get_cudf from narwhals.dependencies import get_modin from narwhals.dependencies import get_pandas @@ -395,3 +396,21 @@ def generate_unique_token(n_bytes: int, columns: list[str]) -> str: # pragma: n "join operation" ) raise AssertionError(msg) + + +def parse_columns_to_drop( + compliant_frame: Any, + columns: Iterable[str], + strict: bool, # noqa: FBT001 +) -> list[str]: + cols = set(compliant_frame.columns) + to_drop = list(columns) + + if strict: + for d in to_drop: + if d not in cols: + msg = f'"{d}" not found' + raise ColumnNotFoundError(msg) + else: + to_drop = list(cols.intersection(set(to_drop))) + return to_drop diff --git a/tests/frame/drop_test.py b/tests/frame/drop_test.py index f22d148b1..547ddc748 100644 --- a/tests/frame/drop_test.py +++ b/tests/frame/drop_test.py @@ -1,21 +1,58 @@ from __future__ import annotations +from contextlib import nullcontext as does_not_raise from typing import Any +import polars as pl import pytest +from polars.exceptions import ColumnNotFoundError as PlColumnNotFoundError import narwhals.stable.v1 as nw +from narwhals._exceptions import ColumnNotFoundError +from narwhals.utils import parse_version @pytest.mark.parametrize( - ("drop", "left"), + ("to_drop", "expected"), [ - (["a"], ["b", "z"]), - (["a", "b"], ["z"]), + ("abc", ["b", "z"]), + (["abc"], ["b", "z"]), + (["abc", "b"], ["z"]), ], ) -def test_drop(constructor: Any, drop: list[str], left: list[str]) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} +def test_drop(constructor: Any, to_drop: list[str], expected: list[str]) -> None: + data = {"abc": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) - assert df.drop(drop).collect_schema().names() == left - assert df.drop(*drop).collect_schema().names() == left + assert df.drop(to_drop).collect_schema().names() == expected + if not isinstance(to_drop, str): + assert df.drop(*to_drop).collect_schema().names() == expected + + +@pytest.mark.parametrize( + ("strict", "context"), + [ + ( + True, + pytest.raises( + (ColumnNotFoundError, PlColumnNotFoundError), match='"z" not found' + ), + ), + (False, does_not_raise()), + ], +) +def test_drop_strict(request: Any, constructor: Any, strict: bool, context: Any) -> None: # noqa: FBT001 + if ( + "polars_lazy" in str(request) + and parse_version(pl.__version__) < (1, 0, 0) + and strict + ): + request.applymarker(pytest.mark.xfail) + + data = {"a": [1, 3, 2], "b": [4, 4, 6]} + to_drop = ["a", "z"] + + df = nw.from_native(constructor(data)) + + with context: + names_out = df.drop(to_drop, strict=strict).collect_schema().names() + assert names_out == ["b"] From 6075ec7fab33e32e29dcf7d57d7421ab1d1e8090 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 13 Aug 2024 19:19:52 +0100 Subject: [PATCH 18/21] feat: improve typing for `to_pandas` / `to_numpy` / `to_arrow` (#783) --- .pre-commit-config.yaml | 4 +- narwhals/_arrow/dataframe.py | 16 +++---- narwhals/_arrow/series.py | 16 ++++--- narwhals/_expression_parsing.py | 4 +- narwhals/_pandas_like/dataframe.py | 8 ++-- narwhals/_pandas_like/series.py | 4 +- narwhals/dataframe.py | 14 +++--- narwhals/dependencies.py | 25 ++++++++++ narwhals/expr.py | 4 +- narwhals/series.py | 8 ++-- narwhals/translate.py | 19 ++++++-- pyproject.toml | 1 + utils/import_check.py | 75 ++++++++++++++++++++++++++++++ 13 files changed, 156 insertions(+), 42 deletions(-) create mode 100644 utils/import_check.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 757cff1db..9ccb3bf28 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,8 +31,8 @@ repos: additional_dependencies: [polars] - id: imports-are-banned name: import are banned (use `get_pandas` instead of `import pandas`) - entry: (?>> )import (pandas|polars|modin|cudf|pyarrow|dask) - language: pygrep + entry: python utils/import_check.py + language: python files: ^narwhals/ exclude: ^narwhals/dependencies\.py - repo: https://github.com/kynan/nbstripout diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 880e6d952..75b9068a0 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -12,10 +12,10 @@ from narwhals._arrow.utils import translate_dtype from narwhals._arrow.utils import validate_dataframe_comparand from narwhals._expression_parsing import evaluate_into_exprs -from narwhals.dependencies import get_numpy from narwhals.dependencies import get_pyarrow from narwhals.dependencies import get_pyarrow_compute from narwhals.dependencies import get_pyarrow_parquet +from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_unique_token @@ -155,11 +155,7 @@ def __getitem__( self._native_frame.slice(item.start, stop - start), ) - elif isinstance(item, Sequence) or ( - (np := get_numpy()) is not None - and isinstance(item, np.ndarray) - and item.ndim == 1 - ): + elif isinstance(item, Sequence) or (is_numpy_array(item) and item.ndim == 1): return self._from_native_frame(self._native_frame.take(item)) else: # pragma: no cover @@ -322,7 +318,7 @@ def to_pandas(self) -> Any: return self._native_frame.to_pandas() def to_numpy(self) -> Any: - import numpy as np + import numpy as np # ignore-banned-import return np.column_stack([col.to_numpy() for col in self._native_frame.columns]) @@ -423,9 +419,10 @@ def write_parquet(self, file: Any) -> Any: pp.write_table(self._native_frame, file) def is_duplicated(self: Self) -> ArrowSeries: + import numpy as np # ignore-banned-import + from narwhals._arrow.series import ArrowSeries - np = get_numpy() pa = get_pyarrow() pc = get_pyarrow_compute() df = self._native_frame @@ -468,7 +465,8 @@ def unique( and has no effect on the output. """ - np = get_numpy() + import numpy as np # ignore-banned-import + pa = get_pyarrow() pc = get_pyarrow_compute() diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 40a94315e..20513c39e 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -12,7 +12,6 @@ from narwhals._arrow.utils import narwhals_to_native_dtype from narwhals._arrow.utils import translate_dtype from narwhals._arrow.utils import validate_column_comparand -from narwhals.dependencies import get_numpy from narwhals.dependencies import get_pandas from narwhals.dependencies import get_pyarrow from narwhals.dependencies import get_pyarrow_compute @@ -393,7 +392,8 @@ def is_in(self, other: Any) -> Self: return self._from_native_series(pc.is_in(ser, value_set=value_set)) def arg_true(self) -> Self: - np = get_numpy() + import numpy as np # ignore-banned-import + ser = self._native_series res = np.flatnonzero(ser) return self._from_iterable( @@ -465,7 +465,8 @@ def sample( *, with_replacement: bool = False, ) -> Self: - np = get_numpy() + import numpy as np # ignore-banned-import + pc = get_pyarrow_compute() ser = self._native_series num_rows = len(self) @@ -503,7 +504,8 @@ def is_unique(self: Self) -> ArrowSeries: return self.to_frame().is_unique().alias(self.name) def is_first_distinct(self: Self) -> Self: - np = get_numpy() + import numpy as np # ignore-banned-import + pa = get_pyarrow() pc = get_pyarrow_compute() @@ -520,7 +522,8 @@ def is_first_distinct(self: Self) -> Self: return self._from_native_series(pc.is_in(row_number, first_distinct_index)) def is_last_distinct(self: Self) -> Self: - np = get_numpy() + import numpy as np # ignore-banned-import + pa = get_pyarrow() pc = get_pyarrow_compute() @@ -567,9 +570,10 @@ def sort( def to_dummies( self: Self, *, separator: str = "_", drop_first: bool = False ) -> ArrowDataFrame: + import numpy as np # ignore-banned-import + from narwhals._arrow.dataframe import ArrowDataFrame - np = get_numpy() pa = get_pyarrow() series = self._native_series diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index dfc338dc9..4c642239d 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -11,7 +11,7 @@ from typing import cast from typing import overload -from narwhals.dependencies import get_numpy +from narwhals.dependencies import is_numpy_array from narwhals.utils import flatten if TYPE_CHECKING: @@ -170,7 +170,7 @@ def parse_into_expr( return namespace._create_expr_from_series(into_expr) # type: ignore[arg-type] if isinstance(into_expr, str): return namespace.col(into_expr) - if (np := get_numpy()) is not None and isinstance(into_expr, np.ndarray): + if is_numpy_array(into_expr): series = namespace._create_compliant_series(into_expr) return namespace._create_expr_from_series(series) # type: ignore[arg-type] msg = f"Expected IntoExpr, got {type(into_expr)}" # pragma: no cover diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index a8729eeec..b2a819a0e 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -18,9 +18,9 @@ from narwhals._pandas_like.utils import validate_dataframe_comparand from narwhals.dependencies import get_cudf from narwhals.dependencies import get_modin -from narwhals.dependencies import get_numpy from narwhals.dependencies import get_pandas from narwhals.dependencies import get_pyarrow +from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_unique_token @@ -161,9 +161,7 @@ def __getitem__( ) elif isinstance(item, (slice, Sequence)) or ( - (np := get_numpy()) is not None - and isinstance(item, np.ndarray) - and item.ndim == 1 + is_numpy_array(item) and item.ndim == 1 ): return self._from_native_frame(self._native_frame.iloc[item]) @@ -532,7 +530,7 @@ def to_numpy(self) -> Any: # returns Object) then we just call `to_numpy()` on the DataFrame. for dtype in self._native_frame.dtypes: if str(dtype) in PANDAS_TO_NUMPY_DTYPE_MISSING: - import numpy as np + import numpy as np # ignore-banned-import return np.hstack([self[col].to_numpy()[:, None] for col in self.columns]) return self._native_frame.to_numpy() diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 090fe9495..51b4cbd72 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -15,7 +15,6 @@ from narwhals._pandas_like.utils import validate_column_comparand from narwhals.dependencies import get_cudf from narwhals.dependencies import get_modin -from narwhals.dependencies import get_numpy from narwhals.dependencies import get_pandas from narwhals.dependencies import get_pyarrow from narwhals.dependencies import get_pyarrow_compute @@ -224,7 +223,8 @@ def is_in(self, other: Any) -> PandasLikeSeries: return self._from_native_series(res) def arg_true(self) -> PandasLikeSeries: - np = get_numpy() + import numpy as np # ignore-banned-import + ser = self._native_series res = np.flatnonzero(ser) return self._from_native_series( diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index bd6ef1155..039496744 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -11,8 +11,8 @@ from typing import TypeVar from typing import overload -from narwhals.dependencies import get_numpy from narwhals.dependencies import get_polars +from narwhals.dependencies import is_numpy_array from narwhals.schema import Schema from narwhals.utils import flatten @@ -21,6 +21,8 @@ from pathlib import Path import numpy as np + import pandas as pd + import pyarrow as pa from typing_extensions import Self from narwhals.group_by import GroupBy @@ -281,7 +283,7 @@ def lazy(self) -> LazyFrame[Any]: """ return super().lazy() - def to_pandas(self) -> Any: + def to_pandas(self) -> pd.DataFrame: """ Convert this DataFrame to a pandas DataFrame. @@ -343,7 +345,7 @@ def write_parquet(self, file: str | Path | BytesIO) -> Any: """ self._compliant_frame.write_parquet(file) - def to_numpy(self) -> Any: + def to_numpy(self) -> np.ndarray: """ Convert this DataFrame to a NumPy ndarray. @@ -551,9 +553,7 @@ def __getitem__( ) elif isinstance(item, (Sequence, slice)) or ( - (np := get_numpy()) is not None - and isinstance(item, np.ndarray) - and item.ndim == 1 + is_numpy_array(item) and item.ndim == 1 ): return self._from_compliant_dataframe(self._compliant_frame[item]) @@ -2026,7 +2026,7 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: """ return super().gather_every(n=n, offset=offset) - def to_arrow(self: Self) -> Any: + def to_arrow(self: Self) -> pa.Table: r""" Convert to arrow table. diff --git a/narwhals/dependencies.py b/narwhals/dependencies.py index 9eebdb703..663b5dde6 100644 --- a/narwhals/dependencies.py +++ b/narwhals/dependencies.py @@ -8,11 +8,14 @@ from typing import Any if TYPE_CHECKING: + import numpy as np + if sys.version_info >= (3, 10): from typing import TypeGuard else: from typing_extensions import TypeGuard import pandas as pd + import polars as pl def get_polars() -> Any: @@ -44,6 +47,8 @@ def get_pyarrow() -> Any: # pragma: no cover def get_pyarrow_compute() -> Any: # pragma: no cover """Get pyarrow.compute module (if pyarrow has already been imported - else return None).""" + # TODO(marco): remove this one, as it's at odds with the others, as it imports + # something new if "pyarrow" in sys.modules: import pyarrow.compute as pc @@ -85,6 +90,26 @@ def is_pandas_dataframe(df: Any) -> TypeGuard[pd.DataFrame]: return bool((pd := get_pandas()) is not None and isinstance(df, pd.DataFrame)) +def is_pandas_series(ser: Any) -> TypeGuard[pd.Series[Any]]: + """Check whether `df` is a pandas Series without importing pandas.""" + return bool((pd := get_pandas()) is not None and isinstance(ser, pd.Series)) + + +def is_polars_dataframe(df: Any) -> TypeGuard[pl.DataFrame]: + """Check whether `df` is a Polars DataFrame without importing Polars.""" + return bool((pl := get_polars()) is not None and isinstance(df, pl.DataFrame)) + + +def is_polars_lazyframe(df: Any) -> TypeGuard[pl.LazyFrame]: + """Check whether `df` is a Polars LazyFrame without importing Polars.""" + return bool((pl := get_polars()) is not None and isinstance(df, pl.LazyFrame)) + + +def is_numpy_array(arr: Any) -> TypeGuard[np.ndarray]: + """Check whether `arr` is a NumPy Array without importing NumPy.""" + return bool((np := get_numpy()) is not None and isinstance(arr, np.ndarray)) + + __all__ = [ "get_polars", "get_pandas", diff --git a/narwhals/expr.py b/narwhals/expr.py index 8b3f24f12..8d0f4956a 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -6,7 +6,7 @@ from typing import Iterable from typing import Literal -from narwhals.dependencies import get_numpy +from narwhals.dependencies import is_numpy_array from narwhals.utils import flatten if TYPE_CHECKING: @@ -4053,7 +4053,7 @@ def lit(value: Any, dtype: DType | None = None) -> Expr: └─────┴─────┘ """ - if (np := get_numpy()) is not None and isinstance(value, np.ndarray): + if is_numpy_array(value): msg = ( "numpy arrays are not supported as literal values. " "Consider using `with_columns` to create a new column from the array." diff --git a/narwhals/series.py b/narwhals/series.py index d575ef707..a826e5dd4 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -9,6 +9,8 @@ if TYPE_CHECKING: import numpy as np + import pandas as pd + import pyarrow as pa from typing_extensions import Self from narwhals.dataframe import DataFrame @@ -1322,7 +1324,7 @@ def n_unique(self) -> int: """ return self._compliant_series.n_unique() # type: ignore[no-any-return] - def to_numpy(self) -> Any: + def to_numpy(self) -> np.ndarray: """ Convert to numpy. @@ -1349,7 +1351,7 @@ def to_numpy(self) -> Any: """ return self._compliant_series.to_numpy() - def to_pandas(self) -> Any: + def to_pandas(self) -> pd.Series: """ Convert to pandas. @@ -2218,7 +2220,7 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: self._compliant_series.gather_every(n=n, offset=offset) ) - def to_arrow(self: Self) -> Any: + def to_arrow(self: Self) -> pa.Array: r""" Convert to arrow. diff --git a/narwhals/translate.py b/narwhals/translate.py index 2779822cd..ed33b376b 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -16,6 +16,10 @@ from narwhals.dependencies import get_pandas from narwhals.dependencies import get_polars from narwhals.dependencies import get_pyarrow +from narwhals.dependencies import is_pandas_dataframe +from narwhals.dependencies import is_pandas_series +from narwhals.dependencies import is_polars_dataframe +from narwhals.dependencies import is_polars_lazyframe if TYPE_CHECKING: from narwhals.dataframe import DataFrame @@ -340,22 +344,27 @@ def from_native( # noqa: PLR0915 level="full", ) + # TODO(marco): write all of these in terms of `is_` rather + # than `get_` + walrus + # Polars - elif (pl := get_polars()) is not None and isinstance(native_object, pl.DataFrame): + elif is_polars_dataframe(native_object): if series_only: msg = "Cannot only use `series_only` with polars.DataFrame" raise TypeError(msg) + pl = get_polars() return DataFrame( PolarsDataFrame(native_object, backend_version=parse_version(pl.__version__)), level="full", ) - elif (pl := get_polars()) is not None and isinstance(native_object, pl.LazyFrame): + elif is_polars_lazyframe(native_object): if series_only: msg = "Cannot only use `series_only` with polars.LazyFrame" raise TypeError(msg) if eager_only or eager_or_interchange_only: msg = "Cannot only use `eager_only` or `eager_or_interchange_only` with polars.LazyFrame" raise TypeError(msg) + pl = get_polars() return LazyFrame( PolarsLazyFrame(native_object, backend_version=parse_version(pl.__version__)), level="full", @@ -370,10 +379,11 @@ def from_native( # noqa: PLR0915 ) # pandas - elif (pd := get_pandas()) is not None and isinstance(native_object, pd.DataFrame): + elif is_pandas_dataframe(native_object): if series_only: msg = "Cannot only use `series_only` with dataframe" raise TypeError(msg) + pd = get_pandas() return DataFrame( PandasLikeDataFrame( native_object, @@ -382,10 +392,11 @@ def from_native( # noqa: PLR0915 ), level="full", ) - elif (pd := get_pandas()) is not None and isinstance(native_object, pd.Series): + elif is_pandas_series(native_object): if not allow_series: msg = "Please set `allow_series=True`" raise TypeError(msg) + pd = get_pandas() return Series( PandasLikeSeries( native_object, diff --git a/pyproject.toml b/pyproject.toml index 854f8e9e4..1b9ca949c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ lint.ignore = [ "E501", "FIX", "ISC001", + "NPY002", "PD901", # This is a auxiliary library so dataframe variables have no concrete business meaning "PLR0911", "PLR0912", diff --git a/utils/import_check.py b/utils/import_check.py new file mode 100644 index 000000000..e8d776cde --- /dev/null +++ b/utils/import_check.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import ast +import sys + +BANNED_IMPORTS = { + "cudf", + "dask", + "dask.dataframe", + "dask_expr", + "duckdb", + "ibis", + "modin", + "numpy", + "pandas", + "polars", + "pyarrow", +} + + +class ImportPandasChecker(ast.NodeVisitor): + def __init__(self, file_name: str, lines: list[str]) -> None: + self.file_name = file_name + self.lines = lines + self.found_import = False + + def visit_If(self, node: ast.If) -> None: # noqa: N802 + # Check if the condition is `if TYPE_CHECKING` + if isinstance(node.test, ast.Name) and node.test.id == "TYPE_CHECKING": + # Skip the body of this if statement + return + self.generic_visit(node) + + def visit_Import(self, node: ast.Import) -> None: # noqa: N802 + for alias in node.names: + if ( + alias.name in BANNED_IMPORTS + and "# ignore-banned-import" not in self.lines[node.lineno - 1] + ): + print( # noqa: T201 + f"{self.file_name}:{node.lineno}:{node.col_offset}: found {alias.name} import" + ) + self.found_import = True + self.generic_visit(node) + + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: # noqa: N802 + if ( + node.module in BANNED_IMPORTS + and "# ignore-banned-import" not in self.lines[node.lineno - 1] + ): + print( # noqa: T201 + f"{self.file_name}:{node.lineno}:{node.col_offset}: found {node.module} import" + ) + self.found_import = True + self.generic_visit(node) + + +def check_import_pandas(filename: str) -> bool: + with open(filename) as file: + content = file.read() + tree = ast.parse(content, filename=filename) + + checker = ImportPandasChecker(filename, content.splitlines()) + checker.visit(tree) + + return checker.found_import + + +if __name__ == "__main__": + ret = 0 + for filename in sys.argv[1:]: + if not filename.endswith(".py"): + continue + ret |= check_import_pandas(filename) + sys.exit(ret) From 350fe7d39acddf49dd0ba89402c4f4cd6132edef Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 14 Aug 2024 08:17:36 +0100 Subject: [PATCH 19/21] feat: Support Arrow PyCapsule Interface for export (#786) --- docs/api-reference/dataframe.md | 1 + docs/api-reference/series.md | 2 ++ narwhals/dataframe.py | 25 ++++++++++++++ narwhals/series.py | 28 ++++++++++++++++ tests/frame/arrow_c_stream_test.py | 42 ++++++++++++++++++++++++ tests/series_only/arrow_c_stream_test.py | 41 +++++++++++++++++++++++ utils/check_api_reference.py | 6 ++-- 7 files changed, 142 insertions(+), 3 deletions(-) create mode 100644 tests/frame/arrow_c_stream_test.py create mode 100644 tests/series_only/arrow_c_stream_test.py diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index 676f64076..b251b2a50 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -4,6 +4,7 @@ handler: python options: members: + - __arrow_c_stream__ - __getitem__ - clone - collect_schema diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 7b7f62b8a..f9cc2e6bb 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -4,6 +4,8 @@ handler: python options: members: + - __arrow_c_stream__ + - __getitem__ - abs - alias - all diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 039496744..dfcdce87b 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -15,6 +15,7 @@ from narwhals.dependencies import is_numpy_array from narwhals.schema import Schema from narwhals.utils import flatten +from narwhals.utils import parse_version if TYPE_CHECKING: from io import BytesIO @@ -249,6 +250,30 @@ def __repr__(self) -> str: # pragma: no cover + "┘" ) + def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: + """ + Export a DataFrame via the Arrow PyCapsule Interface. + + - if the underlying dataframe implements the interface, it'll return that + - else, it'll call `to_arrow` and then defer to PyArrow's implementation + + See [PyCapsule Interface](https://arrow.apache.org/docs/dev/format/CDataInterface/PyCapsuleInterface.html) + for more. + """ + native_frame = self._compliant_frame._native_frame + if hasattr(native_frame, "__arrow_c_stream__"): + return native_frame.__arrow_c_stream__(requested_schema=requested_schema) + try: + import pyarrow as pa # ignore-banned-import + except ModuleNotFoundError as exc: # pragma: no cover + msg = f"PyArrow>=14.0.0 is required for `DataFrame.__arrow_c_stream__` for object of type {type(native_frame)}" + raise ModuleNotFoundError(msg) from exc + if parse_version(pa.__version__) < (14, 0): # pragma: no cover + msg = f"PyArrow>=14.0.0 is required for `DataFrame.__arrow_c_stream__` for object of type {type(native_frame)}" + raise ModuleNotFoundError(msg) from None + pa_table = self.to_arrow() + return pa_table.__arrow_c_stream__(requested_schema=requested_schema) + def lazy(self) -> LazyFrame[Any]: """ Lazify the DataFrame (if possible). diff --git a/narwhals/series.py b/narwhals/series.py index a826e5dd4..3c79024c8 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -7,6 +7,8 @@ from typing import Sequence from typing import overload +from narwhals.utils import parse_version + if TYPE_CHECKING: import numpy as np import pandas as pd @@ -57,6 +59,32 @@ def __getitem__(self, idx: int | slice | Sequence[int]) -> Any | Self: def __native_namespace__(self) -> Any: return self._compliant_series.__native_namespace__() + def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: + """ + Export a Series via the Arrow PyCapsule Interface. + + Narwhals doesn't implement anything itself here: + + - if the underlying series implements the interface, it'll return that + - else, it'll call `to_arrow` and then defer to PyArrow's implementation + + See [PyCapsule Interface](https://arrow.apache.org/docs/dev/format/CDataInterface/PyCapsuleInterface.html) + for more. + """ + native_series = self._compliant_series._native_series + if hasattr(native_series, "__arrow_c_stream__"): + return native_series.__arrow_c_stream__(requested_schema=requested_schema) + try: + import pyarrow as pa # ignore-banned-import + except ModuleNotFoundError as exc: # pragma: no cover + msg = f"PyArrow>=16.0.0 is required for `Series.__arrow_c_stream__` for object of type {type(native_series)}" + raise ModuleNotFoundError(msg) from exc + if parse_version(pa.__version__) < (16, 0): # pragma: no cover + msg = f"PyArrow>=16.0.0 is required for `Series.__arrow_c_stream__` for object of type {type(native_series)}" + raise ModuleNotFoundError(msg) + ca = pa.chunked_array([self.to_arrow()]) + return ca.__arrow_c_stream__(requested_schema=requested_schema) + @property def shape(self) -> tuple[int]: """ diff --git a/tests/frame/arrow_c_stream_test.py b/tests/frame/arrow_c_stream_test.py new file mode 100644 index 000000000..7a3403f69 --- /dev/null +++ b/tests/frame/arrow_c_stream_test.py @@ -0,0 +1,42 @@ +import polars as pl +import pyarrow as pa +import pyarrow.compute as pc +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version + + +@pytest.mark.skipif( + parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" +) +def test_arrow_c_stream_test() -> None: + df = nw.from_native(pl.Series([1, 2, 3]).to_frame("a"), eager_only=True) + result = pa.table(df) + expected = pa.table({"a": [1, 2, 3]}) + assert pc.all(pc.equal(result["a"], expected["a"])).as_py() + + +@pytest.mark.skipif( + parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" +) +def test_arrow_c_stream_test_invalid(monkeypatch: pytest.MonkeyPatch) -> None: + # "poison" the dunder method to make sure it actually got called above + monkeypatch.setattr( + "narwhals.dataframe.DataFrame.__arrow_c_stream__", lambda *_: 1 / 0 + ) + df = nw.from_native(pl.Series([1, 2, 3]).to_frame("a"), eager_only=True) + with pytest.raises(ZeroDivisionError, match="division by zero"): + pa.table(df) + + +@pytest.mark.skipif( + parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" +) +def test_arrow_c_stream_test_fallback(monkeypatch: pytest.MonkeyPatch) -> None: + # Check that fallback to PyArrow works + monkeypatch.delattr("polars.DataFrame.__arrow_c_stream__") + df = nw.from_native(pl.Series([1, 2, 3]).to_frame("a"), eager_only=True) + result = pa.table(df) + expected = pa.table({"a": [1, 2, 3]}) + assert pc.all(pc.equal(result["a"], expected["a"])).as_py() diff --git a/tests/series_only/arrow_c_stream_test.py b/tests/series_only/arrow_c_stream_test.py new file mode 100644 index 000000000..9964d7408 --- /dev/null +++ b/tests/series_only/arrow_c_stream_test.py @@ -0,0 +1,41 @@ +import polars as pl +import pyarrow as pa +import pyarrow.compute as pc +import pytest + +import narwhals.stable.v1 as nw +from narwhals.utils import parse_version + + +@pytest.mark.skipif( + parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" +) +def test_arrow_c_stream_test() -> None: + s = nw.from_native(pl.Series([1, 2, 3]), series_only=True) + result = pa.chunked_array(s) + expected = pa.chunked_array([[1, 2, 3]]) + assert pc.all(pc.equal(result, expected)).as_py() + + +@pytest.mark.skipif( + parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" +) +def test_arrow_c_stream_test_invalid(monkeypatch: pytest.MonkeyPatch) -> None: + # "poison" the dunder method to make sure it actually got called above + monkeypatch.setattr("narwhals.series.Series.__arrow_c_stream__", lambda *_: 1 / 0) + s = nw.from_native(pl.Series([1, 2, 3]), series_only=True) + with pytest.raises(ZeroDivisionError, match="division by zero"): + pa.chunked_array(s) + + +@pytest.mark.skipif( + parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" +) +def test_arrow_c_stream_test_fallback(monkeypatch: pytest.MonkeyPatch) -> None: + # Check that fallback to PyArrow works + monkeypatch.delattr("polars.Series.__arrow_c_stream__") + s = nw.from_native(pl.Series([1, 2, 3]).to_frame("a"), eager_only=True)["a"] + s.__arrow_c_stream__() + result = pa.chunked_array(s) + expected = pa.chunked_array([[1, 2, 3]]) + assert pc.all(pc.equal(result, expected)).as_py() diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 68c980086..f6e5303c4 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -45,13 +45,13 @@ documented = [ remove_prefix(i, " - ") for i in content.splitlines() - if i.startswith(" - ") + if i.startswith(" - ") and not i.startswith(" - _") ] if missing := set(top_level_functions).difference(documented): print("DataFrame: not documented") # noqa: T201 print(missing) # noqa: T201 ret = 1 -if extra := set(documented).difference(top_level_functions).difference({"__getitem__"}): +if extra := set(documented).difference(top_level_functions): print("DataFrame: outdated") # noqa: T201 print(extra) # noqa: T201 ret = 1 @@ -87,7 +87,7 @@ documented = [ remove_prefix(i, " - ") for i in content.splitlines() - if i.startswith(" - ") + if i.startswith(" - ") and not i.startswith(" - _") ] if ( missing := set(top_level_functions) From 6fbfb7783e510b5b27b5989b20738a4fe629f8ef Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 14 Aug 2024 08:22:42 +0100 Subject: [PATCH 20/21] chore: import overhaul (#788) --- CONTRIBUTING.md | 20 +++ narwhals/_arrow/dataframe.py | 27 +-- narwhals/_arrow/group_by.py | 8 +- narwhals/_arrow/namespace.py | 4 +- narwhals/_arrow/series.py | 266 ++++++++++++++++++----------- narwhals/_arrow/utils.py | 28 +-- narwhals/_dask/dataframe.py | 5 +- narwhals/_dask/namespace.py | 6 +- narwhals/_pandas_like/dataframe.py | 5 +- narwhals/_pandas_like/series.py | 8 +- narwhals/dataframe.py | 3 - narwhals/dependencies.py | 67 +++++--- narwhals/expr.py | 9 - narwhals/series.py | 6 - narwhals/translate.py | 42 ++--- narwhals/utils.py | 24 +-- tests/no_imports_test.py | 68 ++++++++ 17 files changed, 382 insertions(+), 214 deletions(-) create mode 100644 tests/no_imports_test.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 03d80fec9..b1eb91b0d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -67,6 +67,26 @@ Please adhere to the following guidelines: If Narwhals looks like underwater unicorn magic to you, then please read [how it works](https://narwhals-dev.github.io/narwhals/how-it-works/). +## Imports + +In Narwhals, we are very particular about imports. When it comes to importing +heavy third-party libraries (pandas, NumPy, Polars, etc...) please follow these rules: + +- Never import anything to do `isinstance` checks. Instead, just use the functions + in `narwhals.dependencies` (such as `is_pandas_dataframe`); +- If you need to import anything, do it in a place where you know that the import + is definitely available. For example, NumPy is a required dependency of PyArrow, + so it's OK to import NumPy to implement a PyArrow function - however, NumPy + should never be imported to implement a Polars function. The only exception is + for when there's simply no way around it by definition - for example, `Series.to_numpy` + always requires NumPy to be installed. +- Don't place a third-party import at the top of a file. Instead, place it in the + function where it's used, so that we minimise the chances of it being imported + unnecessarily. + +We're trying to be really lightweight and minimal-overhead, and +unnecessary imports can slow things down. + ## Happy contributing! Please remember to abide by the code of conduct, else you'll be conducted away from this project. diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 75b9068a0..865d17098 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -13,8 +13,6 @@ from narwhals._arrow.utils import validate_dataframe_comparand from narwhals._expression_parsing import evaluate_into_exprs from narwhals.dependencies import get_pyarrow -from narwhals.dependencies import get_pyarrow_compute -from narwhals.dependencies import get_pyarrow_parquet from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation from narwhals.utils import flatten @@ -182,12 +180,13 @@ def select( *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr, ) -> Self: + import pyarrow as pa # ignore-banned-import() + new_series = evaluate_into_exprs(self, *exprs, **named_exprs) if not new_series: # return empty dataframe, like Polars does return self._from_native_frame(self._native_frame.__class__.from_arrays([])) names = [s.name for s in new_series] - pa = get_pyarrow() df = pa.Table.from_arrays( broadcast_series(new_series), names=names, @@ -337,7 +336,8 @@ def to_dict(self, *, as_series: bool) -> Any: return {name: col.to_pylist() for name, col in names_and_values} def with_row_index(self, name: str) -> Self: - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + df = self._native_frame row_indices = pa.array(range(df.num_rows)) @@ -354,7 +354,8 @@ def filter( return self._from_native_frame(self._native_frame.filter(mask._native_series)) def null_count(self) -> Self: - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + df = self._native_frame names_and_values = zip(df.column_names, df.columns) @@ -415,16 +416,17 @@ def rename(self, mapping: dict[str, str]) -> Self: return self._from_native_frame(df.rename_columns(new_cols)) def write_parquet(self, file: Any) -> Any: - pp = get_pyarrow_parquet() + import pyarrow.parquet as pp # ignore-banned-import + pp.write_table(self._native_frame, file) def is_duplicated(self: Self) -> ArrowSeries: import numpy as np # ignore-banned-import + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() from narwhals._arrow.series import ArrowSeries - pa = get_pyarrow() - pc = get_pyarrow_compute() df = self._native_frame columns = self.columns @@ -443,9 +445,10 @@ def is_duplicated(self: Self) -> ArrowSeries: return ArrowSeries(is_duplicated, name="", backend_version=self._backend_version) def is_unique(self: Self) -> ArrowSeries: + import pyarrow.compute as pc # ignore-banned-import() + from narwhals._arrow.series import ArrowSeries - pc = get_pyarrow_compute() is_duplicated = self.is_duplicated()._native_series return ArrowSeries( @@ -464,11 +467,9 @@ def unique( The param `maintain_order` is only here for compatibility with the polars API and has no effect on the output. """ - import numpy as np # ignore-banned-import - - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() df = self._native_frame diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index ecdfc02a6..27c7ff368 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -8,8 +8,6 @@ from narwhals._expression_parsing import is_simple_aggregation from narwhals._expression_parsing import parse_into_exprs -from narwhals.dependencies import get_pyarrow -from narwhals.dependencies import get_pyarrow_compute from narwhals.utils import remove_prefix if TYPE_CHECKING: @@ -20,7 +18,8 @@ class ArrowGroupBy: def __init__(self, df: ArrowDataFrame, keys: list[str]) -> None: - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + self._df = df self._keys = list(keys) self._grouped = pa.TableGroupBy(self._df._native_frame, list(self._keys)) @@ -79,7 +78,8 @@ def agg_arrow( output_names: list[str], from_dataframe: Callable[[Any], ArrowDataFrame], ) -> ArrowDataFrame: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + all_simple_aggs = True for expr in exprs: if not is_simple_aggregation(expr): diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index ffb3f2d15..bb90b1792 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -13,7 +13,6 @@ from narwhals._arrow.utils import horizontal_concat from narwhals._arrow.utils import vertical_concat from narwhals._expression_parsing import parse_into_exprs -from narwhals.dependencies import get_pyarrow from narwhals.utils import Implementation if TYPE_CHECKING: @@ -87,9 +86,10 @@ def _create_series_from_scalar(self, value: Any, series: ArrowSeries) -> ArrowSe ) def _create_compliant_series(self, value: Any) -> ArrowSeries: + import pyarrow as pa # ignore-banned-import() + from narwhals._arrow.series import ArrowSeries - pa = get_pyarrow() return ArrowSeries( native_series=pa.chunked_array([value]), name="", diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 20513c39e..fb15f3aaf 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -14,7 +14,6 @@ from narwhals._arrow.utils import validate_column_comparand from narwhals.dependencies import get_pandas from narwhals.dependencies import get_pyarrow -from narwhals.dependencies import get_pyarrow_compute from narwhals.utils import Implementation from narwhals.utils import generate_unique_token @@ -35,7 +34,8 @@ def __init__( self._backend_version = backend_version def _from_native_series(self, series: Any) -> Self: - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + if isinstance(series, pa.Array): series = pa.chunked_array([series]) return self.__class__( @@ -52,7 +52,8 @@ def _from_iterable( *, backend_version: tuple[int, ...], ) -> Self: - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + return cls( pa.chunked_array([data]), name=name, @@ -63,67 +64,78 @@ def __len__(self) -> int: return len(self._native_series) def __eq__(self, other: object) -> Self: # type: ignore[override] - pc = get_pyarrow_compute() + import pyarrow.compute as pc + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.equal(ser, other)) def __ne__(self, other: object) -> Self: # type: ignore[override] - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.not_equal(ser, other)) def __ge__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.greater_equal(ser, other)) def __gt__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.greater(ser, other)) def __le__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.less_equal(ser, other)) def __lt__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.less(ser, other)) def __and__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.and_kleene(ser, other)) def __rand__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.and_kleene(other, ser)) def __or__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.or_kleene(ser, other)) def __ror__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.or_kleene(other, ser)) def __add__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + other = validate_column_comparand(other) return self._from_native_series(pc.add(self._native_series, other)) @@ -131,7 +143,8 @@ def __radd__(self, other: Any) -> Self: return self + other # type: ignore[no-any-return] def __sub__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + other = validate_column_comparand(other) return self._from_native_series(pc.subtract(self._native_series, other)) @@ -139,7 +152,8 @@ def __rsub__(self, other: Any) -> Self: return (self - other) * (-1) # type: ignore[no-any-return] def __mul__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + other = validate_column_comparand(other) return self._from_native_series(pc.multiply(self._native_series, other)) @@ -147,13 +161,15 @@ def __rmul__(self, other: Any) -> Self: return self * other # type: ignore[no-any-return] def __pow__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.power(ser, other)) def __rpow__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) return self._from_native_series(pc.power(other, ser)) @@ -169,8 +185,9 @@ def __rfloordiv__(self, other: Any) -> Self: return self._from_native_series(floordiv_compat(other, ser)) def __truediv__(self, other: Any) -> Self: - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) if not isinstance(other, (pa.Array, pa.ChunkedArray)): @@ -179,8 +196,9 @@ def __truediv__(self, other: Any) -> Self: return self._from_native_series(pc.divide(*cast_for_truediv(ser, other))) def __rtruediv__(self, other: Any) -> Self: - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) if not isinstance(other, (pa.Array, pa.ChunkedArray)): @@ -189,7 +207,8 @@ def __rtruediv__(self, other: Any) -> Self: return self._from_native_series(pc.divide(*cast_for_truediv(other, ser))) def __mod__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) floor_div = (self // other)._native_series @@ -197,7 +216,8 @@ def __mod__(self, other: Any) -> Self: return self._from_native_series(res) def __rmod__(self, other: Any) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series other = validate_column_comparand(other) floor_div = (other // self)._native_series @@ -205,7 +225,8 @@ def __rmod__(self, other: Any) -> Self: return self._from_native_series(res) def __invert__(self) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._from_native_series(pc.invert(self._native_series)) def len(self) -> int: @@ -216,27 +237,33 @@ def filter(self, other: Any) -> Self: return self._from_native_series(self._native_series.filter(other)) def mean(self) -> int: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return pc.mean(self._native_series) # type: ignore[no-any-return] def min(self) -> int: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return pc.min(self._native_series) # type: ignore[no-any-return] def max(self) -> int: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return pc.max(self._native_series) # type: ignore[no-any-return] def sum(self) -> int: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return pc.sum(self._native_series) # type: ignore[no-any-return] def drop_nulls(self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._from_native_series(pc.drop_null(self._native_series)) def shift(self, n: int) -> Self: - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + ca = self._native_series if n > 0: @@ -248,15 +275,18 @@ def shift(self, n: int) -> Self: return self._from_native_series(result) def std(self, ddof: int = 1) -> int: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return pc.stddev(self._native_series, ddof=ddof) # type: ignore[no-any-return] def count(self) -> int: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return pc.count(self._native_series) # type: ignore[no-any-return] def n_unique(self) -> int: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + unique_values = pc.unique(self._native_series) return pc.count(unique_values, mode="all") # type: ignore[no-any-return] @@ -302,35 +332,42 @@ def dtype(self) -> DType: return translate_dtype(self._native_series.type) def abs(self) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._from_native_series(pc.abs(self._native_series)) def cum_sum(self) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._from_native_series(pc.cumulative_sum(self._native_series)) def round(self, decimals: int) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._from_native_series( pc.round(self._native_series, decimals, round_mode="half_towards_infinity") ) def diff(self) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._from_native_series( pc.pairwise_diff(self._native_series.combine_chunks()) ) def any(self) -> bool: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return pc.any(self._native_series) # type: ignore[no-any-return] def all(self) -> bool: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return pc.all(self._native_series) # type: ignore[no-any-return] def is_between(self, lower_bound: Any, upper_bound: Any, closed: str = "both") -> Any: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series if closed == "left": ge = pc.greater_equal(ser, lower_bound) @@ -360,7 +397,8 @@ def is_null(self) -> Self: return self._from_native_series(ser.is_null()) def cast(self, dtype: DType) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series dtype = narwhals_to_native_dtype(dtype) return self._from_native_series(pc.cast(ser, dtype)) @@ -385,8 +423,9 @@ def tail(self, n: int) -> Self: return self._from_native_series(ser.slice(abs(n))) def is_in(self, other: Any) -> Self: - pc = get_pyarrow_compute() - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() + value_set = pa.array(other) ser = self._native_series return self._from_native_series(pc.is_in(ser, value_set=value_set)) @@ -420,10 +459,10 @@ def value_counts( normalize: bool = False, ) -> ArrowDataFrame: """Parallel is unused, exists for compatibility""" - from narwhals._arrow.dataframe import ArrowDataFrame + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() - pc = get_pyarrow_compute() - pa = get_pyarrow() + from narwhals._arrow.dataframe import ArrowDataFrame index_name_ = "index" if self._name is None else self._name value_name_ = name or ("proportion" if normalize else "count") @@ -448,7 +487,7 @@ def value_counts( ) def zip_with(self: Self, mask: Self, other: Self) -> Self: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() return self._from_native_series( pc.replace_with_mask( @@ -466,8 +505,8 @@ def sample( with_replacement: bool = False, ) -> Self: import numpy as np # ignore-banned-import + import pyarrow.compute as pc # ignore-banned-import() - pc = get_pyarrow_compute() ser = self._native_series num_rows = len(self) @@ -479,17 +518,19 @@ def sample( return self._from_native_series(pc.take(ser, mask)) def fill_null(self: Self, value: Any) -> Self: - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series dtype = ser.type return self._from_native_series(pc.fill_null(ser, pa.scalar(value, dtype))) def to_frame(self: Self) -> ArrowDataFrame: + import pyarrow as pa # ignore-banned-import() + from narwhals._arrow.dataframe import ArrowDataFrame - pa = get_pyarrow() df = pa.Table.from_arrays([self._native_series], names=[self.name]) return ArrowDataFrame(df, backend_version=self._backend_version) @@ -505,9 +546,8 @@ def is_unique(self: Self) -> ArrowSeries: def is_first_distinct(self: Self) -> Self: import numpy as np # ignore-banned-import - - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() row_number = pa.array(np.arange(len(self))) col_token = generate_unique_token(n_bytes=8, columns=[self.name]) @@ -523,9 +563,8 @@ def is_first_distinct(self: Self) -> Self: def is_last_distinct(self: Self) -> Self: import numpy as np # ignore-banned-import - - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() row_number = pa.array(np.arange(len(self))) col_token = generate_unique_token(n_bytes=8, columns=[self.name]) @@ -543,7 +582,8 @@ def is_sorted(self: Self, *, descending: bool = False) -> bool: if not isinstance(descending, bool): msg = f"argument 'descending' should be boolean, found {type(descending)}" raise TypeError(msg) - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + ser = self._native_series if descending: return pc.all(pc.greater_equal(ser[:-1], ser[1:])) # type: ignore[no-any-return] @@ -551,13 +591,15 @@ def is_sorted(self: Self, *, descending: bool = False) -> bool: return pc.all(pc.less_equal(ser[:-1], ser[1:])) # type: ignore[no-any-return] def unique(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._from_native_series(pc.unique(self._native_series)) def sort( self: Self, *, descending: bool = False, nulls_last: bool = False ) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + series = self._native_series order = "descending" if descending else "ascending" null_placement = "at_end" if nulls_last else "at_start" @@ -571,11 +613,10 @@ def to_dummies( self: Self, *, separator: str = "_", drop_first: bool = False ) -> ArrowDataFrame: import numpy as np # ignore-banned-import + import pyarrow as pa # ignore-banned-import() from narwhals._arrow.dataframe import ArrowDataFrame - pa = get_pyarrow() - series = self._native_series da = series.dictionary_encode().combine_chunks() @@ -593,7 +634,8 @@ def quantile( quantile: float, interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"], ) -> Any: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return pc.quantile(self._native_series, q=quantile, interpolation=interpolation)[ 0 ] @@ -604,8 +646,8 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: def clip( self: Self, lower_bound: Any | None = None, upper_bound: Any | None = None ) -> Self: - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() arr = self._native_series arr = pc.max_element_wise(arr, pa.scalar(lower_bound, type=arr.type)) @@ -638,7 +680,8 @@ def __init__(self: Self, series: ArrowSeries) -> None: self._arrow_series = series def to_string(self: Self, format: str) -> ArrowSeries: # noqa: A002 - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + # PyArrow differs from other libraries in that %S also prints out # the fractional part of the second...:'( # https://arrow.apache.org/docs/python/generated/pyarrow.compute.strftime.html @@ -648,63 +691,72 @@ def to_string(self: Self, format: str) -> ArrowSeries: # noqa: A002 ) def date(self: Self) -> ArrowSeries: - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + return self._arrow_series._from_native_series( self._arrow_series._native_series.cast(pa.date64()) ) def year(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.year(self._arrow_series._native_series) ) def month(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.month(self._arrow_series._native_series) ) def day(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.day(self._arrow_series._native_series) ) def hour(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.hour(self._arrow_series._native_series) ) def minute(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.minute(self._arrow_series._native_series) ) def second(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.second(self._arrow_series._native_series) ) def millisecond(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.millisecond(self._arrow_series._native_series) ) def microsecond(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + arr = self._arrow_series._native_series result = pc.add(pc.multiply(pc.millisecond(arr), 1000), pc.microsecond(arr)) return self._arrow_series._from_native_series(result) def nanosecond(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + arr = self._arrow_series._native_series result = pc.add( pc.multiply(self.microsecond()._native_series, 1000), pc.nanosecond(arr) @@ -712,14 +764,16 @@ def nanosecond(self: Self) -> ArrowSeries: return self._arrow_series._from_native_series(result) def ordinal_day(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.day_of_year(self._arrow_series._native_series) ) def total_minutes(self: Self) -> ArrowSeries: - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() + arr = self._arrow_series._native_series unit = arr.type.unit @@ -736,8 +790,9 @@ def total_minutes(self: Self) -> ArrowSeries: ) def total_seconds(self: Self) -> ArrowSeries: - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() + arr = self._arrow_series._native_series unit = arr.type.unit @@ -754,8 +809,9 @@ def total_seconds(self: Self) -> ArrowSeries: ) def total_milliseconds(self: Self) -> ArrowSeries: - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() + arr = self._arrow_series._native_series unit = arr.type.unit @@ -778,8 +834,9 @@ def total_milliseconds(self: Self) -> ArrowSeries: ) def total_microseconds(self: Self) -> ArrowSeries: - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() + arr = self._arrow_series._native_series unit = arr.type.unit @@ -801,8 +858,9 @@ def total_microseconds(self: Self) -> ArrowSeries: ) def total_nanoseconds(self: Self) -> ArrowSeries: - pa = get_pyarrow() - pc = get_pyarrow_compute() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() + arr = self._arrow_series._native_series unit = arr.type.unit @@ -825,7 +883,8 @@ def __init__(self, series: ArrowSeries) -> None: self._arrow_series = series def get_categories(self) -> ArrowSeries: - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + ca = self._arrow_series._native_series # TODO(Unassigned): this looks potentially expensive - is there no better way? out = pa.chunked_array( @@ -841,7 +900,8 @@ def __init__(self: Self, series: ArrowSeries) -> None: def replace( self, pattern: str, value: str, *, literal: bool = False, n: int = 1 ) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + method = "replace_substring" if literal else "replace_substring_regex" return self._arrow_series._from_native_series( getattr(pc, method)( @@ -858,7 +918,8 @@ def replace_all( return self.replace(pattern, value, literal=literal, n=-1) def strip_chars(self: Self, characters: str | None = None) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + whitespace = " \t\n\r\v\f" return self._arrow_series._from_native_series( pc.utf8_trim( @@ -868,26 +929,30 @@ def strip_chars(self: Self, characters: str | None = None) -> ArrowSeries: ) def starts_with(self: Self, prefix: str) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.equal(self.slice(0, len(prefix))._native_series, prefix) ) def ends_with(self: Self, suffix: str) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.equal(self.slice(-len(suffix))._native_series, suffix) ) def contains(self: Self, pattern: str, *, literal: bool = False) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + check_func = pc.match_substring if literal else pc.match_substring_regex return self._arrow_series._from_native_series( check_func(self._arrow_series._native_series, pattern) ) def slice(self: Self, offset: int, length: int | None = None) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + stop = offset + length if length else None return self._arrow_series._from_native_series( pc.utf8_slice_codeunits( @@ -896,19 +961,22 @@ def slice(self: Self, offset: int, length: int | None = None) -> ArrowSeries: ) def to_datetime(self: Self, format: str | None = None) -> ArrowSeries: # noqa: A002 - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.strptime(self._arrow_series._native_series, format=format, unit="us") ) def to_uppercase(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.utf8_upper(self._arrow_series._native_series), ) def to_lowercase(self: Self) -> ArrowSeries: - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + return self._arrow_series._from_native_series( pc.utf8_lower(self._arrow_series._native_series), ) diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index a6b56a355..6f7517aeb 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -4,8 +4,6 @@ from typing import Any from narwhals import dtypes -from narwhals.dependencies import get_pyarrow -from narwhals.dependencies import get_pyarrow_compute from narwhals.utils import isinstance_or_issubclass if TYPE_CHECKING: @@ -13,7 +11,8 @@ def translate_dtype(dtype: Any) -> dtypes.DType: - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + if pa.types.is_int64(dtype): return dtypes.Int64() if pa.types.is_int32(dtype): @@ -56,9 +55,9 @@ def translate_dtype(dtype: Any) -> dtypes.DType: def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any: - from narwhals import dtypes + import pyarrow as pa # ignore-banned-import() - pa = get_pyarrow() + from narwhals import dtypes if isinstance_or_issubclass(dtype, dtypes.Float64): return pa.float64() @@ -143,7 +142,8 @@ def validate_dataframe_comparand( return NotImplemented if isinstance(other, ArrowSeries): if len(other) == 1: - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + value = other.item() if backend_version < (13,) and hasattr(value, "as_py"): # pragma: no cover value = value.as_py() @@ -159,7 +159,8 @@ def horizontal_concat(dfs: list[Any]) -> Any: Should be in namespace. """ - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + if not dfs: msg = "No dataframes to concatenate" # pragma: no cover raise AssertionError(msg) @@ -191,15 +192,16 @@ def vertical_concat(dfs: list[Any]) -> Any: msg = "unable to vstack, column names don't match" raise TypeError(msg) - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + return pa.concat_tables(dfs).combine_chunks() def floordiv_compat(left: Any, right: Any) -> Any: # The following lines are adapted from pandas' pyarrow implementation. # Ref: https://github.com/pandas-dev/pandas/blob/262fcfbffcee5c3116e86a951d8b693f90411e68/pandas/core/arrays/arrow/array.py#L124-L154 - pc = get_pyarrow_compute() - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() if isinstance(left, (int, float)): left = pa.scalar(left) @@ -237,8 +239,8 @@ def floordiv_compat(left: Any, right: Any) -> Any: def cast_for_truediv(arrow_array: Any, pa_object: Any) -> tuple[Any, Any]: # Lifted from: # https://github.com/pandas-dev/pandas/blob/262fcfbffcee5c3116e86a951d8b693f90411e68/pandas/core/arrays/arrow/array.py#L108-L122 - pc = get_pyarrow_compute() - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + import pyarrow.compute as pc # ignore-banned-import() # Ensure int / int -> float mirroring Python/Numpy behavior # as pc.divide_checked(int, int) -> int @@ -260,7 +262,7 @@ def broadcast_series(series: list[ArrowSeries]) -> list[Any]: if fast_path: return [s._native_series for s in series] - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() reshaped = [] for s, length in zip(series, lengths): diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index cf3c6cc12..99ed430a9 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -87,7 +87,7 @@ def select( *exprs: IntoDaskExpr, **named_exprs: IntoDaskExpr, ) -> Self: - dd = get_dask_dataframe() + import dask.dataframe as dd # ignore-banned-import if exprs and all(isinstance(x, str) for x in exprs) and not named_exprs: # This is a simple slice => fastpath! @@ -97,7 +97,8 @@ def select( if not new_series: # return empty dataframe, like Polars does - pd = get_pandas() + import pandas as pd # ignore-banned-import + return self._from_native_frame( dd.from_pandas(pd.DataFrame(), npartitions=self._native_frame.npartitions) ) diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index afff9fee5..2baf1cf3f 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -10,8 +10,6 @@ from narwhals._dask.expr import DaskExpr from narwhals._dask.selectors import DaskSelectorNamespace from narwhals._expression_parsing import parse_into_exprs -from narwhals.dependencies import get_dask_dataframe -from narwhals.dependencies import get_pandas if TYPE_CHECKING: from narwhals._dask.dataframe import DaskLazyFrame @@ -104,8 +102,8 @@ def sum(self, *column_names: str) -> DaskExpr: ).sum() def len(self) -> DaskExpr: - pd = get_pandas() - dd = get_dask_dataframe() + import dask.dataframe as dd # ignore-banned-import + import pandas as pd # ignore-banned-import def func(df: DaskLazyFrame) -> list[Any]: if not df.columns: diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index b2a819a0e..c815b2a0c 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -19,7 +19,6 @@ from narwhals.dependencies import get_cudf from narwhals.dependencies import get_modin from narwhals.dependencies import get_pandas -from narwhals.dependencies import get_pyarrow from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation from narwhals.utils import flatten @@ -491,7 +490,6 @@ def unique( The param `maintain_order` is only here for compatibility with the polars API and has no effect on the output. """ - mapped_keep = {"none": False, "any": "first"}.get(keep, keep) subset = flatten(subset) if subset else None return self._from_native_frame( @@ -603,5 +601,6 @@ def to_arrow(self: Self) -> Any: msg = "`to_arrow` is not implemented for CuDF backend." raise NotImplementedError(msg) - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + return pa.Table.from_pandas(self._native_frame) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 51b4cbd72..3db0fb73a 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -16,8 +16,6 @@ from narwhals.dependencies import get_cudf from narwhals.dependencies import get_modin from narwhals.dependencies import get_pandas -from narwhals.dependencies import get_pyarrow -from narwhals.dependencies import get_pyarrow_compute from narwhals.utils import Implementation if TYPE_CHECKING: @@ -638,7 +636,8 @@ def to_arrow(self: Self) -> Any: msg = "`to_arrow` is not implemented for CuDF backend." raise NotImplementedError(msg) - pa = get_pyarrow() + import pyarrow as pa # ignore-banned-import() + return pa.Array.from_pandas(self._native_series) @property @@ -786,7 +785,8 @@ def microsecond(self) -> PandasLikeSeries: self._pandas_series._native_series.dtype ): # crazy workaround for https://github.com/pandas-dev/pandas/issues/59154 - pc = get_pyarrow_compute() + import pyarrow.compute as pc # ignore-banned-import() + native_series = self._pandas_series._native_series arr = native_series.array.__arrow_array__() result_arr = pc.add( diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index dfcdce87b..9aa7bb64c 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -1261,7 +1261,6 @@ def head(self, n: int = 5) -> Self: │ 3 ┆ 8 ┆ c │ └─────┴─────┴─────┘ """ - return super().head(n) def tail(self, n: int = 5) -> Self: @@ -1833,7 +1832,6 @@ def is_empty(self: Self) -> bool: >>> func(df_pd), func(df_pl) (False, False) """ - return self._compliant_frame.is_empty() # type: ignore[no-any-return] def is_unique(self: Self) -> Series: @@ -1939,7 +1937,6 @@ def null_count(self: Self) -> Self: │ 1 ┆ 1 ┆ 0 │ └─────┴─────┴─────┘ """ - return self._from_compliant_dataframe(self._compliant_frame.null_count()) def item(self: Self, row: int | None = None, column: int | str | None = None) -> Any: diff --git a/narwhals/dependencies.py b/narwhals/dependencies.py index 663b5dde6..e2d67f03c 100644 --- a/narwhals/dependencies.py +++ b/narwhals/dependencies.py @@ -14,8 +14,12 @@ from typing import TypeGuard else: from typing_extensions import TypeGuard + import cudf + import dask.dataframe as dd + import modin.pandas as mpd import pandas as pd import polars as pl + import pyarrow as pa def get_polars() -> Any: @@ -45,26 +49,6 @@ def get_pyarrow() -> Any: # pragma: no cover return sys.modules.get("pyarrow", None) -def get_pyarrow_compute() -> Any: # pragma: no cover - """Get pyarrow.compute module (if pyarrow has already been imported - else return None).""" - # TODO(marco): remove this one, as it's at odds with the others, as it imports - # something new - if "pyarrow" in sys.modules: - import pyarrow.compute as pc - - return pc - return None - - -def get_pyarrow_parquet() -> Any: # pragma: no cover - """Get pyarrow.parquet module (if pyarrow has already been imported - else return None).""" - if "pyarrow" in sys.modules: - import pyarrow.parquet as pp - - return pp - return None - - def get_numpy() -> Any: """Get numpy module (if already imported - else return None).""" return sys.modules.get("numpy", None) @@ -91,10 +75,35 @@ def is_pandas_dataframe(df: Any) -> TypeGuard[pd.DataFrame]: def is_pandas_series(ser: Any) -> TypeGuard[pd.Series[Any]]: - """Check whether `df` is a pandas Series without importing pandas.""" + """Check whether `ser` is a pandas Series without importing pandas.""" return bool((pd := get_pandas()) is not None and isinstance(ser, pd.Series)) +def is_modin_dataframe(df: Any) -> TypeGuard[mpd.DataFrame]: + """Check whether `df` is a modin DataFrame without importing modin.""" + return bool((pd := get_modin()) is not None and isinstance(df, pd.DataFrame)) + + +def is_modin_series(ser: Any) -> TypeGuard[mpd.Series]: + """Check whether `ser` is a modin Series without importing modin.""" + return bool((pd := get_modin()) is not None and isinstance(ser, pd.Series)) + + +def is_cudf_dataframe(df: Any) -> TypeGuard[cudf.DataFrame]: + """Check whether `df` is a cudf DataFrame without importing cudf.""" + return bool((pd := get_cudf()) is not None and isinstance(df, pd.DataFrame)) + + +def is_cudf_series(ser: Any) -> TypeGuard[pd.Series[Any]]: + """Check whether `ser` is a cudf Series without importing cudf.""" + return bool((pd := get_cudf()) is not None and isinstance(ser, pd.Series)) + + +def is_dask_dataframe(df: Any) -> TypeGuard[dd.DataFrame]: + """Check whether `df` is a Dask DataFrame without importing Dask.""" + return bool((dd := get_dask_dataframe()) is not None and isinstance(df, dd.DataFrame)) + + def is_polars_dataframe(df: Any) -> TypeGuard[pl.DataFrame]: """Check whether `df` is a Polars DataFrame without importing Polars.""" return bool((pl := get_polars()) is not None and isinstance(df, pl.DataFrame)) @@ -105,6 +114,21 @@ def is_polars_lazyframe(df: Any) -> TypeGuard[pl.LazyFrame]: return bool((pl := get_polars()) is not None and isinstance(df, pl.LazyFrame)) +def is_polars_series(ser: Any) -> TypeGuard[pl.Series]: + """Check whether `ser` is a Polars Series without importing Polars.""" + return bool((pl := get_polars()) is not None and isinstance(ser, pl.Series)) + + +def is_pyarrow_chunked_array(ser: Any) -> TypeGuard[pa.ChunkedArray]: + """Check whether `ser` is a PyArrow ChunkedArray without importing PyArrow.""" + return bool((pa := get_pyarrow()) is not None and isinstance(ser, pa.ChunkedArray)) + + +def is_pyarrow_table(df: Any) -> TypeGuard[pa.Table]: + """Check whether `df` is a PyArrow Table without importing PyArrow.""" + return bool((pa := get_pyarrow()) is not None and isinstance(df, pa.Table)) + + def is_numpy_array(arr: Any) -> TypeGuard[np.ndarray]: """Check whether `arr` is a NumPy Array without importing NumPy.""" return bool((np := get_numpy()) is not None and isinstance(arr, np.ndarray)) @@ -116,7 +140,6 @@ def is_numpy_array(arr: Any) -> TypeGuard[np.ndarray]: "get_modin", "get_cudf", "get_pyarrow", - "get_pyarrow_compute", "get_numpy", "is_pandas_dataframe", ] diff --git a/narwhals/expr.py b/narwhals/expr.py index 8d0f4956a..b89fb3d5a 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -162,7 +162,6 @@ def cast( │ 3.0 ┆ 8 │ └─────┴─────┘ """ - return self.__class__( lambda plx: self._call(plx).cast(dtype), ) @@ -499,7 +498,6 @@ def min(self) -> Self: │ 1 ┆ 3 │ └─────┴─────┘ """ - return self.__class__(lambda plx: self._call(plx).min()) def max(self) -> Self: @@ -1409,7 +1407,6 @@ def is_unique(self) -> Self: │ false ┆ true │ └───────┴───────┘ """ - return self.__class__(lambda plx: self._call(plx).is_unique()) def null_count(self) -> Self: @@ -1623,7 +1620,6 @@ def head(self, n: int = 10) -> Self: │ 2 │ └─────┘ """ - return self.__class__(lambda plx: self._call(plx).head(n)) def tail(self, n: int = 10) -> Self: @@ -1667,7 +1663,6 @@ def tail(self, n: int = 10) -> Self: │ 9 │ └─────┘ """ - return self.__class__(lambda plx: self._call(plx).tail(n)) def round(self, decimals: int = 0) -> Self: @@ -1719,7 +1714,6 @@ def round(self, decimals: int = 0) -> Self: │ 3.9 │ └─────┘ """ - return self.__class__(lambda plx: self._call(plx).round(decimals)) def len(self) -> Self: @@ -2242,7 +2236,6 @@ def contains(self, pattern: str, *, literal: bool = False) -> Expr: │ null ┆ null ┆ null ┆ null │ └───────────────────┴───────────────┴────────────────────────┴───────────────┘ """ - return self._expr.__class__( lambda plx: self._expr._call(plx).str.contains(pattern, literal=literal) ) @@ -3445,7 +3438,6 @@ def keep(self: Self) -> Expr: >>> func(df_pl).columns ['foo'] """ - return self._expr.__class__(lambda plx: self._expr._call(plx).name.keep()) def map(self: Self, function: Callable[[str], str]) -> Expr: @@ -3482,7 +3474,6 @@ def map(self: Self, function: Callable[[str], str]) -> Expr: >>> func(df_pl).columns ['oof', 'RAB'] """ - return self._expr.__class__(lambda plx: self._expr._call(plx).name.map(function)) def prefix(self: Self, prefix: str) -> Expr: diff --git a/narwhals/series.py b/narwhals/series.py index 3c79024c8..a1bcae18b 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -1698,7 +1698,6 @@ def null_count(self: Self) -> int: >>> func(s_pl) 2 """ - return self._compliant_series.null_count() # type: ignore[no-any-return] def is_first_distinct(self: Self) -> Self: @@ -1969,7 +1968,6 @@ def zip_with(self: Self, mask: Self, other: Self) -> Self: 4 5 dtype: int64 """ - return self._from_compliant_series( self._compliant_series.zip_with( self._extract_native(mask), self._extract_native(other) @@ -2043,7 +2041,6 @@ def head(self: Self, n: int = 10) -> Self: 2 ] """ - return self._from_compliant_series(self._compliant_series.head(n)) def tail(self: Self, n: int = 10) -> Self: @@ -2084,7 +2081,6 @@ def tail(self: Self, n: int = 10) -> Self: 9 ] """ - return self._from_compliant_series(self._compliant_series.tail(n)) def round(self: Self, decimals: int = 0) -> Self: @@ -2200,7 +2196,6 @@ def to_dummies( │ 0 ┆ 1 │ └─────┴─────┘ """ - from narwhals.dataframe import DataFrame return DataFrame( @@ -2284,7 +2279,6 @@ def to_arrow(self: Self) -> pa.Array: 4 ] """ - return self._compliant_series.to_arrow() @property diff --git a/narwhals/translate.py b/narwhals/translate.py index ed33b376b..f396a8982 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -10,16 +10,23 @@ from narwhals.dependencies import get_cudf from narwhals.dependencies import get_dask -from narwhals.dependencies import get_dask_dataframe from narwhals.dependencies import get_dask_expr from narwhals.dependencies import get_modin from narwhals.dependencies import get_pandas from narwhals.dependencies import get_polars from narwhals.dependencies import get_pyarrow +from narwhals.dependencies import is_cudf_dataframe +from narwhals.dependencies import is_cudf_series +from narwhals.dependencies import is_dask_dataframe +from narwhals.dependencies import is_modin_dataframe +from narwhals.dependencies import is_modin_series from narwhals.dependencies import is_pandas_dataframe from narwhals.dependencies import is_pandas_series from narwhals.dependencies import is_polars_dataframe from narwhals.dependencies import is_polars_lazyframe +from narwhals.dependencies import is_polars_series +from narwhals.dependencies import is_pyarrow_chunked_array +from narwhals.dependencies import is_pyarrow_table if TYPE_CHECKING: from narwhals.dataframe import DataFrame @@ -369,7 +376,8 @@ def from_native( # noqa: PLR0915 PolarsLazyFrame(native_object, backend_version=parse_version(pl.__version__)), level="full", ) - elif (pl := get_polars()) is not None and isinstance(native_object, pl.Series): + elif is_polars_series(native_object): + pl = get_polars() if not allow_series: msg = "Please set `allow_series=True`" raise TypeError(msg) @@ -407,9 +415,8 @@ def from_native( # noqa: PLR0915 ) # Modin - elif (mpd := get_modin()) is not None and isinstance( - native_object, mpd.DataFrame - ): # pragma: no cover + elif is_modin_dataframe(native_object): # pragma: no cover + mpd = get_modin() if series_only: msg = "Cannot only use `series_only` with modin.DataFrame" raise TypeError(msg) @@ -421,9 +428,8 @@ def from_native( # noqa: PLR0915 ), level="full", ) - elif (mpd := get_modin()) is not None and isinstance( - native_object, mpd.Series - ): # pragma: no cover + elif is_modin_series(native_object): # pragma: no cover + mpd = get_modin() if not allow_series: msg = "Please set `allow_series=True`" raise TypeError(msg) @@ -437,9 +443,8 @@ def from_native( # noqa: PLR0915 ) # cuDF - elif (cudf := get_cudf()) is not None and isinstance( # pragma: no cover - native_object, cudf.DataFrame - ): + elif is_cudf_dataframe(native_object): # pragma: no cover + cudf = get_cudf() if series_only: msg = "Cannot only use `series_only` with cudf.DataFrame" raise TypeError(msg) @@ -451,9 +456,8 @@ def from_native( # noqa: PLR0915 ), level="full", ) - elif (cudf := get_cudf()) is not None and isinstance( - native_object, cudf.Series - ): # pragma: no cover + elif is_cudf_series(native_object): # pragma: no cover + cudf = get_cudf() if not allow_series: msg = "Please set `allow_series=True`" raise TypeError(msg) @@ -467,7 +471,8 @@ def from_native( # noqa: PLR0915 ) # PyArrow - elif (pa := get_pyarrow()) is not None and isinstance(native_object, pa.Table): + elif is_pyarrow_table(native_object): + pa = get_pyarrow() if series_only: msg = "Cannot only use `series_only` with arrow table" raise TypeError(msg) @@ -475,7 +480,8 @@ def from_native( # noqa: PLR0915 ArrowDataFrame(native_object, backend_version=parse_version(pa.__version__)), level="full", ) - elif (pa := get_pyarrow()) is not None and isinstance(native_object, pa.ChunkedArray): + elif is_pyarrow_chunked_array(native_object): + pa = get_pyarrow() if not allow_series: msg = "Please set `allow_series=True`" raise TypeError(msg) @@ -487,9 +493,7 @@ def from_native( # noqa: PLR0915 ) # Dask - elif (dd := get_dask_dataframe()) is not None and isinstance( - native_object, dd.DataFrame - ): + elif is_dask_dataframe(native_object): if series_only: msg = "Cannot only use `series_only` with dask DataFrame" raise TypeError(msg) diff --git a/narwhals/utils.py b/narwhals/utils.py index 512099cc5..1a0b752d9 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -18,6 +18,12 @@ from narwhals.dependencies import get_pandas from narwhals.dependencies import get_polars from narwhals.dependencies import get_pyarrow +from narwhals.dependencies import is_cudf_series +from narwhals.dependencies import is_modin_series +from narwhals.dependencies import is_pandas_dataframe +from narwhals.dependencies import is_pandas_series +from narwhals.dependencies import is_polars_series +from narwhals.dependencies import is_pyarrow_chunked_array from narwhals.translate import to_native if TYPE_CHECKING: @@ -95,7 +101,7 @@ def tupleify(arg: Any) -> Any: def _is_iterable(arg: Any | Iterable[Any]) -> bool: from narwhals.series import Series - if (pd := get_pandas()) is not None and isinstance(arg, (pd.Series, pd.DataFrame)): + if is_pandas_dataframe(arg) or is_pandas_series(arg): msg = f"Expected Narwhals class or scalar, got: {type(arg)}. Perhaps you forgot a `nw.from_native` somewhere?" raise TypeError(msg) if (pl := get_polars()) is not None and isinstance( @@ -352,19 +358,15 @@ def is_ordered_categorical(series: Series) -> bool: if series.dtype != dtypes.Categorical: return False native_series = to_native(series) - if (pl := get_polars()) is not None and isinstance(native_series, pl.Series): - return native_series.dtype.ordering == "physical" # type: ignore[no-any-return] - if (pd := get_pandas()) is not None and isinstance(native_series, pd.Series): + if is_polars_series(native_series): + return native_series.dtype.ordering == "physical" # type: ignore[attr-defined, no-any-return] + if is_pandas_series(native_series): return native_series.cat.ordered # type: ignore[no-any-return] - if (mpd := get_modin()) is not None and isinstance( - native_series, mpd.Series - ): # pragma: no cover + if is_modin_series(native_series): # pragma: no cover return native_series.cat.ordered # type: ignore[no-any-return] - if (cudf := get_cudf()) is not None and isinstance( - native_series, cudf.Series - ): # pragma: no cover + if is_cudf_series(native_series): # pragma: no cover return native_series.cat.ordered # type: ignore[no-any-return] - if (pa := get_pyarrow()) is not None and isinstance(native_series, pa.ChunkedArray): + if is_pyarrow_chunked_array(native_series): return native_series.type.ordered # type: ignore[no-any-return] # If it doesn't match any of the above, let's just play it safe and return False. return False # pragma: no cover diff --git a/tests/no_imports_test.py b/tests/no_imports_test.py new file mode 100644 index 000000000..a89ed0ed8 --- /dev/null +++ b/tests/no_imports_test.py @@ -0,0 +1,68 @@ +import sys + +import pandas as pd +import polars as pl +import pyarrow as pa +import pytest + +import narwhals.stable.v1 as nw + + +def test_polars(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delitem(sys.modules, "pandas") + monkeypatch.delitem(sys.modules, "numpy") + monkeypatch.delitem(sys.modules, "pyarrow") + monkeypatch.delitem(sys.modules, "dask", raising=False) + df = pl.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]}) + nw.from_native(df, eager_only=True).group_by("a").agg(nw.col("b").mean()).filter( + nw.col("a") > 1 + ) + assert "polars" in sys.modules + assert "pandas" not in sys.modules + assert "numpy" not in sys.modules + assert "pyarrow" not in sys.modules + assert "dask" not in sys.modules + + +def test_pandas(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delitem(sys.modules, "polars") + monkeypatch.delitem(sys.modules, "pyarrow") + monkeypatch.delitem(sys.modules, "dask", raising=False) + df = pd.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]}) + nw.from_native(df, eager_only=True).group_by("a").agg(nw.col("b").mean()).filter( + nw.col("a") > 1 + ) + assert "polars" not in sys.modules + assert "pandas" in sys.modules + assert "numpy" in sys.modules + assert "pyarrow" not in sys.modules + assert "dask" not in sys.modules + + +def test_dask(monkeypatch: pytest.MonkeyPatch) -> None: + pytest.importorskip("dask") + pytest.importorskip("dask_expr", exc_type=ImportError) + import dask.dataframe as dd + + monkeypatch.delitem(sys.modules, "polars") + monkeypatch.delitem(sys.modules, "pyarrow") + df = dd.from_pandas(pd.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]})) + nw.from_native(df).group_by("a").agg(nw.col("b").mean()).filter(nw.col("a") > 1) + assert "polars" not in sys.modules + assert "pandas" in sys.modules + assert "numpy" in sys.modules + assert "pyarrow" not in sys.modules + assert "dask" in sys.modules + + +def test_pyarrow(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delitem(sys.modules, "polars") + monkeypatch.delitem(sys.modules, "pandas") + monkeypatch.delitem(sys.modules, "dask", raising=False) + df = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}) + nw.from_native(df).group_by("a").agg(nw.col("b").mean()).filter(nw.col("a") > 1) + assert "polars" not in sys.modules + assert "pandas" not in sys.modules + assert "numpy" in sys.modules + assert "pyarrow" in sys.modules + assert "dask" not in sys.modules From 885ef310b977f4bd618b2877dbb337e10d97d281 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 14 Aug 2024 08:43:41 +0100 Subject: [PATCH 21/21] feat: add narwhals.new_series (#787) --- docs/api-reference/narwhals.md | 7 +-- narwhals/__init__.py | 2 + narwhals/_dask/dataframe.py | 1 + narwhals/functions.py | 94 ++++++++++++++++++++++++++++++++++ narwhals/stable/v1.py | 53 +++++++++++++++++++ narwhals/utils.py | 3 ++ tests/frame/drop_test.py | 4 +- tests/new_series_test.py | 36 +++++++++++++ 8 files changed, 194 insertions(+), 6 deletions(-) create mode 100644 tests/new_series_test.py diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index 16bc6621c..275a865c1 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -17,14 +17,15 @@ Here are the top-level functions available in Narwhals. - get_native_namespace - is_ordered_categorical - len - - maybe_align_index - - maybe_set_index - - maybe_convert_dtypes - lit - max + - maybe_align_index + - maybe_convert_dtypes + - maybe_set_index - mean - min - narwhalify + - new_series - sum - sum_horizontal - show_versions diff --git a/narwhals/__init__.py b/narwhals/__init__.py index d67a0587e..3e656c60a 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -36,6 +36,7 @@ from narwhals.functions import concat from narwhals.functions import from_dict from narwhals.functions import get_level +from narwhals.functions import new_series from narwhals.functions import show_versions from narwhals.schema import Schema from narwhals.series import Series @@ -55,6 +56,7 @@ "concat", "from_dict", "get_level", + "new_series", "to_native", "from_native", "is_ordered_categorical", diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 99ed430a9..35ef28bba 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -32,6 +32,7 @@ def __init__( ) -> None: self._native_frame = native_dataframe self._backend_version = backend_version + self._implementation = Implementation.DASK def __native_namespace__(self) -> Any: # pragma: no cover return get_dask_dataframe() diff --git a/narwhals/functions.py b/narwhals/functions.py index 13db8c34b..d5d8be4db 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -48,6 +48,100 @@ def concat( ) +def new_series( + name: str, + values: Any, + dtype: DType | type[DType] | None = None, + *, + native_namespace: ModuleType, +) -> Series: + """ + Instantiate Narwhals Series from raw data. + + Arguments: + name: Name of resulting Series. + values: Values of make Series from. + dtype: (Narwhals) dtype. If not provided, the native library + may auto-infer it from `values`. + native_namespace: The native library to use for DataFrame creation. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} + + Let's define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... values = [4, 1, 2] + ... native_namespace = nw.get_native_namespace(df) + ... return nw.new_series("c", values, nw.Int32, native_namespace=native_namespace) + + Let's see what happens when passing pandas / Polars input: + + >>> func(pd.DataFrame(data)) + 0 4 + 1 1 + 2 2 + Name: c, dtype: int32 + >>> func(pl.DataFrame(data)) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: 'c' [i32] + [ + 4 + 1 + 2 + ] + """ + implementation = Implementation.from_native_namespace(native_namespace) + + if implementation is Implementation.POLARS: + if dtype: + from narwhals._polars.utils import ( + narwhals_to_native_dtype as polars_narwhals_to_native_dtype, + ) + + dtype = polars_narwhals_to_native_dtype(dtype) + + native_series = native_namespace.Series(name=name, values=values, dtype=dtype) + elif implementation in { + Implementation.PANDAS, + Implementation.MODIN, + Implementation.CUDF, + }: + if dtype: + from narwhals._pandas_like.utils import ( + narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype, + ) + + dtype = pandas_like_narwhals_to_native_dtype(dtype, None, implementation) + native_series = native_namespace.Series(values, name=name, dtype=dtype) + + elif implementation is Implementation.PYARROW: + if dtype: + from narwhals._arrow.utils import ( + narwhals_to_native_dtype as arrow_narwhals_to_native_dtype, + ) + + dtype = arrow_narwhals_to_native_dtype(dtype) + native_series = native_namespace.chunked_array([values], type=dtype) + + elif implementation is Implementation.DASK: + msg = "Dask support in Narwhals is lazy-only, so `new_series` is " "not supported" + raise NotImplementedError(msg) + else: # pragma: no cover + try: + # implementation is UNKNOWN, Narhwals extension using this feature should + # implement `from_dict` function in the top-level namespace. + native_series = native_namespace.new_series(name, values, dtype) + except AttributeError as e: + msg = "Unknown namespace is expected to implement `Series` constructor." + raise AttributeError(msg) from e + return from_native(native_series, series_only=True).alias(name) + + def from_dict( data: dict[str, Any], schema: dict[str, DType] | Schema | None = None, diff --git a/narwhals/stable/v1.py b/narwhals/stable/v1.py index 1c1c91711..b5697753f 100644 --- a/narwhals/stable/v1.py +++ b/narwhals/stable/v1.py @@ -1469,6 +1469,58 @@ def get_level( return nw.get_level(obj) +def new_series( + name: str, + values: Any, + dtype: DType | type[DType] | None = None, + *, + native_namespace: ModuleType, +) -> Series: + """ + Instantiate Narwhals Series from raw data. + + Arguments: + name: Name of resulting Series. + values: Values of make Series from. + dtype: (Narwhals) dtype. If not provided, the native library + may auto-infer it from `values`. + native_namespace: The native library to use for DataFrame creation. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals.stable.v1 as nw + >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} + + Let's define a dataframe-agnostic function: + + >>> @nw.narwhalify + ... def func(df): + ... values = [4, 1, 2] + ... native_namespace = nw.get_native_namespace(df) + ... return nw.new_series("c", values, nw.Int32, native_namespace=native_namespace) + + Let's see what happens when passing pandas / Polars input: + + >>> func(pd.DataFrame(data)) + 0 4 + 1 1 + 2 2 + Name: c, dtype: int32 + >>> func(pl.DataFrame(data)) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: 'c' [i32] + [ + 4 + 1 + 2 + ] + """ + return _stableify( + nw.new_series(name, values, dtype, native_namespace=native_namespace) + ) + + def from_dict( data: dict[str, Any], schema: dict[str, DType] | Schema | None = None, @@ -1573,4 +1625,5 @@ def from_dict( "show_versions", "Schema", "from_dict", + "new_series", ] diff --git a/narwhals/utils.py b/narwhals/utils.py index 1a0b752d9..cc2a482c4 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -14,6 +14,7 @@ from narwhals import dtypes from narwhals._exceptions import ColumnNotFoundError from narwhals.dependencies import get_cudf +from narwhals.dependencies import get_dask_dataframe from narwhals.dependencies import get_modin from narwhals.dependencies import get_pandas from narwhals.dependencies import get_polars @@ -43,6 +44,7 @@ class Implementation(Enum): CUDF = auto() PYARROW = auto() POLARS = auto() + DASK = auto() UNKNOWN = auto() @@ -57,6 +59,7 @@ def from_native_namespace( get_cudf(): Implementation.CUDF, get_pyarrow(): Implementation.PYARROW, get_polars(): Implementation.POLARS, + get_dask_dataframe(): Implementation.DASK, } return mapping.get(native_namespace, Implementation.UNKNOWN) diff --git a/tests/frame/drop_test.py b/tests/frame/drop_test.py index 547ddc748..db039fcb2 100644 --- a/tests/frame/drop_test.py +++ b/tests/frame/drop_test.py @@ -33,9 +33,7 @@ def test_drop(constructor: Any, to_drop: list[str], expected: list[str]) -> None [ ( True, - pytest.raises( - (ColumnNotFoundError, PlColumnNotFoundError), match='"z" not found' - ), + pytest.raises((ColumnNotFoundError, PlColumnNotFoundError), match="z"), ), (False, does_not_raise()), ], diff --git a/tests/new_series_test.py b/tests/new_series_test.py new file mode 100644 index 000000000..8ddcabd40 --- /dev/null +++ b/tests/new_series_test.py @@ -0,0 +1,36 @@ +from typing import Any + +import pandas as pd +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import compare_dicts + + +def test_new_series(constructor_eager: Any) -> None: + s = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)["a"] + result = nw.new_series("b", [4, 1, 2], native_namespace=nw.get_native_namespace(s)) + expected = {"b": [4, 1, 2]} + # all supported libraries auto-infer this to be int64, we can always special-case + # something different if necessary + assert result.dtype == nw.Int64 + compare_dicts(result.to_frame(), expected) + + result = nw.new_series( + "b", [4, 1, 2], nw.Int32, native_namespace=nw.get_native_namespace(s) + ) + expected = {"b": [4, 1, 2]} + assert result.dtype == nw.Int32 + compare_dicts(result.to_frame(), expected) + + +def test_new_series_dask() -> None: + pytest.importorskip("dask") + pytest.importorskip("dask_expr", exc_type=ImportError) + import dask.dataframe as dd + + df = nw.from_native(dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}))) + with pytest.raises( + NotImplementedError, match="Dask support in Narwhals is lazy-only" + ): + nw.new_series("a", [1, 2, 3], native_namespace=nw.get_native_namespace(df))