From cb4a583b9b83c23687c09bbe0ad55ab43d790de9 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Sun, 11 Aug 2024 22:26:42 +0200 Subject: [PATCH] feat: dask lazyframe remaining methods (#778) * feat: dask dataframe remaining methods * gather_every in Expr --- narwhals/_dask/dataframe.py | 17 +++++++++++++++++ narwhals/_dask/expr.py | 10 ++++++++++ tests/expr_and_series/tail_test.py | 28 ++++++++++++++++++++++++++++ tests/frame/gather_every_test.py | 4 +--- tests/frame/tail_test.py | 6 +----- tests/series_only/tail_test.py | 14 -------------- 6 files changed, 57 insertions(+), 22 deletions(-) create mode 100644 tests/expr_and_series/tail_test.py delete mode 100644 tests/series_only/tail_test.py diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 776bcde20..685fc7b69 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -275,3 +275,20 @@ def group_by(self, *by: str) -> Any: from narwhals._dask.group_by import DaskLazyGroupBy return DaskLazyGroupBy(self, list(by)) + + def tail(self: Self, n: int) -> Self: + return self._from_native_dataframe( + self._native_dataframe.tail(n=n, compute=False) + ) + + def gather_every(self: Self, n: int, offset: int) -> Self: + row_index_token = generate_unique_token(n_bytes=8, columns=self.columns) + pln = self.__narwhals_namespace__() + return ( + self.with_row_index(name=row_index_token) + .filter( + pln.col(row_index_token) >= offset, # type: ignore[operator] + (pln.col(row_index_token) - offset) % n == 0, # type: ignore[arg-type] + ) + .drop(row_index_token) + ) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 632966e3c..83c160565 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -534,6 +534,16 @@ def func(_input: Any) -> Any: returns_scalar=False, ) + def tail(self: Self) -> NoReturn: + # We can't (yet?) allow methods which modify the index + msg = "`Expr.tail` is not supported for the Dask backend. Please use `LazyFrame.tail` instead." + raise NotImplementedError(msg) + + def gather_every(self: Self, n: int, offset: int = 0) -> NoReturn: + # We can't (yet?) allow methods which modify the index + msg = "`Expr.gather_every` is not supported for the Dask backend. Please use `LazyFrame.gather_every` instead." + raise NotImplementedError(msg) + @property def str(self: Self) -> DaskExprStringNamespace: return DaskExprStringNamespace(self) diff --git a/tests/expr_and_series/tail_test.py b/tests/expr_and_series/tail_test.py new file mode 100644 index 000000000..be17ffb4e --- /dev/null +++ b/tests/expr_and_series/tail_test.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import Any + +import pytest + +import narwhals as nw +from tests.utils import compare_dicts + + +@pytest.mark.parametrize("n", [2, -1]) +def test_head(constructor: Any, n: int, request: Any) -> None: + if "dask" in str(constructor): + request.applymarker(pytest.mark.xfail) + if "polars" in str(constructor) and n < 0: + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor({"a": [1, 2, 3]})) + result = df.select(nw.col("a").tail(n)) + expected = {"a": [2, 3]} + compare_dicts(result, expected) + + +@pytest.mark.parametrize("n", [2, -1]) +def test_head_series(constructor_eager: Any, n: int) -> None: + df = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True) + result = df.select(df["a"].tail(n)) + expected = {"a": [2, 3]} + compare_dicts(result, expected) diff --git a/tests/frame/gather_every_test.py b/tests/frame/gather_every_test.py index a75edaca8..90b06e3d6 100644 --- a/tests/frame/gather_every_test.py +++ b/tests/frame/gather_every_test.py @@ -10,9 +10,7 @@ @pytest.mark.parametrize("n", [1, 2, 3]) @pytest.mark.parametrize("offset", [1, 2, 3]) -def test_gather_every(constructor: Any, n: int, offset: int, request: Any) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_gather_every(constructor: Any, n: int, offset: int) -> None: df = nw.from_native(constructor(data)) result = df.gather_every(n=n, offset=offset) expected = {"a": data["a"][offset::n]} diff --git a/tests/frame/tail_test.py b/tests/frame/tail_test.py index 6a5a6b601..e279caba9 100644 --- a/tests/frame/tail_test.py +++ b/tests/frame/tail_test.py @@ -2,15 +2,11 @@ from typing import Any -import pytest - import narwhals.stable.v1 as nw from tests.utils import compare_dicts -def test_tail(constructor: Any, request: Any) -> None: - if "dask" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_tail(constructor: Any) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} expected = {"a": [3, 2], "b": [4, 6], "z": [8.0, 9]} diff --git a/tests/series_only/tail_test.py b/tests/series_only/tail_test.py deleted file mode 100644 index 058f45831..000000000 --- a/tests/series_only/tail_test.py +++ /dev/null @@ -1,14 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import pytest - -import narwhals.stable.v1 as nw - - -@pytest.mark.parametrize("n", [2, -1]) -def test_tail(constructor_eager: Any, n: int) -> None: - s = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)["a"] - - assert s.tail(n).to_list() == [2, 3]