support in over context

narwhals-dev · Dec 26, 2024 · 165507a · 165507a
2 parents bb458f8 + ecde246
commit 165507a
Show file tree

Hide file tree

Showing 18 changed files with 87 additions and 65 deletions.
diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml
@@ -25,7 +25,7 @@ jobs:
           cache-suffix: ${{ matrix.python-version }}
           cache-dependency-glob: "pyproject.toml"
       - name: local-install
-        run: uv pip install -e ".[dev]" --system
+        run: uv pip install -e ".[dev, dask]" --system
       - name: generate-data
         run: cd tpch && python generate_data.py
       - name: tpch-tests 

diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml
@@ -48,7 +48,6 @@ jobs:
       matrix:
         python-version: ["3.8"]
         os: [ubuntu-latest]
-
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -87,7 +86,6 @@ jobs:
       matrix:
         python-version: ["3.9"]
         os: [ubuntu-latest]
-
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -117,7 +115,6 @@ jobs:
           echo "$DEPS" | grep 'scipy==1.8.0'
           echo "$DEPS" | grep 'scikit-learn==1.3.0'
           echo "$DEPS" | grep 'dask==2024.7'
-
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow
 

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -11,7 +11,6 @@ jobs:
       matrix:
         python-version: ["3.8"]
         os: [windows-latest, ubuntu-latest]
-
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -25,6 +24,7 @@ jobs:
           cache-suffix: ${{ matrix.python-version }}
           cache-dependency-glob: "pyproject.toml"
       - name: install-reqs
+        # Python3.8 is technically at end-of-life, so we don't test everything
         run: uv pip install -e ".[dev]" --system
       - name: show-deps
         run: uv pip freeze
@@ -36,7 +36,6 @@ jobs:
       matrix:
         python-version: ["3.10", "3.12"]
         os: [windows-latest]
-
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -50,22 +49,17 @@ jobs:
           cache-suffix: ${{ matrix.python-version }}
           cache-dependency-glob: "pyproject.toml"
       - name: install-reqs
-        run: uv pip install -e ".[dev]" --system
-      - name: install-dask
-        run: uv pip install --upgrade dask[dataframe] --system
-      - name: install-modin
-        run: uv pip install --upgrade modin --system
+        run: uv pip install -e ".[dev, extra]" --system
       - name: show-deps
         run: uv pip freeze
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --runslow --cov-fail-under=95
 
-  pytest-coverage:
+  pytest-full-coverage:
     strategy:
       matrix:
         python-version: ["3.9", "3.11", "3.13"]
         os: [ubuntu-latest]
-
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -79,18 +73,17 @@ jobs:
           cache-suffix: ${{ matrix.python-version }}
           cache-dependency-glob: "pyproject.toml"
       - name: install-reqs
-        run: uv pip install -e ".[dev]" --system
-      - name: install-modin
-        run: uv pip install --upgrade modin[dask] --system
-      - name: show-deps
-        run: uv pip freeze
+        run: uv pip install -e ".[dev, extra]" --system
       - name: install ibis
         run: uv pip install ibis-framework>=6.0.0 rich packaging pyarrow_hotfix --system
         # Ibis puts upper bounds on dependencies, and requires Python3.10+,
         # which messes with other dependencies on lower Python versions
         if: matrix.python-version == '3.11'
+      - name: show-deps
+        run: uv pip freeze
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=100 --runslow
       - name: Run doctests
+        # reprs differ between versions, so we only run doctests on the latest Python
         if: matrix.python-version == '3.13'
         run: pytest narwhals --doctest-modules
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -96,7 +96,8 @@ Here's how you can set up your local development environment to contribute.
       uv venv -p 3.12 --seed
       ```
    4. Activate it. On Linux, this is `. .venv/bin/activate`, on Windows `.\.venv\Scripts\activate`.
-2. Install Narwhals: `uv pip install -e ".[dev, docs]"`
+2. Install Narwhals: `uv pip install -e ".[dev, docs]"`. If you also want to test Dask , PySpark, and
+   Modin, you can install them too with `uv pip install -e ".[dev, docs, extra]"`.
 3. Install a fork of griffe:
    ```
    uv pip install git+https://github.com/MarcoGorelli/griffe.git@no-overloads

diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -504,7 +504,7 @@ def filter(self: Self, *predicates: IntoArrowExpr, **constraints: Any) -> Self:
                     predicates, (plx.col(name) == v for name, v in constraints.items())
                 )
             )
-            # Safety: all_horizontal's expression only returns a single column.
+            # `[0]` is safe as all_horizontal's expression only returns a single column
             mask = expr._call(self)[0]._native_series
         return self._from_native_frame(self._native_frame.filter(mask))
 

diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py
@@ -461,11 +461,9 @@ def __call__(self: Self, df: ArrowDataFrame) -> Sequence[ArrowSeries]:
         import pyarrow as pa
         import pyarrow.compute as pc
 
-        from narwhals._arrow.namespace import ArrowNamespace
         from narwhals._expression_parsing import parse_into_expr
 
-        plx = ArrowNamespace(backend_version=self._backend_version, version=self._version)
-
+        plx = df.__narwhals_namespace__()
         condition = parse_into_expr(self._condition, namespace=plx)(df)[0]
         try:
             value_series = parse_into_expr(self._then_value, namespace=plx)(df)[0]

diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py
@@ -105,13 +105,11 @@ def filter(self, *predicates: DaskExpr, **constraints: Any) -> Self:
             )
             raise NotImplementedError(msg)
 
-        from narwhals._dask.namespace import DaskNamespace
-
-        plx = DaskNamespace(backend_version=self._backend_version, version=self._version)
+        plx = self.__narwhals_namespace__()
         expr = plx.all_horizontal(
             *chain(predicates, (plx.col(name) == v for name, v in constraints.items()))
         )
-        # Safety: all_horizontal's expression only returns a single column.
+        # `[0]` is safe as all_horizontal's expression only returns a single column
         mask = expr._call(self)[0]
         return self._from_native_frame(self._native_frame.loc[mask])
 

diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py
@@ -416,11 +416,9 @@ def __init__(
         self._version = version
 
     def __call__(self, df: DaskLazyFrame) -> Sequence[dask_expr.Series]:
-        from narwhals._dask.namespace import DaskNamespace
         from narwhals._expression_parsing import parse_into_expr
 
-        plx = DaskNamespace(backend_version=self._backend_version, version=self._version)
-
+        plx = df.__narwhals_namespace__()
         condition = parse_into_expr(self._condition, namespace=plx)(df)[0]
         condition = cast("dask_expr.Series", condition)
         try:

diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py
@@ -412,7 +412,7 @@ def filter(self, *predicates: IntoPandasLikeExpr, **constraints: Any) -> Self:
                     predicates, (plx.col(name) == v for name, v in constraints.items())
                 )
             )
-            # Safety: all_horizontal's expression only returns a single column.
+            # `[0]` is safe as all_horizontal's expression only returns a single column
             mask = expr._call(self)[0]
             _mask = validate_dataframe_comparand(self._native_frame.index, mask)
         return self._from_native_frame(self._native_frame.loc[_mask])
@@ -1006,7 +1006,6 @@ def explode(self: Self, columns: str | Sequence[str], *more_columns: str) -> Sel
             ]
 
             plx = self.__native_namespace__()
-
             return self._from_native_frame(
                 plx.concat([exploded_frame, *exploded_series], axis=1)[original_columns]
             )
diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py
@@ -24,7 +24,7 @@
     from narwhals.utils import Implementation
     from narwhals.utils import Version
 
-CUMULATIVE_FUNCTIONS_TO_PANDAS_EQUIVALENT = {
+MANY_TO_MANY_AGG_FUNCTIONS_TO_PANDAS_EQUIVALENT = {
     "col->cum_sum": "cumsum",
     "col->cum_min": "cummin",
     "col->cum_max": "cummax",
@@ -33,6 +33,7 @@
     # Pandas cumcount counts nulls while Polars does not
     # So, instead of using "cumcount" we use "cumsum" on notna() to get the same result
     "col->cum_count": "cumsum",
+    "col->shift": "shift",
     "col->rank": "rank",
 }
 
@@ -418,7 +419,7 @@ def alias(self, name: str) -> Self:
         )
 
     def over(self: Self, keys: list[str]) -> Self:
-        if self._function_name in CUMULATIVE_FUNCTIONS_TO_PANDAS_EQUIVALENT:
+        if self._function_name in MANY_TO_MANY_AGG_FUNCTIONS_TO_PANDAS_EQUIVALENT:
 
             def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
                 if (
@@ -444,12 +445,22 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]:
                     plx = self.__narwhals_namespace__()
                     df = df.with_columns(~plx.col(*self._root_names).is_null())
 
+                if self._function_name == "col->shift":
+                    kwargs = {"periods": self._kwargs.get("n", 1)}
+                elif self._function_name == "col->rank":
+                    kwargs = {
+                        "method": self._kwargs.get("method", "average"),
+                        "ascending": not self._kwargs.get("descending", False),
+                    }
+                else:  # Cumulative operation
+                    kwargs = {"skipna": True}
+
                 res_native = getattr(
                     df._native_frame.groupby(list(keys), as_index=False)[
                         self._root_names
                     ],
-                    CUMULATIVE_FUNCTIONS_TO_PANDAS_EQUIVALENT[self._function_name],
-                )(skipna=True)
+                    MANY_TO_MANY_AGG_FUNCTIONS_TO_PANDAS_EQUIVALENT[self._function_name],
+                )(**kwargs)
 
                 result_frame = df._from_native_frame(
                     rename(

diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py
@@ -503,15 +503,9 @@ def __init__(
 
     def __call__(self, df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]:
         from narwhals._expression_parsing import parse_into_expr
-        from narwhals._pandas_like.namespace import PandasLikeNamespace
         from narwhals._pandas_like.utils import broadcast_align_and_extract_native
 
-        plx = PandasLikeNamespace(
-            implementation=self._implementation,
-            backend_version=self._backend_version,
-            version=self._version,
-        )
-
+        plx = df.__narwhals_namespace__()
         condition = parse_into_expr(self._condition, namespace=plx)(df)[0]
         try:
             value_series = parse_into_expr(self._then_value, namespace=plx)(df)[0]

diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py
@@ -104,20 +104,16 @@ def select(
         return self._from_native_frame(self._native_frame.select(*new_columns_list))
 
     def filter(self, *predicates: SparkLikeExpr) -> Self:
-        from narwhals._spark_like.namespace import SparkLikeNamespace
-
         if (
             len(predicates) == 1
             and isinstance(predicates[0], list)
             and all(isinstance(x, bool) for x in predicates[0])
         ):
             msg = "`LazyFrame.filter` is not supported for PySpark backend with boolean masks."
             raise NotImplementedError(msg)
-        plx = SparkLikeNamespace(
-            backend_version=self._backend_version, version=self._version
-        )
+        plx = self.__narwhals_namespace__()
         expr = plx.all_horizontal(*predicates)
-        # Safety: all_horizontal's expression only returns a single column.
+        # `[0]` is safe as all_horizontal's expression only returns a single column
         condition = expr._call(self)[0]
         spark_df = self._native_frame.where(condition)
         return self._from_native_frame(spark_df)

diff --git a/narwhals/series.py b/narwhals/series.py
@@ -843,6 +843,9 @@ def any(self) -> Any:
         Notes:
           Only works on Series of data type Boolean.
 
+        Returns:
+            A boolean indicating if any values in the Series are True.
+
         Examples:
             >>> import pandas as pd
             >>> import polars as pl
@@ -870,6 +873,9 @@ def any(self) -> Any:
     def all(self) -> Any:
         """Return whether all values in the Series are True.
 
+        Returns:
+            A boolean indicating if all values in the Series are True.
+
         Examples:
             >>> import pandas as pd
             >>> import polars as pl
@@ -1334,6 +1340,9 @@ def drop_nulls(self) -> Self:
     def abs(self) -> Self:
         """Calculate the absolute value of each element.
 
+        Returns:
+            A new Series with the absolute values of the original elements.
+
         Examples:
             >>> import pandas as pd
             >>> import polars as pl
@@ -1631,6 +1640,9 @@ def alias(self, name: str) -> Self:
         Arguments:
             name: The new name.
 
+        Returns:
+            A new Series with the updated name.
+
         Examples:
             >>> import pandas as pd
             >>> import polars as pl

diff --git a/pyproject.toml b/pyproject.toml
@@ -34,18 +34,21 @@ dev = [
   "duckdb",
   "pandas",
   "polars",
+  "pre-commit",
   "pyarrow",
-  "pyspark; python_version >= '3.9' and python_version < '3.12'",
   "pyarrow-stubs",
   "pytest",
   "pytest-cov",
   "pytest-randomly",
   "pytest-env",
-  "hypothesis",
   "hypothesis[numpy]",
-  "scikit-learn",
   "typing_extensions",
+]
+extra = [  # heavier dependencies we don't necessarily need in every testing job
+  "scikit-learn",
+  "pyspark; python_version >= '3.9' and python_version < '3.12'",
   "dask[dataframe]; python_version >= '3.9'",
+  "modin",
 ]
 docs = [
   "black",  # required by mkdocstrings_handlers
@@ -175,6 +178,7 @@ env = [
 plugins = ["covdefaults"]
 
 [tool.coverage.report]
+fail_under = 80  # This is just for local development, in CI we set it to 100
 omit = [
   'narwhals/typing.py',
   'narwhals/stable/v1/typing.py',
@@ -183,14 +187,13 @@ omit = [
   'narwhals/_ibis/*',
   # the latest pyspark (3.5) doesn't officially support Python 3.12 and 3.13
   'narwhals/_spark_like/*',
+  # we don't run these in every environment
   'tests/spark_like_test.py',
 ]
 exclude_also = [
   "if sys.version_info() <",
-  "if (:?self._)?implementation is Implementation.MODIN",
   "if .*implementation is Implementation.CUDF",
   'request.applymarker\(pytest.mark.xfail',
-  '\w+._backend_version < ',
   'backend_version <',
   'if "cudf" in str\(constructor'
 ]

diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py
@@ -203,7 +203,7 @@ def test_pandas_fixed_offset_1302() -> None:
 def test_huge_int() -> None:
     df = pl.DataFrame({"a": [1, 2, 3]})
     if POLARS_VERSION >= (1, 18):  # pragma: no cover
-        result = nw.from_native(df).schema
+        result = nw.from_native(df.select(pl.col("a").cast(pl.Int128))).schema
         assert result["a"] == nw.Int128
     else:  # pragma: no cover
         # Int128 was not available yet
@@ -221,13 +221,6 @@ def test_huge_int() -> None:
     result = nw.from_native(rel).schema
     assert result["a"] == nw.UInt128
 
-    if POLARS_VERSION >= (1, 18):  # pragma: no cover
-        result = nw.from_native(df).schema
-        assert result["a"] == nw.UInt128
-    else:  # pragma: no cover
-        # UInt128 was not available yet
-        pass
-
     # TODO(unassigned): once other libraries support Int128/UInt128,
     # add tests for them too