narwhals-dev · MarcoGorelli · Jan 3, 2025 · Dec 19, 2024 · Dec 20, 2024 · Dec 27, 2024
diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md
@@ -32,6 +32,7 @@
         - is_first_distinct
         - is_in
         - is_last_distinct
+        - is_nan
         - is_null
         - is_unique
         - len

diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md
@@ -38,6 +38,7 @@
         - is_first_distinct
         - is_in
         - is_last_distinct
+        - is_nan
         - is_null
         - is_sorted
         - is_unique

diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
@@ -312,6 +312,9 @@ def null_count(self: Self) -> Self:
     def is_null(self: Self) -> Self:
         return reuse_series_implementation(self, "is_null")
 
+    def is_nan(self: Self) -> Self:
+        return reuse_series_implementation(self, "is_nan")
+
     def is_between(self: Self, lower_bound: Any, upper_bound: Any, closed: str) -> Self:
         return reuse_series_implementation(
             self, "is_between", lower_bound, upper_bound, closed

diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py
@@ -514,6 +514,12 @@ def is_null(self: Self) -> Self:
         ser = self._native_series
         return self._from_native_series(ser.is_null())
 
+    def is_nan(self: Self) -> Self:
+        import pyarrow.compute as pc
+
+        ser = self._native_series
+        return self._from_native_series(pc.is_nan(ser))
+
     def cast(self: Self, dtype: DType) -> Self:
         import pyarrow.compute as pc
 

diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py
@@ -15,6 +15,7 @@
 from narwhals._pandas_like.utils import calculate_timestamp_datetime
 from narwhals._pandas_like.utils import native_to_narwhals_dtype
 from narwhals.exceptions import ColumnNotFoundError
+from narwhals.exceptions import InvalidOperationError
 from narwhals.typing import CompliantExpr
 from narwhals.utils import Implementation
 from narwhals.utils import generate_temporary_column_name
@@ -689,6 +690,20 @@ def is_null(self: Self) -> Self:
             returns_scalar=False,
         )
 
+    def is_nan(self: Self) -> Self:
+        def func(_input: dask_expr.Series) -> dask_expr.Series:
+            dtype = native_to_narwhals_dtype(_input, self._version, Implementation.DASK)
+            if dtype.is_numeric():
+                return _input != _input  # noqa: PLR0124
+            msg = f"`is_nan` is not supported for dtype {dtype}"
+            raise InvalidOperationError(msg)
+
+        return self._from_call(
+            func,
+            "is_null",
+            returns_scalar=False,
+        )
+
     def len(self: Self) -> Self:
         return self._from_call(
             lambda _input: _input.size,

diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py
@@ -306,6 +306,9 @@ def is_between(
     def is_null(self) -> Self:
         return reuse_series_implementation(self, "is_null")
 
+    def is_nan(self) -> Self:
+        return reuse_series_implementation(self, "is_nan")
+
     def fill_null(
         self,
         value: Any | None = None,

diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py
@@ -20,6 +20,7 @@
 from narwhals._pandas_like.utils import set_axis
 from narwhals._pandas_like.utils import to_datetime
 from narwhals.dependencies import is_numpy_scalar
+from narwhals.exceptions import InvalidOperationError
 from narwhals.typing import CompliantSeries
 from narwhals.utils import Implementation
 from narwhals.utils import import_dtypes_module
@@ -614,8 +615,6 @@ def mean(self) -> Any:
         return ser.mean()
 
     def median(self) -> Any:
-        from narwhals.exceptions import InvalidOperationError
-
         if not self.dtype.is_numeric():
             msg = "`median` operation not supported for non-numeric input type."
             raise InvalidOperationError(msg)
@@ -654,6 +653,13 @@ def is_null(self) -> PandasLikeSeries:
         ser = self._native_series
         return self._from_native_series(ser.isna())
 
+    def is_nan(self) -> PandasLikeSeries:
+        ser = self._native_series
+        if self.dtype.is_numeric():
+            return self._from_native_series(ser != ser)  # noqa: PLR0124
+        msg = f"`is_nan` is not supported for dtype {self.dtype}"
+        raise InvalidOperationError(msg)
+
     def fill_null(
         self,
         value: Any | None = None,

diff --git a/narwhals/expr.py b/narwhals/expr.py
@@ -1921,6 +1921,73 @@ def is_null(self) -> Self:
         """
         return self.__class__(lambda plx: self._to_compliant_expr(plx).is_null())
 
+    def is_nan(self) -> Self:
+        """Returns a boolean Series indicating which values are NaN.
+
+        Returns:
+            A new expression.
+
+        Notes:
+            pandas, Polars and PyArrow handle null values differently. Polars and PyArrow
+            distinguish between NaN and Null, whereas pandas doesn't.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> import pyarrow as pa
+            >>> import narwhals as nw
+            >>> from narwhals.typing import IntoFrameT
+            >>> data = {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
+            >>> df_pd = pd.DataFrame(data).astype({"a": "Int64"})
+            >>> df_pl = pl.DataFrame(data)
+            >>> df_pa = pa.table(data)
+
+            Let's define a dataframe-agnostic function:
+
+            >>> def agnostic_is_nan_columns(df_native: IntoFrameT) -> IntoFrameT:
+            ...     df = nw.from_native(df_native)
+            ...     return df.with_columns(
+            ...         a_is_nan=nw.col("a").is_nan(), b_is_nan=nw.col("b").is_nan()
+            ...     ).to_native()
+
+            We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_is_nan_columns`:
+
+            >>> agnostic_is_nan_columns(df_pd)
+                  a    b  a_is_nan  b_is_nan
+            0     2  2.0     False     False
+            1     4  4.0     False     False
+            2  <NA>  NaN      <NA>      True
+            3     3  3.0     False     False
+            4     5  5.0     False     False
+
+            >>> agnostic_is_nan_columns(df_pl)  # nan != null for polars
+            shape: (5, 4)
+            ┌──────┬─────┬──────────┬──────────┐
+            │ a    ┆ b   ┆ a_is_nan ┆ b_is_nan │
+            │ ---  ┆ --- ┆ ---      ┆ ---      │
+            │ i64  ┆ f64 ┆ bool     ┆ bool     │
+            ╞══════╪═════╪══════════╪══════════╡
+            │ 2    ┆ 2.0 ┆ false    ┆ false    │
+            │ 4    ┆ 4.0 ┆ false    ┆ false    │
+            │ null ┆ NaN ┆ false    ┆ true     │
+            │ 3    ┆ 3.0 ┆ false    ┆ false    │
+            │ 5    ┆ 5.0 ┆ false    ┆ false    │
+            └──────┴─────┴──────────┴──────────┘
+
+            >>> agnostic_is_nan_columns(df_pa)  # nan != null for pyarrow
+            pyarrow.Table
+            a: int64
+            b: double
+            a_is_nan: bool
+            b_is_nan: bool
+            ----
+            a: [[2,4,null,3,5]]
+            b: [[2,4,nan,3,5]]
+            a_is_nan: [[false,false,null,false,false]]
+            b_is_nan: [[false,false,true,false,false]]
+        """
+        return self.__class__(lambda plx: self._to_compliant_expr(plx).is_nan())
+
     def arg_true(self) -> Self:
         """Find elements where boolean expression is True.
 

diff --git a/narwhals/series.py b/narwhals/series.py
@@ -1915,6 +1915,57 @@ def is_null(self) -> Self:
         """
         return self._from_compliant_series(self._compliant_series.is_null())
 
+    def is_nan(self) -> Self:
+        """Returns a boolean Series indicating which values are null.
+
+        Notes:
+            pandas and Polars handle NaN values differently. Polars distinguishes
+            between NaN and Null, whereas pandas doesn't.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> import pyarrow as pa
+            >>> import narwhals as nw
+            >>> from narwhals.typing import IntoSeriesT
+            >>> s = [1.0, 2.0, float("nan")]
+            >>> s_pd = pd.Series(s, dtype="float64")
+            >>> s_pl = pl.Series(s)
+            >>> s_pa = pa.chunked_array([s], type=pa.float64())
+
+            We define a series-agnostic function:
+
+            >>> def agnostic_is_nan_series(s_native: IntoSeriesT) -> IntoSeriesT:
+            ...     s = nw.from_native(s_native, series_only=True)
+            ...     return s.is_nan().to_native()
+
+            We can then pass either pandas or Polars to `agnostic_is_nan_series`:
+
+            >>> agnostic_is_nan_series(s_pd)
+            0    False
+            1    False
+            2     True
+            dtype: bool
+            >>> agnostic_is_nan_series(s_pl)  # doctest: +NORMALIZE_WHITESPACE
+            shape: (3,)
+            Series: '' [bool]
+            [
+               false
+               false
+               true
+            ]
+            >>> agnostic_is_nan_series(s_pa)  # doctest: +NORMALIZE_WHITESPACE
+            <pyarrow.lib.ChunkedArray object at ...>
+            [
+              [
+                false,
+                false,
+                true
+              ]
+            ]
+        """
+        return self._from_compliant_series(self._compliant_series.is_nan())
+
     def fill_null(
         self,
         value: Any | None = None,

diff --git a/tests/expr_and_series/is_nan_test.py b/tests/expr_and_series/is_nan_test.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+import pytest
+
+import narwhals.stable.v1 as nw
+from narwhals.exceptions import InvalidOperationError
+from tests.conftest import dask_lazy_p2_constructor
+from tests.conftest import pandas_constructor
+from tests.utils import Constructor
+from tests.utils import ConstructorEager
+from tests.utils import assert_equal_data
+
+NON_NULLABLE_CONSTRUCTORS = [pandas_constructor, dask_lazy_p2_constructor]
+
+
+def test_nan(constructor: Constructor) -> None:
+    data_na = {"a": [0, 1, None]}
+    df = nw.from_native(constructor(data_na)).select(nw.col("a") / nw.col("a"))
+    result = df.select(nw.col("a").is_nan())
+    if any(constructor is c for c in NON_NULLABLE_CONSTRUCTORS):
+        # Null values are coerced to NaN for non-nullable datatypes
+        expected = {"a": [True, False, True]}
+    else:
+        # Null are preserved and should be differentiated for nullable datatypes
+        expected = {"a": [True, False, None]}  # type: ignore[list-item]
+
+    assert_equal_data(result, expected)
+
+
+def test_nan_series(constructor_eager: ConstructorEager) -> None:
+    data_na = {"a": [0, 1, None]}
+    df = nw.from_native(constructor_eager(data_na), eager_only=True).select(
+        nw.col("a") / nw.col("a")
+    )
+    result = {"a": df["a"].is_nan()}
+    if any(constructor_eager is c for c in NON_NULLABLE_CONSTRUCTORS):
+        # Null values are coerced to NaN for non-nullable datatypes
+        expected = {"a": [True, False, True]}
+    else:
+        # Null are preserved for nullable datatypes
+        expected = {"a": [True, False, None]}  # type: ignore[list-item]
+
+    assert_equal_data(result, expected)
+
+
+def test_nan_non_float() -> None:
+    data = {"a": ["0", "1"]}
+    pd_df = nw.from_native(pandas_constructor(data))
+    with pytest.raises(InvalidOperationError, match="not supported"):
+        pd_df.select(nw.col("a").is_nan())
+
+    dd_df = nw.from_native(dask_lazy_p2_constructor(data))
+    with pytest.raises(InvalidOperationError, match="not supported"):
+        dd_df.select(nw.col("a").is_nan())
-Original file line number
+Diff line change
@@ Expand Up / @@ -32,6 +32,7 @@ @@
             - is_first_distinct
             - is_in
             - is_last_distinct
+            - is_nan
             - is_null
             - is_unique
             - len
@@ Expand Down @@