Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add is_nan expression & series method #1625

Merged
merged 19 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
- is_first_distinct
- is_in
- is_last_distinct
- is_nan
- is_null
- is_unique
- len
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
- is_first_distinct
- is_in
- is_last_distinct
- is_nan
- is_null
- is_sorted
- is_unique
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ def null_count(self: Self) -> Self:
def is_null(self: Self) -> Self:
return reuse_series_implementation(self, "is_null")

def is_nan(self: Self) -> Self:
return reuse_series_implementation(self, "is_nan")

def is_between(self: Self, lower_bound: Any, upper_bound: Any, closed: str) -> Self:
return reuse_series_implementation(
self, "is_between", lower_bound, upper_bound, closed
Expand Down
6 changes: 6 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,12 @@ def is_null(self: Self) -> Self:
ser = self._native_series
return self._from_native_series(ser.is_null())

def is_nan(self: Self) -> Self:
import pyarrow.compute as pc

ser = self._native_series
return self._from_native_series(pc.is_nan(ser))

def cast(self: Self, dtype: DType) -> Self:
import pyarrow.compute as pc

Expand Down
15 changes: 15 additions & 0 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from narwhals._pandas_like.utils import calculate_timestamp_datetime
from narwhals._pandas_like.utils import native_to_narwhals_dtype
from narwhals.exceptions import ColumnNotFoundError
from narwhals.exceptions import InvalidOperationError
from narwhals.typing import CompliantExpr
from narwhals.utils import Implementation
from narwhals.utils import generate_temporary_column_name
Expand Down Expand Up @@ -689,6 +690,20 @@ def is_null(self: Self) -> Self:
returns_scalar=False,
)

def is_nan(self: Self) -> Self:
def func(_input: dask_expr.Series) -> dask_expr.Series:
dtype = native_to_narwhals_dtype(_input, self._version, Implementation.DASK)
if dtype.is_numeric():
return _input != _input # noqa: PLR0124
msg = f"`is_nan` is not supported for dtype {dtype}"
raise InvalidOperationError(msg)

return self._from_call(
func,
"is_null",
returns_scalar=False,
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
)

def len(self: Self) -> Self:
return self._from_call(
lambda _input: _input.size,
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,9 @@ def is_between(
def is_null(self) -> Self:
return reuse_series_implementation(self, "is_null")

def is_nan(self) -> Self:
return reuse_series_implementation(self, "is_nan")

def fill_null(
self,
value: Any | None = None,
Expand Down
10 changes: 8 additions & 2 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from narwhals._pandas_like.utils import set_axis
from narwhals._pandas_like.utils import to_datetime
from narwhals.dependencies import is_numpy_scalar
from narwhals.exceptions import InvalidOperationError
from narwhals.typing import CompliantSeries
from narwhals.utils import Implementation
from narwhals.utils import import_dtypes_module
Expand Down Expand Up @@ -614,8 +615,6 @@ def mean(self) -> Any:
return ser.mean()

def median(self) -> Any:
from narwhals.exceptions import InvalidOperationError

if not self.dtype.is_numeric():
msg = "`median` operation not supported for non-numeric input type."
raise InvalidOperationError(msg)
Expand Down Expand Up @@ -654,6 +653,13 @@ def is_null(self) -> PandasLikeSeries:
ser = self._native_series
return self._from_native_series(ser.isna())

def is_nan(self) -> PandasLikeSeries:
ser = self._native_series
if self.dtype.is_numeric():
return self._from_native_series(ser != ser) # noqa: PLR0124
msg = f"`is_nan` is not supported for dtype {self.dtype}"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this error message be extended to include a suggestion?

"is_nan is not supported for dtype {self.dtype}, did you mean to use is_null?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lots of love for kind error messages πŸ’š

raise InvalidOperationError(msg)

def fill_null(
self,
value: Any | None = None,
Expand Down
67 changes: 67 additions & 0 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1921,6 +1921,73 @@ def is_null(self) -> Self:
"""
return self.__class__(lambda plx: self._to_compliant_expr(plx).is_null())

def is_nan(self) -> Self:
"""Returns a boolean Series indicating which values are NaN.

Returns:
A new expression.

Notes:
pandas, Polars and PyArrow handle null values differently. Polars and PyArrow
distinguish between NaN and Null, whereas pandas doesn't.
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>> data = {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]}
>>> df_pd = pd.DataFrame(data).astype({"a": "Int64"})
>>> df_pl = pl.DataFrame(data)
>>> df_pa = pa.table(data)

Let's define a dataframe-agnostic function:

>>> def agnostic_is_nan_columns(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.with_columns(
... a_is_nan=nw.col("a").is_nan(), b_is_nan=nw.col("b").is_nan()
... ).to_native()
camriddell marked this conversation as resolved.
Show resolved Hide resolved

We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_is_nan_columns`:

>>> agnostic_is_nan_columns(df_pd)
a b a_is_nan b_is_nan
0 2 2.0 False False
1 4 4.0 False False
2 <NA> NaN <NA> True
3 3 3.0 False False
4 5 5.0 False False

>>> agnostic_is_nan_columns(df_pl) # nan != null for polars
shape: (5, 4)
β”Œβ”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ a ┆ b ┆ a_is_nan ┆ b_is_nan β”‚
β”‚ --- ┆ --- ┆ --- ┆ --- β”‚
β”‚ i64 ┆ f64 ┆ bool ┆ bool β”‚
β•žβ•β•β•β•β•β•β•ͺ═════β•ͺ══════════β•ͺ══════════║
β”‚ 2 ┆ 2.0 ┆ false ┆ false β”‚
β”‚ 4 ┆ 4.0 ┆ false ┆ false β”‚
β”‚ null ┆ NaN ┆ false ┆ true β”‚
β”‚ 3 ┆ 3.0 ┆ false ┆ false β”‚
β”‚ 5 ┆ 5.0 ┆ false ┆ false β”‚
β””β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

>>> agnostic_is_nan_columns(df_pa) # nan != null for pyarrow
pyarrow.Table
a: int64
b: double
a_is_nan: bool
b_is_nan: bool
----
a: [[2,4,null,3,5]]
b: [[2,4,nan,3,5]]
a_is_nan: [[false,false,null,false,false]]
b_is_nan: [[false,false,true,false,false]]
"""
return self.__class__(lambda plx: self._to_compliant_expr(plx).is_nan())

def arg_true(self) -> Self:
"""Find elements where boolean expression is True.

Expand Down
51 changes: 51 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1915,6 +1915,57 @@ def is_null(self) -> Self:
"""
return self._from_compliant_series(self._compliant_series.is_null())

def is_nan(self) -> Self:
"""Returns a boolean Series indicating which values are null.

Notes:
pandas and Polars handle NaN values differently. Polars distinguishes
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
between NaN and Null, whereas pandas doesn't.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoSeriesT
>>> s = [1.0, 2.0, float("nan")]
>>> s_pd = pd.Series(s, dtype="float64")
>>> s_pl = pl.Series(s)
>>> s_pa = pa.chunked_array([s], type=pa.float64())

We define a series-agnostic function:

>>> def agnostic_is_nan_series(s_native: IntoSeriesT) -> IntoSeriesT:
... s = nw.from_native(s_native, series_only=True)
... return s.is_nan().to_native()

We can then pass either pandas or Polars to `agnostic_is_nan_series`:

>>> agnostic_is_nan_series(s_pd)
0 False
1 False
2 True
dtype: bool
>>> agnostic_is_nan_series(s_pl) # doctest: +NORMALIZE_WHITESPACE
shape: (3,)
Series: '' [bool]
[
false
false
true
]
>>> agnostic_is_nan_series(s_pa) # doctest: +NORMALIZE_WHITESPACE
<pyarrow.lib.ChunkedArray object at ...>
[
[
false,
false,
true
]
]
"""
return self._from_compliant_series(self._compliant_series.is_nan())

def fill_null(
self,
value: Any | None = None,
Expand Down
54 changes: 54 additions & 0 deletions tests/expr_and_series/is_nan_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from __future__ import annotations

import pytest

import narwhals.stable.v1 as nw
from narwhals.exceptions import InvalidOperationError
from tests.conftest import dask_lazy_p2_constructor
from tests.conftest import pandas_constructor
from tests.utils import Constructor
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

NON_NULLABLE_CONSTRUCTORS = [pandas_constructor, dask_lazy_p2_constructor]


def test_nan(constructor: Constructor) -> None:
data_na = {"a": [0, 1, None]}
df = nw.from_native(constructor(data_na)).select(nw.col("a") / nw.col("a"))
result = df.select(nw.col("a").is_nan())
if any(constructor is c for c in NON_NULLABLE_CONSTRUCTORS):
# Null values are coerced to NaN for non-nullable datatypes
expected = {"a": [True, False, True]}
else:
# Null are preserved and should be differentiated for nullable datatypes
expected = {"a": [True, False, None]} # type: ignore[list-item]

assert_equal_data(result, expected)


def test_nan_series(constructor_eager: ConstructorEager) -> None:
data_na = {"a": [0, 1, None]}
df = nw.from_native(constructor_eager(data_na), eager_only=True).select(
nw.col("a") / nw.col("a")
)
result = {"a": df["a"].is_nan()}
if any(constructor_eager is c for c in NON_NULLABLE_CONSTRUCTORS):
# Null values are coerced to NaN for non-nullable datatypes
expected = {"a": [True, False, True]}
else:
# Null are preserved for nullable datatypes
expected = {"a": [True, False, None]} # type: ignore[list-item]

assert_equal_data(result, expected)


def test_nan_non_float() -> None:
data = {"a": ["0", "1"]}
pd_df = nw.from_native(pandas_constructor(data))
with pytest.raises(InvalidOperationError, match="not supported"):
pd_df.select(nw.col("a").is_nan())

dd_df = nw.from_native(dask_lazy_p2_constructor(data))
with pytest.raises(InvalidOperationError, match="not supported"):
dd_df.select(nw.col("a").is_nan())
Loading