Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add is_nan expression & series method #1625

Merged
merged 19 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
- is_first_distinct
- is_in
- is_last_distinct
- is_nan
- is_null
- is_unique
- len
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
- is_first_distinct
- is_in
- is_last_distinct
- is_nan
- is_null
- is_sorted
- is_unique
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_arrow/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,9 @@ def null_count(self: Self) -> Self:
def is_null(self: Self) -> Self:
return reuse_series_implementation(self, "is_null")

def is_nan(self: Self) -> Self:
return reuse_series_implementation(self, "is_nan")

def is_between(self: Self, lower_bound: Any, upper_bound: Any, closed: str) -> Self:
return reuse_series_implementation(
self,
Expand Down
5 changes: 5 additions & 0 deletions narwhals/_arrow/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,11 @@ def is_null(self: Self) -> Self:
ser = self._native_series
return self._from_native_series(ser.is_null())

def is_nan(self: Self) -> Self:
import pyarrow.compute as pc

return self._from_native_series(pc.is_nan(self._native_series))

def cast(self: Self, dtype: DType) -> Self:
import pyarrow.compute as pc

Expand Down
15 changes: 15 additions & 0 deletions narwhals/_dask/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from narwhals._pandas_like.utils import calculate_timestamp_datetime
from narwhals._pandas_like.utils import native_to_narwhals_dtype
from narwhals.exceptions import ColumnNotFoundError
from narwhals.exceptions import InvalidOperationError
from narwhals.typing import CompliantExpr
from narwhals.utils import Implementation
from narwhals.utils import generate_temporary_column_name
Expand Down Expand Up @@ -706,6 +707,20 @@ def is_null(self: Self) -> Self:
returns_scalar=self._returns_scalar,
)

def is_nan(self: Self) -> Self:
def func(_input: dask_expr.Series) -> dask_expr.Series:
dtype = native_to_narwhals_dtype(_input, self._version, self._implementation)
if dtype.is_numeric():
return _input != _input # noqa: PLR0124
msg = f"`.is_nan` only supported for numeric dtypes and not {dtype}, did you mean `.is_null`?"
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
raise InvalidOperationError(msg)

return self._from_call(
func,
"is_null",
returns_scalar=self._returns_scalar,
)

def len(self: Self) -> Self:
return self._from_call(
lambda _input: _input.size,
Expand Down
3 changes: 3 additions & 0 deletions narwhals/_pandas_like/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,9 @@ def is_between(
def is_null(self) -> Self:
return reuse_series_implementation(self, "is_null")

def is_nan(self) -> Self:
return reuse_series_implementation(self, "is_nan")

def fill_null(
self,
value: Any | None = None,
Expand Down
10 changes: 8 additions & 2 deletions narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from narwhals._pandas_like.utils import set_index
from narwhals._pandas_like.utils import to_datetime
from narwhals.dependencies import is_numpy_scalar
from narwhals.exceptions import InvalidOperationError
from narwhals.typing import CompliantSeries
from narwhals.utils import Implementation
from narwhals.utils import import_dtypes_module
Expand Down Expand Up @@ -623,8 +624,6 @@ def mean(self) -> Any:
return ser.mean()

def median(self) -> Any:
from narwhals.exceptions import InvalidOperationError

if not self.dtype.is_numeric():
msg = "`median` operation not supported for non-numeric input type."
raise InvalidOperationError(msg)
Expand Down Expand Up @@ -663,6 +662,13 @@ def is_null(self) -> PandasLikeSeries:
ser = self._native_series
return self._from_native_series(ser.isna())

def is_nan(self) -> PandasLikeSeries:
ser = self._native_series
if self.dtype.is_numeric():
return self._from_native_series(ser != ser) # noqa: PLR0124
msg = f"`.is_nan` only supported for numeric dtype and not {self.dtype}, did you mean `.is_null`?"
raise InvalidOperationError(msg)

def fill_null(
self,
value: Any | None = None,
Expand Down
9 changes: 9 additions & 0 deletions narwhals/_polars/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,15 @@ def ewm_mean(
)
return self._from_native_expr(native_expr)

def is_nan(self: Self) -> Self:
if self._backend_version < (1, 18): # pragma: no cover
import polars as pl

return self._from_native_expr(
pl.when(self._native_expr.is_not_null()).then(self._native_expr.is_nan())
)
return self._from_native_expr(self._native_expr.is_nan())

def rolling_var(
self: Self,
window_size: int,
Expand Down
13 changes: 13 additions & 0 deletions narwhals/_polars/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,19 @@ def __rpow__(self: Self, other: PolarsSeries | Any) -> Self:
def __invert__(self: Self) -> Self:
return self._from_native_series(self._native_series.__invert__())

def is_nan(self: Self) -> Self:
import polars as pl

native = self._native_series

if self._backend_version < (1, 18): # pragma: no cover
return self._from_native_series(
pl.select(pl.when(native.is_not_null()).then(native.is_nan()))[
native.name
]
)
return self._from_native_series(native.is_nan())

def median(self: Self) -> Any:
from narwhals.exceptions import InvalidOperationError

Expand Down
64 changes: 64 additions & 0 deletions narwhals/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1925,6 +1925,70 @@ def is_null(self) -> Self:
"""
return self.__class__(lambda plx: self._to_compliant_expr(plx).is_null())

def is_nan(self) -> Self:
"""Indicate which values are NaN.

Returns:
A new expression.

Notes:
pandas handles null values differently from Polars and PyArrow.
See [null_handling](../pandas_like_concepts/null_handling.md/)
for reference.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>> data = {"orig": [0.0, None, 2.0]}
>>> df_pd = pd.DataFrame(data).astype({"orig": "Float64"})
>>> df_pl = pl.DataFrame(data)
>>> df_pa = pa.table(data)

Let's define a dataframe-agnostic function:

>>> def agnostic_self_div_is_nan(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.with_columns(
... divided=nw.col("orig") / nw.col("orig"),
... divided_is_nan=(nw.col("orig") / nw.col("orig")).is_nan(),
... ).to_native()

We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_self_div_is_nan`:

>>> print(agnostic_self_div_is_nan(df_pd))
orig divided divided_is_nan
0 0.0 NaN True
1 <NA> <NA> <NA>
2 2.0 1.0 False

>>> print(agnostic_self_div_is_nan(df_pl))
shape: (3, 3)
β”Œβ”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ orig ┆ divided ┆ divided_is_nan β”‚
β”‚ --- ┆ --- ┆ --- β”‚
β”‚ f64 ┆ f64 ┆ bool β”‚
β•žβ•β•β•β•β•β•β•ͺ═════════β•ͺ════════════════║
β”‚ 0.0 ┆ NaN ┆ true β”‚
β”‚ null ┆ null ┆ null β”‚
β”‚ 2.0 ┆ 1.0 ┆ false β”‚
β””β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

>>> print(agnostic_self_div_is_nan(df_pa))
pyarrow.Table
orig: double
divided: double
divided_is_nan: bool
----
orig: [[0,null,2]]
divided: [[nan,null,1]]
divided_is_nan: [[true,null,false]]

"""
return self.__class__(lambda plx: self._to_compliant_expr(plx).is_nan())

def arg_true(self) -> Self:
"""Find elements where boolean expression is True.

Expand Down
53 changes: 53 additions & 0 deletions narwhals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2437,6 +2437,59 @@ def is_null(self) -> Self:
"""
return self._from_compliant_series(self._compliant_series.is_null())

def is_nan(self) -> Self:
"""Returns a boolean Series indicating which values are NaN.

Returns:
A boolean Series indicating which values are NaN.

Notes:
pandas handles null values differently from Polars and PyArrow.
See [null_handling](../pandas_like_concepts/null_handling.md/)
for reference.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoSeriesT

>>> data = [0.0, None, 2.0]
>>> s_pd = pd.Series(data, dtype="Float64")
>>> s_pl = pl.Series(data)
>>> s_pa = pa.chunked_array([data], type=pa.float64())

>>> def agnostic_self_div_is_nan(s_native: IntoSeriesT) -> IntoSeriesT:
... s = nw.from_native(s_native, series_only=True)
... return s.is_nan().to_native()

>>> print(agnostic_self_div_is_nan(s_pd))
0 False
1 <NA>
2 False
dtype: boolean

>>> print(agnostic_self_div_is_nan(s_pl)) # doctest: +NORMALIZE_WHITESPACE
shape: (3,)
Series: '' [bool]
[
false
null
false
]

>>> print(agnostic_self_div_is_nan(s_pa)) # doctest: +NORMALIZE_WHITESPACE
[
[
false,
null,
false
]
]
"""
return self._from_compliant_series(self._compliant_series.is_nan())

def fill_null(
self,
value: Any | None = None,
Expand Down
119 changes: 119 additions & 0 deletions tests/expr_and_series/is_nan_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from __future__ import annotations

from typing import Any

import pytest

import narwhals.stable.v1 as nw
from tests.conftest import dask_lazy_p1_constructor
from tests.conftest import dask_lazy_p2_constructor
from tests.conftest import modin_constructor
from tests.conftest import pandas_constructor
from tests.utils import Constructor
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

NON_NULLABLE_CONSTRUCTORS = [
pandas_constructor,
dask_lazy_p1_constructor,
dask_lazy_p2_constructor,
modin_constructor,
]


def test_nan(constructor: Constructor) -> None:
data_na = {"int": [0, 1, None]}
df = nw.from_native(constructor(data_na)).with_columns(
float=nw.col("int").cast(nw.Float64), float_na=nw.col("int") / nw.col("int")
)
result = df.select(
int=nw.col("int").is_nan(),
float=nw.col("float").is_nan(),
float_na=nw.col("float_na").is_nan(),
)

expected: dict[str, list[Any]]
if any(constructor is c for c in NON_NULLABLE_CONSTRUCTORS):
# Null values are coerced to NaN for non-nullable datatypes
expected = {
"int": [False, False, True],
"float": [False, False, True],
"float_na": [True, False, True],
}
else:
# Null are preserved and should be differentiated for nullable datatypes
expected = {
"int": [False, False, None],
"float": [False, False, None],
"float_na": [True, False, None],
}

assert_equal_data(result, expected)


def test_nan_series(constructor_eager: ConstructorEager) -> None:
data_na = {"int": [0, 1, None]}
df = nw.from_native(constructor_eager(data_na), eager_only=True).with_columns(
float=nw.col("int").cast(nw.Float64), float_na=nw.col("int") / nw.col("int")
)

result = {
"int": df["int"].is_nan(),
"float": df["float"].is_nan(),
"float_na": df["float_na"].is_nan(),
}
expected: dict[str, list[Any]]
if any(constructor_eager is c for c in NON_NULLABLE_CONSTRUCTORS):
# Null values are coerced to NaN for non-nullable datatypes
expected = {
"int": [False, False, True],
"float": [False, False, True],
"float_na": [True, False, True],
}
else:
# Null are preserved and should be differentiated for nullable datatypes
expected = {
"int": [False, False, None],
"float": [False, False, None],
"float_na": [True, False, None],
}

assert_equal_data(result, expected)


def test_nan_non_float(constructor: Constructor) -> None:
from polars.exceptions import InvalidOperationError as PlInvalidOperationError
from pyarrow.lib import ArrowNotImplementedError

from narwhals.exceptions import InvalidOperationError as NwInvalidOperationError

data = {"a": ["x", "y"]}
df = nw.from_native(constructor(data))

exc = NwInvalidOperationError
if "polars" in str(constructor):
exc = PlInvalidOperationError
elif "pyarrow_table" in str(constructor):
exc = ArrowNotImplementedError

with pytest.raises(exc):
df.select(nw.col("a").is_nan()).lazy().collect()


def test_nan_non_float_series(constructor_eager: ConstructorEager) -> None:
from polars.exceptions import InvalidOperationError as PlInvalidOperationError
from pyarrow.lib import ArrowNotImplementedError

from narwhals.exceptions import InvalidOperationError as NwInvalidOperationError

data = {"a": ["x", "y"]}
df = nw.from_native(constructor_eager(data), eager_only=True)

exc = NwInvalidOperationError
if "polars" in str(constructor_eager):
exc = PlInvalidOperationError
elif "pyarrow_table" in str(constructor_eager):
exc = ArrowNotImplementedError

with pytest.raises(exc):
df["a"].is_nan()
Loading