Skip to content

Commit

Permalink
Allow None when nan_as_null=False in column constructor (#15709)
Browse files Browse the repository at this point in the history
Fixes: #15708 

This PR fixes an issue where we were throwing an error when `None` is present and `nan_as_null=False`, this is a bug because of using `pd.isna`, this returns `True` for `nan`, `None` and `NA`. Whereas we are only looking for `np.nan` and not `None` and `pd.NA`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: #15709
  • Loading branch information
galipremsagar authored May 15, 2024
1 parent 04d247c commit fa9d028
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 34 deletions.
22 changes: 20 additions & 2 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1411,6 +1411,13 @@ def column_empty_like(
return column_empty(row_count, dtype, masked)


def _has_any_nan(arbitrary):
return any(
((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
for x in np.asarray(arbitrary)
)


def column_empty_like_same_mask(
column: ColumnBase, dtype: Dtype
) -> ColumnBase:
Expand Down Expand Up @@ -1948,9 +1955,20 @@ def as_column(
raise TypeError(
f"Cannot convert a {inferred_dtype} of object type"
)
elif nan_as_null is False and (
pd.isna(arbitrary).any()
elif inferred_dtype == "boolean":
if cudf.get_option("mode.pandas_compatible"):
if dtype != np.dtype("bool") or pd.isna(arbitrary).any():
raise MixedTypeError(
f"Cannot have mixed values with {inferred_dtype}"
)
elif nan_as_null is False and _has_any_nan(arbitrary):
raise MixedTypeError(
f"Cannot have mixed values with {inferred_dtype}"
)
elif (
nan_as_null is False
and inferred_dtype not in ("decimal", "empty")
and _has_any_nan(arbitrary)
):
# Decimal can hold float("nan")
# All np.nan is not restricted by type
Expand Down
38 changes: 11 additions & 27 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4008,44 +4008,28 @@ def test_diff(dtype, period, data_empty):

@pytest.mark.parametrize("df", _dataframe_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_dataframe_isnull_isna(df, nan_as_null):
if nan_as_null is False and (
df.select_dtypes(object).isna().any().any()
and not df.select_dtypes(object).isna().all().all()
):
with pytest.raises(MixedTypeError):
cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
else:
gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
@pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"])
def test_dataframe_isnull_isna_and_reverse(df, nan_as_null, api_call):
def detect_nan(x):
# Check if the input is a float and if it is nan
return x.apply(lambda v: isinstance(v, float) and np.isnan(v))

assert_eq(df.isnull(), gdf.isnull())
assert_eq(df.isna(), gdf.isna())

# Test individual columns
for col in df:
assert_eq(df[col].isnull(), gdf[col].isnull())
assert_eq(df[col].isna(), gdf[col].isna())


@pytest.mark.parametrize("df", _dataframe_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_dataframe_notna_notnull(df, nan_as_null):
nan_contains = df.select_dtypes(object).apply(detect_nan)
if nan_as_null is False and (
df.select_dtypes(object).isna().any().any()
and not df.select_dtypes(object).isna().all().all()
nan_contains.any().any() and not nan_contains.all().all()
):
with pytest.raises(MixedTypeError):
cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
else:
gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)

assert_eq(df.notnull(), gdf.notnull())
assert_eq(df.notna(), gdf.notna())
assert_eq(getattr(df, api_call)(), getattr(gdf, api_call)())

# Test individual columns
for col in df:
assert_eq(df[col].notnull(), gdf[col].notnull())
assert_eq(df[col].notna(), gdf[col].notna())
assert_eq(
getattr(df[col], api_call)(), getattr(gdf[col], api_call)()
)


def test_ndim():
Expand Down
23 changes: 18 additions & 5 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,8 +774,9 @@ def test_round_nan_as_null_false(series, decimal):
@pytest.mark.parametrize("ps", _series_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_series_isnull_isna(ps, nan_as_null):
nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
if nan_as_null is False and (
ps.isna().any() and not ps.isna().all() and ps.dtype == object
nan_contains.any() and not nan_contains.all() and ps.dtype == object
):
with pytest.raises(MixedTypeError):
cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
Expand All @@ -789,8 +790,9 @@ def test_series_isnull_isna(ps, nan_as_null):
@pytest.mark.parametrize("ps", _series_na_data())
@pytest.mark.parametrize("nan_as_null", [True, False, None])
def test_series_notnull_notna(ps, nan_as_null):
nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
if nan_as_null is False and (
ps.isna().any() and not ps.isna().all() and ps.dtype == object
nan_contains.any() and not nan_contains.all() and ps.dtype == object
):
with pytest.raises(MixedTypeError):
cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
Expand Down Expand Up @@ -2356,12 +2358,23 @@ def test_multi_dim_series_error():

def test_bool_series_mixed_dtype_error():
ps = pd.Series([True, False, None])
all_bool_ps = pd.Series([True, False, True], dtype="object")
# ps now has `object` dtype, which
# isn't supported by `cudf`.
with cudf.option_context("mode.pandas_compatible", True):
with pytest.raises(TypeError):
cudf.Series(ps)
with pytest.raises(TypeError):
cudf.from_pandas(ps)
with pytest.raises(TypeError):
cudf.Series(ps, dtype=bool)
expected = cudf.Series(all_bool_ps, dtype=bool)
assert_eq(expected, all_bool_ps.astype(bool))
nan_bools_mix = pd.Series([True, False, True, np.nan], dtype="object")
gs = cudf.Series(nan_bools_mix, nan_as_null=True)
assert_eq(gs.to_pandas(nullable=True), nan_bools_mix.astype("boolean"))
with pytest.raises(TypeError):
cudf.Series(ps, nan_as_null=False)
with pytest.raises(TypeError):
cudf.from_pandas(ps, nan_as_null=False)
cudf.Series(nan_bools_mix, nan_as_null=False)


@pytest.mark.parametrize(
Expand Down

0 comments on commit fa9d028

Please sign in to comment.