diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 371c91dd96f..1785eb834b2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1411,6 +1411,13 @@ def column_empty_like( return column_empty(row_count, dtype, masked) +def _has_any_nan(arbitrary): + return any( + ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x)) + for x in np.asarray(arbitrary) + ) + + def column_empty_like_same_mask( column: ColumnBase, dtype: Dtype ) -> ColumnBase: @@ -1948,9 +1955,20 @@ def as_column( raise TypeError( f"Cannot convert a {inferred_dtype} of object type" ) - elif nan_as_null is False and ( - pd.isna(arbitrary).any() + elif inferred_dtype == "boolean": + if cudf.get_option("mode.pandas_compatible"): + if dtype != np.dtype("bool") or pd.isna(arbitrary).any(): + raise MixedTypeError( + f"Cannot have mixed values with {inferred_dtype}" + ) + elif nan_as_null is False and _has_any_nan(arbitrary): + raise MixedTypeError( + f"Cannot have mixed values with {inferred_dtype}" + ) + elif ( + nan_as_null is False and inferred_dtype not in ("decimal", "empty") + and _has_any_nan(arbitrary) ): # Decimal can hold float("nan") # All np.nan is not restricted by type diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 96301670e9c..8b18e53d320 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4008,44 +4008,28 @@ def test_diff(dtype, period, data_empty): @pytest.mark.parametrize("df", _dataframe_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) -def test_dataframe_isnull_isna(df, nan_as_null): - if nan_as_null is False and ( - df.select_dtypes(object).isna().any().any() - and not df.select_dtypes(object).isna().all().all() - ): - with pytest.raises(MixedTypeError): - cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) - else: - gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) +@pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"]) +def test_dataframe_isnull_isna_and_reverse(df, nan_as_null, api_call): + def detect_nan(x): + # Check if the input is a float and if it is nan + return x.apply(lambda v: isinstance(v, float) and np.isnan(v)) - assert_eq(df.isnull(), gdf.isnull()) - assert_eq(df.isna(), gdf.isna()) - - # Test individual columns - for col in df: - assert_eq(df[col].isnull(), gdf[col].isnull()) - assert_eq(df[col].isna(), gdf[col].isna()) - - -@pytest.mark.parametrize("df", _dataframe_na_data()) -@pytest.mark.parametrize("nan_as_null", [True, False, None]) -def test_dataframe_notna_notnull(df, nan_as_null): + nan_contains = df.select_dtypes(object).apply(detect_nan) if nan_as_null is False and ( - df.select_dtypes(object).isna().any().any() - and not df.select_dtypes(object).isna().all().all() + nan_contains.any().any() and not nan_contains.all().all() ): with pytest.raises(MixedTypeError): cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) else: gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) - assert_eq(df.notnull(), gdf.notnull()) - assert_eq(df.notna(), gdf.notna()) + assert_eq(getattr(df, api_call)(), getattr(gdf, api_call)()) # Test individual columns for col in df: - assert_eq(df[col].notnull(), gdf[col].notnull()) - assert_eq(df[col].notna(), gdf[col].notna()) + assert_eq( + getattr(df[col], api_call)(), getattr(gdf[col], api_call)() + ) def test_ndim(): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 08a6173d3f5..9aeae566730 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -774,8 +774,9 @@ def test_round_nan_as_null_false(series, decimal): @pytest.mark.parametrize("ps", _series_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_series_isnull_isna(ps, nan_as_null): + nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x)) if nan_as_null is False and ( - ps.isna().any() and not ps.isna().all() and ps.dtype == object + nan_contains.any() and not nan_contains.all() and ps.dtype == object ): with pytest.raises(MixedTypeError): cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) @@ -789,8 +790,9 @@ def test_series_isnull_isna(ps, nan_as_null): @pytest.mark.parametrize("ps", _series_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_series_notnull_notna(ps, nan_as_null): + nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x)) if nan_as_null is False and ( - ps.isna().any() and not ps.isna().all() and ps.dtype == object + nan_contains.any() and not nan_contains.all() and ps.dtype == object ): with pytest.raises(MixedTypeError): cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) @@ -2356,12 +2358,23 @@ def test_multi_dim_series_error(): def test_bool_series_mixed_dtype_error(): ps = pd.Series([True, False, None]) + all_bool_ps = pd.Series([True, False, True], dtype="object") # ps now has `object` dtype, which # isn't supported by `cudf`. + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(TypeError): + cudf.Series(ps) + with pytest.raises(TypeError): + cudf.from_pandas(ps) + with pytest.raises(TypeError): + cudf.Series(ps, dtype=bool) + expected = cudf.Series(all_bool_ps, dtype=bool) + assert_eq(expected, all_bool_ps.astype(bool)) + nan_bools_mix = pd.Series([True, False, True, np.nan], dtype="object") + gs = cudf.Series(nan_bools_mix, nan_as_null=True) + assert_eq(gs.to_pandas(nullable=True), nan_bools_mix.astype("boolean")) with pytest.raises(TypeError): - cudf.Series(ps, nan_as_null=False) - with pytest.raises(TypeError): - cudf.from_pandas(ps, nan_as_null=False) + cudf.Series(nan_bools_mix, nan_as_null=False) @pytest.mark.parametrize(