From 82aac668062842fe2e5f501a381c5a8dcb9bb9d0 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 16 Apr 2024 14:34:28 -0700 Subject: [PATCH] use pandas NA --- src/nested_pandas/series/dtype.py | 5 +-- src/nested_pandas/series/ext_array.py | 10 ++++- src/nested_pandas/series/na.py | 55 ------------------------ tests/nested_pandas/series/test_dtype.py | 3 +- tests/nested_pandas/series/test_na.py | 50 --------------------- 5 files changed, 12 insertions(+), 111 deletions(-) delete mode 100644 src/nested_pandas/series/na.py delete mode 100644 tests/nested_pandas/series/test_na.py diff --git a/src/nested_pandas/series/dtype.py b/src/nested_pandas/series/dtype.py index 3559b7b..7ae5302 100644 --- a/src/nested_pandas/series/dtype.py +++ b/src/nested_pandas/series/dtype.py @@ -13,7 +13,6 @@ from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.base import ExtensionDtype -from nested_pandas.series.na import NA, NAType from nested_pandas.series.utils import is_pa_type_a_list __all__ = ["NestedDtype"] @@ -29,9 +28,9 @@ class NestedDtype(ExtensionDtype): """Attributes to use as metadata for __eq__ and __hash__""" @property - def na_value(self) -> NAType: + def na_value(self) -> pd.NA: """The missing value for this dtype""" - return NA + return pd.NA type = pd.DataFrame """The type of the array's elements, always pd.DataFrame""" diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py index df0b99f..efce5f0 100644 --- a/src/nested_pandas/series/ext_array.py +++ b/src/nested_pandas/series/ext_array.py @@ -160,8 +160,16 @@ def to_numpy(self, dtype: None = None, copy: bool = False, na_value: Any = no_de # Hack with np.empty is the only way to force numpy to create 1-d array of objects result = np.empty(shape=array.shape, dtype=object) + + # TODO: delete block once confirmed NA is better to return + # fill NaNs - Fill with an empty dictionary using the same columns + # nan_vals = pd.isna(array) + # if sum(nan_vals) > 0: + # na_idx = np.where(nan_vals) + # array[na_idx] = {key: [] for key in array[np.where(~nan_vals)[0][0]].keys()} + # We do copy=False here because user's 'copy' is already handled by ArrowExtensionArray.to_numpy - result[:] = [pd.DataFrame(value, copy=False) for value in array] + result[:] = [pd.DataFrame(value, copy=False) if not pd.isna(value) else pd.NA for value in array] return result def __setitem__(self, key, value) -> None: diff --git a/src/nested_pandas/series/na.py b/src/nested_pandas/series/na.py deleted file mode 100644 index 0b77bb4..0000000 --- a/src/nested_pandas/series/na.py +++ /dev/null @@ -1,55 +0,0 @@ -"""Missing value for NestedDtype - -It i something between pandas' NA and NaN -""" - -__all__ = ["NAType", "NA"] - - -class _NAType: - pass - - -class NAType: - """Singleton class representing missing value for NestedDtype. - - It doesn't implement most of the arithmetics and boolean logic operations, - because they are ambiguous for missing values. - - The implementation is inspired both by pandas' NA and float number NaN. - - `NA` is a singleton instance of this class. - """ - - _instance = None - - def __new__(cls, *args, **kwargs): - """Create a new instance of NAType.""" - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def __repr__(self) -> str: - return "" - - def __format__(self, format_spec) -> str: - try: - return self.__repr__().__format__(format_spec) - except ValueError: - return self.__repr__() - - def __bool__(self): - raise TypeError("boolean value of NA is ambiguous") - - def __eq__(self, other): - return False - - def __ne__(self, other): - return True - - def __hash__(self): - return 0 - - -NA = NAType() -"""Missed value for NestedDtype, a singleton instance of `NAType` class.""" diff --git a/tests/nested_pandas/series/test_dtype.py b/tests/nested_pandas/series/test_dtype.py index 97aa32d..2b6b3b2 100644 --- a/tests/nested_pandas/series/test_dtype.py +++ b/tests/nested_pandas/series/test_dtype.py @@ -3,7 +3,6 @@ import pytest from nested_pandas.series.dtype import NestedDtype from nested_pandas.series.ext_array import NestedExtensionArray -from nested_pandas.series.na import NA @pytest.mark.parametrize( @@ -62,7 +61,7 @@ def test_from_fields(): def test_na_value(): """Test that NestedDtype.na_value is a singleton instance of NAType.""" dtype = NestedDtype(pa.struct([pa.field("a", pa.list_(pa.int64()))])) - assert dtype.na_value is NA + assert dtype.na_value is pd.NA def test_fields(): diff --git a/tests/nested_pandas/series/test_na.py b/tests/nested_pandas/series/test_na.py deleted file mode 100644 index b3a81b3..0000000 --- a/tests/nested_pandas/series/test_na.py +++ /dev/null @@ -1,50 +0,0 @@ -import pytest -from nested_pandas.series.na import NA - - -def test_na_is_singleton(): - """Test that NA is a singleton instance""" - assert NA is NA - - -def test_na_repr(): - """Test that NA has the correct representation.""" - assert repr(NA) == "" - - -def test_na_format(): - """Test that NA has the correct format.""" - assert f"{NA}" == "" - - -def test_na_bool(): - """Test that NA raises TypeError when converted to bool.""" - with pytest.raises(TypeError): - bool(NA) - - -def test_na_eq(): - """Test that NA is not equal to anything.""" - assert NA != 1 - assert NA != 1.0 - assert NA != "1" - assert NA != NA - - -def test_na_neq(): - """Test that NA is not equal to anything.""" - assert NA != 1 - assert NA != 1.0 - assert NA != "1" - assert [] != NA - assert {} != NA - assert NA != () - assert set() != NA - assert NA != NA - assert object() != NA - - -def test_hash(): - """Test that hash(NA) is always the same.""" - assert hash(NA) == hash(NA) - assert {NA, NA} == {NA}