From c5ece93665ced51458d00af06a95b533703e7ed3 Mon Sep 17 00:00:00 2001 From: Marigold Date: Thu, 14 Nov 2024 10:07:32 +0100 Subject: [PATCH] wip --- lib/repack/owid/repack/__init__.py | 29 ++--------------------------- lib/repack/tests/test_repack.py | 8 ++++++-- 2 files changed, 8 insertions(+), 29 deletions(-) diff --git a/lib/repack/owid/repack/__init__.py b/lib/repack/owid/repack/__init__.py index 2eb4c0ca7e2..38029175a0f 100644 --- a/lib/repack/owid/repack/__init__.py +++ b/lib/repack/owid/repack/__init__.py @@ -76,7 +76,6 @@ def repack_series(s: pd.Series) -> pd.Series: def to_int(s: pd.Series) -> pd.Series: # values could be integers or strings - # s = _to_float64(s) s = s.astype("Float64") v = s.astype("Int64") @@ -93,7 +92,6 @@ def shrink_integer(s: pd.Series) -> pd.Series: Take an Int64 series and make it as small as possible. """ assert s.dtype == "Int64" - # assert s.dtype.name.replace("[pyarrow]", "") in ("Int64", "int64", "UInt64", "uint64"), s.dtype if s.isnull().all(): # shrink all NaNs to Int8 @@ -114,17 +112,6 @@ def shrink_integer(s: pd.Series) -> pd.Series: return s -# def _to_float64(s: pd.Series) -> pd.Series: -# """Convert pandas series to float if possible. It handles object types and string types as well.""" -# # Directly convert to float64[pyarrow] -# if "float" in s.dtype.name.lower(): -# return s.astype("Float64") -# else: -# # Convert object types to float first and then to Float64 -# # TODO: is this necessary? -# return s.astype("Float64") - - def to_float(s: pd.Series) -> pd.Series: return shrink_float(s.astype("Float64")) @@ -154,10 +141,7 @@ def to_category(s: pd.Series) -> pd.Series: return s.astype("category") -# TODO: can it be simplified? -def series_eq( - lhs: pd.Series, rhs: pd.Series, cast: Optional[Any] = None, rtol: float = 1e-5, atol: float = 1e-8 -) -> bool: +def series_eq(lhs: pd.Series, rhs: pd.Series, rtol: float = 1e-5, atol: float = 1e-8) -> bool: """ Check that series are equal, but unlike normal floating point checks where NaN != NaN, we want missing or null values to be reported as equal to each @@ -168,16 +152,7 @@ def series_eq( if len(lhs) != len(rhs): return False - # improve performance by calling native astype method - if cast == float: - func = lambda s: s.astype(float) # noqa: E731 - elif cast is None: - func = lambda s: s # noqa: E731 - else: - # NOTE: this would be extremely slow in practice - func = lambda s: s.apply(cast) # noqa: E731 - - return np.allclose(func(lhs), func(rhs), rtol=rtol, atol=atol, equal_nan=True) + return np.allclose(lhs, rhs, rtol=rtol, atol=atol, equal_nan=True) def _safe_dtype(dtype: Any) -> str: diff --git a/lib/repack/tests/test_repack.py b/lib/repack/tests/test_repack.py index 3dbba13b3a5..b09ad955cda 100644 --- a/lib/repack/tests/test_repack.py +++ b/lib/repack/tests/test_repack.py @@ -228,11 +228,15 @@ def test_repack_float64_all_nans(): def test_series_eq(): a = pd.Series([1, np.nan], dtype="float64") b = pd.Series([2, np.nan], dtype="float64") - assert not repack.series_eq(a, b, cast=float) + assert not repack.series_eq(a, b) a = pd.Series([1, np.nan], dtype="float64") b = pd.Series([1, np.nan], dtype="float64") - assert repack.series_eq(a, b, cast=float) + assert repack.series_eq(a, b) + + a = pd.Series([1, np.nan], dtype="float64") + b = pd.Series([1, np.nan], dtype="float64").astype("Float64") + assert repack.series_eq(a, b) def test_repack_object_np_str():