Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
Marigold committed Nov 14, 2024
1 parent e2d5d04 commit c5ece93
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 29 deletions.
29 changes: 2 additions & 27 deletions lib/repack/owid/repack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def repack_series(s: pd.Series) -> pd.Series:

def to_int(s: pd.Series) -> pd.Series:
# values could be integers or strings
# s = _to_float64(s)
s = s.astype("Float64")
v = s.astype("Int64")

Expand All @@ -93,7 +92,6 @@ def shrink_integer(s: pd.Series) -> pd.Series:
Take an Int64 series and make it as small as possible.
"""
assert s.dtype == "Int64"
# assert s.dtype.name.replace("[pyarrow]", "") in ("Int64", "int64", "UInt64", "uint64"), s.dtype

if s.isnull().all():
# shrink all NaNs to Int8
Expand All @@ -114,17 +112,6 @@ def shrink_integer(s: pd.Series) -> pd.Series:
return s


# def _to_float64(s: pd.Series) -> pd.Series:
# """Convert pandas series to float if possible. It handles object types and string types as well."""
# # Directly convert to float64[pyarrow]
# if "float" in s.dtype.name.lower():
# return s.astype("Float64")
# else:
# # Convert object types to float first and then to Float64
# # TODO: is this necessary?
# return s.astype("Float64")


def to_float(s: pd.Series) -> pd.Series:
return shrink_float(s.astype("Float64"))

Expand Down Expand Up @@ -154,10 +141,7 @@ def to_category(s: pd.Series) -> pd.Series:
return s.astype("category")


# TODO: can it be simplified?
def series_eq(
lhs: pd.Series, rhs: pd.Series, cast: Optional[Any] = None, rtol: float = 1e-5, atol: float = 1e-8
) -> bool:
def series_eq(lhs: pd.Series, rhs: pd.Series, rtol: float = 1e-5, atol: float = 1e-8) -> bool:
"""
Check that series are equal, but unlike normal floating point checks where
NaN != NaN, we want missing or null values to be reported as equal to each
Expand All @@ -168,16 +152,7 @@ def series_eq(
if len(lhs) != len(rhs):
return False

# improve performance by calling native astype method
if cast == float:
func = lambda s: s.astype(float) # noqa: E731
elif cast is None:
func = lambda s: s # noqa: E731
else:
# NOTE: this would be extremely slow in practice
func = lambda s: s.apply(cast) # noqa: E731

return np.allclose(func(lhs), func(rhs), rtol=rtol, atol=atol, equal_nan=True)
return np.allclose(lhs, rhs, rtol=rtol, atol=atol, equal_nan=True)


def _safe_dtype(dtype: Any) -> str:
Expand Down
8 changes: 6 additions & 2 deletions lib/repack/tests/test_repack.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,15 @@ def test_repack_float64_all_nans():
def test_series_eq():
a = pd.Series([1, np.nan], dtype="float64")
b = pd.Series([2, np.nan], dtype="float64")
assert not repack.series_eq(a, b, cast=float)
assert not repack.series_eq(a, b)

a = pd.Series([1, np.nan], dtype="float64")
b = pd.Series([1, np.nan], dtype="float64")
assert repack.series_eq(a, b, cast=float)
assert repack.series_eq(a, b)

a = pd.Series([1, np.nan], dtype="float64")
b = pd.Series([1, np.nan], dtype="float64").astype("Float64")
assert repack.series_eq(a, b)


def test_repack_object_np_str():
Expand Down

0 comments on commit c5ece93

Please sign in to comment.