Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
Marigold committed Nov 14, 2024
1 parent 9c51749 commit 7eb73fc
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 3 deletions.
2 changes: 1 addition & 1 deletion etl/steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ def is_dirty(self) -> bool:
return False

def has_existing_data(self) -> bool:
return self._dest_dir.is_dir() and (self._dest_dir / "index.json").exists()
return self._dest_dir.is_dir()

def can_execute(self, archive_ok: bool = True) -> bool:
sp = self._search_path
Expand Down
12 changes: 10 additions & 2 deletions lib/repack/owid/repack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,17 @@ def repack_series(s: pd.Series) -> pd.Series:
return s


def _to_float(s: pd.Series) -> pd.Series:
"""Convert series to Float64. Replace numpy NaNs with NA. This can
happen when original series is an object and contains 'nan' string."""
s = s.astype("Float64")
s = s.mask(np.isnan(s), pd.NA)
return s


def to_int(s: pd.Series) -> pd.Series:
# values could be integers or strings
s = s.astype("Float64")
s = _to_float(s)
v = s.astype("Int64")

# casting to float converts strings to floats, that doesn't work with float64[pyarrow]
Expand Down Expand Up @@ -113,7 +121,7 @@ def shrink_integer(s: pd.Series) -> pd.Series:


def to_float(s: pd.Series) -> pd.Series:
return shrink_float(s.astype("Float64"))
return shrink_float(_to_float(s))


def shrink_float(s: pd.Series) -> pd.Series:
Expand Down
12 changes: 12 additions & 0 deletions lib/repack/tests/test_repack.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,18 @@ def test_repack_float_object_to_float32():
assert v.dtype == "Float32"


def test_repack_object_with_nan_string():
s = pd.Series([1, 2, "nan"], dtype="object")
v = repack.repack_series(s)
assert v.dtype == "UInt8"
assert v.isnull().sum() == 1

s = pd.Series([1, 2.2, "nan"], dtype="object")
v = repack.repack_series(s)
assert v.dtype == "Float32"
assert v.isnull().sum() == 1


def test_repack_category():
s = pd.Series(["a", "b", "c", None])
assert s.dtype == np.object_
Expand Down

0 comments on commit 7eb73fc

Please sign in to comment.