Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
Marigold committed Nov 14, 2024
1 parent c5ece93 commit 9c51749
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 1 deletion.
2 changes: 1 addition & 1 deletion etl/steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ def is_dirty(self) -> bool:
return False

def has_existing_data(self) -> bool:
return self._dest_dir.is_dir()
return self._dest_dir.is_dir() and (self._dest_dir / "index.json").exists()

def can_execute(self, archive_ok: bool = True) -> bool:
sp = self._search_path
Expand Down
7 changes: 7 additions & 0 deletions lib/repack/tests/test_repack.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,18 +315,25 @@ def test_to_safe_types_with_nan():
df = pd.DataFrame(
{
"int_col": [1, 2, 3],
"int_with_nan": [1, np.nan, 3],
"nullable_int_with_nan": [1, np.nan, 3],
"float_col": [1.1, np.nan, 3.3],
"cat_col": pd.Categorical(["a", None, "c"]),
}
)
df.set_index("float_col", inplace=True)
df["nullable_int_with_nan"] = df["nullable_int_with_nan"].astype("Int32")

# Apply the to_safe_types function
df_safe = repack.to_safe_types(df)

# Check that NaN values are handled correctly
assert df_safe.index.dtype == "Float64"
assert df_safe["int_col"].dtype == "Int64"
# NOTE: ints with nans end up as floats, but if they are nullable ints they remain as Int64 (which is our case
# since we store everything as nullable types)
assert df_safe["int_with_nan"].dtype == "Float64"
assert df_safe["nullable_int_with_nan"].dtype == "Int64"
assert df_safe["cat_col"].dtype == "string[pyarrow]"

# Ensure that the NA value in 'cat_col' remains pd.NA and not the string "NA"
Expand Down

0 comments on commit 9c51749

Please sign in to comment.