Skip to content

Commit

Permalink
Merge pull request #8 from camsys/data-type-op-pd-cat
Browse files Browse the repository at this point in the history
relax progressive checks for categorical dtypes
  • Loading branch information
i-am-sijia authored Feb 6, 2024
2 parents a9dd383 + 54afbb0 commit ae126f1
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 16 deletions.
33 changes: 29 additions & 4 deletions activitysim/core/workflow/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,14 +958,21 @@ def restore_from(self, location: Path, checkpoint_name: str = LAST_CHECKPOINT):
self.load(checkpoint_name, store=from_store)
logger.debug(f"checkpoint.restore_from of {checkpoint_name} complete")

def check_against(self, location: Path, checkpoint_name: str):
def check_against(
self, location: Path, checkpoint_name: str, strict_categoricals: bool = False
):
"""
Check that the tables in this State match those in an archived pipeline.
Parameters
----------
location : Path-like
checkpoint_name : str
strict_categoricals : bool, default False
If True, check that categorical columns have the same categories
in both the current state and the checkpoint. Otherwise, the dtypes
of categorical columns are ignored, and only the values themselves are
checked to confirm they match.
Raises
------
Expand Down Expand Up @@ -1033,9 +1040,27 @@ def check_against(self, location: Path, checkpoint_name: str):
local_table[ref_table.columns], ref_table, check_dtype=False
)
except Exception as err:
raise AssertionError(
f"checkpoint {checkpoint_name!r} table {table_name!r}, {str(err)}"
)
if not strict_categoricals:
try:
pd.testing.assert_frame_equal(
local_table[ref_table.columns],
ref_table,
check_dtype=False,
check_categorical=False,
)
except Exception as err2:
raise AssertionError(
f"checkpoint {checkpoint_name!r} table {table_name!r}, {str(err)}\nfrom: {str(err2)}"
)
else:
warnings.warn(
f"checkpoint {checkpoint_name!r} table {table_name!r}, "
f"values match but categorical dtype does not"
)
else:
raise AssertionError(
f"checkpoint {checkpoint_name!r} table {table_name!r}, {str(err)}"
)
else:
logger.info(f"table {table_name!r}: ok")

Expand Down
27 changes: 17 additions & 10 deletions activitysim/core/workflow/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -1082,16 +1082,23 @@ def extend_table(self, table_name, df, axis=0):
missing_df_str_columns = []

# union categoricals
for c in table_df.columns.intersection(df.columns):
if isinstance(table_df[c].dtype, pd.api.types.CategoricalDtype):
if isinstance(df[c].dtype, pd.api.types.CategoricalDtype):
from pandas.api.types import union_categoricals

uc = union_categoricals([table_df[c], df[c]])
table_df[c] = pd.Categorical(
table_df[c], categories=uc.categories
)
df[c] = pd.Categorical(df[c], categories=uc.categories)
for c in table_df.columns:
if c in df.columns:
if isinstance(table_df[c].dtype, pd.api.types.CategoricalDtype):
if isinstance(df[c].dtype, pd.api.types.CategoricalDtype):
from pandas.api.types import union_categoricals

uc = union_categoricals([table_df[c], df[c]])
table_df[c] = pd.Categorical(
table_df[c], categories=uc.categories
)
df[c] = pd.Categorical(df[c], categories=uc.categories)
else:
# when the existing categorical type has an empty string as a category,
# we will use that as the missing value instead of NaN
if isinstance(table_df[c].dtype, pd.api.types.CategoricalDtype):
if "" in table_df[c].cat.categories:
missing_df_str_columns.append(c)

# preserve existing column order
df = pd.concat([table_df, df], sort=False, axis=axis)
Expand Down
4 changes: 2 additions & 2 deletions activitysim/examples/external_example_manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
#

prototype_mtc:
url: https://github.com/jpn--/activitysim-prototype-mtc/archive/refs/tags/v1.3.1.tar.gz
url: https://github.com/ActivitySim/activitysim-prototype-mtc/archive/refs/tags/v1.3.1.tar.gz
sha256: ec53c6e72da1444bd5808de8c644cea75db284dfcc419b776575ba532b3ccb87
assets:
test/prototype_mtc_reference_pipeline.zip:
url: https://github.com/jpn--/activitysim-prototype-mtc/releases/download/v1.3.1/prototype_mtc_reference_pipeline.zip
url: https://github.com/ActivitySim/activitysim-prototype-mtc/releases/download/v1.3.1/prototype_mtc_reference_pipeline.zip
sha256: 394e5b403d4c61d5214493cefe161432db840ba4967c23c999d914178d43a1f0

estimation_example:
Expand Down

0 comments on commit ae126f1

Please sign in to comment.