From ccc195c070db0a7241711f9277b34448bfe9df2a Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 2 Feb 2024 11:11:08 -0600 Subject: [PATCH 1/3] relax progressive checks for categorical dtypes --- activitysim/core/workflow/checkpoint.py | 33 ++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/activitysim/core/workflow/checkpoint.py b/activitysim/core/workflow/checkpoint.py index b553460a2..8771b4129 100644 --- a/activitysim/core/workflow/checkpoint.py +++ b/activitysim/core/workflow/checkpoint.py @@ -958,7 +958,9 @@ def restore_from(self, location: Path, checkpoint_name: str = LAST_CHECKPOINT): self.load(checkpoint_name, store=from_store) logger.debug(f"checkpoint.restore_from of {checkpoint_name} complete") - def check_against(self, location: Path, checkpoint_name: str): + def check_against( + self, location: Path, checkpoint_name: str, strict_categoricals: bool = False + ): """ Check that the tables in this State match those in an archived pipeline. @@ -966,6 +968,11 @@ def check_against(self, location: Path, checkpoint_name: str): ---------- location : Path-like checkpoint_name : str + strict_categoricals : bool, default False + If True, check that categorical columns have the same categories + in both the current state and the checkpoint. Otherwise, the dtypes + of categorical columns are ignored, and only the values themselves are + checked to confirm they match. Raises ------ @@ -1033,9 +1040,27 @@ def check_against(self, location: Path, checkpoint_name: str): local_table[ref_table.columns], ref_table, check_dtype=False ) except Exception as err: - raise AssertionError( - f"checkpoint {checkpoint_name!r} table {table_name!r}, {str(err)}" - ) + if not strict_categoricals: + try: + pd.testing.assert_frame_equal( + local_table[ref_table.columns], + ref_table, + check_dtype=False, + check_categorical=False, + ) + except Exception as err2: + raise AssertionError( + f"checkpoint {checkpoint_name!r} table {table_name!r}, {str(err)}" + ) + else: + warnings.warn( + f"checkpoint {checkpoint_name!r} table {table_name!r}, " + f"values match but categorical dtype does not" + ) + else: + raise AssertionError( + f"checkpoint {checkpoint_name!r} table {table_name!r}, {str(err)}" + ) else: logger.info(f"table {table_name!r}: ok") From f81211ad3fe3283d8f82e8a02b5cba70715ff320 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 2 Feb 2024 11:34:02 -0600 Subject: [PATCH 2/3] update pointers --- activitysim/examples/external_example_manifest.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/activitysim/examples/external_example_manifest.yaml b/activitysim/examples/external_example_manifest.yaml index 0be270636..5face8e8e 100644 --- a/activitysim/examples/external_example_manifest.yaml +++ b/activitysim/examples/external_example_manifest.yaml @@ -13,11 +13,11 @@ # prototype_mtc: - url: https://github.com/jpn--/activitysim-prototype-mtc/archive/refs/tags/v1.3.1.tar.gz + url: https://github.com/ActivitySim/activitysim-prototype-mtc/archive/refs/tags/v1.3.1.tar.gz sha256: ec53c6e72da1444bd5808de8c644cea75db284dfcc419b776575ba532b3ccb87 assets: test/prototype_mtc_reference_pipeline.zip: - url: https://github.com/jpn--/activitysim-prototype-mtc/releases/download/v1.3.1/prototype_mtc_reference_pipeline.zip + url: https://github.com/ActivitySim/activitysim-prototype-mtc/releases/download/v1.3.1/prototype_mtc_reference_pipeline.zip sha256: 394e5b403d4c61d5214493cefe161432db840ba4967c23c999d914178d43a1f0 estimation_example: From 54afbb0f466df2cc324070d2da81e62eecfb5dba Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Mon, 5 Feb 2024 18:10:02 -0600 Subject: [PATCH 3/3] use empty string as NaN --- activitysim/core/workflow/checkpoint.py | 2 +- activitysim/core/workflow/state.py | 27 ++++++++++++++++--------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/activitysim/core/workflow/checkpoint.py b/activitysim/core/workflow/checkpoint.py index 8771b4129..b7fe5900a 100644 --- a/activitysim/core/workflow/checkpoint.py +++ b/activitysim/core/workflow/checkpoint.py @@ -1050,7 +1050,7 @@ def check_against( ) except Exception as err2: raise AssertionError( - f"checkpoint {checkpoint_name!r} table {table_name!r}, {str(err)}" + f"checkpoint {checkpoint_name!r} table {table_name!r}, {str(err)}\nfrom: {str(err2)}" ) else: warnings.warn( diff --git a/activitysim/core/workflow/state.py b/activitysim/core/workflow/state.py index 479b032c7..e00e20c94 100644 --- a/activitysim/core/workflow/state.py +++ b/activitysim/core/workflow/state.py @@ -1082,16 +1082,23 @@ def extend_table(self, table_name, df, axis=0): missing_df_str_columns = [] # union categoricals - for c in table_df.columns.intersection(df.columns): - if isinstance(table_df[c].dtype, pd.api.types.CategoricalDtype): - if isinstance(df[c].dtype, pd.api.types.CategoricalDtype): - from pandas.api.types import union_categoricals - - uc = union_categoricals([table_df[c], df[c]]) - table_df[c] = pd.Categorical( - table_df[c], categories=uc.categories - ) - df[c] = pd.Categorical(df[c], categories=uc.categories) + for c in table_df.columns: + if c in df.columns: + if isinstance(table_df[c].dtype, pd.api.types.CategoricalDtype): + if isinstance(df[c].dtype, pd.api.types.CategoricalDtype): + from pandas.api.types import union_categoricals + + uc = union_categoricals([table_df[c], df[c]]) + table_df[c] = pd.Categorical( + table_df[c], categories=uc.categories + ) + df[c] = pd.Categorical(df[c], categories=uc.categories) + else: + # when the existing categorical type has an empty string as a category, + # we will use that as the missing value instead of NaN + if isinstance(table_df[c].dtype, pd.api.types.CategoricalDtype): + if "" in table_df[c].cat.categories: + missing_df_str_columns.append(c) # preserve existing column order df = pd.concat([table_df, df], sort=False, axis=axis)