From 2bb3859a3327b873df2beb176db015ad423ae540 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:34:19 -0800 Subject: [PATCH] Update `compare_df` to display the diff report on column differences not just rows (#2040) Prevents false positives when columns are missing in output ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/2040 --- python/morpheus/morpheus/utils/compare_df.py | 5 +++-- scripts/compare_data_files.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/morpheus/morpheus/utils/compare_df.py b/python/morpheus/morpheus/utils/compare_df.py index 605b515edf..5541e0ecd4 100755 --- a/python/morpheus/morpheus/utils/compare_df.py +++ b/python/morpheus/morpheus/utils/compare_df.py @@ -130,6 +130,7 @@ def compare_df(df_a: pd.DataFrame, total_rows = len(df_a_filtered) diff_rows = len(df_a_filtered) - int(comparison.count_matching_rows()) + diff_cols = len(extra_columns) + len(missing_columns) if (comparison.matches()): logger.info("Results match validation dataset") @@ -141,7 +142,7 @@ def compare_df(df_a: pd.DataFrame, mismatch_df = merged.loc[mismatched_idx] - if diff_rows > 0: + if diff_rows > 0 or diff_cols > 0: logger.debug("Results do not match. Diff %d/%d (%f %%). First 10 mismatched rows:", diff_rows, total_rows, @@ -160,5 +161,5 @@ def compare_df(df_a: pd.DataFrame, "matching_cols": list(same_columns), "extra_cols": list(extra_columns), "missing_cols": list(missing_columns), - "diff_cols": len(extra_columns) + len(missing_columns) + "diff_cols": diff_cols } diff --git a/scripts/compare_data_files.py b/scripts/compare_data_files.py index b1a53f4fa1..3250d9d65b 100755 --- a/scripts/compare_data_files.py +++ b/scripts/compare_data_files.py @@ -66,7 +66,7 @@ def main(): abs_tol=args.abs_tol, rel_tol=args.rel_tol) - if results['diff_rows'] > 0: + if results['diff_rows'] > 0 or results['diff_cols'] > 0: sys.exit(1)