Skip to content

Commit

Permalink
🐛 don't report spurious metadata differences in datadiff
Browse files Browse the repository at this point in the history
Sorts YAML keys before comparison, so that it's invariant to field
order.
  • Loading branch information
larsyencken committed Mar 4, 2024
1 parent f32be2c commit a423ebd
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
12 changes: 6 additions & 6 deletions etl/datadiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,10 +325,10 @@ def cli(
ds_a = _match_dataset(path_to_ds_a, path)
ds_b = _match_dataset(path_to_ds_b, path)

if ds_a and ds_b and ds_a.metadata.source_checksum == ds_b.metadata.source_checksum:
# skip if they have the same source checksum, note that we're not comparing checksum of actual data
# to improve performance. Source checksum should be enough
continue
# if ds_a and ds_b and ds_a.metadata.source_checksum == ds_b.metadata.source_checksum:
# # skip if they have the same source checksum, note that we're not comparing checksum of actual data
# # to improve performance. Source checksum should be enough
# continue

lines = []

Expand Down Expand Up @@ -389,8 +389,8 @@ def _index_equals(table_a: pd.DataFrame, table_b: pd.DataFrame, sample: int = 10

def _dict_diff(dict_a: Dict[str, Any], dict_b: Dict[str, Any], tabs: int = 0, **kwargs) -> str:
"""Convert dictionaries into YAML and compare them using difflib. Return colored diff as a string."""
meta_a = yaml_dump(dict_a, **kwargs)
meta_b = yaml_dump(dict_b, **kwargs)
meta_a = yaml_dump(dict_a, sort_keys=True, **kwargs)
meta_b = yaml_dump(dict_b, sort_keys=True, **kwargs)

lines = difflib.ndiff(meta_a.splitlines(keepends=True), meta_b.splitlines(keepends=True)) # type: ignore
# do not print lines that are identical
Expand Down
3 changes: 2 additions & 1 deletion etl/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,15 @@ def yaml_dump(
strip_lines: bool = True,
replace_confusing_ascii: bool = False,
width: int = 120,
sort_keys: bool = False,
) -> Optional[str]:
"""Alternative to yaml.dump which produces good looking multi-line strings and perserves ordering
of keys. If strip_lines is True, all lines in the string will be stripped and all tabs will be
replaced by two spaces."""
# strip lines, otherwise YAML won't output strings in literal format
if strip_lines:
d = _strip_lines_in_dict(d)
s = yaml.dump(d, stream=stream, sort_keys=False, allow_unicode=True, Dumper=_MyDumper, width=width)
s = yaml.dump(d, stream=stream, sort_keys=sort_keys, allow_unicode=True, Dumper=_MyDumper, width=width)
if replace_confusing_ascii:
assert s, "replace_confusing_ascii does not work for streams"
s = (
Expand Down

0 comments on commit a423ebd

Please sign in to comment.