diff --git a/etl/datadiff.py b/etl/datadiff.py index 251d753fb71..2a253b8e482 100644 --- a/etl/datadiff.py +++ b/etl/datadiff.py @@ -325,10 +325,10 @@ def cli( ds_a = _match_dataset(path_to_ds_a, path) ds_b = _match_dataset(path_to_ds_b, path) - if ds_a and ds_b and ds_a.metadata.source_checksum == ds_b.metadata.source_checksum: - # skip if they have the same source checksum, note that we're not comparing checksum of actual data - # to improve performance. Source checksum should be enough - continue + # if ds_a and ds_b and ds_a.metadata.source_checksum == ds_b.metadata.source_checksum: + # # skip if they have the same source checksum, note that we're not comparing checksum of actual data + # # to improve performance. Source checksum should be enough + # continue lines = [] @@ -389,8 +389,8 @@ def _index_equals(table_a: pd.DataFrame, table_b: pd.DataFrame, sample: int = 10 def _dict_diff(dict_a: Dict[str, Any], dict_b: Dict[str, Any], tabs: int = 0, **kwargs) -> str: """Convert dictionaries into YAML and compare them using difflib. Return colored diff as a string.""" - meta_a = yaml_dump(dict_a, **kwargs) - meta_b = yaml_dump(dict_b, **kwargs) + meta_a = yaml_dump(dict_a, sort_keys=True, **kwargs) + meta_b = yaml_dump(dict_b, sort_keys=True, **kwargs) lines = difflib.ndiff(meta_a.splitlines(keepends=True), meta_b.splitlines(keepends=True)) # type: ignore # do not print lines that are identical diff --git a/etl/files.py b/etl/files.py index 4f427ffc0d1..73e567b236c 100644 --- a/etl/files.py +++ b/etl/files.py @@ -156,6 +156,7 @@ def yaml_dump( strip_lines: bool = True, replace_confusing_ascii: bool = False, width: int = 120, + sort_keys: bool = False, ) -> Optional[str]: """Alternative to yaml.dump which produces good looking multi-line strings and perserves ordering of keys. If strip_lines is True, all lines in the string will be stripped and all tabs will be @@ -163,7 +164,7 @@ def yaml_dump( # strip lines, otherwise YAML won't output strings in literal format if strip_lines: d = _strip_lines_in_dict(d) - s = yaml.dump(d, stream=stream, sort_keys=False, allow_unicode=True, Dumper=_MyDumper, width=width) + s = yaml.dump(d, stream=stream, sort_keys=sort_keys, allow_unicode=True, Dumper=_MyDumper, width=width) if replace_confusing_ascii: assert s, "replace_confusing_ascii does not work for streams" s = (