From 392a8814a3e96b6762611f7c8eae4669087f3de5 Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 8 Feb 2024 17:11:41 -0600 Subject: [PATCH] Completely re-write schema comparison summary Now generates a changelog based on the Major/Minor/Patch changes Fix issues with name for withStandard schema Improve comparison between schema entries to handle inherited attributes --- hed/schema/hed_schema_entry.py | 22 +- hed/schema/schema_compare.py | 509 ++++++++---------- hed/schema/schema_io/base2schema.py | 1 + .../schema_tests/schema_compare.mediawiki | 51 ++ .../schema_tests/schema_compare2.mediawiki | 62 +++ tests/models/test_basic_search.py | 27 +- tests/schema/test_schema_compare.py | 106 ++-- 7 files changed, 427 insertions(+), 351 deletions(-) create mode 100644 tests/data/schema_tests/schema_compare.mediawiki create mode 100644 tests/data/schema_tests/schema_compare2.mediawiki diff --git a/hed/schema/hed_schema_entry.py b/hed/schema/hed_schema_entry.py index 4be137da..3ad99b2e 100644 --- a/hed/schema/hed_schema_entry.py +++ b/hed/schema/hed_schema_entry.py @@ -115,13 +115,8 @@ def section_key(self): def __eq__(self, other): if self.name != other.name: return False - if self.attributes != other.attributes: - # We only want to compare known attributes - self_attr = self.get_known_attributes() - other_attr = other.get_known_attributes() - # We can no longer be sure on the order of attribute values, since owl formatting has no order - if self_attr != other_attr and not self._compare_attributes_no_order(self_attr, other_attr): - return False + if not self._compare_attributes_no_order(self.attributes, other.attributes): + return False if self.description != other.description: return False return True @@ -138,8 +133,9 @@ def get_known_attributes(self): @staticmethod def _compare_attributes_no_order(left, right): - left = {name: (set(value.split(",")) if isinstance(value, str) else value) for (name, value) in left.items()} - right = {name: (set(value.split(",")) if isinstance(value, str) else value) for (name, value) in right.items()} + if left != right: + left = {name: (set(value.split(",")) if isinstance(value, str) else value) for (name, value) in left.items()} + right = {name: (set(value.split(",")) if isinstance(value, str) else value) for (name, value) in right.items()} return left == right @@ -235,6 +231,7 @@ def get_conversion_factor(self, unit_name): if HedKey.ConversionFactor in self.attributes: return float(self.derivative_units.get(unit_name)) + class HedTagEntry(HedSchemaEntry): """ A single tag entry in the HedSchema. """ def __init__(self, *args, **kwargs): @@ -252,6 +249,13 @@ def __init__(self, *args, **kwargs): # Descendent tags below this one self.children = {} + def __eq__(self, other): + if not super().__eq__(other): + return False + if not self._compare_attributes_no_order(self.inherited_attributes, other.inherited_attributes): + return False + return True + def has_attribute(self, attribute, return_value=False): """ Returns th existence or value of an attribute in this entry. diff --git a/hed/schema/schema_compare.py b/hed/schema/schema_compare.py index f128306d..b2d4455c 100644 --- a/hed/schema/schema_compare.py +++ b/hed/schema/schema_compare.py @@ -1,7 +1,7 @@ from hed.schema.hed_schema import HedSchema, HedKey from hed.schema.hed_schema_constants import HedSectionKey +from collections import defaultdict -# This is still in design, means header attributes, epilogue, and prologue MiscSection = "misc" SectionEntryNames = { @@ -12,6 +12,7 @@ HedSectionKey.UnitModifiers: "Unit Modifier", HedSectionKey.Properties: "Property", HedSectionKey.Attributes: "Attribute", + MiscSection: "Misc Metadata" } SectionEntryNamesPlural = { @@ -22,154 +23,64 @@ HedSectionKey.UnitModifiers: "Unit Modifiers", HedSectionKey.Properties: "Properties", HedSectionKey.Attributes: "Attributes", + MiscSection: "Misc Metadata" } -def find_matching_tags(schema1, schema2, output='raw', sections=(HedSectionKey.Tags,), - include_summary=True): - """ - Compare the tags in two library schemas. This finds tags with the same term. +def find_matching_tags(schema1, schema2, sections=(HedSectionKey.Tags,), return_string=True): + """Compare the tags in two library schemas. This finds tags with the same term. Parameters: schema1 (HedSchema): The first schema to be compared. schema2 (HedSchema): The second schema to be compared. - output (str): Defaults to returning a python object dicts. - 'string' returns a single string - 'dict' returns a json style dictionary sections(list): the list of sections to compare. By default, just the tags section. If None, checks all sections including header, prologue, and epilogue. - include_summary(bool): If True, adds the 'summary' dict to the dict return option, and prints it with the - string option. Lists the names of all the nodes that are missing or different. + return_string(bool): If False, returns the raw python dictionary(for tools etc. possible use) Returns: - dict, json style dict, or str: A dictionary containing matching entries in the Tags section of both schemas. + str or dict: Returns a formatted string or python dict """ matches, _, _, unequal_entries = compare_schemas(schema1, schema2, sections=sections) + header_summary = _get_tag_name_summary((matches, unequal_entries)) + # Combine the two dictionaries for section_key, section_dict in matches.items(): section_dict.update(unequal_entries[section_key]) - header_summary = _get_tag_name_summary((matches, unequal_entries)) - - if output == 'string': - final_string = "" - if include_summary: - final_string += _pretty_print_header(header_summary) - if sections is None: - sections = HedSectionKey - for section_key in sections: - type_name = SectionEntryNames[section_key] - entries = matches[section_key] - if not entries: - continue - final_string += f"{type_name} differences:\n" - final_string += _pretty_print_diff_all(entries, type_name=type_name) + "\n" + if return_string: + final_string = "Nodes with matching names:\n" + final_string += _pretty_print_header(header_summary) + # Do we actually want this...? I'm just going to remove and add back later if needed. + # for section_key, entries in matches.items(): + # type_name = SectionEntryNames[section_key] + # if not entries: + # continue + # final_string += f"{type_name} differences:\n" + # final_string += _pretty_print_diff_all(entries, type_name=type_name) + "\n" return final_string - elif output == 'dict': - output_dict = {} - if include_summary: - output_dict["summary"] = {str(key): value for key, value in header_summary.items()} - - for section_name, section_entries in matches.items(): - output_dict[str(section_name)] = {} - for key, (entry1, entry2) in section_entries.items(): - output_dict[str(section_name)][key] = _dict_diff_entries(entry1, entry2) - return output_dict return matches -def compare_differences(schema1, schema2, output='raw', attribute_filter=None, sections=(HedSectionKey.Tags,), - include_summary=True): - """ - Compare the tags in two schemas, this finds any differences - - Parameters: - schema1 (HedSchema): The first schema to be compared. - schema2 (HedSchema): The second schema to be compared. - output (str): 'raw' (default) returns a tuple of python object dicts with raw results. - 'string' returns a single string - 'dict' returns a json-style python dictionary that can be converted to JSON - attribute_filter (str, optional): The attribute to filter entries by. - Entries without this attribute are skipped. - The most common use would be HedKey.InLibrary - If it evaluates to False, no filtering is performed. - sections(list or None): the list of sections to compare. By default, just the tags section. - If None, checks all sections including header, prologue, and epilogue. - include_summary(bool): If True, adds the 'summary' dict to the dict return option, and prints it with the - string option. Lists the names of all the nodes that are missing or different. - - Returns: - tuple, str or dict: - - Tuple with dict entries (not_in_schema1, not_in_schema1, unequal_entries). - - Formatted string with the output ready for printing. - - A Python dictionary with the output ready to be converted to JSON (for web output). +def _pretty_print_header(summary_dict): + output_string = "" + first_entry = True + for section_key, tag_names in summary_dict.items(): + if not tag_names: + continue + type_name = SectionEntryNamesPlural[section_key] + if not first_entry: + output_string += "\n" + output_string += f"{type_name}: " - Notes: The underlying dictionaries are: - - not_in_schema1(dict): Entries present in schema2 but not in schema1. - - not_in_schema2(dict): Entries present in schema1 but not in schema2. - - unequal_entries(dict): Entries that differ between the two schemas. + output_string += ", ".join(sorted(tag_names)) - """ - _, not_in_1, not_in_2, unequal_entries = compare_schemas(schema1, schema2, attribute_filter=attribute_filter, - sections=sections) - - if sections is None: - sections = HedSectionKey - - header_summary = _get_tag_name_summary((not_in_1, not_in_2, unequal_entries)) - if output == 'string': - final_string = "" - if include_summary: - final_string += _pretty_print_header(header_summary) - if not final_string: - return final_string - final_string = ("Overall summary:\n================\n" + final_string + \ - "\n\n\nSummary details:\n================\n\n") - for section_key in sections: - val1, val2, val3 = unequal_entries[section_key], not_in_1[section_key], not_in_2[section_key] - type_name = SectionEntryNames[section_key] - if val1 or val2 or val3: - final_string += f"{type_name} differences:\n" - if val1: - final_string += _pretty_print_diff_all(val1, type_name=type_name) + "\n" - if val2: - final_string += _pretty_print_missing_all(val2, "Schema1", type_name) + "\n" - if val3: - final_string += _pretty_print_missing_all(val3, "Schema2", type_name) + "\n" - final_string += "\n\n" - return final_string - elif output == 'dict': - # todo: clean this part up - output_dict = {} - current_section = {} - if include_summary: - output_dict["summary"] = {str(key): value for key, value in header_summary.items()} - - output_dict["unequal"] = current_section - for section_name, section_entries in unequal_entries.items(): - current_section[str(section_name)] = {} - for key, (entry1, entry2) in section_entries.items(): - current_section[str(section_name)][key] = _dict_diff_entries(entry1, entry2) - - current_section = {} - output_dict["not_in_1"] = current_section - for section_name, section_entries in not_in_1.items(): - current_section[str(section_name)] = {} - for key, entry in section_entries.items(): - current_section[str(section_name)][key] = _entry_to_dict(entry) - - current_section = {} - output_dict["not_in_2"] = current_section - for section_name, section_entries in not_in_2.items(): - current_section[str(section_name)] = {} - for key, entry in section_entries.items(): - current_section[str(section_name)][key] = _entry_to_dict(entry) - return output_dict - return not_in_1, not_in_2, unequal_entries + output_string += "\n" + first_entry = False + return output_string def compare_schemas(schema1, schema2, attribute_filter=HedKey.InLibrary, sections=(HedSectionKey.Tags,)): - """ - Compare two schemas section by section. + """ Compare two schemas section by section. + The function records matching entries, entries present in one schema but not in the other, and unequal entries. Parameters: @@ -179,7 +90,7 @@ def compare_schemas(schema1, schema2, attribute_filter=HedKey.InLibrary, section Entries without this attribute are skipped. The most common use would be HedKey.InLibrary If it evaluates to False, no filtering is performed. - sections(list): the list of sections to compare. By default, just the tags section. + sections(list or None): the list of sections to compare. By default, just the tags section. If None, checks all sections including header, prologue, and epilogue. Returns: @@ -216,16 +127,16 @@ def compare_schemas(schema1, schema2, attribute_filter=HedKey.InLibrary, section section1 = schema1[section_key] section2 = schema2[section_key] - attribute = 'short_tag_name' if section_key == HedSectionKey.Tags else 'name' + name_attribute = 'short_tag_name' if section_key == HedSectionKey.Tags else 'name' # Get the name we're comparing things by for entry in section1.all_entries: if not attribute_filter or entry.has_attribute(attribute_filter): - dict1[getattr(entry, attribute)] = entry + dict1[getattr(entry, name_attribute)] = entry for entry in section2.all_entries: if not attribute_filter or entry.has_attribute(attribute_filter): - dict2[getattr(entry, attribute)] = entry + dict2[getattr(entry, name_attribute)] = entry # Find keys present in dict1 but not in dict2, and vice versa not_in_schema2[section_key] = {key: dict1[key] for key in dict1 if key not in dict2} @@ -243,185 +154,227 @@ def compare_schemas(schema1, schema2, attribute_filter=HedKey.InLibrary, section def _get_tag_name_summary(tag_dicts): + """Combines the given dicts, so the output is section_key:list of keys""" out_dict = {section_key: [] for section_key in HedSectionKey} for tag_dict in tag_dicts: for section_key, section in tag_dict.items(): - if section_key == MiscSection: - continue out_dict[section_key].extend(section.keys()) return out_dict -def _pretty_print_header(summary_dict): - - output_string = "" - first_entry = True - for section_key, tag_names in summary_dict.items(): - if not tag_names: - continue - type_name = SectionEntryNamesPlural[section_key] - if not first_entry: - output_string += "\n" - output_string += f"{type_name}: " +def _group_changes_by_section_with_unique_tags(change_dict): + """Similar to above, but on the patch note changes""" + organized_changes = defaultdict(set) + for change in change_dict: + section_key = change['section'] + tag = change['tag'] + organized_changes[section_key].add(tag) + return dict(organized_changes) - output_string += ", ".join(sorted(tag_names)) - - output_string += "\n" - first_entry = False - return output_string - - -def _pretty_print_entry(entry): - """ Returns the contents of a HedSchemaEntry object as a list of strings. - - Parameters: - entry (HedSchemaEntry): The HedSchemaEntry object to be displayed. - - Returns: - List of strings representing the entry. - """ - # Initialize the list with the name of the entry - output = [f"\tName: {entry.name}"] - # Add the description to the list if it exists - if entry.description is not None: - output.append(f"\tDescription: {entry.description}") +def _sort_changes_by_severity(changes_dict): + """Sort the changelist by severity""" + for section in changes_dict.values(): + order = {'Major': 1, 'Minor': 2, 'Patch': 3, 'Unknown': 4} + section.sort(key=lambda x: order.get(x['change_type'], order['Unknown'])) - # Iterate over all attributes and add them to the list - for attr_key, attr_value in entry.attributes.items(): - output.append(f"\tAttribute: {attr_key} - Value: {attr_value}") - return output - - -def _entry_to_dict(entry): - """ - Returns the contents of a HedSchemaEntry object as a dictionary. - - Parameters: - entry (HedSchemaEntry): The HedSchemaEntry object to be displayed. - - Returns: - Dictionary representing the entry. - """ - output = { - "Name": entry.name, - "Description": entry.description, - "Attributes": entry.attributes - } - return output - - -def _dict_diff_entries(entry1, entry2): +def gather_schema_changes(schema1, schema2, attribute_filter=None): """ - Returns the differences between two HedSchemaEntry objects as a dictionary. + Compare two schemas section by section, generated a changelog Parameters: - entry1 (HedSchemaEntry or str): The first entry. - entry2 (HedSchemaEntry or str): The second entry. + schema1 (HedSchema): The first schema to be compared. + schema2 (HedSchema): The second schema to be compared. + attribute_filter (str, optional): The attribute to filter entries by. + Entries without this attribute are skipped. + The most common use would be HedKey.InLibrary + If it evaluates to False, no filtering is performed. Returns: - Dictionary representing the differences. - """ - diff_dict = {} - - if isinstance(entry1, str): - # Handle special case ones like prologue - if entry1 != entry2: - diff_dict["value"] = { - "Schema1": entry1, - "Schema2": entry2 - } - else: - if entry1.name != entry2.name: - diff_dict["name"] = { - "Schema1": entry1.name, - "Schema2": entry2.name - } - - # Checking if both entries have the same description - if entry1.description != entry2.description: - diff_dict["description"] = { - "Schema1": entry1.description, - "Schema2": entry2.description - } - - # Comparing attributes - for attr in set(entry1.attributes.keys()).union(entry2.attributes.keys()): - if entry1.attributes.get(attr) != entry2.attributes.get(attr): - diff_dict[attr] = { - "Schema1": entry1.attributes.get(attr), - "Schema2": entry2.attributes.get(attr) - } - - return diff_dict - - -def _pretty_print_diff_entry(entry1, entry2): + changelog(dict): A dict organized by section with the changes """ - Returns the differences between two HedSchemaEntry objects as a list of strings. + _, not_in_1, not_in_2, unequal_entries = compare_schemas(schema1, schema2, attribute_filter=attribute_filter, + sections=None) + change_dict = defaultdict(list) + + # Items removed from schema + for section_key, section in not_in_2.items(): + for tag, _ in section.items(): + type_name = SectionEntryNamesPlural[section_key] + if section_key == HedSectionKey.Tags: + change_dict[section_key].append( + {'change_type': 'Major', 'change': f'Tag {tag} deleted from {type_name}', + 'tag': tag}) + else: + # Only here for completeness - these aren't in the list + change_dict[section_key].append( + {'change_type': 'Unknown', 'change': f"Item {tag} removed from {type_name}", + 'tag': tag}) + + # Items added to schema + for section_key, section in not_in_1.items(): + for tag, _ in section.items(): + type_name = SectionEntryNamesPlural[section_key] + change_dict[section_key].append({'change_type': 'Minor', 'change': f'Item {tag} added to {type_name}', + 'tag': tag}) + + # Now the much more complex comparing an individual tag changes + for section_key, changes in unequal_entries.items(): + if section_key == MiscSection: + for misc_section, (value1, value2) in changes.items(): + # todo: consider fine grained header changes + change_dict[section_key].append( + {'change_type': 'Patch', 'change': f'{misc_section} changed from "{value1}" to "{value2}"', + 'tag': misc_section}) + continue + for tag, (entry1, entry2) in changes.items(): + if section_key == HedSectionKey.UnitClasses: + for unit in entry1.units: + if unit not in entry2.units: + change_dict[section_key].append( + {'change_type': 'Major', 'change': f'Unit {unit} removed from {entry1.name}', + 'tag': tag}) + for unit in entry2.units: + if unit not in entry1.units: + change_dict[section_key].append( + {'change_type': 'Patch', 'change': f'Unit {unit} added to {entry2.name}', + 'tag': tag}) + if section_key == HedSectionKey.Tags: + for unit_class in entry1.unit_classes: + if unit_class not in entry2.unit_classes: + change_dict[section_key].append( + {'change_type': 'Major', + 'change': f'Unit class {unit_class} removed from {entry1.short_tag_name}', + 'tag': tag}) + for unit_class in entry2.unit_classes: + if unit_class not in entry1.unit_classes: + change_dict[section_key].append( + {'change_type': 'Patch', + 'change': f'Unit class {unit_class} added to {entry2.short_tag_name}', + 'tag': tag}) + + for value_class in entry1.value_classes: + if value_class not in entry2.value_classes: + change_dict[section_key].append( + {'change_type': 'Unknown', + 'change': f'Value class {value_class} removed from {entry1.short_tag_name}', + 'tag': tag}) + for value_class in entry2.value_classes: + if value_class not in entry1.value_classes: + change_dict[section_key].append( + {'change_type': 'Minor', + 'change': f'Value class {value_class} added to {entry2.short_tag_name}', + 'tag': tag}) + + if entry1.long_tag_name != entry2.long_tag_name: + change_dict[section_key].append( + {'change_type': 'Minor', 'change': f'Tag {entry1.short_tag_name} moved in schema', + 'tag': tag}) + + suggested_tag1 = sorted(entry1.inherited_attributes.get(HedKey.SuggestedTag, "").split(",")) + suggested_tag2 = sorted(entry2.inherited_attributes.get(HedKey.SuggestedTag, "").split(",")) + if suggested_tag1 != suggested_tag2: + change_dict[section_key].append( + {'change_type': 'Patch', 'change': f'Suggested tag changed on {entry1.name}', + 'tag': tag}) + + related_tag1 = sorted(entry1.inherited_attributes.get(HedKey.RelatedTag, "").split(",")) + related_tag2 = sorted(entry2.inherited_attributes.get(HedKey.RelatedTag, "").split(",")) + if related_tag1 != related_tag2: + change_dict[section_key].append( + {'change_type': 'Patch', 'change': f'Related tag changed on {entry1.name}', + 'tag': tag}) + + _check_other_attributes(entry1, entry2, tag, section_key, change_dict) + if entry1.description != entry2.description: + change_dict[section_key].append({'change_type': 'Patch', 'change': f'Description of {tag} modified', + 'tag': tag}) + + _sort_changes_by_severity(change_dict) + return change_dict + + +def pretty_print_change_dict(change_dict, title="Schema changes"): + """Formats the change_dict into a string. Parameters: - entry1 (HedSchemaEntry): The first entry. - entry2 (HedSchemaEntry): The second entry. + change_dict(dict): The result from calling gather_schema_changes + title(str): Optional header to add, a default on will be added otherwise. Returns: - List of strings representing the differences. + changelog(str): the changes listed out by section """ - diff_dict = _dict_diff_entries(entry1, entry2) - diff_lines = [] - - for key, value in diff_dict.items(): - diff_lines.append(f"\t{key}:") - for schema, val in value.items(): - diff_lines.append(f"\t\t{schema}: {val}") + final_strings = [] + if change_dict: + final_strings.append(title) + for section_key, section_dict in change_dict.items(): + name = SectionEntryNamesPlural.get(section_key, section_key) + final_strings.append(f"{name}:") + for item in section_dict: + change, tag, change_type = item['change'], item['tag'], item['change_type'] + final_strings.append(f"\t{tag} ({change_type}): {change}") + return "\n".join(final_strings) - return diff_lines - -def _pretty_print_diff_all(entries, type_name=""): - """ - Formats the differences between pairs of HedSchemaEntry objects. +def compare_differences(schema1, schema2, attribute_filter=None, title=""): + """Compare the tags in two schemas, this finds any differences Parameters: - entries (dict): A dictionary where each key maps to a pair of HedSchemaEntry objects. - type_name(str): The type to identify this as, such as Tag + schema1 (HedSchema): The first schema to be compared. + schema2 (HedSchema): The second schema to be compared. + attribute_filter (str, optional): The attribute to filter entries by. + Entries without this attribute are skipped. + The most common use would be HedKey.InLibrary + If it evaluates to False, no filtering is performed. + title(str): Optional header to add, a default on will be added otherwise. + Returns: - diff_string(str): The differences found in the dict + changelog(str): the changes listed out by section """ - output = [] - if not type_name.endswith(" "): - type_name += " " - if not entries: - return "" - for key, (entry1, entry2) in entries.items(): - output.append(f"{type_name}'{key}':") - output += _pretty_print_diff_entry(entry1, entry2) - output.append("") + changelog = gather_schema_changes(schema1, schema2, attribute_filter=attribute_filter) + if not title: + title = f"Differences between {schema1.name} and {schema2.name}" + changelog_string = pretty_print_change_dict(changelog, title=title) - return "\n".join(output) + return changelog_string -def _pretty_print_missing_all(entries, schema_name, type_name): - """ - Formats the missing entries from schema_name. - - Parameters: - entries (dict): A dictionary where each key maps to a pair of HedSchemaEntry objects. - schema_name(str): The name these entries are missing from - type_name(str): The type to identify this as, such as Tag - Returns: - diff_string(str): The differences found in the dict - """ - output = [] - if not entries: - return "" - if not type_name.endswith(" "): - type_name += " " - for key, entry in entries.items(): - output.append(f"{type_name}'{key}' not in '{schema_name}':") - output += _pretty_print_entry(entry) - output.append("") - - return "\n".join(output) +def _check_other_attributes(entry1, entry2, tag, section_key, change_dict): + """Compare non specialized attributes""" + already_checked_attributes = [HedKey.RelatedTag, HedKey.SuggestedTag, HedKey.ValueClass, HedKey.UnitClass] + unique_keys = set(entry1.attributes.keys()).union(entry2.attributes.keys()) + if section_key == HedSectionKey.Tags: + unique_inherited_keys = set(entry1.inherited_attributes.keys()).union(entry2.inherited_attributes.keys()) + else: + unique_inherited_keys = unique_keys + # Combine unique keys from both attributes and inherited attributes, then remove already checked attributes + all_unique_keys = unique_keys.union(unique_inherited_keys).difference(already_checked_attributes) + + for key in all_unique_keys: + is_inherited = key in unique_inherited_keys + is_direct = key in unique_keys + + if section_key == HedSectionKey.Tags: + value1 = entry1.inherited_attributes.get(key) + value2 = entry2.inherited_attributes.get(key) + else: + value1 = entry1.attributes.get(key) + value2 = entry2.attributes.get(key) + + if value1 != value2: + if is_inherited and not is_direct: + change_dict[section_key].append({ + "change_type": "Minor", + "change": f"Inherited attribute '{key}' modified from '{value1}' to '{value2}'", + "tag": tag, + "section": section_key + }) + else: + change_dict[section_key].append({ + "change_type": "Patch", + "change": f"Attribute '{key}' modified from '{value1}' to '{value2}'", + "tag": tag, + "section": section_key + }) diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py index 2f48775c..bc193eaf 100644 --- a/hed/schema/schema_io/base2schema.py +++ b/hed/schema/schema_io/base2schema.py @@ -114,6 +114,7 @@ def _load(self): # Copy the non-alterable cached schema self._schema = copy.deepcopy(base_version) self._schema.filename = self.filename + self._schema.name = self.name # Manually set name here as we don't want to pass it to load_schema_version self._schema.header_attributes = saved_attr self._loading_merged = False diff --git a/tests/data/schema_tests/schema_compare.mediawiki b/tests/data/schema_tests/schema_compare.mediawiki new file mode 100644 index 00000000..397f26d8 --- /dev/null +++ b/tests/data/schema_tests/schema_compare.mediawiki @@ -0,0 +1,51 @@ +HED version="1.1.0" library="compare" withStandard="8.2.0" unmerged="True" + +'''Prologue''' + +!# start schema + +'''Tag1''' +* Moving-tag +* Moving-tag-changed +* Deleting-tag +* RemovingUnitClass {takesValue} +** # {unitClass=accelerationUnits, unitClass=physicalLengthUnits} +* AddingUnitClass {takesValue} +** # {unitClass=physicalLengthUnits} + + +'''Tag2''' +* RelatedTagChanging {relatedTag=Tag1, relatedTag=Tag2} +** InheritedRelatedTagChanging +* SuggestedTagChanging {suggestedTag=Tag1, suggestedTag=Tag2} +* RemovingValueClass {takesValue} +** # {valueClass=numericClass, valueClass=nameClass} +* AddingValueClass {takesValue} +** # {valueClass=numericClass} + +'''Tag3''' +* DescriptionChanging [TagDescription] + +'''Tag4'''{extensionAllowed} +* ExtensionAllowedRemoval1 +* ExtensionAllowedRemoval2 + +!# end schema + +'''Unit classes''' +* addingUnit +** unit1 +* removingUnit +** unit3 +** unit4 +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' +'''Epilogue''' +Epilogue Removed + +!# end hed diff --git a/tests/data/schema_tests/schema_compare2.mediawiki b/tests/data/schema_tests/schema_compare2.mediawiki new file mode 100644 index 00000000..2747d72b --- /dev/null +++ b/tests/data/schema_tests/schema_compare2.mediawiki @@ -0,0 +1,62 @@ +HED version="1.2.0" library="compare" withStandard="8.2.0" unmerged="True" + +'''Prologue''' +Prologue Added + +!# start schema + +'''Tag1''' +* RemovingUnitClass {takesValue} +** # {unitClass=accelerationUnits} +* AddingUnitClass {takesValue} +** # {unitClass=accelerationUnits, unitClass=physicalLengthUnits} + +'''Tag2''' +* Moving-tag +* Moving-tag-changed {requireChild} +** AddedTag + +* RemovingValueClass {takesValue} +** # {valueClass=numericClass} +* AddingValueClass {takesValue} +** # {valueClass=numericClass, valueClass=nameClass} + +* RelatedTagChanging {relatedTag=Tag1} +** InheritedRelatedTagChanging +* SuggestedTagChanging {suggestedTag=Tag2} + +'''Tag3''' +* DescriptionChanging [TagDescriptionNew] + +'''Tag4''' +* ExtensionAllowedRemoval1 + +'''Tag5''' +* ExtensionAllowedRemoval2 + +!# end schema + +'''Unit classes''' +* addingUnit +** unit1 +** unit2 +* removingUnit +** unit4 +* NewUnitClass + +'''Unit modifiers''' +* NewUnitModifier + +'''Value classes''' +* NewValueClass + +'''Schema attributes''' +* NewAttribute + +'''Properties''' +* NewProperty + +'''Epilogue''' + + +!# end hed diff --git a/tests/models/test_basic_search.py b/tests/models/test_basic_search.py index 519c9bae..0124a0f5 100644 --- a/tests/models/test_basic_search.py +++ b/tests/models/test_basic_search.py @@ -8,7 +8,7 @@ from hed.models.basic_search import find_words, check_parentheses, reverse_and_flip_parentheses, \ construct_delimiter_map, verify_search_delimiters, find_matching import numpy as np - +from hed.models.df_util import convert_to_form class TestNewSearch(unittest.TestCase): @classmethod @@ -26,11 +26,34 @@ def test_find_matching_results(self): result1 = basic_search.find_matching(self.df, "(Face, Item-interval/1)") result2 = basic_search.find_matching(self.df, "(Face, Item-interval/1*)") - # Add assertions self.assertTrue(np.sum(result1) > 0, "result1 should have some true values") self.assertTrue(np.sum(result2) > 0, "result2 should have some true values") self.assertTrue(np.sum(result1) < np.sum(result2), "result1 should have fewer true values than result2") + # Verify we get the same results in both tag forms + df_copy = self.df.copy() + convert_to_form(df_copy, self.schema, "long_tag") + + result1b = basic_search.find_matching(self.df, "(Face, Item-interval/1)") + result2b = basic_search.find_matching(self.df, "(Face, Item-interval/1*)") + + self.assertTrue(np.sum(result1b) > 0, "result1 should have some true values") + self.assertTrue(np.sum(result2b) > 0, "result2 should have some true values") + self.assertTrue(np.sum(result1b) < np.sum(result2b), "result1 should have fewer true values than result2") + self.assertTrue(result1.equals(result1b)) + self.assertTrue(result2.equals(result2b)) + + convert_to_form(df_copy, self.schema, "short_tag") + + result1b = basic_search.find_matching(self.df, "(Face, Item-interval/1)") + result2b = basic_search.find_matching(self.df, "(Face, Item-interval/1*)") + + self.assertTrue(np.sum(result1b) > 0, "result1 should have some true values") + self.assertTrue(np.sum(result2b) > 0, "result2 should have some true values") + self.assertTrue(np.sum(result1b) < np.sum(result2b), "result1 should have fewer true values than result2") + self.assertTrue(result1.equals(result1b)) + self.assertTrue(result2.equals(result2b)) + class TestFindWords(unittest.TestCase): def test_basic(self): diff --git a/tests/schema/test_schema_compare.py b/tests/schema/test_schema_compare.py index f6b1ceed..31302c66 100644 --- a/tests/schema/test_schema_compare.py +++ b/tests/schema/test_schema_compare.py @@ -1,21 +1,27 @@ import unittest import json +import copy + from hed.schema import HedKey, HedSectionKey -from hed.schema.schema_compare import compare_schemas, find_matching_tags, \ - _pretty_print_diff_all, _pretty_print_missing_all, compare_differences -from hed import load_schema_version +from hed.schema.schema_compare import compare_schemas +from hed.schema.schema_compare import gather_schema_changes, find_matching_tags, pretty_print_change_dict, compare_differences +from hed import load_schema_version, load_schema from . import util_create_schemas - +import os class TestSchemaComparison(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.base_data = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/schema_tests/') + def test_find_matching_tags(self): # create entries for schema1 schema1 = util_create_schemas.load_schema1() schema2 = util_create_schemas.load_schema2() - result = find_matching_tags(schema1, schema2) + result = find_matching_tags(schema1, schema2, return_string=False) # Check if the result is correct self.assertEqual(len(result[HedSectionKey.Tags]), 3) self.assertIn("TestNode", result[HedSectionKey.Tags]) @@ -24,27 +30,9 @@ def test_find_matching_tags(self): self.assertNotIn("TestNode4", result[HedSectionKey.Tags]) self.assertNotIn("TestNode5", result[HedSectionKey.Tags]) - # Test with include_summary=True - match_string = find_matching_tags(schema1, schema2, output='string', include_summary=True) + match_string = find_matching_tags(schema1, schema2) self.assertIsInstance(match_string, str) self.assertIn("Tags:", match_string) - # print(match_string) - - json_style_dict = find_matching_tags(schema1, schema2, output='dict', include_summary=True) - self.assertIsInstance(json_style_dict, dict) - self.assertIn("summary", json_style_dict) - - result_string = json.dumps(json_style_dict, indent=4) - self.assertIsInstance(result_string, str) - - # Optionally, you can also test the case without include_summary - match_string_no_summary = find_matching_tags(schema1, schema2, output='string', include_summary=False) - self.assertIsInstance(match_string_no_summary, str) - self.assertNotIn("Tags:", match_string_no_summary) - - json_style_dict_no_summary = find_matching_tags(schema1, schema2, output='dict', include_summary=False) - self.assertIsInstance(json_style_dict_no_summary, dict) - self.assertNotIn("summary", json_style_dict_no_summary) def test_compare_schemas(self): schema1 = util_create_schemas.load_schema1() @@ -67,11 +55,22 @@ def test_compare_schemas(self): self.assertEqual(len(unequal_entries[HedSectionKey.Tags]), 1) # No unequal entries should be found self.assertIn("TestNode3", unequal_entries[HedSectionKey.Tags]) + def test_compare_and_summarize_schemas_test(self): + schema1 = load_schema(os.path.join(self.base_data, "schema_compare.mediawiki"), name="Schema1") + schema2 = load_schema(os.path.join(self.base_data, "schema_compare2.mediawiki"), name="Schema2") + + result = gather_schema_changes(schema1, schema2) + self.assertEqual(sum(len(x) for x in result.values()), 30) + schema_string = pretty_print_change_dict(result, title=f"Differences between {schema1.name} and {schema2.name}") + # this test may need updating if the text format changes + found_issues = schema_string.count("):") + self.assertEqual(found_issues, 30) + def test_compare_differences(self): schema1 = util_create_schemas.load_schema1() schema2 = util_create_schemas.load_schema2() - not_in_schema1, not_in_schema2, unequal_entries = compare_differences(schema1, schema2) + _, not_in_schema1, not_in_schema2, unequal_entries = compare_schemas(schema1, schema2) self.assertEqual(len(not_in_schema2[HedSectionKey.Tags]), 1) # One tag not in schema2 self.assertIn("TestNode4", not_in_schema2[HedSectionKey.Tags]) # "TestNode4" is not in schema2 @@ -82,49 +81,32 @@ def test_compare_differences(self): self.assertEqual(len(unequal_entries[HedSectionKey.Tags]), 1) # No unequal entries should be found self.assertIn("TestNode3", unequal_entries[HedSectionKey.Tags]) - # Test with include_summary=True, string output - diff_string_with_summary = compare_differences(schema1, schema2, output='string', include_summary=True) + diff_string_with_summary = compare_differences(schema1, schema2) self.assertIsInstance(diff_string_with_summary, str) self.assertIn("Tags:", diff_string_with_summary) - # print(diff_string_with_summary) - - # Test with include_summary=True, dict output - diff_dict_with_summary = compare_differences(schema1, schema2, output='dict', include_summary=True) - self.assertIsInstance(diff_dict_with_summary, dict) - self.assertIn("summary", diff_dict_with_summary) - - # Optionally, test without include_summary, string output - diff_string_no_summary = compare_differences(schema1, schema2, output='string', include_summary=False) - self.assertIsInstance(diff_string_no_summary, str) - self.assertNotIn("Tags:", diff_string_no_summary) - - # Optionally, test without include_summary, dict output - diff_dict_no_summary = compare_differences(schema1, schema2, output='dict', include_summary=False) - self.assertIsInstance(diff_dict_no_summary, dict) - self.assertNotIn("summary", diff_dict_no_summary) def test_compare_score_lib_versions(self): schema1 = load_schema_version("score_1.0.0") schema2 = load_schema_version("score_1.1.0") - not_in_schema1, not_in_schema2, unequal_entries = compare_differences(schema1, schema2, - attribute_filter=HedKey.InLibrary) - - + _, not_in_schema1, not_in_schema2, unequal_entries = compare_schemas(schema1, schema2, + attribute_filter=HedKey.InLibrary) + self.assertEqual(len(not_in_schema1[HedSectionKey.Tags]), 21) self.assertEqual(len(not_in_schema2[HedSectionKey.Tags]), 10) - self.assertEqual(len(unequal_entries[HedSectionKey.Tags]), 61) - - diff_string = compare_differences(schema1, schema1, attribute_filter=HedKey.InLibrary, output='string', - sections=None) + self.assertEqual(len(unequal_entries[HedSectionKey.Tags]), 80) + + diff_string = compare_differences(schema1, schema2, attribute_filter=HedKey.InLibrary) + # Do a half-hearted check that all the above showed up in the output + self.assertTrue(diff_string) + for item in not_in_schema1[HedSectionKey.Tags].keys(): + self.assertIn(item, diff_string) + for item in not_in_schema2[HedSectionKey.Tags].keys(): + self.assertIn(item, diff_string) + for item in unequal_entries[HedSectionKey.Tags].keys(): + self.assertIn(item, diff_string) + + def test_compare_identical_schemas(self): + schema1 = load_schema_version("score_1.0.0") + schema2 = copy.deepcopy(schema1) + diff_string = compare_differences(schema1, schema2, attribute_filter=HedKey.InLibrary) self.assertFalse(diff_string) - diff_string = compare_differences(schema1, schema2, attribute_filter=HedKey.InLibrary, output='string', - sections=None) - - self.assertIsInstance(diff_string, str) - - json_style_dict = compare_differences(schema1, schema2, attribute_filter=HedKey.InLibrary, output='dict', - sections=None) - self.assertIsInstance(json_style_dict, dict) - - result_string = json.dumps(json_style_dict, indent=4) - self.assertIsInstance(result_string, str)