diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py index 45302f89..000e910d 100644 --- a/hed/errors/exceptions.py +++ b/hed/errors/exceptions.py @@ -49,6 +49,8 @@ class HedExceptions: CANNOT_PARSE_RDF = "CANNOT_PARSE_RDF" SCHEMA_LOAD_FAILED = "SCHEMA_LOAD_FAILED" + SCHEMA_TAG_TSV_BAD_PARENT = "SCHEMA_TAG_TSV_BAD_PARENT" + class HedFileError(Exception): """Exception raised when a file cannot be parsed due to being malformed, file IO, etc.""" diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index 73600457..99354386 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -65,6 +65,7 @@ def load_schema(hed_path, schema_namespace=None, schema=None, name=None): hed_path (str): A filepath or url to open a schema from. If loading a TSV file, this should be a single filename where: Template: basename.tsv, where files are named basename_Struct.tsv, basename_Tag.tsv, etc. + Alternatively, you can point to a directory containing the .tsv files. schema_namespace (str or None): The name_prefix all tags in this schema will accept. schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py index 5b26c19f..e49bd62e 100644 --- a/hed/schema/schema_io/df2schema.py +++ b/hed/schema/schema_io/df2schema.py @@ -7,14 +7,14 @@ from hed.schema.schema_io import ontology_util from hed.schema.hed_schema_constants import HedSectionKey, HedKey from hed.errors.exceptions import HedFileError, HedExceptions -from hed.schema.schema_io.text2schema import SchemaLoaderText +from hed.schema.schema_io.base2schema import SchemaLoader import pandas as pd import hed.schema.hed_schema_df_constants as constants from hed.errors import error_reporter from hed.schema.schema_io import text_util -class SchemaLoaderDF(SchemaLoaderText): +class SchemaLoaderDF(SchemaLoader): """ Load dataframe schemas from filenames Expected usage is SchemaLoaderDF.load(filenames) @@ -139,17 +139,82 @@ def _read_schema(self, dataframe): """ self._schema._initialize_attributes(HedSectionKey.Tags) known_parent_tags = {"HedTag": []} - level_adj = 0 - for row_number, row in dataframe[constants.TAG_KEY].iterrows(): - # skip blank rows, though there shouldn't be any - if not any(row): - continue - parent_tag = row[constants.subclass_of] - org_parent_tags = known_parent_tags.get(parent_tag, []).copy() + iterations = 0 + # Handle this over multiple iterations incase tags have parent tags listed later in the file. + # A properly formatted .tsv file will never have parents after the child. + current_rows = list(dataframe[constants.TAG_KEY].iterrows()) + while current_rows: + iterations += 1 + next_round_rows = [] + for row_number, row in current_rows: + # skip blank rows, though there shouldn't be any + if not any(row): + continue + + parent_tag = row[constants.subclass_of] + org_parent_tags = known_parent_tags.get(parent_tag) + tag_entry = self._create_tag_entry(org_parent_tags, row_number, row) + if not tag_entry: + # This will have already raised an error + continue + + # If this is NOT a rooted tag and we have no parent, try it in another round. + if org_parent_tags is None and not tag_entry.has_attribute(HedKey.Rooted): + next_round_rows.append((row_number, row)) + continue + + tag_entry = self._add_tag_entry(tag_entry, row_number, row) + if tag_entry: + known_parent_tags[tag_entry.short_tag_name] = tag_entry.name.split("/") + + if len(next_round_rows) == len(current_rows): + for row_number, row in current_rows: + tag_name = self._get_tag_name(row) + msg = (f"Cannot resolve parent tag. " + f"There is probably an issue with circular parent tags of {tag_name} on row {row_number}.") + self._add_fatal_error(row_number, row, msg, HedExceptions.SCHEMA_TAG_TSV_BAD_PARENT) + break + current_rows = next_round_rows + + def _add_tag_entry(self, tag_entry, row_number, row): + try: + rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged) + if rooted_entry: + parent_tags = rooted_entry.long_tag_name.split("/") + # Create the entry again for rooted tags, to get the full name. + tag_entry = self._create_tag_entry(parent_tags, row_number, row) + except HedFileError as e: + self._add_fatal_error(row_number, row, e.message, e.code) + return None + + tag_entry = self._add_to_dict(row_number, row, tag_entry, HedSectionKey.Tags) + + return tag_entry + + def _create_tag_entry(self, parent_tags, row_number, row): + """ Create a tag entry(does not add to dict) - tag_entry, parent_tags, _ = self._add_tag_meta(org_parent_tags, row_number, row, level_adj) - if tag_entry: - known_parent_tags[tag_entry.short_tag_name] = parent_tags.copy() + Parameters: + parent_tags (list): A list of parent tags in order. + row_number (int): The row number to report errors as + row (str or pd.Series): A tag row or pandas series(depends on format) + + Returns: + HedSchemaEntry: The entry for the added tag. + + Notes: + Includes attributes and description. + """ + tag_name = self._get_tag_name(row) + if tag_name: + if parent_tags: + long_tag_name = "/".join(parent_tags) + "/" + tag_name + else: + long_tag_name = tag_name + return self._create_entry(row_number, row, HedSectionKey.Tags, long_tag_name) + + self._add_fatal_error(row_number, row, f"No tag name found in row.", + error_code=HedExceptions.GENERIC_ERROR) def _read_section(self, df, section_key): self._schema._initialize_attributes(section_key) @@ -185,11 +250,11 @@ def _read_attribute_section(self, df, annotation_property=False, section_key=Hed def _get_tag_name(self, row): base_tag_name = row[constants.name] if base_tag_name.endswith("-#"): - return "#", 0 - return base_tag_name, 0 + return "#" + return base_tag_name def _create_entry(self, row_number, row, key_class, full_tag_name=None): - element_name, _ = self._get_tag_name(row) + element_name = self._get_tag_name(row) if full_tag_name: element_name = full_tag_name @@ -224,6 +289,14 @@ def _get_tag_attributes(self, row_number, row): except ValueError as e: self._add_fatal_error(row_number, str(row), str(e)) + def _add_to_dict(self, row_number, row, entry, key_class): + if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema: + self._add_fatal_error(row_number, row, + "Library tag in unmerged schema has InLibrary attribute", + HedExceptions.IN_LIBRARY_IN_UNMERGED) + + return self._add_to_dict_base(entry, key_class) + def load_dataframes(filenames): dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames) diff --git a/hed/schema/schema_io/text2schema.py b/hed/schema/schema_io/text2schema.py deleted file mode 100644 index 7db35ec7..00000000 --- a/hed/schema/schema_io/text2schema.py +++ /dev/null @@ -1,107 +0,0 @@ -""" -Create a HedSchema object from a .mediawiki file. -""" - -from abc import abstractmethod -from hed.schema.hed_schema_constants import HedSectionKey, HedKey -from hed.errors.exceptions import HedFileError, HedExceptions -from hed.schema.schema_io.base2schema import SchemaLoader - - -class SchemaLoaderText(SchemaLoader): - """ Intermediate class to handle text based formats(tsv, wiki) - - Cannot be used directly - """ - def __init__(self, filename, schema_as_string=None, schema=None, file_format=None, name=""): - super().__init__(filename, schema_as_string, schema, file_format, name) - self._no_name_msg = f"No tag name found in row." - self._no_name_error = HedExceptions.GENERIC_ERROR - - def _add_tag_meta(self, parent_tags, row_number, row, level_adj): - tag_entry = self._add_tag_line(parent_tags, row_number, row) - if not tag_entry: - # This will have already raised an error - return None, parent_tags, level_adj - - try: - rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged) - if rooted_entry: - parent_tags = rooted_entry.long_tag_name.split("/") - level_adj = len(parent_tags) - # Create the entry again for rooted tags, to get the full name. - tag_entry = self._add_tag_line(parent_tags, row_number, row) - except HedFileError as e: - self._add_fatal_error(row_number, row, e.message, e.code) - return None, parent_tags, level_adj - - tag_entry = self._add_to_dict(row_number, row, tag_entry, HedSectionKey.Tags) - - if tag_entry.name.endswith("/#"): - parent_tags.append("#") - else: - parent_tags.append(tag_entry.short_tag_name) - - return tag_entry, parent_tags, level_adj - - def _add_tag_line(self, parent_tags, row_number, row): - """ Add a tag to the dictionaries. - - Parameters: - parent_tags (list): A list of parent tags in order. - row_number (int): The row number to report errors as - row (str or pd.Series): A tag row or pandas series(depends on format) - - Returns: - HedSchemaEntry: The entry for the added tag. - - Notes: - Includes attributes and description. - """ - tag_name, _ = self._get_tag_name(row) - if tag_name: - if parent_tags: - long_tag_name = "/".join(parent_tags) + "/" + tag_name - else: - long_tag_name = tag_name - return self._create_entry(row_number, row, HedSectionKey.Tags, long_tag_name) - - self._add_fatal_error(row_number, row, self._no_name_msg, error_code=self._no_name_error) - - def _add_to_dict(self, row_number, row, entry, key_class): - if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema: - self._add_fatal_error(row_number, row, - "Library tag in unmerged schema has InLibrary attribute", - HedExceptions.IN_LIBRARY_IN_UNMERGED) - - return self._add_to_dict_base(entry, key_class) - - @abstractmethod - def _create_entry(self, row_number, row, key_class, full_tag_name=None): - """ Create a tag entry from the given row - - Parameters: - row_number (int): The row number to report errors as - row (str or pd.Series): A tag row or pandas series(depends on format) - key_class(HedSectionKey): The HedSectionKey for this object - full_tag_name (str): The full long form tag name, overrides value found in row. - - Returns: - HedSchemaEntry or None: The entry for the added tag. - """ - raise NotImplementedError("Required in subclass") - - @abstractmethod - def _get_tag_name(self, row): - """ Returns the tag name for the given row - - Parameters: - row (str or pd.Series): A tag row or pandas series(depends on format) - - Returns: - entry_name(str): The tag name for the given row - - Notes: - Should be set to add a fatal error if no name returned - """ - raise NotImplementedError("Required in subclass") diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py index 7cb71b34..4d937434 100644 --- a/hed/schema/schema_io/wiki2schema.py +++ b/hed/schema/schema_io/wiki2schema.py @@ -3,11 +3,11 @@ """ import re -from hed.schema.hed_schema_constants import HedSectionKey +from hed.schema.hed_schema_constants import HedSectionKey, HedKey from hed.errors.exceptions import HedFileError, HedExceptions from hed.errors import error_reporter from hed.schema.schema_io import wiki_constants -from hed.schema.schema_io.text2schema import SchemaLoaderText +from hed.schema.schema_io.base2schema import SchemaLoader from hed.schema.schema_io.wiki_constants import HedWikiSection, SectionStarts, SectionNames from hed.schema.schema_io import text_util @@ -34,7 +34,7 @@ ] -class SchemaLoaderWiki(SchemaLoaderText): +class SchemaLoaderWiki(SchemaLoader): """ Load MediaWiki schemas from filenames or strings. Expected usage is SchemaLoaderWiki.load(filename) @@ -45,8 +45,6 @@ class SchemaLoaderWiki(SchemaLoaderText): def __init__(self, filename, schema_as_string=None, schema=None, file_format=None, name=""): super().__init__(filename, schema_as_string, schema, file_format, name) self._schema.source_format = ".mediawiki" - self._no_name_msg = "Schema term is empty or the line is malformed", - self._no_name_error = HedExceptions.WIKI_DELIMITERS_INVALID def _open_file(self): if self.filename: @@ -151,22 +149,29 @@ def _read_schema(self, lines): self._schema._initialize_attributes(HedSectionKey.Tags) parent_tags = [] level_adj = 0 - for line_number, line in lines: - if line.startswith(wiki_constants.ROOT_TAG): + for row_number, row in lines: + if row.startswith(wiki_constants.ROOT_TAG): parent_tags = [] level_adj = 0 else: - level = self._get_tag_level(line) + level_adj + level = self._get_tag_level(row) + level_adj if level < len(parent_tags): parent_tags = parent_tags[:level] elif level > len(parent_tags): - self._add_fatal_error(line_number, line, + self._add_fatal_error(row_number, row, "Line has too many *'s at front. You cannot skip a level.", HedExceptions.WIKI_LINE_START_INVALID) continue # Create the entry - tag_entry, parent_tags, level_adj = self._add_tag_meta(parent_tags, line_number, line, level_adj) + tag_entry = self._create_tag_entry(parent_tags, row_number, row) + if not tag_entry: + # This will have already raised an error + continue + + tag_entry, level_adj = self._add_tag_entry(tag_entry, row_number, row, level_adj) + if tag_entry: + parent_tags = tag_entry.name.split("/") def _read_unit_classes(self, lines): """Add the unit classes section. @@ -468,3 +473,52 @@ def _split_lines_into_sections(self, wiki_lines): strings_for_section[current_section].append((line_number + 1, line)) return strings_for_section + + def _add_tag_entry(self, tag_entry, row_number, row, level_adj): + try: + rooted_entry = self.find_rooted_entry(tag_entry, self._schema, self._loading_merged) + if rooted_entry: + parent_tags = rooted_entry.long_tag_name.split("/") + level_adj = len(parent_tags) + # Create the entry again for rooted tags, to get the full name. + tag_entry = self._create_tag_entry(parent_tags, row_number, row) + except HedFileError as e: + self._add_fatal_error(row_number, row, e.message, e.code) + return None, level_adj + + tag_entry = self._add_to_dict(row_number, row, tag_entry, HedSectionKey.Tags) + + return tag_entry, level_adj + + def _create_tag_entry(self, parent_tags, row_number, row): + """ Create a tag entry(does not add to schema) + + Parameters: + parent_tags (list): A list of parent tags in order. + row_number (int): The row number to report errors as + row (str or pd.Series): A tag row or pandas series(depends on format) + + Returns: + HedSchemaEntry: The entry for the added tag. + + Notes: + Includes attributes and description. + """ + tag_name, _ = self._get_tag_name(row) + if tag_name: + if parent_tags: + long_tag_name = "/".join(parent_tags) + "/" + tag_name + else: + long_tag_name = tag_name + return self._create_entry(row_number, row, HedSectionKey.Tags, long_tag_name) + + self._add_fatal_error(row_number, row, "Schema term is empty or the line is malformed" + , error_code=HedExceptions.WIKI_DELIMITERS_INVALID) + + def _add_to_dict(self, row_number, row, entry, key_class): + if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema: + self._add_fatal_error(row_number, row, + "Library tag in unmerged schema has InLibrary attribute", + HedExceptions.IN_LIBRARY_IN_UNMERGED) + + return self._add_to_dict_base(entry, key_class) diff --git a/tests/schema/test_hed_schema_io_df.py b/tests/schema/test_hed_schema_io_df.py index d8d37059..a73dafc0 100644 --- a/tests/schema/test_hed_schema_io_df.py +++ b/tests/schema/test_hed_schema_io_df.py @@ -1,10 +1,12 @@ import unittest import shutil - -from hed.schema.hed_schema_io import load_schema, load_schema_version, from_dataframes - import os +import pandas as pd +from hed.errors import HedExceptions, HedFileError +from hed.schema.hed_schema_io import load_schema, load_schema_version, from_dataframes from hed.schema.schema_io.df2schema import SchemaLoaderDF +from hed.schema import hed_schema_df_constants as df_constants +from hed.schema.schema_io.ontology_util import create_empty_dataframes class TestHedSchemaDF(unittest.TestCase): @@ -85,3 +87,89 @@ def test_save_load_location2(self): reloaded_schema = load_schema(output_location) self.assertEqual(schema, reloaded_schema) + + def _create_structure_df(self): + data = {"hedId": ["HED_0060010"], + "rdfs:label": ["LangHeader"], + "Attributes": ['version="1.0.0", library="lang", withStandard="8.3.0", unmerged="True"'], + "omn:SubClassOf": ["HedHeader"], + "dc:description": [""], + "omn:EquivalentTo": ['HedHeader and (inHedSchema some LangSchema) and (version value "1.0.0") and (library value "lang") and (withStandard value "8.3.0") and (unmerged value "True")']} + + df = pd.DataFrame(data) + return df + + def _add_tag_row(self, tag_df, name, parent): + new_row = {col_name: "" for col_name in tag_df.columns} + new_row[df_constants.name] = name + new_row[df_constants.subclass_of] = parent + return pd.concat([tag_df, pd.DataFrame([new_row])], ignore_index=True) + + def test_loading_out_of_order(self): + # Verify loading a .tsv file that defines a child before it's parent works + dataframes = create_empty_dataframes() + struct_df = self._create_structure_df() + tag_df = pd.DataFrame([], columns=df_constants.tag_columns, dtype=str) + + tag_df = self._add_tag_row(tag_df, "MadeUpLongTagNameParent", "HedTag") + tag_df = self._add_tag_row(tag_df, "MadeUpLongTagNameChild", "MadeUpLongTagNameParent") + + dataframes[df_constants.STRUCT_KEY] = struct_df + dataframes[df_constants.TAG_KEY] = tag_df + + loaded_schema = from_dataframes(dataframes) + issues = loaded_schema.check_compliance(check_for_warnings=False) + self.assertEqual(len(issues), 0) + breakHere = 3 + + self.assertEqual(loaded_schema.tags['MadeUpLongTagNameChild'].name, + "MadeUpLongTagNameParent/MadeUpLongTagNameChild") + self.assertEqual(loaded_schema.tags['MadeUpLongTagNameParent'].name, + "MadeUpLongTagNameParent") + + tag_df = pd.DataFrame([], columns=df_constants.tag_columns, dtype=str) + + tag_df = self._add_tag_row(tag_df, "MadeUpLongTagNameChild", "MadeUpLongTagNameParent") + tag_df = self._add_tag_row(tag_df, "MadeUpLongTagNameParent", "HedTag") + + dataframes[df_constants.TAG_KEY] = tag_df + + loaded_out_of_order = from_dataframes(dataframes) + issues = loaded_schema.check_compliance(check_for_warnings=False) + self.assertEqual(len(issues), 0) + self.assertEqual(loaded_schema.tags['MadeUpLongTagNameChild'].name, + "MadeUpLongTagNameParent/MadeUpLongTagNameChild") + self.assertEqual(loaded_schema.tags['MadeUpLongTagNameParent'].name, + "MadeUpLongTagNameParent") + self.assertEqual(loaded_schema, loaded_out_of_order) + + def test_loading_circular(self): + # Verify a circular reference properly reports an error + dataframes = create_empty_dataframes() + struct_df = self._create_structure_df() + tag_df = pd.DataFrame([], columns=df_constants.tag_columns, dtype=str) + + tag_df = self._add_tag_row(tag_df, "MadeUpLongTagNameParent", "MadeUpLongTagNameChild") + tag_df = self._add_tag_row(tag_df, "MadeUpLongTagNameChild", "MadeUpLongTagNameParent") + + dataframes[df_constants.STRUCT_KEY] = struct_df + dataframes[df_constants.TAG_KEY] = tag_df + + with self.assertRaises(HedFileError) as error: + _ = from_dataframes(dataframes) + self.assertEqual(error.exception.args[0], HedExceptions.SCHEMA_TAG_TSV_BAD_PARENT) + + dataframes = create_empty_dataframes() + struct_df = self._create_structure_df() + tag_df = pd.DataFrame([], columns=df_constants.tag_columns, dtype=str) + + tag_df = self._add_tag_row(tag_df, "MadeUpLongTagName1", "MadeUpLongTagName2") + tag_df = self._add_tag_row(tag_df, "MadeUpLongTagName2", "MadeUpLongTagName3") + tag_df = self._add_tag_row(tag_df, "MadeUpLongTagName3", "MadeUpLongTagName1") + + dataframes[df_constants.STRUCT_KEY] = struct_df + dataframes[df_constants.TAG_KEY] = tag_df + + with self.assertRaises(HedFileError) as error: + _ = from_dataframes(dataframes) + self.assertEqual(error.exception.args[0], HedExceptions.SCHEMA_TAG_TSV_BAD_PARENT) \ No newline at end of file