diff --git a/docs/requirements.txt b/docs/requirements.txt index 1e05aebd..94d6f9e9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -9,3 +9,4 @@ myst-parser>=1.0.0 Sphinx>=5.2.2 sphinx_rtd_theme>=1.0.0 wordcloud==1.9.3 +rdflib>=6 diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index 67922025..657aefbb 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -25,6 +25,12 @@ def val_error_empty_group(tag): return f"HED tags cannot be empty. Extra delimiters found: '{tag}'" +@hed_tag_error(OnsetErrors.HED_ONSET_WITH_NO_COLUMN, actual_code=ValidationErrors.ONSET_OFFSET_INSET_ERROR) +def val_error_hed_onset_with_no_column(tag): + return f"Cannot have Temporal tags without an 'Onset' column. Found tag: '{tag}'" + + + @hed_tag_error(ValidationErrors.TAG_EXTENDED, has_sub_tag=True, default_severity=ErrorSeverity.WARNING) def val_error_tag_extended(tag, problem_tag): return f"Hed tag is extended. '{problem_tag}' in {tag}" diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index 7f9a6443..5dc32737 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -166,7 +166,7 @@ class OnsetErrors: ONSET_TAG_OUTSIDE_OF_GROUP = "ONSET_TAG_OUTSIDE_OF_GROUP" INSET_BEFORE_ONSET = "INSET_BEFORE_ONSET" ONSET_SAME_DEFS_ONE_ROW = "ONSET_SAME_DEFS_ONE_ROW" - + HED_ONSET_WITH_NO_COLUMN = 'HED_ONSET_WITH_NO_COLUMN' class ColumnErrors: INVALID_COLUMN_REF = "INVALID_COLUMN_REF" diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py index 7cce60d6..aff5420c 100644 --- a/hed/errors/exceptions.py +++ b/hed/errors/exceptions.py @@ -45,6 +45,8 @@ class HedExceptions: SCHEMA_DUPLICATE_NAMES = "SCHEMA_DUPLICATE_NAMES" + CANNOT_PARSE_RDF = "CANNOT_PARSE_RDF" + class HedFileError(Exception): """Exception raised when a file cannot be parsed due to being malformed, file IO, etc.""" diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index a58cc9c4..d32413aa 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -1,10 +1,13 @@ import json +import os from hed.schema.hed_schema_constants import HedKey, HedSectionKey from hed.schema import hed_schema_constants as constants from hed.schema.schema_io import schema_util from hed.schema.schema_io.schema2xml import Schema2XML from hed.schema.schema_io.schema2wiki import Schema2Wiki +from hed.schema.schema_io.schema2owl import Schema2Owl +from hed.schema.schema_io.owl_constants import ext_to_format from hed.schema.hed_schema_section import HedSchemaSection, HedSchemaTagSection, HedSchemaUnitClassSection from hed.errors import ErrorHandler from hed.errors.error_types import ValidationErrors @@ -208,6 +211,11 @@ def valid_prefixes(self): # =============================================== # Creation and saving functions # =============================================== + + # todo: we may want to collapse these 6 functions into one like this + # def serialize(self, filename=None, save_merged=False, file_format=whatever is default): + # pass + def get_as_mediawiki_string(self, save_merged=False): """ Return the schema to a mediawiki string. @@ -222,6 +230,26 @@ def get_as_mediawiki_string(self, save_merged=False): output_strings = Schema2Wiki.process_schema(self, save_merged) return '\n'.join(output_strings) + def get_as_owl_string(self, save_merged=False, file_format="owl"): + """ Return the schema to a mediawiki string. + + Parameters: + save_merged (bool): If true, this will save the schema as a merged schema if it is a "withStandard" schema. + If it is not a "withStandard" schema, this setting has no effect. + file_format(str or None): Override format from filename extension. + Accepts any value rdflib accepts(We fully support "turtle", "xml"("owl" also accepted) and "json-ld") + Other values should work, but aren't as fully supported. + Returns: + str: The schema as a string in mediawiki format. + + :raises rdflib.plugin.PluginException: + - Invalid format of file_format. Make sure you use a supported RDF format. + """ + if file_format == "owl": + file_format = "xml" + rdf_data = Schema2Owl.process_schema(self, save_merged) + return rdf_data.serialize(format=file_format) + def get_as_xml_string(self, save_merged=True): """ Return the schema to an XML string. @@ -234,39 +262,69 @@ def get_as_xml_string(self, save_merged=True): """ xml_tree = Schema2XML.process_schema(self, save_merged) - return schema_util._xml_element_2_str(xml_tree) + return schema_util.xml_element_2_str(xml_tree) - def save_as_mediawiki(self, filename=None, save_merged=False): + def save_as_mediawiki(self, filename, save_merged=False): """ Save as mediawiki to a file. filename: str - If present, move the resulting file to this location. + save location save_merged: bool If true, this will save the schema as a merged schema if it is a "withStandard" schema. If it is not a "withStandard" schema, this setting has no effect. - Returns: - str: The newly created schema filename. + :raises OSError: + - File cannot be saved for some reason """ output_strings = Schema2Wiki.process_schema(self, save_merged) - local_wiki_file = schema_util.write_strings_to_file(output_strings, ".mediawiki") - return schema_util.move_file(local_wiki_file, filename) + with open(filename, mode='w', encoding='utf-8') as opened_file: + for string in output_strings: + opened_file.write(string) + opened_file.write('\n') - def save_as_xml(self, filename=None, save_merged=True): + def save_as_owl(self, filename, save_merged=False, file_format=None): + """ Save as json to a file. + + filename: str + Save the file here + save_merged: bool + If true, this will save the schema as a merged schema if it is a "withStandard" schema. + If it is not a "withStandard" schema, this setting has no effect. + file_format(str or None): Required for owl formatted files other than the following: + .ttl: turtle + .owl: xml + .json-ld: json-ld + + :raises OSError: + - File cannot be saved for some reason + + :raises rdflib.plugin.PluginException: + - Invalid format of file_format. Make sure you use a supported RDF format. + """ + ext = os.path.splitext(filename.lower())[1] + if ext in ext_to_format and file_format is None: + file_format = ext_to_format[ext] + if file_format == "owl": + file_format = "xml" + rdf_data = Schema2Owl.process_schema(self, save_merged) + rdf_data.serialize(filename, format=file_format) + + def save_as_xml(self, filename, save_merged=True): """ Save as XML to a file. filename: str - If present, move the resulting file to this location. + save location save_merged: bool If true, this will save the schema as a merged schema if it is a "withStandard" schema. If it is not a "withStandard" schema, this setting has no effect. - Returns: - str: The name of the newly created schema file. + :raises OSError: + - File cannot be saved for some reason """ xml_tree = Schema2XML.process_schema(self, save_merged) - local_xml_file = schema_util.write_xml_tree_2_xml_file(xml_tree, ".xml") - return schema_util.move_file(local_xml_file, filename) + with open(filename, mode='w', encoding='utf-8') as opened_file: + xml_string = schema_util.xml_element_2_str(xml_tree) + opened_file.write(xml_string) def set_schema_prefix(self, schema_namespace): """ Set library namespace associated for this schema. diff --git a/hed/schema/hed_schema_constants.py b/hed/schema/hed_schema_constants.py index 60a1a934..d53b8c59 100644 --- a/hed/schema/hed_schema_constants.py +++ b/hed/schema/hed_schema_constants.py @@ -43,15 +43,7 @@ class HedKey: Rooted = "rooted" DeprecatedFrom = "deprecatedFrom" ConversionFactor = "conversionFactor" - - # All known properties - BoolProperty = 'boolProperty' - UnitClassProperty = 'unitClassProperty' - UnitProperty = 'unitProperty' - UnitModifierProperty = 'unitModifierProperty' - ValueClassProperty = 'valueClassProperty' - ElementProperty = 'elementProperty' - IsInheritedProperty = 'isInheritedProperty' + Reserved = "reserved" SIUnit = 'SIUnit' UnitSymbol = 'unitSymbol' @@ -68,6 +60,17 @@ class HedKey: # Node attributes InLibrary = "inLibrary" + # All known properties + BoolProperty = 'boolProperty' + UnitClassProperty = 'unitClassProperty' + UnitProperty = 'unitProperty' + UnitModifierProperty = 'unitModifierProperty' + ValueClassProperty = 'valueClassProperty' + ElementProperty = 'elementProperty' + NodeProperty = 'nodeProperty' + IsInheritedProperty = 'isInheritedProperty' + + VERSION_ATTRIBUTE = 'version' LIBRARY_ATTRIBUTE = 'library' diff --git a/hed/schema/hed_schema_entry.py b/hed/schema/hed_schema_entry.py index 936943e8..de066dbc 100644 --- a/hed/schema/hed_schema_entry.py +++ b/hed/schema/hed_schema_entry.py @@ -119,7 +119,8 @@ def __eq__(self, other): # We only want to compare known attributes self_attr = self.get_known_attributes() other_attr = other.get_known_attributes() - if self_attr != other_attr: + # We can no longer be sure on the order of attribute values, since owl formatting has no order + if self_attr != other_attr and not self._compare_attributes_no_order(self_attr, other_attr): return False if self.description != other.description: return False @@ -135,6 +136,13 @@ def get_known_attributes(self): return {key: value for key, value in self.attributes.items() if not self._unknown_attributes or key not in self._unknown_attributes} + @staticmethod + def _compare_attributes_no_order(left, right): + left = {name: (set(value.split(",")) if isinstance(value, str) else value) for (name, value) in left.items()} + right = {name: (set(value.split(",")) if isinstance(value, str) else value) for (name, value) in right.items()} + + return left == right + class UnitClassEntry(HedSchemaEntry): """ A single unit class entry in the HedSchema. """ diff --git a/hed/schema/hed_schema_io.py b/hed/schema/hed_schema_io.py index 027c63d6..63a751d8 100644 --- a/hed/schema/hed_schema_io.py +++ b/hed/schema/hed_schema_io.py @@ -4,6 +4,7 @@ import functools from hed.schema.schema_io.xml2schema import SchemaLoaderXML from hed.schema.schema_io.wiki2schema import SchemaLoaderWiki +from hed.schema.schema_io.owl2schema import SchemaLoaderOWL from hed.schema import hed_cache from hed.errors.exceptions import HedFileError, HedExceptions @@ -11,6 +12,7 @@ from hed.schema.hed_schema_group import HedSchemaGroup from hed.schema.schema_validation_util import validate_version_string from collections import defaultdict +from hed.schema.schema_io.owl_constants import ext_to_format MAX_MEMORY_CACHE = 40 @@ -20,8 +22,10 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche """ Create a schema from the given string. Parameters: - schema_string (str): An XML or mediawiki file as a single long string. + schema_string (str): An XML, mediawiki or OWL, file as a single long string schema_format (str): The schema format of the source schema string. + Allowed normal values: .mediawiki, .xml + Allowed owl values: xml, owl, pretty-xml, turtle (or any other value rdflib supports) schema_namespace (str, None): The name_prefix all tags in this schema will accept. schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. @@ -45,6 +49,8 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche hed_schema = SchemaLoaderXML.load(schema_as_string=schema_string, schema=schema) elif schema_format.endswith(".mediawiki"): hed_schema = SchemaLoaderWiki.load(schema_as_string=schema_string, schema=schema) + elif schema_format: + hed_schema = SchemaLoaderOWL.load(schema_as_string=schema_string, schema=schema, file_format=schema_format) else: raise HedFileError(HedExceptions.INVALID_EXTENSION, "Unknown schema extension", filename=schema_format) @@ -54,14 +60,18 @@ def from_string(schema_string, schema_format=".xml", schema_namespace=None, sche return hed_schema -def load_schema(hed_path=None, schema_namespace=None, schema=None): +def load_schema(hed_path, schema_namespace=None, schema=None, file_format=None): """ Load a schema from the given file or URL path. Parameters: - hed_path (str or None): A filepath or url to open a schema from. + hed_path (str): A filepath or url to open a schema from. schema_namespace (str or None): The name_prefix all tags in this schema will accept. schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. + file_format(str or None): Required for owl formatted files other than the following: + .ttl: turtle + .owl: xml + .json-ld: json-ld Returns: HedSchema: The loaded schema. @@ -76,11 +86,15 @@ def load_schema(hed_path=None, schema_namespace=None, schema=None): raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file path passed to HedSchema.load_file", filename=hed_path) + ext = os.path.splitext(hed_path.lower())[1] is_url = hed_cache._check_if_url(hed_path) - if is_url: file_as_string = schema_util.url_to_string(hed_path) hed_schema = from_string(file_as_string, schema_format=os.path.splitext(hed_path.lower())[1]) + elif ext in ext_to_format: + hed_schema = SchemaLoaderOWL.load(hed_path, schema=schema, file_format=ext_to_format[ext]) + elif file_format: + hed_schema = SchemaLoaderOWL.load(hed_path, schema=schema, file_format=file_format) elif hed_path.lower().endswith(".xml"): hed_schema = SchemaLoaderXML.load(hed_path, schema=schema) elif hed_path.lower().endswith(".mediawiki"): diff --git a/hed/schema/hed_schema_section.py b/hed/schema/hed_schema_section.py index 7a866fc0..708dcf0d 100644 --- a/hed/schema/hed_schema_section.py +++ b/hed/schema/hed_schema_section.py @@ -151,6 +151,7 @@ def _finalize_section(self, hed_schema): class HedSchemaUnitClassSection(HedSchemaSection): def _check_if_duplicate(self, name_key, new_entry): + """Allow adding units to existing unit classes, using a placeholder one with no attributes.""" if name_key in self and len(new_entry.attributes) == 1\ and HedKey.InLibrary in new_entry.attributes: return self.all_names[name_key] diff --git a/hed/schema/schema_io/base2schema.py b/hed/schema/schema_io/base2schema.py index 3bbac873..c3a68b21 100644 --- a/hed/schema/schema_io/base2schema.py +++ b/hed/schema/schema_io/base2schema.py @@ -14,7 +14,7 @@ class SchemaLoader(ABC): SchemaLoaderXML(filename) will load just the header_attributes """ - def __init__(self, filename, schema_as_string=None, schema=None): + def __init__(self, filename, schema_as_string=None, schema=None, file_format=None): """Loads the given schema from one of the two parameters. Parameters: @@ -22,10 +22,12 @@ def __init__(self, filename, schema_as_string=None, schema=None): schema_as_string(str or None): A full schema as text or None schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. + file_format(str or None): The format of this file if needed(only for owl currently) """ if schema_as_string and filename: raise HedFileError(HedExceptions.BAD_PARAMETERS, "Invalid parameters to schema creation.", filename) + self.file_format = file_format self.filename = filename self.schema_as_string = schema_as_string self.appending_to_schema = False @@ -68,7 +70,7 @@ def schema(self): return self._schema @classmethod - def load(cls, filename=None, schema_as_string=None, schema=None): + def load(cls, filename=None, schema_as_string=None, schema=None, file_format=None): """ Loads and returns the schema, including partnered schema if applicable. Parameters: @@ -76,11 +78,13 @@ def load(cls, filename=None, schema_as_string=None, schema=None): schema_as_string(str or None): A full schema as text or None schema(HedSchema or None): A hed schema to merge this new file into It must be a with-standard schema with the same value. + file_format(str or None): If this is an owl file being loaded, this is the format. + Allowed values include: turtle, json-ld, and owl(xml) Returns: schema(HedSchema): The new schema """ - loader = cls(filename, schema_as_string, schema) + loader = cls(filename, schema_as_string, schema, file_format) return loader._load() def _load(self): diff --git a/hed/schema/schema_io/owl2schema.py b/hed/schema/schema_io/owl2schema.py new file mode 100644 index 00000000..8f5d6efa --- /dev/null +++ b/hed/schema/schema_io/owl2schema.py @@ -0,0 +1,290 @@ +""" +This module is used to create a HedSchema object from an OWL file or graph. +""" + + +from hed.errors.exceptions import HedFileError, HedExceptions +from hed.schema.hed_schema_constants import HedSectionKey, HedKey +from hed.schema import schema_validation_util +from .base2schema import SchemaLoader +import rdflib +from rdflib.exceptions import ParserError +from rdflib import Graph, RDF, RDFS, Literal, URIRef, OWL, XSD +from collections import defaultdict + +from hed.schema.schema_io.owl_constants import HED, HEDT, HEDU, HEDUM + + +class SchemaLoaderOWL(SchemaLoader): + """ Loads XML schemas from filenames or strings. + + Expected usage is SchemaLoaderXML.load(filename) + + SchemaLoaderXML(filename) will load just the header_attributes + """ + def __init__(self, filename, schema_as_string=None, schema=None, file_format=None): + if schema_as_string and not file_format: + raise HedFileError(HedExceptions.BAD_PARAMETERS, + "Must pass a file_format if loading owl schema as a string.", + filename) + super().__init__(filename, schema_as_string, schema, file_format) + + self.graph = None + # When loading, this stores rooted tag name -> full root path pairs + self._rooted_cache = {} + + def _open_file(self): + """Parses a Turtle/owl/etc file and returns the RDF graph.""" + + graph = rdflib.Graph() + try: + if self.filename: + graph.parse(self.filename, format=self.file_format) + else: + graph.parse(data=self.schema_as_string, format=self.file_format) + except FileNotFoundError as fnf_error: + raise HedFileError(HedExceptions.FILE_NOT_FOUND, str(fnf_error), self.filename) + except ParserError as parse_error: + raise HedFileError(HedExceptions.CANNOT_PARSE_RDF, str(parse_error), self.filename) + + return graph + + def _read_prologue(self): + """Reads the Prologue section from the ontology.""" + prologue = self.graph.value(subject=HED.Prologue, predicate=HED.elementValue, any=False) + return str(prologue) if prologue else "" + + def _read_epilogue(self): + """Reads the Epilogue section from the ontology.""" + epilogue = self.graph.value(subject=HED.Epilogue, predicate=HED.elementValue, any=False) + return str(epilogue) if epilogue else "" + + def _get_header_attributes(self, graph): + """Parses header attributes from an RDF graph into a dictionary.""" + header_attributes = {} + for s, _, _ in graph.triples((None, RDF.type, HED.HeaderMember)): + label = graph.value(s, RDFS.label) + if label: + header_attribute = graph.value(s, HED.HeaderAttribute) + header_attributes[str(label)] = str(header_attribute) if header_attribute else None + return header_attributes + + def _parse_data(self): + self.graph = self.input_data + self.graph.bind("hed", HED) + self.graph.bind("hedt", HEDT) + self.graph.bind("hedu", HEDU) + self.graph.bind("hedum", HEDUM) + + + self._schema.epilogue = self._read_epilogue() + self._schema.prologue = self._read_prologue() + self._get_header_attributes(self.graph) + self._read_properties() + self._read_attributes() + self._read_units() + self._read_section(HedSectionKey.ValueClasses, HED.HedValueClass) + self._read_section(HedSectionKey.UnitModifiers, HED.HedUnitModifier) + self._read_tags() + + breakHere = 3 + + def get_local_names_from_uris(parent_chain, tag_uri): + """ + Extracts local names from URIs using RDFlib's n3() method. + """ + full_names = [] + for uri in parent_chain + [tag_uri]: + # Serialize the URI into N3 format and extract the local name + name = uri.n3(namespace_manager=HED.namespace_manager).split(':')[-1] + full_names.append(name) + + return full_names + + def sort_classes_by_hierarchy(self, classes): + """ + Sorts all tags based on assembled full name + + Returns: + list of tuples. + Left Tag URI, right side is parent labels(not including self) + """ + parent_chains = [] + full_tag_names = [] + for tag_uri in classes: + parent_chain = self._get_parent_chain(tag_uri) + parent_chain = [uri.n3(namespace_manager=self.graph.namespace_manager).split(':')[-1] for uri in parent_chain + [tag_uri]] + # parent_chain = [self.graph.value(p, RDFS.label) or p for p in parent_chain + [tag_uri]] + full_tag_names.append("/".join(parent_chain)) + parent_chains.append((tag_uri, parent_chain[:-1])) + + # Sort parent_chains by full_tag_names. + _, parent_chains = zip(*sorted(zip(full_tag_names, parent_chains))) + + return parent_chains + + def _get_parent_chain(self, cls): + """ Recursively builds the parent chain for a given class. """ + parent = self.graph.value(subject=cls, predicate=HED.hasHedParent) + if parent is None: + return [] + return self._get_parent_chain(parent) + [parent] + + def _parse_uri(self, uri, key_class, name=None): + if name: + label = name + else: + label = self.graph.value(subject=uri, predicate=RDFS.label) + if not label: + raise ValueError(f"Empty label value found in owl file in uri {uri}") + label = str(label) + + tag_entry = self._schema._create_tag_entry(label, key_class) + + description = self.graph.value(subject=uri, predicate=RDFS.comment) + if description: + tag_entry.description = str(description) + + section = self._schema._sections[key_class] + valid_attributes = section.valid_attributes + + new_values = defaultdict(list) + for predicate, obj in self.graph.predicate_objects(subject=uri): + # Convert predicate URI to a readable string, assuming it's in a known namespace + attr_name = predicate.n3(self.graph.namespace_manager).split(':')[1] + + if attr_name in valid_attributes: + if isinstance(obj, URIRef): + attr_value = obj.n3(self.graph.namespace_manager).split(':')[1] + else: + attr_value = str(obj) + + new_values[attr_name].append(attr_value) + + for name, value in new_values.items(): + value = ",".join(value) + if value == "true": + value = True + tag_entry._set_attribute_value(name, value) + + return tag_entry + + def _get_classes_with_subproperty(self, subproperty_uri, base_type): + """Iterates over all classes that have a specified rdfs:subPropertyOf.""" + classes = set() + for s in self.graph.subjects(RDF.type, base_type): + if (s, RDFS.subPropertyOf, subproperty_uri) in self.graph: + classes.add(s) + return classes + + def _get_all_subclasses(self, base_type): + """ + Recursively finds all subclasses of the given base_type. + """ + subclasses = set() + for subclass in self.graph.subjects(RDFS.subClassOf, base_type): + subclasses.add(subclass) + subclasses.update(self._get_all_subclasses(subclass)) + return subclasses + + def _get_classes(self, base_type): + """ + Retrieves all instances of the given base_type, including instances of its subclasses. + """ + classes = set() + # Add instances of the base type + for s in self.graph.subjects(RDF.type, base_type): + classes.add(s) + # Add instances of all subclasses + for subclass in self._get_all_subclasses(base_type): + for s in self.graph.subjects(RDF.type, subclass): + classes.add(s) + return classes + + def _read_properties(self): + key_class = HedSectionKey.Properties + self._schema._initialize_attributes(key_class) + prop_uris = self._get_classes_with_subproperty(HED.schemaProperty, OWL.AnnotationProperty) + for uri in prop_uris: + new_entry = self._parse_uri(uri, key_class) + self._add_to_dict(new_entry, key_class) + + def _read_attributes(self): + key_class = HedSectionKey.Attributes + self._schema._initialize_attributes(key_class) + prop_uris = self._get_classes_with_subproperty(HED.schemaAttributeDatatypeProperty, OWL.DatatypeProperty) + prop_uris.update(self._get_classes_with_subproperty(HED.schemaAttributeObjectProperty, OWL.ObjectProperty)) + + for uri in prop_uris: + new_entry = self._parse_uri(uri, key_class) + self._add_to_dict(new_entry, key_class) + + def _read_section(self, key_class, node_uri): + self._schema._initialize_attributes(key_class) + classes = self._get_classes(node_uri) + for uri in classes: + new_entry = self._parse_uri(uri, key_class) + self._add_to_dict(new_entry, key_class) + + def _read_units(self): + self._schema._initialize_attributes(HedSectionKey.UnitClasses) + self._schema._initialize_attributes(HedSectionKey.Units) + key_class = HedSectionKey.UnitClasses + classes = self._get_classes(HED.HedUnitClass) + unit_classes = {} + for uri in classes: + new_entry = self._parse_uri(uri, key_class) + self._add_to_dict(new_entry, key_class) + unit_classes[uri] = new_entry + + + + key_class = HedSectionKey.Units + units = self._get_classes(HED.HedUnit) + for uri in units: + new_entry = self._parse_uri(uri, key_class) + self._add_to_dict(new_entry, key_class) + unit_class_uri = self.graph.value(subject=uri, predicate=HED.unitClass) + class_entry = unit_classes.get(unit_class_uri) + class_entry.add_unit(new_entry) + breakHere = 3 + + def _add_tag_internal(self, uri, parent_tags): + tag_name = self.graph.value(uri, RDFS.label) + if not tag_name: + raise ValueError(f"No label for uri {uri}") + tag_name = str(tag_name) + parents_and_child = parent_tags + [tag_name] + if parent_tags and parents_and_child[0] in self._rooted_cache: + full_tag = "/".join([self._rooted_cache[parents_and_child[0]]] + parents_and_child[1:]) + else: + full_tag = "/".join(parents_and_child) + + tag_entry = self._parse_uri(uri, HedSectionKey.Tags, full_tag) + + rooted_entry = schema_validation_util.find_rooted_entry(tag_entry, self._schema, self._loading_merged) + if rooted_entry: + loading_from_chain = rooted_entry.name + "/" + tag_entry.short_tag_name + loading_from_chain_short = tag_entry.short_tag_name + self._rooted_cache[tag_entry.short_tag_name] = loading_from_chain + full_tag = full_tag.replace(loading_from_chain_short, loading_from_chain) + tag_entry = self._parse_uri(uri, HedSectionKey.Tags, full_tag) + + self._add_to_dict(tag_entry, HedSectionKey.Tags) + + def _read_tags(self): + """Populates a dictionary of dictionaries associated with tags and their attributes.""" + classes = self._get_classes(HED.HedTag) + classes.update(self._get_classes(HED.HedPlaceholder)) + sorted_classes = self.sort_classes_by_hierarchy(classes) + self._schema._initialize_attributes(HedSectionKey.Tags) + for uri, parents in sorted_classes: + self._add_tag_internal(uri, parents) + + def _add_to_dict(self, entry, key_class): + if entry.has_attribute(HedKey.InLibrary) and not self._loading_merged and not self.appending_to_schema: + raise HedFileError(HedExceptions.IN_LIBRARY_IN_UNMERGED, + f"Library tag in unmerged schema has InLibrary attribute", + self._schema.filename) + + return self._add_to_dict_base(entry, key_class) diff --git a/hed/schema/schema_io/owl_constants.py b/hed/schema/schema_io/owl_constants.py new file mode 100644 index 00000000..8d450d90 --- /dev/null +++ b/hed/schema/schema_io/owl_constants.py @@ -0,0 +1,51 @@ +from rdflib import Namespace + +from hed.schema.hed_schema_constants import HedSectionKey + + +# Default file associations(notably owl maps to XML format, as we already use XML) +ext_to_format = { + ".ttl": "turtle", + ".owl": "xml", + ".json-ld": "json-ld" +} + +# Core schema structural types in owl +HED = Namespace("https://purl.org/hed#") +# Tags +HEDT = Namespace("https://purl.org/hed/tag#") +# Unit classes, value classes, and units +HEDU = Namespace("https://purl.org/hed/aux#") +# Unit Modifiers +HEDUM = Namespace("https://purl.org/hed/aux/unit_modifier#") + +# Some of this stuff may be commented back in later if needed + +# SECTION_ELEMENT_NAME = { +# HedSectionKey.Tags: "StartSchemaSection", +# HedSectionKey.UnitClasses: "UnitClassSection", +# HedSectionKey.Units: "UnitSection", +# HedSectionKey.UnitModifiers: "UnitModifiersSection", +# HedSectionKey.ValueClasses: "ValueClassesSection", +# HedSectionKey.Attributes: "AttributesSection", +# HedSectionKey.Properties: "PropertiesSection", +# } +# +# SECTION_ELEMENT_TYPE = { +# HedSectionKey.Tags: "HedStartSchemaSection", +# HedSectionKey.UnitClasses: "HedUnitClassSection", +# HedSectionKey.Units: "HedUnitSection", +# HedSectionKey.UnitModifiers: "HedUnitModifiersSection", +# HedSectionKey.ValueClasses: "HedValueClassesSection", +# HedSectionKey.Attributes: "HedAttributesSection", +# HedSectionKey.Properties: "HedPropertiesSection", +# } + +ELEMENT_NAMES = { + HedSectionKey.Tags: "HedTag", + HedSectionKey.Units: "HedUnit", + HedSectionKey.UnitClasses: "HedUnitClass", + HedSectionKey.UnitModifiers: "HedUnitModifier", + HedSectionKey.ValueClasses: "HedValueClass", +} + diff --git a/hed/schema/schema_io/schema2owl.py b/hed/schema/schema_io/schema2owl.py new file mode 100644 index 00000000..0b683942 --- /dev/null +++ b/hed/schema/schema_io/schema2owl.py @@ -0,0 +1,314 @@ +"""Allows output of HedSchema objects as .xml format""" + +from hed.schema.hed_schema_constants import HedSectionKey, HedKey +from hed.schema.schema_io import owl_constants +from hed.schema.schema_io.schema2base import Schema2Base +from rdflib import Graph, RDF, RDFS, Literal, URIRef, OWL, XSD + +from hed.schema.schema_io.owl_constants import HED, HEDT, HEDU, HEDUM + +HED_URIS = { + None: HED, + HedSectionKey.Tags: HEDT, + HedSectionKey.UnitClasses: HEDU, + HedSectionKey.Units: HEDU, + HedSectionKey.UnitModifiers: HEDUM, + HedSectionKey.ValueClasses: HEDU, + HedSectionKey.Attributes: HED, + HedSectionKey.Properties: HED, +} + +HED_ATTR = { + "unitClass": HEDU, + "valueClass": HEDU, + "unit": HEDU, + "unitModifier": HEDUM, + "property": HED, + "suggestedTag": HEDT, + "relatedTag": HEDT, + "rooted": HEDT, +} + +float_attributes = {"conversionFactor"} + +hed_keys_with_types = { + HedKey.ExtensionAllowed: XSD["boolean"], + HedKey.Recommended: XSD["boolean"], + HedKey.Required: XSD["boolean"], + HedKey.RequireChild: XSD["boolean"], + HedKey.TagGroup: XSD["boolean"], + HedKey.TakesValue: XSD["boolean"], + HedKey.TopLevelTagGroup: XSD["boolean"], + HedKey.Unique: XSD["boolean"], + HedKey.UnitClass: HED["HedUnitClass"], + HedKey.ValueClass: HED["HedValueClass"], + HedKey.RelatedTag: HED["HedTag"], + HedKey.SuggestedTag: HED["HedTag"], + HedKey.Rooted: HED["HedTag"], + HedKey.DeprecatedFrom: XSD["string"], + HedKey.ConversionFactor: XSD["string"], + HedKey.Reserved: XSD["boolean"], + HedKey.SIUnit: XSD["boolean"], + HedKey.UnitSymbol: XSD["boolean"], + HedKey.DefaultUnits: HED["HedUnit"], + HedKey.UnitPrefix: XSD["boolean"], + HedKey.SIUnitModifier: XSD["boolean"], + HedKey.SIUnitSymbolModifier: XSD["boolean"], + HedKey.AllowedCharacter: XSD["string"], + HedKey.InLibrary: XSD["string"] +} + +object_properties = {key for key, value in hed_keys_with_types.items() if value.startswith(HED)} + + +class Schema2Owl(Schema2Base): + def __init__(self): + super().__init__() + self.owl_graph = Graph() + self.output = self.owl_graph + self.owl_graph.bind("hed", HED) + self.owl_graph.bind("hedt", HEDT) + self.owl_graph.bind("hedu", HEDU) + self.owl_graph.bind("hedum", HEDUM) + + # ========================================= + # Required baseclass function + # ========================================= + def _output_header(self, attributes, prologue): + # Create a dictionary mapping label names to property URIs + property_uris = { + "library": HED.Library, + "unmerged": HED.Unmerged, + "version": HED.Version, + "withStandard": HED.WithStandard, + "xmlns:xsi": HED.XSI, + "xsi:noNamespaceSchemaLocation": HED.XSINoNamespaceSchemaLocation + } + + for attrib_label, attrib_value in attributes.items(): + prop_uri = property_uris.get(attrib_label) + if prop_uri: + self.owl_graph.add((prop_uri, RDF.type, HED.HeaderMember)) + self.owl_graph.add((prop_uri, RDFS.label, Literal(attrib_label))) + self.owl_graph.add((prop_uri, HED.HeaderAttribute, Literal(attrib_value))) + + self.owl_graph.add((HED.Prologue, RDF.type, HED.HedElement)) + self.owl_graph.add((HED.Prologue, RDFS.label, Literal("epilogue"))) + if prologue: + self.owl_graph.add((HED.Prologue, HED["elementValue"], Literal(prologue))) + + def _output_footer(self, epilogue): + self.owl_graph.add((HED.Epilogue, RDF.type, HED.HedElement)) + self.owl_graph.add((HED.Epilogue, RDFS.label, Literal("epilogue"))) + if epilogue: + self.owl_graph.add((HED.Epilogue, HED["elementValue"], Literal(epilogue))) + + def _start_section(self, key_class): + return None + + def _end_tag_section(self): + pass + + def _write_attributes(self, entry_uri, entry): + for attribute, value in entry.attributes.items(): + is_bool = entry.attribute_has_property(attribute, "boolProperty") \ + or entry.section_key == HedSectionKey.Attributes + + if self._attribute_disallowed(attribute): + continue + + if is_bool: + self.owl_graph.add((entry_uri, HED[attribute], Literal(True))) + + elif attribute in float_attributes: + # Treat as a string for now + self.owl_graph.add((entry_uri, HED[attribute], Literal(value))) + else: + # Todo: further develop this if needed or merge into base tools + values = value.split(",") + for val2 in values: + clean_value = val2 + if attribute in HED_ATTR: + attribute_uri = HED_ATTR[attribute][clean_value] + else: + attribute_uri = Literal(clean_value) + + self.owl_graph.add((entry_uri, HED[attribute], attribute_uri)) + + def _add_entry(self, base_uri, tag_name, label, comment, parent=None, entry=None, + tag_type=HED.HedTag, unit_class_uri=None): + is_takes_value = entry.has_attribute("takesValue") + if is_takes_value: + tag_type = HED.HedPlaceholder + tag_name = entry.short_tag_name + "-Placeholder" + label = "#" + + tag_name = sanitize_for_turtle(tag_name) + uri = f"{base_uri}{tag_name}" + hed_tag_uri = URIRef(uri) + + self.owl_graph.add((hed_tag_uri, RDF.type, tag_type)) + self.owl_graph.add((hed_tag_uri, RDFS.label, Literal(label))) + if comment: + self.owl_graph.add((hed_tag_uri, RDFS.comment, Literal(comment))) + # Don't store the parent in unmerged rooted nodes + if parent is not None and (HedKey.Rooted not in entry.attributes or self._save_merged): + parent_uri = HEDT[parent] + self.owl_graph.add((hed_tag_uri, HED.hasHedParent, parent_uri)) + if unit_class_uri is not None: + self.owl_graph.add((hed_tag_uri, HED.unitClass, unit_class_uri)) + self._write_attributes(hed_tag_uri, entry) + return hed_tag_uri + + def _add_property(self, base_uri, name, label, comment, entry, + data_type, sub_type): + name = sanitize_for_turtle(name) + uri = f"{base_uri}{name}" + hed_tag_uri = URIRef(uri) + + self.owl_graph.add((hed_tag_uri, RDF.type, data_type)) + self.owl_graph.add((hed_tag_uri, RDFS.subPropertyOf, sub_type)) + self.owl_graph.add((hed_tag_uri, RDFS.range, XSD.boolean)) + self.owl_graph.add((hed_tag_uri, RDFS.label, Literal(label))) + self.owl_graph.add((hed_tag_uri, RDFS.comment, Literal(comment))) + self._write_attributes(hed_tag_uri, entry) + + return hed_tag_uri + + def _get_element_domains(self, entry): + domain_table = {HedKey.ValueClassProperty: "HedValueClass", + HedKey.UnitModifierProperty: "HedUnitModifier", + HedKey.UnitProperty: "HedUnit", + HedKey.ElementProperty: "HedElement", + HedKey.UnitClassProperty: "HedUnitClass", + HedKey.NodeProperty: "HedTag" + } + domains = [] + for attribute in entry.attributes: + if attribute in domain_table: + domains.append(domain_table[attribute]) + + if not domains: + domains.append(domain_table[HedKey.NodeProperty]) + + return domains + + def _add_attribute(self, base_uri, name, label, comment, entry): + domains = self._get_element_domains(entry) + name = sanitize_for_turtle(name) + uri = f"{base_uri}{name}" + hed_tag_uri = URIRef(uri) + data_type = OWL.ObjectProperty + sub_type = HED.schemaAttributeObjectProperty + if name not in object_properties: + data_type = OWL.DatatypeProperty + sub_type = HED.schemaAttributeDatatypeProperty + self.owl_graph.add((hed_tag_uri, RDF.type, data_type)) + for domain in domains: + self.owl_graph.add((hed_tag_uri, RDFS.domain, HED[domain])) + self.owl_graph.add((hed_tag_uri, RDFS.subPropertyOf, sub_type)) + self.owl_graph.add((hed_tag_uri, RDFS.range, hed_keys_with_types[name])) + self.owl_graph.add((hed_tag_uri, RDFS.label, Literal(label))) + self.owl_graph.add((hed_tag_uri, RDFS.comment, Literal(comment))) + self._write_attributes(hed_tag_uri, entry) + + return hed_tag_uri + + def _write_tag_entry(self, tag_entry, parent_node=None, level=0): + """ + Creates a tag node and adds it to the parent. + + Parameters + ---------- + tag_entry: HedTagEntry + The entry for that tag we want to write out + parent_node: Any + Unused + level: Any + Unused + + Returns + ------- + SubElement + The added node + """ + tag_name = tag_entry.short_tag_name + parent = tag_entry.parent + if parent: + parent = parent.short_tag_name + comment = tag_entry.description + return self._add_entry( + HEDT, + tag_name=tag_name, + label=tag_name, + comment=comment, + parent=parent, + entry=tag_entry + ) + + def _write_entry(self, entry, parent_node=None, include_props=True): + """ + Creates an entry node and adds it to the parent. + + Parameters: + entry(HedSchemaEntry): The entry for that tag we want to write out + parent_node(str): URI for unit class owner, if this is a unit + include_props(bool): Add the description and attributes to new node. + Returns: + str: The added URI + """ + key_class = entry.section_key + prefix = HED_URIS[key_class] + name = entry.name + comment = entry.description + if key_class == HedSectionKey.Attributes: + uri = self._add_attribute( + prefix, + name=name, + label=name, + comment=comment, + entry=entry + ) + elif key_class == HedSectionKey.Properties: + uri = self._add_property( + prefix, + name=name, + label=name, + comment=comment, + entry=entry, + data_type=OWL.AnnotationProperty, + sub_type=HED.schemaProperty + ) + else: + unit_class_uri = None + if key_class == HedSectionKey.Units: + unit_class_uri = parent_node + uri = self._add_entry( + prefix, + tag_name=name, + label=name, + comment=comment, + entry=entry, + tag_type=HED[owl_constants.ELEMENT_NAMES[key_class]], + unit_class_uri=unit_class_uri + ) + return uri + + +import re + + +def sanitize_for_turtle(name): + """ Sanitizes a string to be a valid IRIREF in Turtle, based on the SPARQL grammar. + + Excludes: `control characters, space, <, >, double quote, {, }, |, ^, backtick, and backslash.` + Replacing them with underscores + + Parameters: + name (str): The string to sanitize. + + Returns: + str: A sanitized string suitable for use as an IRIREF in Turtle. + """ + invalid_chars_pattern = r'[\x00-\x20<>"{}\|^`\\]' + return re.sub(invalid_chars_pattern, '_', name) diff --git a/hed/schema/schema_io/schema_util.py b/hed/schema/schema_io/schema_util.py index d2bf0721..67a73c1f 100644 --- a/hed/schema/schema_io/schema_util.py +++ b/hed/schema/schema_io/schema_util.py @@ -1,6 +1,5 @@ """ Utilities for writing content to files and for other file manipulation.""" -import shutil import tempfile import os import urllib.request @@ -72,65 +71,7 @@ def url_to_string(resource_url): return url_data -def write_strings_to_file(output_strings, extension=None): - """ Write output strings to a temporary file. - - Parameters: - output_strings ([str], str): Strings to output one per line. - extension (str): File extension of the temporary file. - - Returns: - file: Opened temporary file. - - """ - if isinstance(output_strings, str): - output_strings = [output_strings] - with tempfile.NamedTemporaryFile(suffix=extension, delete=False, mode='w', encoding='utf-8') as opened_file: - for string in output_strings: - opened_file.write(string) - opened_file.write('\n') - return opened_file.name - - -def move_file(input_path, target_path): - """ - If target_path is not empty, move input file to target file - - Parameters: - input_path(str): Path to an existing file - target_path(str or None): Path to move this file to - If None, the function does nothing and returns input_path - - Returns: - filepath(str): the original or moved filepath - """ - if target_path: - directory = os.path.dirname(target_path) - if directory and not os.path.exists(directory): - os.makedirs(directory) - shutil.move(input_path, target_path) - return target_path - return input_path - - -def write_xml_tree_2_xml_file(xml_tree, extension=".xml"): - """ Write an XML element tree object into an XML file. - - Parameters: - xml_tree (Element): An element representing an XML file. - extension (string): The file extension to use for the temporary file. - - Returns: - str: Name of the temporary file. - - """ - with tempfile.NamedTemporaryFile(suffix=extension, mode='w', delete=False, encoding='utf-8') as hed_xml_file: - xml_string = _xml_element_2_str(xml_tree) - hed_xml_file.write(xml_string) - return hed_xml_file.name - - -def _xml_element_2_str(elem): +def xml_element_2_str(elem): """ Convert an XML element to an XML string. Parameters: diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py index b4547831..e29906d3 100644 --- a/hed/schema/schema_io/wiki2schema.py +++ b/hed/schema/schema_io/wiki2schema.py @@ -43,8 +43,8 @@ class SchemaLoaderWiki(SchemaLoader): SchemaLoaderWiki(filename) will load just the header_attributes """ - def __init__(self, filename, schema_as_string=None, schema=None): - super().__init__(filename, schema_as_string, schema) + def __init__(self, filename, schema_as_string=None, schema=None, file_format=None): + super().__init__(filename, schema_as_string, schema, file_format) self.fatal_errors = [] def _open_file(self): diff --git a/hed/schema/schema_io/xml2schema.py b/hed/schema/schema_io/xml2schema.py index 91ba402e..131faf35 100644 --- a/hed/schema/schema_io/xml2schema.py +++ b/hed/schema/schema_io/xml2schema.py @@ -21,22 +21,13 @@ class SchemaLoaderXML(SchemaLoader): SchemaLoaderXML(filename) will load just the header_attributes """ - def __init__(self, filename, schema_as_string=None, schema=None): - super().__init__(filename, schema_as_string, schema) + def __init__(self, filename, schema_as_string=None, schema=None, file_format=None): + super().__init__(filename, schema_as_string, schema, file_format) self._root_element = None self._parent_map = {} def _open_file(self): - """Parses an XML file and returns the root element. - - Parameters - ---------- - Returns - ------- - RestrictedElement - The root element of the HED XML file. - - """ + """Parses an XML file and returns the root element.""" try: if self.filename: hed_xml_tree = ElementTree.parse(self.filename) @@ -49,13 +40,7 @@ def _open_file(self): return root def _get_header_attributes(self, root_element): - """ - Gets the schema attributes form the XML root node - - Returns - ------- - attribute_dict: {str: str} - """ + """Gets the schema attributes from the XML root node""" return self._reformat_xsd_attrib(root_element.attrib) def _parse_data(self): @@ -128,17 +113,7 @@ def _add_tags_recursive(self, new_tags, parent_tags): self._add_tags_recursive(child_tags, parents_and_child) def _populate_tag_dictionaries(self, tag_section): - """Populates a dictionary of dictionaries associated with tags and their attributes. - - Parameters - ---------- - - Returns - ------- - {} - A dictionary of dictionaries that has been populated with dictionaries associated with tag attributes. - - """ + """Populates a dictionary of dictionaries associated with tags and their attributes.""" self._schema._initialize_attributes(HedSectionKey.Tags) root_tags = tag_section.findall("node") @@ -146,18 +121,7 @@ def _populate_tag_dictionaries(self, tag_section): def _populate_unit_class_dictionaries(self, unit_section): """Populates a dictionary of dictionaries associated with all the unit classes, unit class units, and unit - class default units. - - Parameters - ---------- - - Returns - ------- - {} - A dictionary of dictionaries associated with all the unit classes, unit class units, and unit class - default units. - - """ + class default units.""" self._schema._initialize_attributes(HedSectionKey.UnitClasses) self._schema._initialize_attributes(HedSectionKey.Units) def_element_name = xml_constants.ELEMENT_NAMES[HedSectionKey.UnitClasses] @@ -169,9 +133,8 @@ class default units. if unit_class_entry is None: continue element_units = self._get_elements_by_name(xml_constants.UNIT_CLASS_UNIT_ELEMENT, unit_class_element) - element_unit_names = [self._get_element_tag_value(element) for element in element_units] - for unit, element in zip(element_unit_names, element_units): + for element in element_units: unit_class_unit_entry = self._parse_node(element, HedSectionKey.Units) self._add_to_dict(unit_class_unit_entry, HedSectionKey.Units) unit_class_entry.add_unit(unit_class_unit_entry) diff --git a/hed/validator/onset_validator.py b/hed/validator/onset_validator.py index 94be9d7e..f1819636 100644 --- a/hed/validator/onset_validator.py +++ b/hed/validator/onset_validator.py @@ -63,3 +63,20 @@ def _handle_onset_or_offset(self, def_tag, onset_offset_tag): del self._onsets[full_def_name.lower()] return [] + + @staticmethod + def check_for_banned_tags(hed_string): + """ Returns an issue for every tag found from the banned list + + Parameters: + hed_string(HedString): the string to check + + Returns: + list: The validation issues associated with the characters. Each issue is dictionary. + """ + banned_tag_list = DefTagNames.TEMPORAL_KEYS + issues = [] + for tag in hed_string.get_all_tags(): + if tag in banned_tag_list: + issues += ErrorHandler.format_error(OnsetErrors.HED_ONSET_WITH_NO_COLUMN, tag) + return issues diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index 751af961..aad30283 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -28,8 +28,7 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): Validate the input data using the schema Parameters: - data (BaseInput or pd.DataFrame): Input data to be validated. - If a dataframe, it is assumed to be assembled already. + data (BaseInput): Input data to be validated. def_dicts(list of DefDict or DefDict): all definitions to use for validation name(str): The name to report errors from this file as error_handler (ErrorHandler): Error context to use. Creates a new one if None @@ -41,22 +40,27 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): if error_handler is None: error_handler = ErrorHandler() + if not isinstance(data, BaseInput): + raise TypeError("Invalid type passed to spreadsheet validator. Can only validate BaseInput objects.") + error_handler.push_error_context(ErrorContext.FILE_NAME, name) - self._hed_validator = HedValidator(self._schema, def_dicts=def_dicts) - self._onset_validator = OnsetValidator() - onset_filtered = None # Adjust to account for 1 based row_adj = 1 - if isinstance(data, BaseInput): - # Adjust to account for column names - if data.has_column_names: - row_adj += 1 - issues += self._validate_column_structure(data, error_handler, row_adj) - onset_filtered = data.series_filtered - data = data.dataframe_a + # Adjust to account for column names + if data.has_column_names: + row_adj += 1 + issues += self._validate_column_structure(data, error_handler, row_adj) + onset_filtered = data.series_filtered + df = data.dataframe_a + + self._hed_validator = HedValidator(self._schema, def_dicts=def_dicts) + if data.onsets is not None: + self._onset_validator = OnsetValidator() + else: + self._onset_validator = None # Check the rows of the input data - issues += self._run_checks(data, onset_filtered, error_handler=error_handler, row_adj=row_adj) + issues += self._run_checks(df, onset_filtered, error_handler=error_handler, row_adj=row_adj) error_handler.pop_error_context() issues = sort_issues(issues) @@ -98,7 +102,10 @@ def _run_checks(self, hed_df, onset_filtered, error_handler, row_adj): if row_string: error_handler.push_error_context(ErrorContext.HED_STRING, row_string) new_column_issues = self._hed_validator.run_full_string_checks(row_string) - new_column_issues += self._onset_validator.validate_temporal_relations(row_string) + if self._onset_validator is not None: + new_column_issues += self._onset_validator.validate_temporal_relations(row_string) + else: + new_column_issues += OnsetValidator.check_for_banned_tags(row_string) error_handler.add_context_and_filter(new_column_issues) error_handler.pop_error_context() issues += new_column_issues diff --git a/requirements.txt b/requirements.txt index 799f90f3..587a5823 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ pandas>=1.3.5 portalocker>=2.7.0 semantic_version>=2.10.0 wordcloud==1.9.3 -jsonschema==4.18.4 \ No newline at end of file +jsonschema==4.18.4 +rdflib>=6 diff --git a/spec_tests/hed-examples b/spec_tests/hed-examples index a4b01682..3508e5de 160000 --- a/spec_tests/hed-examples +++ b/spec_tests/hed-examples @@ -1 +1 @@ -Subproject commit a4b016822b4666285b92715917355ec6bd2ae9d1 +Subproject commit 3508e5de848ba63a78a8a59771c2b0e072ac4d11 diff --git a/spec_tests/hed-specification b/spec_tests/hed-specification index 570ae3e5..9a22f156 160000 --- a/spec_tests/hed-specification +++ b/spec_tests/hed-specification @@ -1 +1 @@ -Subproject commit 570ae3e56c042c05a6f488e3cfe56fb70d1fda72 +Subproject commit 9a22f1563501d47bf99a80f3a0f3d6c8725872a5 diff --git a/spec_tests/validate_bids.py b/spec_tests/validate_bids.py index 7cb02d4e..0ee85293 100644 --- a/spec_tests/validate_bids.py +++ b/spec_tests/validate_bids.py @@ -26,9 +26,9 @@ def test_validation(self): bids_data = BidsDataset(dataset_path) issues = bids_data.validate(check_for_warnings=False) if issues: - self.fail_count.append(issues) + self.fail_count.append((directory, issues)) print(f"{len(self.fail_count)} tests got an unexpected result") - print("\n".join(get_printable_issue_string(issue, skip_filename=False) for issue in self.fail_count)) + print("\n".join(get_printable_issue_string(issue, f"Errors in directory: {title}", skip_filename=False) for title, issue in self.fail_count)) self.assertEqual(0, len(self.fail_count)) if __name__ == '__main__': diff --git a/tests/schema/test_hed_schema_io.py b/tests/schema/test_hed_schema_io.py index ef60c5c8..40beb123 100644 --- a/tests/schema/test_hed_schema_io.py +++ b/tests/schema/test_hed_schema_io.py @@ -1,10 +1,12 @@ import unittest +import rdflib + from hed.errors import HedFileError from hed.errors.error_types import SchemaErrors from hed.schema import load_schema, HedSchemaGroup, load_schema_version, HedSchema from hed.schema.hed_schema_io import parse_version_list, _load_schema_version - +from tests.schema.test_schema_converters import with_temp_file, get_temp_filename import os from hed.errors import HedExceptions @@ -91,7 +93,7 @@ def test_load_schema_version_merged(self): # Verify this cannot be saved with self.assertRaises(HedFileError): - schemas3.save_as_mediawiki() + schemas3.save_as_mediawiki("filename") def test_load_and_verify_tags(self): # Load 'testlib' by itself @@ -310,36 +312,40 @@ def _base_merging_test(self, files): s1 = files[i] s2 = files[i + 1] self.assertEqual(s1, s2) + filename1 = get_temp_filename(".xml") + filename2 = get_temp_filename(".xml") try: - path1 = s1.save_as_xml(save_merged=save_merged) - path2 = s2.save_as_xml(save_merged=save_merged) - result = filecmp.cmp(path1, path2) + s1.save_as_xml(filename1, save_merged=save_merged) + s2.save_as_xml(filename2, save_merged=save_merged) + result = filecmp.cmp(filename1, filename2) # print(s1.filename) # print(s2.filename) self.assertTrue(result) - reload1 = load_schema(path1) - reload2 = load_schema(path2) + reload1 = load_schema(filename1) + reload2 = load_schema(filename2) self.assertEqual(reload1, reload2) except Exception: self.assertTrue(False) finally: - os.remove(path1) - os.remove(path2) + os.remove(filename1) + os.remove(filename2) try: - path1 = s1.save_as_mediawiki(save_merged=save_merged) - path2 = s2.save_as_mediawiki(save_merged=save_merged) - result = filecmp.cmp(path1, path2) + filename1 = get_temp_filename(".mediawiki") + filename2 = get_temp_filename(".mediawiki") + s1.save_as_mediawiki(filename1, save_merged=save_merged) + s2.save_as_mediawiki(filename2, save_merged=save_merged) + result = filecmp.cmp(filename1, filename2) self.assertTrue(result) - reload1 = load_schema(path1) - reload2 = load_schema(path2) + reload1 = load_schema(filename1) + reload2 = load_schema(filename2) self.assertEqual(reload1, reload2) except Exception: self.assertTrue(False) finally: - os.remove(path1) - os.remove(path2) + os.remove(filename1) + os.remove(filename2) lines1 = s1.get_as_mediawiki_string(save_merged=save_merged) lines2 = s2.get_as_mediawiki_string(save_merged=save_merged) @@ -376,13 +382,11 @@ def test_saving_merged_rooted_sorting(self): self._base_merging_test(files) - def test_saving_bad_sort(self): + @with_temp_file(".mediawiki") + def test_saving_bad_sort(self, filename): loaded_schema = load_schema(os.path.join(self.full_base_folder, "bad_sort_test.mediawiki")) - filename = loaded_schema.save_as_mediawiki() - try: - reloaded_schema = load_schema(filename) - finally: - os.remove(filename) + loaded_schema.save_as_mediawiki(filename) + reloaded_schema = load_schema(filename) self.assertEqual(loaded_schema, reloaded_schema) @@ -423,17 +427,17 @@ def _base_added_class_tests(self, schema): def test_saving_merged2(self): s1 = load_schema(os.path.join(self.full_base_folder, "add_all_types.mediawiki")) self._base_added_class_tests(s1) - path1 = "" - path2 = "" for save_merged in [True, False]: + path1 = get_temp_filename(".xml") + path2 = get_temp_filename(".mediawiki") try: - path1 = s1.save_as_xml(save_merged=save_merged) + s1.save_as_xml(path1, save_merged=save_merged) s2 = load_schema(path1) self.assertEqual(s1, s2) self._base_added_class_tests(s2) - path2 = s1.save_as_mediawiki(save_merged=save_merged) - s2 = load_schema(path1) + s1.save_as_mediawiki(path2, save_merged=save_merged) + s2 = load_schema(path2) self.assertEqual(s1, s2) self._base_added_class_tests(s2) finally: @@ -565,3 +569,82 @@ def test_triple_prefixes(self): """Test that libraries with triple prefixes are handled correctly.""" self.assertEqual(parse_version_list(["test:score", "ol:otherlib", "test:testlib", "abc:anotherlib"]), {"test": "test:score,testlib", "ol": "ol:otherlib", "abc": "abc:anotherlib"}) + + +class TestOwlBase(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.base_schema = schema.load_schema_version("8.2.0") + + @with_temp_file(".owl") + def test_schema2xml(self, filename): + self.base_schema.save_as_owl(filename) + loaded_schema = schema.load_schema(filename) + + self.assertEqual(loaded_schema, self.base_schema) + + self.base_schema.save_as_owl(filename, save_merged=True) + loaded_schema = schema.load_schema(filename) + + self.assertEqual(loaded_schema, self.base_schema) + + @with_temp_file(".ttl") + def test_schema2turtle(self, filename): + self.base_schema.save_as_owl(filename) + loaded_schema = schema.load_schema(filename) + + self.assertEqual(loaded_schema, self.base_schema) + + self.base_schema.save_as_owl(filename, save_merged=True) + loaded_schema = schema.load_schema(filename) + + self.assertEqual(loaded_schema, self.base_schema) + + @with_temp_file(".json-ld") + def test_schema2jsonld(self, filename): + self.base_schema.save_as_owl(filename) + loaded_schema = schema.load_schema(filename) + + self.assertEqual(loaded_schema, self.base_schema) + + self.base_schema.save_as_owl(filename, save_merged=True) + loaded_schema = schema.load_schema(filename) + + self.assertEqual(loaded_schema, self.base_schema) + + def test_schema2owlstring(self): + owl_string = self.base_schema.get_as_owl_string(file_format="turtle") + loaded_schema = schema.from_string(owl_string, schema_format="turtle") + + self.assertEqual(loaded_schema, self.base_schema) + + owl_string = self.base_schema.get_as_owl_string(save_merged=True, file_format="turtle") + loaded_schema = schema.from_string(owl_string, schema_format="turtle") + + self.assertEqual(loaded_schema, self.base_schema) + + def test_schema2bad_filename(self): + with self.assertRaises(OSError): + self.base_schema.save_as_owl("", file_format="xml") + with self.assertRaises(OSError): + self.base_schema.save_as_owl("/////////", file_format="xml") + + def test_schema2bad_filename_rdf_format(self): + with self.assertRaises(rdflib.plugin.PluginException): + self.base_schema.save_as_owl("valid_filename.invalid_extension") + with self.assertRaises(rdflib.plugin.PluginException): + self.base_schema.save_as_owl("") + with self.assertRaises(rdflib.plugin.PluginException): + self.base_schema.save_as_owl("", file_format="unknown") + + +class TestOwlLibRooted(TestOwlBase): + @classmethod + def setUpClass(cls): + cls.base_schema = schema.load_schema_version("testlib_2.0.0") + + +class TestOwlLib(TestOwlBase): + @classmethod + def setUpClass(cls): + cls.base_schema = schema.load_schema_version("score_1.1.0") diff --git a/tests/schema/test_schema_converters.py b/tests/schema/test_schema_converters.py index 30cacaba..2708f134 100644 --- a/tests/schema/test_schema_converters.py +++ b/tests/schema/test_schema_converters.py @@ -3,6 +3,30 @@ import os from hed import schema +import tempfile +import functools + + +def get_temp_filename(extension): + with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as temp_file: + filename = temp_file.name + return filename + +# Function wrapper to create and clean up a single schema for testing +def with_temp_file(extension): + def decorator(test_func): + @functools.wraps(test_func) + def wrapper(*args, **kwargs): + # Create a temporary file with the given extension + filename = get_temp_filename(extension) + try: + # Call the test function with the filename + return test_func(*args, filename=filename, **kwargs) + finally: + # Clean up: Remove the temporary file + os.remove(filename) + return wrapper + return decorator class TestConverterBase(unittest.TestCase): @@ -17,21 +41,17 @@ def setUpClass(cls): cls.wiki_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.wiki_file) cls.hed_schema_wiki = schema.load_schema(cls.wiki_file) - def test_schema2xml(self): - saved_filename = self.hed_schema_xml.save_as_xml() - try: - loaded_schema = schema.load_schema(saved_filename) - finally: - os.remove(saved_filename) + @with_temp_file(".xml") + def test_schema2xml(self, filename): + self.hed_schema_xml.save_as_xml(filename) + loaded_schema = schema.load_schema(filename) self.assertEqual(loaded_schema, self.hed_schema_xml) - def test_schema2wiki(self): - saved_filename = self.hed_schema_xml.save_as_mediawiki() - try: - loaded_schema = schema.load_schema(saved_filename) - finally: - os.remove(saved_filename) + @with_temp_file(".mediawiki") + def test_schema2wiki(self, filename): + self.hed_schema_xml.save_as_mediawiki(filename) + loaded_schema = schema.load_schema(filename) self.assertEqual(loaded_schema, self.hed_schema_xml) @@ -50,23 +70,19 @@ def test_schema_as_string_wiki(self): string_schema = schema.from_string(hed_schema_as_string, schema_format=".mediawiki") self.assertEqual(string_schema, self.hed_schema_wiki) - def test_wikischema2xml(self): - saved_filename = self.hed_schema_wiki.save_as_xml() - try: - loaded_schema = schema.load_schema(saved_filename) - finally: - os.remove(saved_filename) + @with_temp_file(".xml") + def test_wikischema2xml(self, filename): + self.hed_schema_wiki.save_as_xml(filename) + loaded_schema = schema.load_schema(filename) wiki_schema_copy = copy.deepcopy(self.hed_schema_wiki) self.assertEqual(loaded_schema, wiki_schema_copy) - def test_wikischema2wiki(self): - saved_filename = self.hed_schema_wiki.save_as_mediawiki() - try: - loaded_schema = schema.load_schema(saved_filename) - finally: - os.remove(saved_filename) + @with_temp_file(".mediawiki") + def test_wikischema2wiki(self, filename): + self.hed_schema_wiki.save_as_mediawiki(filename) + loaded_schema = schema.load_schema(filename) self.assertEqual(loaded_schema, self.hed_schema_wiki) @@ -159,12 +175,6 @@ class TestDuplicateUnitClass(TestComplianceBase): expected_issues = 1 -# class TestSchemaComplianceOld(TestComplianceBase): -# xml_file = '../data/legacy_xml/HED7.1.1.xml' -# wiki_file = '../data/legacy_xml/HED7.2.0.mediawiki' -# can_compare = False -# expected_issues = 1 - class TestConverterSavingPrefix(unittest.TestCase): xml_file = '../data/schema_tests/HED8.0.0t.xml' @@ -175,11 +185,9 @@ def setUpClass(cls): cls.hed_schema_xml = schema.load_schema(cls.xml_file) cls.hed_schema_xml_prefix = schema.load_schema(cls.xml_file, schema_namespace="tl:") - def test_saving_prefix(self): - saved_filename = self.hed_schema_xml_prefix.save_as_xml() - try: - loaded_schema = schema.load_schema(saved_filename) - finally: - os.remove(saved_filename) + @with_temp_file(".xml") + def test_saving_prefix(self, filename): + self.hed_schema_xml_prefix.save_as_xml(filename) + loaded_schema = schema.load_schema(filename) self.assertEqual(loaded_schema, self.hed_schema_xml) diff --git a/tests/validator/test_onset_validator.py b/tests/validator/test_onset_validator.py index 2b60d391..42e6c958 100644 --- a/tests/validator/test_onset_validator.py +++ b/tests/validator/test_onset_validator.py @@ -312,6 +312,18 @@ def test_onset_two_in_one_line(self): self._test_issues_base(test_strings, test_issues, expected_context, placeholder_def_only=False) + def test_check_for_banned_tags(self): + hed_string = HedString("Event, (Duration/Short, Label/Example)", self.hed_schema) + issues = OnsetValidator.check_for_banned_tags(hed_string) + self.assertEqual(len(issues), 0) + + hed_string = HedString("Onset, (Offset, Event)", self.hed_schema) + issues = OnsetValidator.check_for_banned_tags(hed_string) + self.assertEqual(len(issues), 2) + + hed_string = HedString("(Onset, Duration/Long), Label/Example", self.hed_schema) + issues = OnsetValidator.check_for_banned_tags(hed_string) + self.assertEqual(len(issues), 1) if __name__ == '__main__': unittest.main() diff --git a/tests/validator/test_spreadsheet_validator.py b/tests/validator/test_spreadsheet_validator.py index 1b1f57eb..9c0691d4 100644 --- a/tests/validator/test_spreadsheet_validator.py +++ b/tests/validator/test_spreadsheet_validator.py @@ -5,12 +5,14 @@ import unittest from hed import load_schema_version, load_schema from hed.validator import SpreadsheetValidator -from hed import SpreadsheetInput +from hed import TabularInput, SpreadsheetInput +from hed.errors.error_types import ValidationErrors + class TestSpreadsheetValidation(unittest.TestCase): @classmethod def setUpClass(cls): - cls.schema = load_schema_version("8.1.0") + cls.schema = load_schema_version("8.2.0") cls.validator = SpreadsheetValidator(cls.schema) base = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/') cls.base_data_dir = base @@ -45,3 +47,50 @@ def test_basic_validate(self): issues = file_input.validate(self.schema) self.assertTrue(len(issues), 1) + def test_invalid_onset_invalid_column(self): + def_dict = "(Definition/DefaultOnset, (Event))" + base_df = pd.DataFrame({ + 'HED': ["Event, (Age/5, Label/Example)", "Age/1, Label/Example", "Age/3, (Event)"] + }) + + self.df_with_onset = base_df.copy() + self.df_with_onset['onset'] = [1, 2, 3] + self.df_without_onset = base_df.copy() + + # No tags in either of these + issues = self.validator.validate(TabularInput(self.df_without_onset), def_dicts=def_dict) + self.assertEqual(len(issues), 0) + + issues = self.validator.validate(TabularInput(self.df_with_onset), def_dicts=def_dict) + self.assertEqual(len(issues), 1) + self.assertEqual(issues[0]['code'], ValidationErrors.HED_UNKNOWN_COLUMN) + + base_has_tags_df = pd.DataFrame({ + 'HED': ["(Onset, Def/DefaultOnset)", "(Inset, Def/DefaultOnset), (Event, Age/2)", "(Offset, Def/DefaultOnset), (Age/4)"] + }) + + self.df_with_onset_has_tags = base_has_tags_df.copy() + self.df_with_onset_has_tags['onset'] = [1, 2, 3] + self.df_without_onset_has_tags = base_has_tags_df.copy() + + issues = self.validator.validate(TabularInput(self.df_without_onset_has_tags), def_dicts=def_dict) + self.assertEqual(len(issues), 3) + self.assertEqual(issues[0]['code'], ValidationErrors.ONSET_OFFSET_INSET_ERROR) + issues = self.validator.validate(TabularInput(self.df_with_onset_has_tags), def_dicts=def_dict) + self.assertEqual(len(issues), 1) + self.assertEqual(issues[0]['code'], ValidationErrors.HED_UNKNOWN_COLUMN) + + base_has_tags_unordered_df = pd.DataFrame({ + 'HED': ["(Onset, Def/DefaultOnset)", "(Offset, Def/DefaultOnset), (Age/4)", "(Inset, Def/DefaultOnset), (Event, Age/2)"] + }) + self.df_with_onset_has_tags_unordered = base_has_tags_unordered_df.copy() + self.df_with_onset_has_tags_unordered['onset'] = [1, 2, 3] + self.df_without_onset_has_tags_unordered = base_has_tags_unordered_df.copy() + + issues = self.validator.validate(TabularInput(self.df_without_onset_has_tags_unordered), def_dicts=def_dict) + self.assertEqual(len(issues), 3) + self.assertEqual(issues[0]['code'], ValidationErrors.ONSET_OFFSET_INSET_ERROR) + issues = self.validator.validate(TabularInput(self.df_with_onset_has_tags_unordered), def_dicts=def_dict) + self.assertEqual(len(issues), 2) + self.assertEqual(issues[0]['code'], ValidationErrors.HED_UNKNOWN_COLUMN) + self.assertEqual(issues[1]['code'], ValidationErrors.ONSET_OFFSET_INSET_ERROR) \ No newline at end of file