Minor tweaks to spreadsheet script/code

Better error on no schemas changed Load an empty spreadhseet if no sheet found for .tsv format(this may need adjustment)
hed-standard · May 23, 2024 · 317c381 · 317c381
1 parent aff55a7
commit 317c381
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 26 deletions.
diff --git a/hed/schema/hed_schema_df_constants.py b/hed/schema/hed_schema_df_constants.py
@@ -40,11 +40,11 @@
 subclass_of = "omn:SubClassOf"
 attributes = "Attributes"
 description = "dc:description"
-equivalent_to = "owm:EquivalentTo"
+equivalent_to = "omn:EquivalentTo"
 has_unit_class = "hasUnitClass"
 
 struct_columns = [hed_id, name, attributes, subclass_of, description]
-tag_columns = [hed_id, level, name, subclass_of, attributes, description, equivalent_to]
+tag_columns = [hed_id, name, level, subclass_of, attributes, description, equivalent_to]
 unit_columns = [hed_id, name, subclass_of, has_unit_class, attributes, description, equivalent_to]
 
 # The columns for unit class, value class, and unit modifier

diff --git a/hed/schema/schema_io/df2schema.py b/hed/schema/schema_io/df2schema.py
@@ -4,7 +4,7 @@
 import io
 import os
 
-import hed.schema.schema_io.ontology_util
+from hed.schema.schema_io import ontology_util
 from hed.schema.hed_schema_constants import HedSectionKey, HedKey
 from hed.errors.exceptions import HedFileError, HedExceptions
 from hed.schema.schema_io.base2schema import SchemaLoader
@@ -282,7 +282,7 @@ def _get_tag_attributes(self, row_number, row):
             dict: Dictionary of attributes.
         """
         try:
-            return hed.schema.schema_io.ontology_util.get_attributes_from_row(row)
+            return ontology_util.get_attributes_from_row(row)
         except ValueError as e:
             self._add_fatal_error(row_number, str(row), str(e))
 
@@ -297,12 +297,13 @@ def _add_to_dict(self, line_number, line, entry, key_class):
 
 def load_dataframes(filenames):
     dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames)
-    dataframes = {}
+    dataframes = ontology_util.create_empty_dataframes()
     for key, filename in dict_filenames.items():
         try:
             dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
         except OSError:
-            dataframes[key] = None
+            # todo: consider if we want to report this error(we probably do)
+            pass  # We will use a blank one for this
     return dataframes
 
 

diff --git a/hed/schema/schema_io/ontology_util.py b/hed/schema/schema_io/ontology_util.py
@@ -399,3 +399,19 @@ def get_attributes_from_row(row):
     else:
         attr_string = ""
     return parse_attribute_string(attr_string)
+
+
+def create_empty_dataframes():
+    """Returns the default empty dataframes"""
+    return {
+        constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str),
+        constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str),
+        constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str),
+        constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
+        constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
+        constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
+        constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
+        constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
+        constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
+        constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str),
+    }
diff --git a/hed/schema/schema_io/schema2df.py b/hed/schema/schema_io/schema2df.py
@@ -1,7 +1,7 @@
 """Allows output of HedSchema objects as .mediawiki format"""
 
 from hed.schema.hed_schema_constants import HedSectionKey, HedKey
-from hed.schema.schema_io.ontology_util import get_library_name_and_id, remove_prefix
+from hed.schema.schema_io.ontology_util import get_library_name_and_id, remove_prefix, create_empty_dataframes
 from hed.schema.schema_io.schema2base import Schema2Base
 import pandas as pd
 import hed.schema.hed_schema_df_constants as constants
@@ -56,18 +56,7 @@ def _get_object_id(self, object_name, base_id=0, include_prefix=False):
     # Required baseclass function
     # =========================================
     def _initialize_output(self):
-        self.output = {
-            constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str),
-            constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str),
-            constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str),
-            constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
-            constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
-            constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
-            constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
-            constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
-            constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
-            constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str),
-        }
+        self.output = create_empty_dataframes()
         self._tag_rows = []
 
     def _create_and_add_object_row(self, base_object, attributes="", description=""):
@@ -327,4 +316,3 @@ def _calculate_attribute_type(attribute_entry):
         elif any(attribute in object_ranges for attribute in attributes):
             return "object"
         return "data"
-
diff --git a/hed/scripts/convert_and_update_schema.py b/hed/scripts/convert_and_update_schema.py
@@ -19,7 +19,11 @@ def convert_and_update(filenames, set_ids):
     schema_files = sort_base_schemas(filenames)
     all_issues = validate_all_schemas(schema_files)
 
-    if all_issues or not schema_files:
+    if not schema_files:
+        print("No schema file changes found in the file list")
+        return 0
+
+    if all_issues:
         print("Did not attempt to update schemas due to validation failures")
         return 1
 

diff --git a/hed/scripts/script_util.py b/hed/scripts/script_util.py
@@ -13,12 +13,20 @@ def validate_schema(file_path):
     """
     validation_issues = []
     try:
+        _, extension = os.path.splitext(file_path)
+        if extension.lower() != extension:
+            error_message = f"Only fully lowercase extensions are allowed for schema files.  " \
+                             f"Invalid extension on file: {file_path}"
+            validation_issues.append(error_message)
+            return validation_issues
+
         base_schema = load_schema(file_path)
         issues = base_schema.check_compliance()
         issues = [issue for issue in issues if issue["code"] != SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED]
         if issues:
             error_message = get_printable_issue_string(issues, title=file_path)
             validation_issues.append(error_message)
+            return validation_issues
 
         mediawiki_string = base_schema.get_as_mediawiki_string()
         reloaded_schema = from_string(mediawiki_string, schema_format=".mediawiki")
@@ -47,7 +55,7 @@ def validate_schema(file_path):
 
 def add_extension(basename, extension):
     """Generate the final name for a given extension.  Only .tsv varies notably."""
-    if extension.lower() == ".tsv":
+    if extension == ".tsv":
         parent_path, basename = os.path.split(basename)
         return os.path.join(parent_path, "hedtsv", basename)
     return basename + extension
@@ -74,10 +82,10 @@ def sort_base_schemas(filenames):
     schema_files = defaultdict(set)
     for file_path in filenames:
         basename, extension = os.path.splitext(file_path)
-        if extension.lower() == ".xml" or extension.lower() == ".mediawiki":
+        if extension == ".xml" or extension == ".mediawiki":
             schema_files[basename].add(extension)
             continue
-        elif extension.lower() == ".tsv":
+        elif extension == ".tsv":
             tsv_basename = basename.rpartition("_")[0]
             full_parent_path, real_basename = os.path.split(tsv_basename)
             full_parent_path, real_basename2 = os.path.split(full_parent_path)

diff --git a/tests/scripts/test_script_util.py b/tests/scripts/test_script_util.py
@@ -2,7 +2,7 @@
 import os
 import shutil
 from hed import load_schema_version
-from hed.scripts.script_util import add_extension, sort_base_schemas, validate_all_schema_formats
+from hed.scripts.script_util import add_extension, sort_base_schemas, validate_all_schema_formats, validate_schema
 
 
 class TestAddExtension(unittest.TestCase):
@@ -25,9 +25,10 @@ def test_empty_extension(self):
 
     def test_none_extension(self):
         """Test behavior with None as extension."""
-        with self.assertRaises(AttributeError):
+        with self.assertRaises(TypeError):
             add_extension("filename", None)
 
+
 class TestSortBaseSchemas(unittest.TestCase):
     def test_mixed_file_types(self):
         filenames = [
@@ -119,3 +120,16 @@ def test_error_no_error(self):
     def tearDownClass(cls):
         """Remove the entire directory created for testing to ensure a clean state."""
         shutil.rmtree(cls.base_path)  # This will delete the directory and all its contents
+
+
+class TestValidateSchema(unittest.TestCase):
+    def test_load_invalid_extension(self):
+        # Verify capital letters fail validation
+        self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.MEDIAWIKI")[0])
+        self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.Mediawiki")[0])
+        self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.XML")[0])
+        self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.Xml")[0])
+        self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.TSV")[0])
+        self.assertNotIn("Only fully lowercase extensions ", validate_schema("does_not_matter.tsv")[0])
+        self.assertNotIn("Only fully lowercase extensions ", validate_schema("does_not_matter.xml")[0])
+        self.assertNotIn("Only fully lowercase extensions ", validate_schema("does_not_matter.mediawiki")[0])