Skip to content

Commit

Permalink
Minor tweaks to spreadsheet script/code
Browse files Browse the repository at this point in the history
Better error on no schemas changed
Load an empty spreadhseet if no sheet found for .tsv format(this may need adjustment)
  • Loading branch information
IanCa committed May 23, 2024
1 parent aff55a7 commit 317c381
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 26 deletions.
4 changes: 2 additions & 2 deletions hed/schema/hed_schema_df_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@
subclass_of = "omn:SubClassOf"
attributes = "Attributes"
description = "dc:description"
equivalent_to = "owm:EquivalentTo"
equivalent_to = "omn:EquivalentTo"
has_unit_class = "hasUnitClass"

struct_columns = [hed_id, name, attributes, subclass_of, description]
tag_columns = [hed_id, level, name, subclass_of, attributes, description, equivalent_to]
tag_columns = [hed_id, name, level, subclass_of, attributes, description, equivalent_to]
unit_columns = [hed_id, name, subclass_of, has_unit_class, attributes, description, equivalent_to]

# The columns for unit class, value class, and unit modifier
Expand Down
9 changes: 5 additions & 4 deletions hed/schema/schema_io/df2schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import io
import os

import hed.schema.schema_io.ontology_util
from hed.schema.schema_io import ontology_util
from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.errors.exceptions import HedFileError, HedExceptions
from hed.schema.schema_io.base2schema import SchemaLoader
Expand Down Expand Up @@ -282,7 +282,7 @@ def _get_tag_attributes(self, row_number, row):
dict: Dictionary of attributes.
"""
try:
return hed.schema.schema_io.ontology_util.get_attributes_from_row(row)
return ontology_util.get_attributes_from_row(row)
except ValueError as e:
self._add_fatal_error(row_number, str(row), str(e))

Expand All @@ -297,12 +297,13 @@ def _add_to_dict(self, line_number, line, entry, key_class):

def load_dataframes(filenames):
dict_filenames = SchemaLoaderDF.convert_filenames_to_dict(filenames)
dataframes = {}
dataframes = ontology_util.create_empty_dataframes()
for key, filename in dict_filenames.items():
try:
dataframes[key] = pd.read_csv(filename, sep="\t", dtype=str, na_filter=False)
except OSError:
dataframes[key] = None
# todo: consider if we want to report this error(we probably do)
pass # We will use a blank one for this
return dataframes


Expand Down
16 changes: 16 additions & 0 deletions hed/schema/schema_io/ontology_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,3 +399,19 @@ def get_attributes_from_row(row):
else:
attr_string = ""
return parse_attribute_string(attr_string)


def create_empty_dataframes():
"""Returns the default empty dataframes"""
return {
constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str),
constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str),
constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str),
constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str),
}
16 changes: 2 additions & 14 deletions hed/schema/schema_io/schema2df.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Allows output of HedSchema objects as .mediawiki format"""

from hed.schema.hed_schema_constants import HedSectionKey, HedKey
from hed.schema.schema_io.ontology_util import get_library_name_and_id, remove_prefix
from hed.schema.schema_io.ontology_util import get_library_name_and_id, remove_prefix, create_empty_dataframes
from hed.schema.schema_io.schema2base import Schema2Base
import pandas as pd
import hed.schema.hed_schema_df_constants as constants
Expand Down Expand Up @@ -56,18 +56,7 @@ def _get_object_id(self, object_name, base_id=0, include_prefix=False):
# Required baseclass function
# =========================================
def _initialize_output(self):
self.output = {
constants.STRUCT_KEY: pd.DataFrame(columns=constants.struct_columns, dtype=str),
constants.TAG_KEY: pd.DataFrame(columns=constants.tag_columns, dtype=str),
constants.UNIT_KEY: pd.DataFrame(columns=constants.unit_columns, dtype=str),
constants.UNIT_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.UNIT_MODIFIER_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.VALUE_CLASS_KEY: pd.DataFrame(columns=constants.other_columns, dtype=str),
constants.ANNOTATION_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.DATA_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.OBJECT_KEY: pd.DataFrame(columns=constants.property_columns, dtype=str),
constants.ATTRIBUTE_PROPERTY_KEY: pd.DataFrame(columns=constants.property_columns_reduced, dtype=str),
}
self.output = create_empty_dataframes()
self._tag_rows = []

def _create_and_add_object_row(self, base_object, attributes="", description=""):
Expand Down Expand Up @@ -327,4 +316,3 @@ def _calculate_attribute_type(attribute_entry):
elif any(attribute in object_ranges for attribute in attributes):
return "object"
return "data"

6 changes: 5 additions & 1 deletion hed/scripts/convert_and_update_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ def convert_and_update(filenames, set_ids):
schema_files = sort_base_schemas(filenames)
all_issues = validate_all_schemas(schema_files)

if all_issues or not schema_files:
if not schema_files:
print("No schema file changes found in the file list")
return 0

if all_issues:
print("Did not attempt to update schemas due to validation failures")
return 1

Expand Down
14 changes: 11 additions & 3 deletions hed/scripts/script_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,20 @@ def validate_schema(file_path):
"""
validation_issues = []
try:
_, extension = os.path.splitext(file_path)
if extension.lower() != extension:
error_message = f"Only fully lowercase extensions are allowed for schema files. " \
f"Invalid extension on file: {file_path}"
validation_issues.append(error_message)
return validation_issues

base_schema = load_schema(file_path)
issues = base_schema.check_compliance()
issues = [issue for issue in issues if issue["code"] != SchemaWarnings.SCHEMA_PRERELEASE_VERSION_USED]
if issues:
error_message = get_printable_issue_string(issues, title=file_path)
validation_issues.append(error_message)
return validation_issues

mediawiki_string = base_schema.get_as_mediawiki_string()
reloaded_schema = from_string(mediawiki_string, schema_format=".mediawiki")
Expand Down Expand Up @@ -47,7 +55,7 @@ def validate_schema(file_path):

def add_extension(basename, extension):
"""Generate the final name for a given extension. Only .tsv varies notably."""
if extension.lower() == ".tsv":
if extension == ".tsv":
parent_path, basename = os.path.split(basename)
return os.path.join(parent_path, "hedtsv", basename)
return basename + extension
Expand All @@ -74,10 +82,10 @@ def sort_base_schemas(filenames):
schema_files = defaultdict(set)
for file_path in filenames:
basename, extension = os.path.splitext(file_path)
if extension.lower() == ".xml" or extension.lower() == ".mediawiki":
if extension == ".xml" or extension == ".mediawiki":
schema_files[basename].add(extension)
continue
elif extension.lower() == ".tsv":
elif extension == ".tsv":
tsv_basename = basename.rpartition("_")[0]
full_parent_path, real_basename = os.path.split(tsv_basename)
full_parent_path, real_basename2 = os.path.split(full_parent_path)
Expand Down
18 changes: 16 additions & 2 deletions tests/scripts/test_script_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import shutil
from hed import load_schema_version
from hed.scripts.script_util import add_extension, sort_base_schemas, validate_all_schema_formats
from hed.scripts.script_util import add_extension, sort_base_schemas, validate_all_schema_formats, validate_schema


class TestAddExtension(unittest.TestCase):
Expand All @@ -25,9 +25,10 @@ def test_empty_extension(self):

def test_none_extension(self):
"""Test behavior with None as extension."""
with self.assertRaises(AttributeError):
with self.assertRaises(TypeError):
add_extension("filename", None)


class TestSortBaseSchemas(unittest.TestCase):
def test_mixed_file_types(self):
filenames = [
Expand Down Expand Up @@ -119,3 +120,16 @@ def test_error_no_error(self):
def tearDownClass(cls):
"""Remove the entire directory created for testing to ensure a clean state."""
shutil.rmtree(cls.base_path) # This will delete the directory and all its contents


class TestValidateSchema(unittest.TestCase):
def test_load_invalid_extension(self):
# Verify capital letters fail validation
self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.MEDIAWIKI")[0])
self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.Mediawiki")[0])
self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.XML")[0])
self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.Xml")[0])
self.assertIn("Only fully lowercase extensions ", validate_schema("does_not_matter.TSV")[0])
self.assertNotIn("Only fully lowercase extensions ", validate_schema("does_not_matter.tsv")[0])
self.assertNotIn("Only fully lowercase extensions ", validate_schema("does_not_matter.xml")[0])
self.assertNotIn("Only fully lowercase extensions ", validate_schema("does_not_matter.mediawiki")[0])

0 comments on commit 317c381

Please sign in to comment.