Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bugs/improve unit and def validation code #805

Merged
merged 2 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions hed/models/hed_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,12 @@ def _calculate_to_canonical_forms(self, hed_schema):

return tag_issues

def get_stripped_unit_value(self):
def get_stripped_unit_value(self, extension_text):
""" Return the extension divided into value and units, if the units are valid.

Parameters:
extension_text (str): The text to split, in case it's a portion of a tag.

Returns:
stripped_unit_value (str): The extension portion with the units removed.
unit (str or None): None if no valid unit found.
Expand All @@ -328,7 +331,7 @@ def get_stripped_unit_value(self):

"""
tag_unit_classes = self.unit_classes
stripped_value, unit, _ = self._get_tag_units_portion(tag_unit_classes)
stripped_value, unit, _ = HedTag._get_tag_units_portion(extension_text, tag_unit_classes)
if stripped_value:
return stripped_value, unit

Expand All @@ -354,7 +357,7 @@ def value_as_default_unit(self):
unit_entry = self.default_unit
unit = unit_entry.name
else:
stripped_value, unit, unit_entry = self._get_tag_units_portion(tag_unit_classes)
stripped_value, unit, unit_entry = HedTag._get_tag_units_portion(self.extension, tag_unit_classes)

if stripped_value:
if unit_entry.get_conversion_factor(unit) is not None:
Expand Down Expand Up @@ -544,7 +547,8 @@ def _get_schema_namespace(org_tag):
return org_tag[:first_colon + 1]
return ""

def _get_tag_units_portion(self, tag_unit_classes):
@staticmethod
def _get_tag_units_portion(extension_text, tag_unit_classes):
""" Check that this string has valid units and remove them.

Parameters:
Expand All @@ -555,19 +559,19 @@ def _get_tag_units_portion(self, tag_unit_classes):
This is filled in if there are no units as well.
unit (UnitEntry or None): The matching unit entry if one is found
"""
value, _, units = self.extension.rpartition(" ")
value, _, units = extension_text.rpartition(" ")
if not units:
return None, None, None

for unit_class_entry in tag_unit_classes.values():
all_valid_unit_permutations = unit_class_entry.derivative_units

possible_match = self._find_modifier_unit_entry(units, all_valid_unit_permutations)
possible_match = HedTag._find_modifier_unit_entry(units, all_valid_unit_permutations)
if possible_match and not possible_match.has_attribute(HedKey.UnitPrefix):
return value, units, possible_match

# Repeat the above, but as a prefix
possible_match = self._find_modifier_unit_entry(value, all_valid_unit_permutations)
possible_match = HedTag._find_modifier_unit_entry(value, all_valid_unit_permutations)
if possible_match and possible_match.has_attribute(HedKey.UnitPrefix):
return units, value, possible_match

Expand Down
11 changes: 0 additions & 11 deletions hed/schema/hed_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -753,17 +753,6 @@ def _get_attributes_for_section(self, key_class):
# Semi private function used to create a schema in memory(usually from a source file)
# ===============================================
def _add_tag_to_dict(self, long_tag_name, new_entry, key_class):
# Add the InLibrary attribute to any library schemas as they are loaded
# These are later removed when they are saved out, if saving unmerged
# if self.library and (not self.with_standard or (not self.merged and self.with_standard)):
# # only add it if not already present - This is a rare case
# Todo ian: I think this should be moved up one level for parity with the other loading changes
# .library will be updated to potentially be a list
# Cannot save schema if .library is a list
#
# if not new_entry.has_attribute(HedKey.InLibrary):
# new_entry._set_attribute_value(HedKey.InLibrary, self.library)

section = self._sections[key_class]
return section._add_to_dict(long_tag_name, new_entry)

Expand Down
64 changes: 36 additions & 28 deletions hed/validator/def_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,37 +35,14 @@ def validate_def_tags(self, hed_string_obj, hed_validator=None):
if self._label_tag_name not in hed_string_lower:
return []

# This is needed primarily to validate the contents of a def-expand matches the default.
def_issues = []
# We need to check for labels to expand in ALL groups
for def_tag, def_expand_group, def_group in hed_string_obj.find_def_tags(recursive=True):
def_issues += self._validate_def_contents(def_tag, def_expand_group, hed_validator)

return def_issues

@staticmethod
def _validate_def_units(def_tag, placeholder_tag, hed_validator, is_def_expand_tag):
"""Validate units and value classes on def/def-expand tags

Parameters:
def_tag(HedTag): The source tag
placeholder_tag(HedTag): The placeholder tag this def fills in
hed_validator(HedValidator): Used to validate the units/values
is_def_expand_tag(bool): If the given def_tag is a def-expand tag or not.

Returns:
issues(list): Issues found from validating placeholders.
"""
def_issues = []
error_code = ValidationErrors.DEF_INVALID
if is_def_expand_tag:
error_code = ValidationErrors.DEF_EXPAND_INVALID

def_issues += hed_validator.validate_units(placeholder_tag,
report_as=def_tag,
error_code=error_code)

return def_issues

@staticmethod
def _report_missing_or_invalid_value(def_tag, def_entry, is_def_expand_tag):
"""Returns the correct error for this type of def tag
Expand Down Expand Up @@ -121,15 +98,46 @@ def _validate_def_contents(self, def_tag, def_expand_group, hed_validator):
def_issues += ErrorHandler.format_error(ValidationErrors.HED_DEF_EXPAND_INVALID,
tag=def_tag, actual_def=def_contents,
found_def=def_expand_group)
if def_entry.takes_value and hed_validator:
placeholder_tag = def_contents.get_first_group().find_placeholder_tag()
def_issues += self._validate_def_units(def_tag, placeholder_tag, hed_validator,
is_def_expand_tag)
else:
def_issues += self._report_missing_or_invalid_value(def_tag, def_entry, is_def_expand_tag)

return def_issues

def validate_def_value_units(self, def_tag, hed_validator):
"""Equivalent to HedValidator.validate_units for the special case of a Def or Def-expand tag"""
tag_label, _, placeholder = def_tag.extension.partition('/')
is_def_expand_tag = def_tag.short_base_tag == DefTagNames.DEF_EXPAND_ORG_KEY

def_entry = self.defs.get(tag_label.lower())
# These errors will be caught as can't match definition
if def_entry is None:
return []

error_code = ValidationErrors.DEF_INVALID
if is_def_expand_tag:
error_code = ValidationErrors.DEF_EXPAND_INVALID

def_issues = []

# Validate the def name vs the name class
def_issues += hed_validator.validate_units(def_tag,
tag_label,
error_code=error_code)

def_contents = def_entry.get_definition(def_tag, placeholder_value=placeholder, return_copy_of_tag=True)
if def_contents and def_entry.takes_value and hed_validator:
placeholder_tag = def_contents.get_first_group().find_placeholder_tag()
# Handle the case where they're adding a unit as part of a placeholder. eg Speed/# mph
if placeholder_tag:
placeholder = placeholder_tag.extension
def_issues += hed_validator.validate_units(placeholder_tag,
placeholder,
report_as=def_tag,
error_code=error_code,
index_offset=len(tag_label) + 1)

return def_issues

def validate_onset_offset(self, hed_string_obj):
""" Validate onset/offset

Expand Down
25 changes: 19 additions & 6 deletions hed/validator/hed_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,30 +140,40 @@ def check_tag_formatting(self, original_tag):

return validation_issues

def validate_units(self, original_tag, report_as=None, error_code=None):
def validate_units(self, original_tag, validate_text=None, report_as=None, error_code=None,
index_offset=0):
"""Validate units and value classes

Parameters:
original_tag(HedTag): The source tag
validate_text (str): the text we want to validate, if not the full extension.
report_as(HedTag): Report the error tag as coming from a different one.
Mostly for definitions that expand.
error_code(str): The code to override the error as. Again mostly for def/def-expand tags.
index_offset(int): Offset into the extension validate_text starts at

Returns:
issues(list): Issues found from units
"""
if validate_text is None:
validate_text = original_tag.extension
issues = []
if original_tag.is_unit_class_tag():
issues += self._unit_validator.check_tag_unit_class_units_are_valid(original_tag,
validate_text,
report_as=report_as,
error_code=error_code)
error_code=error_code,
index_offset=index_offset)
elif original_tag.is_value_class_tag():
issues += self._unit_validator.check_tag_value_class_valid(original_tag,
validate_text,
report_as=report_as,
error_code=error_code)
# todo: potentially make this one have a report_as
error_code=error_code,
index_offset=index_offset)
elif original_tag.extension:
issues += self._char_validator.check_for_invalid_extension_chars(original_tag)
issues += self._char_validator.check_for_invalid_extension_chars(original_tag,
validate_text,
index_offset=index_offset)

return issues

Expand Down Expand Up @@ -198,6 +208,9 @@ def _validate_individual_tags_in_hed_string(self, hed_string_obj, allow_placehol
run_individual_tag_validators(hed_tag,
allow_placeholders=allow_placeholders,
is_definition=is_definition)
validation_issues += self.validate_units(hed_tag)
if hed_tag.short_base_tag == DefTagNames.DEF_ORG_KEY or hed_tag.short_base_tag == DefTagNames.DEF_EXPAND_ORG_KEY:
validation_issues += self._def_validator.validate_def_value_units(hed_tag, self)
else:
validation_issues += self.validate_units(hed_tag)

return validation_issues
16 changes: 11 additions & 5 deletions hed/validator/tag_util/char_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,23 +54,28 @@ def check_tag_invalid_chars(self, original_tag, allow_placeholders):
validation_issues += self._check_invalid_chars(original_tag.org_base_tag, allowed_chars, original_tag)
return validation_issues

def check_for_invalid_extension_chars(self, original_tag):
def check_for_invalid_extension_chars(self, original_tag, validate_text, error_code=None,
index_offset=0):
"""Report invalid characters in extension/value.

Parameters:
original_tag (HedTag): The original tag that is used to report the error.
validate_text (str): the text we want to validate, if not the full extension.
error_code(str): The code to override the error as. Again mostly for def/def-expand tags.
index_offset(int): Offset into the extension validate_text starts at

Returns:
list: Validation issues. Each issue is a dictionary.
"""
allowed_chars = self.TAG_ALLOWED_CHARS
allowed_chars += self.DEFAULT_ALLOWED_PLACEHOLDER_CHARS
allowed_chars += " "
return self._check_invalid_chars(original_tag.extension, allowed_chars, original_tag,
starting_index=len(original_tag.org_base_tag) + 1)
return self._check_invalid_chars(validate_text, allowed_chars, original_tag,
starting_index=len(original_tag.org_base_tag) + 1 + index_offset,
error_code=error_code)

@staticmethod
def _check_invalid_chars(check_string, allowed_chars, source_tag, starting_index=0):
def _check_invalid_chars(check_string, allowed_chars, source_tag, starting_index=0, error_code=None):
validation_issues = []
for i, character in enumerate(check_string):
if character.isalnum():
Expand All @@ -82,7 +87,8 @@ def _check_invalid_chars(check_string, allowed_chars, source_tag, starting_index
continue
validation_issues += ErrorHandler.format_error(ValidationErrors.INVALID_TAG_CHARACTER,
tag=source_tag, index_in_tag=starting_index + i,
index_in_tag_end=starting_index + i + 1)
index_in_tag_end=starting_index + i + 1,
actual_error=error_code)
return validation_issues

@staticmethod
Expand Down
33 changes: 21 additions & 12 deletions hed/validator/tag_util/class_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@

from hed.errors.error_reporter import ErrorHandler
from hed.errors.error_types import ValidationErrors
from hed.schema.hed_schema_constants import HedKey
import functools


class UnitValueValidator:
Expand All @@ -18,6 +16,7 @@ class UnitValueValidator:
DIGIT_OR_POUND_EXPRESSION = r'^(-?[\d.]+(?:e-?\d+)?|#)$'

VALUE_CLASS_ALLOWED_CACHE=20

def __init__(self, value_validators=None):
""" Validates the unit and value classes on a given tag.

Expand All @@ -39,28 +38,31 @@ def _get_default_value_class_validators(self):

return validator_dict

def check_tag_unit_class_units_are_valid(self, original_tag, report_as=None, error_code=None):
def check_tag_unit_class_units_are_valid(self, original_tag, validate_text, report_as=None, error_code=None,
index_offset=0):
""" Report incorrect unit class or units.

Parameters:
original_tag (HedTag): The original tag that is used to report the error.
validate_text (str): The text to validate
report_as (HedTag): Report errors as coming from this tag, rather than original_tag.
error_code (str): Override error codes
Returns:
list: Validation issues. Each issue is a dictionary.
"""
validation_issues = []
if original_tag.is_unit_class_tag():
stripped_value, unit = original_tag.get_stripped_unit_value()
stripped_value, unit = original_tag.get_stripped_unit_value(validate_text)
if not unit:
# Todo: in theory this should separately validate the number and the units, for units
# that are prefixes like $. Right now those are marked as unit invalid AND value_invalid.
bad_units = " " in original_tag.extension
bad_units = " " in validate_text

if bad_units:
stripped_value = stripped_value.split(" ")[0]

validation_issues += self._check_value_class(original_tag, stripped_value, report_as, error_code)
validation_issues += self._check_value_class(original_tag, stripped_value, report_as, error_code,
index_offset)
validation_issues += self._check_units(original_tag, bad_units, report_as)

# We don't want to give this overall error twice
Expand All @@ -71,17 +73,21 @@ def check_tag_unit_class_units_are_valid(self, original_tag, report_as=None, err

return validation_issues

def check_tag_value_class_valid(self, original_tag, report_as=None, error_code=None):
def check_tag_value_class_valid(self, original_tag, validate_text, report_as=None, error_code=None,
index_offset=0):
""" Report an invalid value portion.

Parameters:
original_tag (HedTag): The original tag that is used to report the error.
validate_text (str): The text to validate
report_as (HedTag): Report errors as coming from this tag, rather than original_tag.
error_code (str): Override error codes
index_offset(int): Offset into the extension validate_text starts at

Returns:
list: Validation issues.
"""
return self._check_value_class(original_tag, original_tag.extension, report_as, error_code)
return self._check_value_class(original_tag, validate_text, report_as, error_code, index_offset)

# char_sets = {
# "letters": set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"),
Expand Down Expand Up @@ -118,17 +124,20 @@ def _get_problem_indexes(self, original_tag, stripped_value):
# indexes = [index for index, char in enumerate(stripped_value) if char not in allowed_characters]
# pass

def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None):
def _check_value_class(self, original_tag, stripped_value, report_as, error_code=None, index_offset=0):
"""Returns any issues found if this is a value tag"""
# todo: This function needs to check for allowed characters, not just {}
validation_issues = []
if original_tag.is_takes_value_tag():
report_as = report_as if report_as else original_tag
problem_indexes = self._get_problem_indexes(original_tag, stripped_value)
for char, index in problem_indexes:
error_code = ValidationErrors.CURLY_BRACE_UNSUPPORTED_HERE \
if char in "{}" else ValidationErrors.INVALID_TAG_CHARACTER
validation_issues += ErrorHandler.format_error(error_code,
tag_code = ValidationErrors.CURLY_BRACE_UNSUPPORTED_HERE if (
char in "{}") else ValidationErrors.INVALID_TAG_CHARACTER

index_adj = len(report_as.org_base_tag) - len(original_tag.org_base_tag)
index += index_adj + index_offset
validation_issues += ErrorHandler.format_error(tag_code,
tag=report_as, index_in_tag=index,
index_in_tag_end=index + 1)
if not self._validate_value_class_portion(original_tag, stripped_value):
Expand Down
Loading