From c5f03860d49753f0fb394cf141e64bce5f789726 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Thu, 21 Sep 2023 12:39:20 -0500 Subject: [PATCH 01/21] Add a string based search first pass Update query parser from [[]] to {} notation and clean up some Minor bug fixes in query parser --- hed/models/base_input.py | 2 +- hed/models/basic_search.py | 237 +++++++++++++++++++ hed/models/df_util.py | 26 ++ hed/models/expression_parser.py | 159 +++++++------ tests/models/test_base_input.py | 27 ++- tests/models/test_basic_search.py | 313 +++++++++++++++++++++++++ tests/models/test_expression_parser.py | 118 +++++++--- tests/schema/test_hed_schema_io.py | 4 + 8 files changed, 756 insertions(+), 130 deletions(-) create mode 100644 hed/models/basic_search.py create mode 100644 tests/models/test_basic_search.py diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 12e2d889..9f437102 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -137,7 +137,7 @@ def _indexed_dict_from_onsets(onsets): @staticmethod def _filter_by_index_list(original_series, indexed_dict): - new_series = ["n/a"] * len(original_series) # Initialize new_series with "n/a" + new_series = pd.Series(["n/a"] * len(original_series)) for onset, indices in indexed_dict.items(): if indices: diff --git a/hed/models/basic_search.py b/hed/models/basic_search.py new file mode 100644 index 00000000..ae47b71e --- /dev/null +++ b/hed/models/basic_search.py @@ -0,0 +1,237 @@ +import re +from itertools import combinations, product +from collections import defaultdict +import pandas as pd + + +def find_matching(series, search_string, regex=False): + """ Finds lines in the series that match the search string and returns a mask. + + Syntax Rules: + - '@': Prefixing a term in the search string means the object must appear anywhere within a line. + - Parentheses: Elements within parentheses must appear in the line with the same level of nesting. + eg: Search string: "(A), (B)" will match "(A), (B, C)", but not "(A, B)", since they don't + start in the same group. + - "LongFormTag*": A * will match any remaining word(anything but a comma or parenthesis) + - An individual term can be arbitrary regex, but it is limited to single continuous words. + + Notes: + - The format of the series should match the format of the search string, whether it's in short or long form. + - To enable support for matching parent tags, ensure that both the series and search string are in long form. + + Args: + series (pd.Series): A Pandas Series object containing the lines to be searched. + search_string (str): The string to search for in each line of the series. + regex (bool): By default, translate any * wildcard characters to .*? regex + If True, do no translation and pass the words as is. Due to how it's setup, you must not include + the following characters: (), + + Returns: + mask (pd.Series): A Boolean mask Series of the same length as the input series. + The mask has `True` for lines that match the search string and `False` otherwise. + """ + if not regex: + # Replace *'s with a reasonable value for people who don't know regex + search_string = re.sub(r'(?<!\.)\*', '.*?', search_string) + anywhere_words, specific_words = find_words(search_string) + delimiter_map = construct_delimiter_map(search_string, specific_words) + source_words = anywhere_words + specific_words + + # Create a set of series of masks to determine which rows contain each individual word + candidate_indexes = set(series.index) + + # Loop through source_words to filter down candidate_indexes + for word in source_words: + matches = series.str.contains(word, regex=True) + current_word_indexes = set(matches[matches].index.tolist()) + + # Update candidate_indexes by taking the intersection with current_word_indexes + candidate_indexes &= current_word_indexes + + if not candidate_indexes: + break + + candidate_indexes = sorted(candidate_indexes) + + full_mask = pd.Series(False, index=series.index) + + candidate_series = series[candidate_indexes] + + mask = candidate_series.apply(verify_search_delimiters, args=(anywhere_words, specific_words, delimiter_map)) + full_mask.loc[candidate_indexes] = mask + + return full_mask + + +def find_words(search_string): + """ Extract all words in the search string. Dividing them into words that must be relative to each other, + and words that can be anywhere. + + Args: + search_string (str): The search query string to parse. + Words prefixed with '@' are 'anywhere' words. + + Returns: + tuple: A tuple containing two lists: + - anywhere_words (list of str): Words that can appear anywhere in the text. + - specific_words (list of str): Words that must appear relative to other terms. + """ + # Match sequences of characters that are not commas, parentheses, or standalone spaces. + pattern = r'[^,()]+' + words = re.findall(pattern, search_string) + + # Remove any extraneous whitespace from each word + words = [word.strip() for word in words if word.strip()] + + anywhere_words = [word[1:] for word in words if word.startswith("@")] + specific_words = [word for word in words if not word.startswith("@")] + + return anywhere_words, specific_words + + +def check_parentheses(text): + """ Checks for balanced parentheses in the given text and returns the unbalanced ones. + + Args: + text (str): The text to be checked for balanced parentheses. + + Returns: + str: A string containing the unbalanced parentheses in their original order. + + Notes: + - The function only considers the characters '(' and ')' for balancing. + - Balanced pairs of parentheses are removed, leaving behind only the unbalanced ones. + + """ + # Extract all parentheses from the text + all_parentheses = ''.join(re.findall('[()]', text)) + + stack = [] + remaining_parentheses = [] + + # Loop through all parentheses and find balanced ones + for p in all_parentheses: + if p == '(': + stack.append(p) + elif p == ')' and stack: + stack.pop() + else: + remaining_parentheses.append(p) + + # Add unbalanced ( back to remaining parentheses + remaining_parentheses.extend(stack) + + return ''.join(remaining_parentheses) + + +def reverse_and_flip_parentheses(s): + """ Reverses a string and flips the parentheses. + + Args: + s (str): The string to be reversed and have its parentheses flipped. + + Returns: + str: The reversed string with flipped parentheses. + + Notes: + - The function takes into account only the '(' and ')' characters for flipping. + """ + # Reverse the string + reversed_s = s[::-1] + + # Flip the parentheses directly in the reversed string + flipped_s = reversed_s.translate(str.maketrans("()", ")(")) + return flipped_s + + +def construct_delimiter_map(text, words): + """ Takes an input search query and list of words, returning the parenthetical delimiters between them. + + Args: delimiter + text (str): The search query + words(list): A list of words we want to map between from the query + + Returns: + dict: The two-way delimiter map + """ + locations = {} + # Find the locations of each word in the text + for word in words: + for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text): + start_index = match.start(1) + end_index = match.end(1) + match_length = end_index - start_index + locations[start_index] = (word, match_length) + + sorted_locations = sorted(locations.items()) + + delimiter_map = {} + # Use combinations to get every combination of two words in order + for (start1, (word1, length1)), (start2, (word2, length2)) in combinations(sorted_locations, 2): + end1 = start1 + length1 + delimiter_text = text[end1:start2] + delimiter_map[(word1, word2)] = check_parentheses(delimiter_text) + + # Add the reversed version of the above + reverse_map = {(word2, word1): reverse_and_flip_parentheses(delimiter_text) for ((word1, word2), delimiter_text) in + delimiter_map.items()} + delimiter_map.update(reverse_map) + + return delimiter_map + + +def verify_search_delimiters(text, anywhere_words, specific_words, delimiter_map): + """ Verifies if the text contains specific words with expected delimiters between them. + + Args: + text (str): The text to search in. + anywhere_words (list of str): Words that can appear anywhere in the text. + specific_words (list of str): Words that must appear relative to other words in the text + delimiter_map (dict): A dictionary specifying expected delimiters between pairs of specific words. + + Returns: + bool: True if all conditions are met, otherwise False. + """ + locations = defaultdict(list) + + # Check for anywhere words + for word in anywhere_words: + pattern = r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)' + if not any(re.finditer(pattern, text)): + return False + + # Find all locations for each word in the text + for word in specific_words: + for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text): + start_index = match.start(1) + matched_word = match.group(1) + locations[word].append((start_index, len(matched_word), word)) + + if len(locations) != len(specific_words): + return False + + # Generate all possible combinations of word sequences + # this covers cases where the same tag is found twice, and you need to check both + for sequence in product(*locations.values()): + sorted_sequence = sorted(sequence) + + # Check if the delimiters for this sequence match the expected delimiters + valid = True + for i in range(len(sorted_sequence) - 1): + start1, len1, word1 = sorted_sequence[i] + start2, len2, word2 = sorted_sequence[i + 1] + + end1 = start1 + len1 + delimiter_text = text[end1:start2] + + found_delimiter = check_parentheses(delimiter_text) + expected_delimiter = delimiter_map.get((word1, word2), None) + + if found_delimiter != expected_delimiter: + valid = False + break + + if valid: + return True # Return True if any sequence is valid + + return False # Return False if no valid sequence is found diff --git a/hed/models/df_util.py b/hed/models/df_util.py index 0a9373d1..2449e8a7 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -120,6 +120,26 @@ def expand_defs(df, hed_schema, def_dict, columns=None): df.loc[mask, column] = df.loc[mask, column].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict)) +def sort_strings(df, hed_schema, tag_form="short_tag", columns=None): + """ Expands any def tags found in the dataframe. + + Converts in place + + Parameters: + df (pd.Dataframe or pd.Series): The dataframe or series to modify + hed_schema (HedSchema or None): The schema to use to identify defs + columns (list or None): The columns to modify on the dataframe + """ + if isinstance(df, pd.Series): + df[:] = df.apply(partial(_sort, hed_schema=hed_schema, tag_form=tag_form)) + else: + if columns is None: + columns = df.columns + + for column in columns: + df.loc[column] = df.loc[column].apply(partial(_sort, hed_schema=hed_schema, tag_form=tag_form)) + + def _convert_to_form(hed_string, hed_schema, tag_form): return str(HedString(hed_string, hed_schema).get_as_form(tag_form)) @@ -132,6 +152,12 @@ def _expand_defs(hed_string, hed_schema, def_dict): return str(HedString(hed_string, hed_schema, def_dict).expand_defs()) +def _sort(hed_string, hed_schema, tag_form): + sorted_string = HedString(hed_string, hed_schema) + sorted_string.sort() + return sorted_string.get_as_form(tag_form) + + def process_def_expands(hed_strings, hed_schema, known_defs=None, ambiguous_defs=None): """ Gather def-expand tags in the strings/compare with known definitions to find any differences diff --git a/hed/models/expression_parser.py b/hed/models/expression_parser.py index 736ed625..ab56aa76 100644 --- a/hed/models/expression_parser.py +++ b/hed/models/expression_parser.py @@ -1,7 +1,7 @@ import re -class search_result: +class SearchResult: def __init__(self, group, tag): self.group = group # todo: rename tag: children @@ -12,7 +12,7 @@ def __init__(self, group, tag): self.tags = new_tags def __eq__(self, other): - if isinstance(other, search_result): + if isinstance(other, SearchResult): return self.group == other.group return other == self.group @@ -27,7 +27,7 @@ def merge_result(self, other): if self.group != other.group: raise ValueError("Internal error") - return search_result(self.group, new_tags) + return SearchResult(self.group, new_tags) def has_same_tags(self, other): if self.group != other.group: @@ -53,8 +53,6 @@ def get_groups_only(self): class Token: And = 0 Tag = 1 - ContainingGroup = 2 - ContainingGroupEnd = 3 DescendantGroup = 4 DescendantGroupEnd = 5 Or = 6 @@ -72,8 +70,6 @@ def __init__(self, text): ",": Token.And, "and": Token.And, "or": Token.Or, - "[[": Token.ContainingGroup, - "]]": Token.ContainingGroupEnd, "[": Token.DescendantGroup, "]": Token.DescendantGroupEnd, "(": Token.LogicalGroup, @@ -116,6 +112,17 @@ def __init__(self, token, left=None, right=None): self._match_mode = 2 token.text = token.text.replace("*", "") + def _get_parent_groups(self, search_results): + found_parent_groups = [] + if search_results: + for group in search_results: + if not group.group.is_group: + continue + if group.group._parent: + found_parent_groups.append(SearchResult(group.group._parent, group.group)) + + return found_parent_groups + def __str__(self): output_str = "" if self.left: @@ -138,16 +145,16 @@ def handle_expr(self, hed_group, exact=False): if groups_found: groups_found = [] else: - groups_found = [(None, group) for group in hed_group.get_all_groups()] + groups_found = [([], group) for group in hed_group.get_all_groups()] # If we're checking for all groups, also need to add parents. if exact: - all_found_groups = [search_result(group, tag) for tag, group in groups_found] + all_found_groups = [SearchResult(group, tag) for tag, group in groups_found] else: all_found_groups = [] for tag, group in groups_found: while group: - all_found_groups.append(search_result(group, tag)) + all_found_groups.append(SearchResult(group, tag)) # This behavior makes it eat higher level groups at higher levels tag = group group = group._parent @@ -170,7 +177,7 @@ def merge_groups(groups1, groups2): for other_group in groups2: if group.group is other_group.group: # At this point any shared tags between the two groups invalidates it. - if any(tag is tag2 for tag in group.tags for tag2 in other_group.tags): + if any(tag is tag2 and tag is not None for tag in group.tags for tag2 in other_group.tags): continue merged_result = group.merge_result(other_group) @@ -221,7 +228,7 @@ def handle_expr(self, hed_group, exact=False): # Wildcards are only found in containing groups. I believe this is correct. # todo: Is this code still needed for this kind of wildcard? We already are registering every group, just not # every group at every level. - all_found_groups = [search_result(group, tag) for tag, group in groups_found] + all_found_groups = [SearchResult(group, tag) for tag, group in groups_found] return all_found_groups @@ -260,73 +267,51 @@ def handle_expr(self, hed_group, exact=False): # Todo: this may need more thought with respects to wildcards and negation # negated_groups = [group for group in hed_group.get_all_groups() if group not in groups] # This simpler version works on python >= 3.9 - # negated_groups = [search_result(group, []) for group in hed_group.get_all_groups() if group not in groups] + # negated_groups = [SearchResult(group, []) for group in hed_group.get_all_groups() if group not in groups] # Python 3.7/8 compatible version. - negated_groups = [search_result(group, []) for group in hed_group.get_all_groups() + negated_groups = [SearchResult(group, []) for group in hed_group.get_all_groups() if not any(group is found_group.group for found_group in found_groups)] return negated_groups -class ExpressionContainingGroup(Expression): - def handle_expr(self, hed_group, exact=False): - result = self.right.handle_expr(hed_group, exact=True) - found_groups = result - if result: - found_parent_groups = [] - for group in found_groups: - if not group.group.is_group: - continue - if group.group._parent: - found_parent_groups.append(search_result(group.group._parent, group.group)) - - if found_parent_groups: - return found_parent_groups - - return [] - - class ExpressionDescendantGroup(Expression): def handle_expr(self, hed_group, exact=False): found_groups = self.right.handle_expr(hed_group) - found_parent_groups = [] - if found_groups: - for group in found_groups: - if not group.group.is_group: - continue - if group.group._parent: - found_parent_groups.append(search_result(group.group._parent, group.group)) - - if found_parent_groups: - return found_parent_groups - return [] + found_parent_groups = self._get_parent_groups(found_groups) + return found_parent_groups class ExpressionExactMatch(Expression): + def __init__(self, token, left=None, right=None): + super().__init__(token, left, right) + self.optional = "any" + + def _filter_exact_matches(self, search_results): + filtered_list = [] + for group in search_results: + if len(group.group.children) == len(group.tags): + filtered_list.append(group) + + return filtered_list + def handle_expr(self, hed_group, exact=False): found_groups = self.right.handle_expr(hed_group, exact=True) - if found_groups: - return_list = [] - for group in found_groups: - if len(group.group.children) == len(group.tags): - return_list.append(group) + if self.optional == "any": + return self._get_parent_groups(found_groups) - if return_list: - return return_list + filtered_list = self._filter_exact_matches(found_groups) + if filtered_list: + return self._get_parent_groups(filtered_list) # Basically if we don't have an exact match above, do the more complex matching including optional if self.left: optional_groups = self.left.handle_expr(hed_group, exact=True) found_groups = ExpressionAnd.merge_groups(found_groups, optional_groups) - if found_groups: - return_list = [] - for group in found_groups: - if len(group.group.children) == len(group.tags): - return_list.append(group) - - if return_list: - return return_list + filtered_list = self._filter_exact_matches(found_groups) + if filtered_list: + return self._get_parent_groups(filtered_list) return [] @@ -337,7 +322,6 @@ class QueryParser: def __init__(self, expression_string): """Compiles a QueryParser for a particular expression, so it can be used to search hed strings. - Basic Input Examples: 'Event' - Finds any strings with Event, or a descendent tag of Event such as Sensory-event @@ -354,11 +338,15 @@ def __init__(self, expression_string): '[Event and Action]' - Find a group that contains both Event and Action(at any level) - '[[Event and Action]]' - Find a group with Event And Action at the same level. + '{Event and Action}' - Find a group with Event And Action at the same level. + + '{Event and Action:}' - Find a group with Event And Action at the same level, and nothing else + + '{Event and Action:Agent}' - Find a group with Event And Action at the same level, and optionally an Agent tag. Practical Complex Example: - [[{(Onset or Offset), (Def or [[Def-expand]]): ???}]] - A group with an onset tag, + {(Onset or Offset), (Def or {Def-expand}): ???} - A group with an onset tag, a def tag or def-expand group, and an optional wildcard group Parameters: @@ -392,15 +380,22 @@ def current_token(self): def _handle_and_op(self): expr = self._handle_negation() - next_token = self._next_token_is([Token.And, Token.Or]) + next_token = self._next_token_is([Token.And]) while next_token: right = self._handle_negation() if next_token.kind == Token.And: expr = ExpressionAnd(next_token, expr, right) - elif next_token.kind == Token.Or: - expr = ExpressionOr(next_token, expr, right) - next_token = self._next_token_is([Token.And, Token.Or]) + next_token = self._next_token_is([Token.And]) + return expr + def _handle_or_op(self): + expr = self._handle_and_op() # Note: calling _handle_and_op here + next_token = self._next_token_is([Token.Or]) + while next_token: + right = self._handle_and_op() # Note: calling _handle_and_op here + if next_token.kind == Token.Or: + expr = ExpressionOr(next_token, expr, right) + next_token = self._next_token_is([Token.Or]) return expr def _handle_negation(self): @@ -417,33 +412,35 @@ def _handle_negation(self): def _handle_grouping_op(self): next_token = self._next_token_is( - [Token.ContainingGroup, Token.LogicalGroup, Token.DescendantGroup, Token.ExactMatch]) - if next_token == Token.ContainingGroup: - interior = self._handle_and_op() - expr = ExpressionContainingGroup(next_token, right=interior) - next_token = self._next_token_is([Token.ContainingGroupEnd]) - if next_token != Token.ContainingGroupEnd: - raise ValueError("Parse error: Missing closing square brackets") - # Can we move this to the and_or level? or does that break everything...? - elif next_token == Token.LogicalGroup: - expr = self._handle_and_op() + [Token.LogicalGroup, Token.DescendantGroup, Token.ExactMatch]) + if next_token == Token.LogicalGroup: + expr = self._handle_or_op() next_token = self._next_token_is([Token.LogicalGroupEnd]) if next_token != Token.LogicalGroupEnd: raise ValueError("Parse error: Missing closing paren") elif next_token == Token.DescendantGroup: - interior = self._handle_and_op() + interior = self._handle_or_op() expr = ExpressionDescendantGroup(next_token, right=interior) next_token = self._next_token_is([Token.DescendantGroupEnd]) if next_token != Token.DescendantGroupEnd: raise ValueError("Parse error: Missing closing square bracket") elif next_token == Token.ExactMatch: - interior = self._handle_and_op() + interior = self._handle_or_op() expr = ExpressionExactMatch(next_token, right=interior) next_token = self._next_token_is([Token.ExactMatchEnd, Token.ExactMatchOptional]) if next_token == Token.ExactMatchOptional: - optional_portion = self._handle_and_op() - expr.left = optional_portion + # We have an optional portion - this needs to now be an exact match + expr.optional = "none" next_token = self._next_token_is([Token.ExactMatchEnd]) + if next_token != Token.ExactMatchEnd: + optional_portion = self._handle_or_op() + expr.left = optional_portion + next_token = self._next_token_is([Token.ExactMatchEnd]) + if "~" in str(expr): + raise ValueError("Cannot use negation in exact matching groups," + " as it's not clear what is being matched.\n" + "{thing and ~(expression)} is allowed.") + if next_token is None: raise ValueError("Parse error: Missing closing curly bracket") else: @@ -452,13 +449,15 @@ def _handle_grouping_op(self): expr = ExpressionWildcardNew(next_token) elif next_token: expr = Expression(next_token) + else: + expr = None return expr def _parse(self, expression_string): self.tokens = self._tokenize(expression_string) - expr = self._handle_and_op() + expr = self._handle_or_op() if self.at_token + 1 != len(self.tokens): raise ValueError("Parse error in search string") diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py index f5b381eb..71e21386 100644 --- a/tests/models/test_base_input.py +++ b/tests/models/test_base_input.py @@ -304,25 +304,30 @@ def test_complex_onsets(self): {3.5: [0, 1], 4.0: [2], 4.4: [3, 4], -1.0: [5]}) def test_empty_and_single_item_series(self): - self.assertEqual(BaseInput._filter_by_index_list([], {}), []) - self.assertEqual(BaseInput._filter_by_index_list(["apple"], {0: [0]}), ["apple"]) + self.assertTrue(BaseInput._filter_by_index_list(pd.Series([]), {}).equals(pd.Series([]))) + self.assertTrue(BaseInput._filter_by_index_list(pd.Series(["apple"]), {0: [0]}).equals(pd.Series(["apple"]))) def test_two_item_series_with_same_onset(self): - self.assertEqual(BaseInput._filter_by_index_list(["apple", "orange"], {0: [0, 1]}), ["apple,orange", "n/a"]) + input_series = pd.Series(["apple", "orange"]) + expected_series = pd.Series(["apple,orange", "n/a"]) + self.assertTrue(BaseInput._filter_by_index_list(input_series, {0: [0, 1]}).equals(expected_series)) def test_multiple_item_series(self): - original = ["apple", "orange", "banana", "mango"] + input_series = pd.Series(["apple", "orange", "banana", "mango"]) indexed_dict = {0: [0, 1], 1: [2], 2: [3]} - self.assertEqual(BaseInput._filter_by_index_list(original, indexed_dict), ["apple,orange", "n/a", "banana", "mango"]) + expected_series = pd.Series(["apple,orange", "n/a", "banana", "mango"]) + self.assertTrue(BaseInput._filter_by_index_list(input_series, indexed_dict).equals(expected_series)) def test_complex_scenarios(self): # Test with negative, zero and positive onsets - original = ["negative", "zero", "positive"] + original = pd.Series(["negative", "zero", "positive"]) indexed_dict = {-1: [0], 0: [1], 1: [2]} - self.assertEqual(BaseInput._filter_by_index_list(original, indexed_dict), ["negative", "zero", "positive"]) + expected_series1 = pd.Series(["negative", "zero", "positive"]) + self.assertTrue(BaseInput._filter_by_index_list(original, indexed_dict).equals(expected_series1)) # Test with more complex indexed_dict - original = ["apple", "orange", "banana", "mango", "grape"] - indexed_dict = {0: [0, 1], 1: [2], 2: [3, 4]} - self.assertEqual(BaseInput._filter_by_index_list(original, indexed_dict), - ["apple,orange", "n/a", "banana", "mango,grape", "n/a"]) + original2 = ["apple", "orange", "banana", "mango", "grape"] + indexed_dict2= {0: [0, 1], 1: [2], 2: [3, 4]} + expected_series2 = pd.Series(["apple,orange", "n/a", "banana", "mango,grape", "n/a"]) + self.assertTrue(BaseInput._filter_by_index_list(original2, indexed_dict2).equals(expected_series2)) + diff --git a/tests/models/test_basic_search.py b/tests/models/test_basic_search.py new file mode 100644 index 00000000..0a942b93 --- /dev/null +++ b/tests/models/test_basic_search.py @@ -0,0 +1,313 @@ +import unittest +import pandas as pd +from hed import load_schema_version + +import os +from hed import TabularInput +from hed.models import df_util, basic_search +from hed.models.basic_search import find_words, check_parentheses, reverse_and_flip_parentheses, \ + construct_delimiter_map, verify_search_delimiters, find_matching +import numpy as np + + +class TestNewSearch(unittest.TestCase): + @classmethod + def setUpClass(cls): + bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/bids_tests/eeg_ds003645s_hed')) + sidecar1_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) + cls.events_path = os.path.realpath( + os.path.join(bids_root_path, 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) + cls.base_input = TabularInput(cls.events_path, sidecar1_path) + cls.schema = load_schema_version() + cls.df = cls.base_input.series_filtered + + def test_find_matching_results(self): + result1 = basic_search.find_matching(self.df, "(Face, Item-interval/1)") + result2 = basic_search.find_matching(self.df, "(Face, Item-interval/1*)") + + # Add assertions + self.assertTrue(np.sum(result1) > 0, "result1 should have some true values") + self.assertTrue(np.sum(result2) > 0, "result2 should have some true values") + self.assertTrue(np.sum(result1) < np.sum(result2), "result1 should have fewer true values than result2") + + +class TestFindWords(unittest.TestCase): + def test_basic(self): + search_string = "@global (local1, local2)" + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, ['global']) + self.assertEqual(specific_words, ['local1', 'local2']) + + def test_no_anywhere_words(self): + search_string = "(local1, local2)" + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, []) + self.assertEqual(specific_words, ['local1', 'local2']) + + def test_no_specific_words(self): + search_string = "@global1, @global2" + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, ['global1', 'global2']) + self.assertEqual(specific_words, []) + + def test_empty_string(self): + search_string = "" + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, []) + self.assertEqual(specific_words, []) + + def test_mixed_words(self): + search_string = "@global (local1, local2), @another_global" + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, ['global', 'another_global']) + self.assertEqual(specific_words, ['local1', 'local2']) + + def test_whitespace(self): + search_string = " @Global , ( local1 , local2 ) " + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, ['Global']) + self.assertEqual(specific_words, ['local1', 'local2']) + + +class TestCheckParentheses(unittest.TestCase): + def test_balanced_parentheses(self): + self.assertEqual(check_parentheses("(())"), "") + self.assertEqual(check_parentheses("(someText())"), "") + self.assertEqual(check_parentheses("((some)text())"), "") + self.assertEqual(check_parentheses("()"), "") + + def test_unbalanced_parentheses(self): + self.assertEqual(check_parentheses("(()"), "(") + self.assertEqual(check_parentheses("()someText("), "(") + self.assertEqual(check_parentheses("(text)text)"), ")") + self.assertEqual(check_parentheses("text)"), ")") + + def test_mixed_parentheses(self): + self.assertEqual(check_parentheses("(()(())"), "(") + self.assertEqual(check_parentheses("(someText))((someText)"), ")(") + self.assertEqual(check_parentheses("((someText))someText"), "") + self.assertEqual(check_parentheses("(someText(someText))someText"), "") + + def test_special_cases(self): + self.assertEqual(check_parentheses(""), "") + self.assertEqual(check_parentheses("abc"), "") + self.assertEqual(check_parentheses("((()))("), "(") + self.assertEqual(check_parentheses("text"), "") + + def test_reverse_and_flip_parentheses(self): + self.assertEqual(reverse_and_flip_parentheses("(abc)"), "(cba)") + self.assertEqual(reverse_and_flip_parentheses("Hello()"), "()olleH") + self.assertEqual(reverse_and_flip_parentheses(")("), ")(") + self.assertEqual(reverse_and_flip_parentheses("((()))"), "((()))") + self.assertEqual(reverse_and_flip_parentheses("()()()"), "()()()") + self.assertEqual(reverse_and_flip_parentheses("abc"), "cba") + self.assertEqual(reverse_and_flip_parentheses("123(abc)321"), "123(cba)321") + self.assertEqual(reverse_and_flip_parentheses("a(bc)d"), "d(cb)a") + + +class TestConstructDelimiterMap(unittest.TestCase): + def test_empty_text(self): + self.assertEqual(construct_delimiter_map("", ["word1", "word2"]), {}) + + def test_empty_words(self): + self.assertEqual(construct_delimiter_map("word1,word2", []), {}) + + def test_single_occurrence(self): + text = "word1,word2" + expected_result = { + ("word1", "word2"): "", + ("word2", "word1"): "" + } + self.assertEqual(construct_delimiter_map(text, ["word1", "word2"]), expected_result) + + def test_multiple_words(self): + text = "word0,((word1),word2)" + expected_result = { + ("word0", "word1"): "((", + ("word0", "word2"): "(", + ("word1", "word0"): "))", + ("word1", "word2"): ")", + ("word2", "word1"): "(", + ("word2", "word0"): ")" + } + self.assertEqual(construct_delimiter_map(text, ["word0", "word1", "word2"]), expected_result) + + text = "word0 , ( (word1 ), word2)" + self.assertEqual(construct_delimiter_map(text, ["word0", "word1", "word2"]), expected_result) + + +class TestVerifyDelimiters(unittest.TestCase): + def base_verify_func(self, query_text, text, anywhere_words, specific_words, expected_result): + delimiter_map = construct_delimiter_map(query_text, specific_words) + actual_result = verify_search_delimiters(text, anywhere_words, specific_words, delimiter_map) + self.assertEqual(actual_result, expected_result) + + def test_all_conditions_met(self): + query_text = "word0,((word1),word2)" + specific_words = ["word0", "word1", "word2"] + text = "word0,((word1),word2)" + self.base_verify_func(query_text, text, [], specific_words, True) + text = "((word1),word2), word0" + self.base_verify_func(query_text, text, [], specific_words, True) + text = "word0,(word2, (word1))" + self.base_verify_func(query_text, text, [], specific_words, True) + text = "word0,((word1),(ExtraGroup),word2)" + self.base_verify_func(query_text, text, [], specific_words, True) + text = "word0,((word2),word1)" + self.base_verify_func(query_text, text, [], specific_words, False) + text = "((word1),word0), word2" + self.base_verify_func(query_text, text, [], specific_words, False) + text = "word0,((word1))" + self.base_verify_func(query_text, text, [], specific_words, False) + text = "(word1),(ExtraGroup),word2)" + self.base_verify_func(query_text, text, [], specific_words, False) + + def test_complex_case_with_word_identifiers(self): + query_text = "word0,((word1),@word2,@word3,word4)" + specific_words = ["word0", "word1", "word4"] + anywhere_words = ["word2", "word3"] + text = "word0,((word1),word2,word3,word4)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + text = "word2,word0,((word1),word3,word4)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + text = "word3,((word1),word2,word4),word0" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + text = "word0,((word1),word4),word2,word3" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + text = "word0,word1,word4,word2" # Incorrect delimiters + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + text = "word2,word3" # Missing specific words + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + def test_very_complex_case_with_word_identifiers(self): + query_text = "word0,(((word1,word2),@word3)),((word4,word5)))" + specific_words = ["word0", "word1", "word2", "word4", "word5"] + anywhere_words = ["word3"] + + # Test case where all conditions are met + text = "word0,(((word1,word2),word3)),((word4,word5)))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Test case with anywhere words out of specific context but still in the string + text = "word3,word0,(((word1,word2))),((word4,word5)))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Test case with correct specific words but incorrect delimiters + text = "word0,((word1,word2),word3),(word4,word5)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + # Test case missing one specific word + text = "word0,(((word1,word2),word3)),(word4))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + # Test case missing anywhere word + text = "word0,(((word1,word2))),((word4,word5)))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + def test_incorrect_single_delimiter(self): + query_text = "word0,((word1)),word2" + specific_words = ["word0", "word1", "word2"] + anywhere_words = [] + + # Positive case 1: Exact match + text = "word0,((word1)),word2" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Positive case 2: Additional parentheses around the entire sequence + text = "(word0,((word1)),word2)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Single closing parenthesis missing between word1 and word2 + text = "word0,((word1),word2)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + # Single opening parenthesis missing between word0 and word1 + text = "word0,(word1)),word2" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + def test_mismatched_parentheses(self): + query_text = "word0,((word1)),(word2,word3)" + specific_words = ["word0", "word1", "word2", "word3"] + anywhere_words = [] + + # Positive case 1: Exact match + text = "word0,((word1)),(word2,word3)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Positive case 2: Reordered sequence with the same delimiters + text = "(word2,word3),word0,((word1))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Positive case 3: Additional text in between but the delimiters remain the same + text = "word0,someExtraText,((word1)),someMoreText,(word2,word3)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Extra closing parenthesis between word2 and word3 + text = "word0,((word1),(word2,word3))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + # Extra opening parenthesis between word1 and word2 + text = "word0,((word1),((word2,word3)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + def test_wildcard_matching_verify_delimiters(self): + query_text = "word0, ((word1.*?)), word2.*?" + delimiter_map = construct_delimiter_map(query_text, ["word0", "word1.*?", "word2.*?"]) + + # Positive test cases + text = "((word1)), word0, word2X" + self.assertTrue(verify_search_delimiters(text, [], ["word0", "word1.*?", "word2.*?"], delimiter_map)) + + text = "word0, ((word1Y)), word2Z" + self.assertTrue(verify_search_delimiters(text, [], ["word0", "word1.*?", "word2.*?"], delimiter_map)) + + # Negative test cases + text = "word0, (word1), word2" + self.assertFalse(verify_search_delimiters(text, [], ["word0", "word1.*?", "word2.*?"], delimiter_map)) + +class TestFindMatching(unittest.TestCase): + def base_find_matching(self, series, search_string, expected): + mask = find_matching(series, search_string) + self.assertTrue(all(mask == expected), f"Expected {expected}, got {mask}") + + def test_basic_matching(self): + series = pd.Series([ + "(word1), word0, ((word2))", + "word0, ((word1)), word2", + "(word1), word0, (word2)" + ]) + search_string = "word0, ((word1)), word2" + expected = pd.Series([False, True, False]) + self.base_find_matching(series, search_string, expected) + + def test_anywhere_words(self): + series = pd.Series([ + "(word1), word0, ((word2))", + "word0, ((word1)), word2", + "word0, (word3), ((word1)), word2" + ]) + search_string = "@word3, word0, ((word1)), word2" + expected = pd.Series([False, False, True]) + self.base_find_matching(series, search_string, expected) + + def test_mismatched_parentheses(self): + series = pd.Series([ + "(word1), word0, ((word2))", + "word0, ((word1)), word2", + "word0, (word1)), word2", + "word0, ((word1), word2" + ]) + search_string = "word0, ((word1)), word2" + expected = pd.Series([False, True, False, False]) + self.base_find_matching(series, search_string, expected) + + def test_wildcard_matching(self): + series = pd.Series([ + "word2, word0, ((word1X))", + "word0, ((word1Y)), word2Z", + "word0, (word1), word2" + ]) + search_string = "word0, ((word1*)), word2*" + expected = pd.Series([True, True, False]) + self.base_find_matching(series, search_string, expected) diff --git a/tests/models/test_expression_parser.py b/tests/models/test_expression_parser.py index cca54411..5bdb71b7 100644 --- a/tests/models/test_expression_parser.py +++ b/tests/models/test_expression_parser.py @@ -118,7 +118,7 @@ def test_finding_tags2(self): "Agent, (Event)": True, "(Item), (Event)": True } - self.base_test("(Item or Agent) and [[Action or Event]]", test_strings) + self.base_test("(Item or Agent) and {Action or Event}", test_strings) def test_exact_group(self): test_strings = { @@ -131,7 +131,7 @@ def test_exact_group(self): "(A, B, (C, D))": True, "(A, B, C)": True } - self.base_test("[[a, b]]", test_strings) + self.base_test("{a, b}", test_strings) def test_exact_group_simple_complex(self): test_strings = { @@ -145,7 +145,7 @@ def test_exact_group_simple_complex(self): "(E, F, (A, B, (C, D)))": True, "(A, B, (E, F, (C, D)))": False, # TODO: Should this be True? [[c]] isn't directly inside an a group. } - self.base_test("[[a, [[c]] ]]", test_strings) + self.base_test("{a, {c} }", test_strings) def test_exact_group_complex(self): test_strings = { @@ -155,7 +155,7 @@ def test_exact_group_complex(self): "(A, B, ((C, D)))": False, "(E, F, (A, B, (C, D)))": True, } - self.base_test("[[a, b, [[c, d]] ]]", test_strings) + self.base_test("{a, b, {c, d} }", test_strings) def test_duplicate_search(self): test_strings = { @@ -183,7 +183,7 @@ def test_exact_group_complex_split(self): "(E, F, (A, B, (C, D)))": False, "((A, B), (C, D))": True, } - self.base_test("[[ [[a, b]], [[c, d]] ]]", test_strings) + self.base_test("{ {a, b}, {c, d} }", test_strings) def test_mixed_group_split(self): test_strings = { @@ -192,7 +192,7 @@ def test_mixed_group_split(self): "((Event), ((Clear-throat)))": True, "((Event, Clear-throat))": False, } - self.base_test("[[ [Event], [Action] ]]", test_strings) + self.base_test("{ [Event], [Action] }", test_strings) def test_exact_group_split(self): test_strings = { @@ -201,7 +201,7 @@ def test_exact_group_split(self): "((Event), ((Clear-throat)))": False, "((Event, Clear-throat))": False, } - self.base_test("[[ [[Event]], [[Action]] ]]", test_strings) + self.base_test("{ {Event}, {Action} }", test_strings) def test_exact_group_split_or(self): test_strings = { @@ -210,17 +210,18 @@ def test_exact_group_split_or(self): "((A), ((D)))": True, "((A, D))": True, } - self.base_test("[[ [[a]] or [[d]] ]]", test_strings) + self.base_test("{ {a} or {d} }", test_strings) def test_exact_group_split_or_negation(self): test_strings = { - "(Event, Clear-throat)": False, + # "(Event, Clear-throat)": False, "((Event), (Clear-throat))": True, "((Event))": False, "((Event), ((Clear-throat)))": True, "((Event, Clear-throat))": False, } - self.base_test("[[ [[~Event]] ]]", test_strings) + # Need to think this through more. How do you exact match a negative tag? + self.base_test("{ {~Event} }", test_strings) def test_exact_group_split_or_negation_dual(self): test_strings = { @@ -233,7 +234,7 @@ def test_exact_group_split_or_negation_dual(self): "((A), (B, C))": False, "((A), ((B), C))": True, } - self.base_test("[[ [[~a and ~b]] ]]", test_strings) + self.base_test("{ {~a and ~b} }", test_strings) def test_exact_group_split_or_negation_dual2(self): test_strings = { @@ -246,7 +247,7 @@ def test_exact_group_split_or_negation_dual2(self): "((A), (B, C))": False, "((A), ((B), C))": True, } - self.base_test("[[ [[~(a or b)]] ]]", test_strings) + self.base_test("{ {~(a or b)} }", test_strings) def test_exact_group_split_or_negation_complex(self): test_strings = { @@ -260,7 +261,7 @@ def test_exact_group_split_or_negation_complex(self): "((A), (B, C)), (D)": False, "((A), (B, C)), (H)": False, } - self.base_test("[[ [[~(a or b)]] ]] and [[D or ~F]]", test_strings) + self.base_test("{ {~(a or b)} } and {D or ~F}", test_strings) # TODO: Should this work, and what should it mean? # Right now this is always true, since there is at least one group without ", (a)" in every string. @@ -272,7 +273,7 @@ def test_exact_group_negation(self): "((A), ((D)))": True, "((A, D))": True, } - self.base_test("[[ ~[[a]] ]]", test_strings) + self.base_test("{ ~{a} }", test_strings) def test_exact_group_negation2(self): test_strings = { @@ -282,9 +283,42 @@ def test_exact_group_negation2(self): "((A), ((D, B)))": True, "((A, D))": False, "(B, (D))": True, - "(B)": True + "(B)": True, + "((A), B)": False } - self.base_test("[[ ~[[a]], b]]", test_strings) + self.base_test("{ ~{a}, b}", test_strings) + + def test_exact_group_negation3(self): + test_strings = { + "(A, D, B)": False, + "((A), (D), B)": True, + "((A))": False, + "((A), ((D, B)))": True, + "((A, D))": False, + "(B, (D))": True, + "(B)": True, + "((A), B)": True + } + self.base_test("{ ~a and b}", test_strings) + + def test_exact_group_negation4(self): + test_strings = { + "(A, D, B)": False, + "((A), (D), B)": False, + "((A))": False, + "((A), ((D, B)))": False, + "((A, D))": False, + "(B)": True, + "(B, (D))": True, + "((A), B)": False + } + self.base_test("{ @c and @a and b: ???}", test_strings) + + def test_exact_group_negation5(self): + test_string = "{ ~a and b:}" + with self.assertRaises(ValueError) as context: + QueryParser(test_string) + self.assertTrue(context.exception.args[0]) def test_mixed_group_complex_split(self): test_strings = { @@ -297,7 +331,7 @@ def test_mixed_group_complex_split(self): "((A, B), (C, D))": True, "((A, B, C, D))": False, } - self.base_test("[[ [a, b], [c, d] ]]", test_strings) + self.base_test("{ [a, b], [c, d] }", test_strings) def test_exact_group_complex2(self): test_strings = { @@ -309,7 +343,7 @@ def test_exact_group_complex2(self): "(B, (C)), (A, B, (C))": True, "(A, B, (A, (C)))": False } - self.base_test("[[a, b, [[c]] ]]", test_strings) + self.base_test("{a, b, {c} }", test_strings) def test_containing_group_complex2(self): test_strings = { @@ -362,13 +396,13 @@ def test_mixed_groups(self): test_strings = { "(A, B), (C, D, (E, F))": True } - self.base_test("[[a]], [[ [[e, f]] ]]", test_strings) + self.base_test("{a}, { {e, f} }", test_strings) test_strings = { "(A, B), (C, D, (E, F))": False } # This example works because it finds the group containing (c, d, (e, f)), rather than the ef group - self.base_test("[[a]], [e, [[f]] ]", test_strings) + self.base_test("{a}, [e, {f} ]", test_strings) def test_and(self): test_strings = { @@ -411,18 +445,17 @@ def test_and_wildcard_nothing_else(self): "A": False, "B": False, "C": False, - "A, B": True, + "A, B": False, "A, C": False, "B, C": False, "A, B, C": False, "D, A, B": False, "A, B, (C)": False, "(A, B), C": True, - "(A, B, C)": False, + "(A, B, C)": True, } self.base_test("{a and b}", test_strings) - def test_and_wildcard_nothing_else2(self): test_strings = { "A": False, "B": False, @@ -436,8 +469,7 @@ def test_and_wildcard_nothing_else2(self): "(A, B), C": True, "(A, B, C)": False, } - self.base_test("[{a and b}]", test_strings) - self.base_test("[[{a and b}]]", test_strings) + self.base_test("{a and b:}", test_strings) def test_and_logical_wildcard(self): test_strings = { @@ -450,9 +482,11 @@ def test_and_logical_wildcard(self): self.base_test("A, B and ?", test_strings) test_strings = { - "A": False, + "A": True, "A, C": True, "A, B, C": True, + "B, C": False, + "B, C, D, E": True } self.base_test("(a or (b and c) and ?)", test_strings) @@ -469,7 +503,7 @@ def test_double_wildcard(self): def test_or_wildcard(self): test_strings = { - "A": False, + "A": True, "B": False, "C": False, "A, B": True, @@ -589,10 +623,10 @@ def test_and_or(self): self.base_test("a and (b or c)", test_strings) test_strings = { - "A": False, + "A": True, "B": False, "C": False, - "A, B": False, + "A, B": True, "A, C": True, "B, C": True } @@ -698,35 +732,43 @@ def test_not_in_line3(self): def test_optional_exact_group(self): test_strings = { - "A, C": True, + "(A, C)": True, } self.base_test("{a and (b or c)}", test_strings) test_strings = { - "A, B, C, D": True, + "(A, B, C, D)": True, } self.base_test("{a and b: c and d}", test_strings) test_strings = { - "A, B, C": True, - "A, B, C, D": False, + "(A, B, C)": True, + "(A, B, C, D)": False, } self.base_test("{a and b: c or d}", test_strings) test_strings = { - "A, C": True, - "A, D": True, - "A, B, C": False, - "A, B, C, D": False, + "(A, C)": True, + "(A, D)": True, + "(A, B, C)": False, + "(A, B, C, D)": False, } self.base_test("{a or b: c or d}", test_strings) test_strings = { "(Onset, (Def-expand/taco))": True, + "(Onset, Def-expand/taco)": False, + "(Onset, Def/taco, (Def-expand/taco))": True, # this one validates + "(Onset, (Def/taco))": False, "(Onset, (Def-expand/taco, (Label/DefContents)))": True, "(Onset, (Def-expand/taco), (Label/OnsetContents))": True, "(Onset, (Def-expand/taco), (Label/OnsetContents, Description/MoreContents))": True, "Onset, (Def-expand/taco), (Label/OnsetContents)": False, "(Onset, (Def-expand/taco), Label/OnsetContents)": False, } - self.base_test("[[{(Onset or Offset), (Def or [[Def-expand]]): ???}]]", test_strings) \ No newline at end of file + self.base_test("{(Onset or Offset), (Def or {Def-expand}): ???}", test_strings) + test_strings = { + "(A, B)": True, + "(A, B, C)": True + } + self.base_test("{a or b}", test_strings) \ No newline at end of file diff --git a/tests/schema/test_hed_schema_io.py b/tests/schema/test_hed_schema_io.py index f3591ead..c21d839a 100644 --- a/tests/schema/test_hed_schema_io.py +++ b/tests/schema/test_hed_schema_io.py @@ -170,6 +170,8 @@ def _base_merging_test(self, files): reload1 = load_schema(path1) reload2 = load_schema(path2) self.assertEqual(reload1, reload2) + except Exception: + self.assertTrue(False) finally: os.remove(path1) os.remove(path2) @@ -183,6 +185,8 @@ def _base_merging_test(self, files): reload1 = load_schema(path1) reload2 = load_schema(path2) self.assertEqual(reload1, reload2) + except Exception: + self.assertTrue(False) finally: os.remove(path1) os.remove(path2) From ab2a25cd9b00ff0be4d7f311b707af2ace4ebf99 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Thu, 28 Sep 2023 18:16:38 -0500 Subject: [PATCH 02/21] Update schema validation and tests Validate most attribute values now Update error codes to match spec more All sections are now required in mediawiki schema misc minor fixes --- hed/errors/error_types.py | 11 ++- hed/errors/exceptions.py | 17 ++-- hed/errors/known_error_codes.py | 1 + hed/errors/schema_error_messages.py | 39 +++++--- hed/schema/hed_schema.py | 4 +- hed/schema/hed_schema_constants.py | 1 + hed/schema/hed_schema_entry.py | 16 ++-- hed/schema/schema_attribute_validators.py | 64 ++++++++++++- hed/schema/schema_compliance.py | 53 +++++------ hed/schema/schema_io/schema2base.py | 5 - hed/schema/schema_io/wiki2schema.py | 95 +++++++++++++------ hed/schema/schema_validation_util.py | 12 ++- spec_tests/hed-specification | 2 +- spec_tests/test_errors.py | 50 +--------- .../issues_tests/HED_badroot_0.0.1.mediawiki | 10 ++ .../HED_dupesubroot_0.0.1.mediawiki | 10 ++ .../issues_tests/HED_root_invalid1.mediawiki | 10 ++ .../issues_tests/HED_root_invalid2.mediawiki | 10 ++ .../issues_tests/HED_root_invalid3.mediawiki | 10 ++ .../HED_root_wrong_place_0.0.1.mediawiki | 10 ++ .../issues_tests/overlapping_tags1.mediawiki | 7 ++ .../issues_tests/overlapping_tags2.mediawiki | 8 ++ .../issues_tests/overlapping_tags3.mediawiki | 8 ++ .../issues_tests/overlapping_tags4.mediawiki | 8 ++ .../overlapping_unit_classes.mediawiki | 7 ++ .../issues_tests/overlapping_units.mediawiki | 7 ++ .../merge_tests/sorted_root.mediawiki | 10 ++ .../wiki_tests/HED_default.mediawiki | 14 ++- .../wiki_tests/attribute_unknown1.mediawiki | 41 ++++++++ .../validator_tests/bids_schema.mediawiki | 5 + tests/schema/test_hed_schema_io.py | 8 +- tests/schema/test_schema_wiki_fatal_errors.py | 54 ++++++----- tests/schema/util_create_schemas.py | 23 ++++- 33 files changed, 445 insertions(+), 185 deletions(-) create mode 100644 tests/data/schema_tests/wiki_tests/attribute_unknown1.mediawiki diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index 7305e7c6..a866ec32 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -107,7 +107,7 @@ class SidecarErrors: class SchemaErrors: SCHEMA_DUPLICATE_NODE = 'SCHEMA_DUPLICATE_NODE' - SCHEMA_ATTRIBUTE_INVALID = 'SCHEMA_ATTRIBUTE_INVALID' + SCHEMA_DUPLICATE_FROM_LIBRARY = "SCHEMA_LIBRARY_INVALID" @@ -119,19 +119,22 @@ class SchemaWarnings: SCHEMA_CHARACTER_INVALID = "SCHEMA_CHARACTER_INVALID" SCHEMA_INVALID_CAPITALIZATION = 'invalidCaps' SCHEMA_NON_PLACEHOLDER_HAS_CLASS = 'SCHEMA_NON_PLACEHOLDER_HAS_CLASS' - SCHEMA_INVALID_ATTRIBUTE = "SCHEMA_INVALID_ATTRIBUTE" class SchemaAttributeErrors: + SCHEMA_ATTRIBUTE_INVALID = 'SCHEMA_ATTRIBUTE_INVALID' + SCHEMA_ATTRIBUTE_VALUE_INVALID = 'SCHEMA_ATTRIBUTE_VALUE_INVALID' SCHEMA_DEPRECATED_INVALID = "SCHEMA_DEPRECATED_INVALID" SCHEMA_SUGGESTED_TAG_INVALID = "SCHEMA_SUGGESTED_TAG_INVALID" - SCHEMA_RELATED_TAG_INVALID = "SCHEMA_RELATED_TAG_INVALID" SCHEMA_UNIT_CLASS_INVALID = "SCHEMA_UNIT_CLASS_INVALID" SCHEMA_VALUE_CLASS_INVALID = "SCHEMA_VALUE_CLASS_INVALID" + SCHEMA_ALLOWED_CHARACTERS_INVALID = "SCHEMA_ALLOWED_CHARACTERS_INVALID" + SCHEMA_IN_LIBRARY_INVALID = "SCHEMA_IN_LIBRARY_INVALID" SCHEMA_DEFAULT_UNITS_INVALID = "SCHEMA_DEFAULT_UNITS_INVALID" - SCHEMA_CHILD_OF_DEPRECATED = "SCHEMA_CHILD_OF_DEPRECATED" # Reported as SCHEMA_DEPRECATED_INVALID + SCHEMA_CHILD_OF_DEPRECATED = "SCHEMA_CHILD_OF_DEPRECATED" + SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE = "SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE" class DefinitionErrors: diff --git a/hed/errors/exceptions.py b/hed/errors/exceptions.py index e7ee857b..e368ec43 100644 --- a/hed/errors/exceptions.py +++ b/hed/errors/exceptions.py @@ -14,8 +14,9 @@ class HedExceptions: INVALID_DATAFRAME = 'INVALID_DATAFRAME' INVALID_FILE_FORMAT = 'INVALID_FILE_FORMAT' # These are actual schema issues, not that the file cannot be found or parsed - SCHEMA_HEADER_MISSING = 'HED_SCHEMA_HEADER_INVALID' - HED_SCHEMA_HEADER_INVALID = 'HED_SCHEMA_HEADER_INVALID' + SCHEMA_HEADER_MISSING = 'SCHEMA_HEADER_INVALID' + SCHEMA_HEADER_INVALID = 'SCHEMA_HEADER_INVALID' + SCHEMA_UNKNOWN_HEADER_ATTRIBUTE = "SCHEMA_HEADER_INVALID" SCHEMA_LIBRARY_INVALID = "SCHEMA_LIBRARY_INVALID" BAD_HED_LIBRARY_NAME = 'SCHEMA_LIBRARY_INVALID' @@ -26,14 +27,14 @@ class HedExceptions: ROOTED_TAG_DOES_NOT_EXIST = "SCHEMA_LIBRARY_INVALID" IN_LIBRARY_IN_UNMERGED = "SCHEMA_LIBRARY_INVALID" - HED_SCHEMA_VERSION_INVALID = 'HED_SCHEMA_VERSION_INVALID' - SCHEMA_START_MISSING = 'HED_WIKI_SEPARATOR_INVALID' - SCHEMA_END_INVALID = 'HED_WIKI_SEPARATOR_INVALID' - HED_END_INVALID = 'HED_WIKI_SEPARATOR_INVALID' - INVALID_SECTION_SEPARATOR = 'invalidSectionSeparator' + SCHEMA_VERSION_INVALID = 'SCHEMA_VERSION_INVALID' + SCHEMA_SECTION_MISSING = 'SCHEMA_SECTION_MISSING' + + WIKI_SEPARATOR_INVALID = 'invalidSectionSeparator' # This issue will contain a list of lines with issues. - HED_WIKI_DELIMITERS_INVALID = 'HED_WIKI_DELIMITERS_INVALID' + WIKI_DELIMITERS_INVALID = 'WIKI_DELIMITERS_INVALID' + WIKI_LINE_START_INVALID = 'WIKI_LINE_START_INVALID' HED_SCHEMA_NODE_NAME_INVALID = 'HED_SCHEMA_NODE_NAME_INVALID' SCHEMA_DUPLICATE_PREFIX = 'schemaDuplicatePrefix' diff --git a/hed/errors/known_error_codes.py b/hed/errors/known_error_codes.py index b72e8470..b8962682 100644 --- a/hed/errors/known_error_codes.py +++ b/hed/errors/known_error_codes.py @@ -31,6 +31,7 @@ ], "schema_validation_errors": [ "SCHEMA_ATTRIBUTE_INVALID", + "SCHEMA_ATTRIBUTE_VALUE_INVALID", "SCHEMA_CHARACTER_INVALID", "SCHEMA_DUPLICATE_NODE", "SCHEMA_HEADER_INVALID", diff --git a/hed/errors/schema_error_messages.py b/hed/errors/schema_error_messages.py index b7fda9d5..8c196f9e 100644 --- a/hed/errors/schema_error_messages.py +++ b/hed/errors/schema_error_messages.py @@ -16,7 +16,7 @@ def schema_error_hed_duplicate_from_library(tag, duplicate_tag_list, section): f"{tag_join_delimiter}{tag_join_delimiter.join(duplicate_tag_list)}" -@hed_error(SchemaErrors.SCHEMA_ATTRIBUTE_INVALID) +@hed_error(SchemaAttributeErrors.SCHEMA_ATTRIBUTE_INVALID) def schema_error_unknown_attribute(attribute_name, source_tag): return f"Attribute '{attribute_name}' used by '{source_tag}' was not defined in the schema, " \ f"or was used outside of it's defined class." @@ -40,45 +40,58 @@ def schema_warning_SCHEMA_INVALID_CAPITALIZATION(tag_name, problem_char, char_in f"Found character '{problem_char}' in tag '{tag_name}' at position {char_index}." -@hed_error(SchemaWarnings.SCHEMA_NON_PLACEHOLDER_HAS_CLASS, default_severity=ErrorSeverity.WARNING) +@hed_error(SchemaWarnings.SCHEMA_NON_PLACEHOLDER_HAS_CLASS, default_severity=ErrorSeverity.WARNING, + actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID) def schema_warning_non_placeholder_class(tag_name, invalid_attribute_name): return "Only placeholder nodes('#') can have a unit class, value class, or takes value." + \ f"Found {invalid_attribute_name} on {tag_name}" -@hed_error(SchemaWarnings.SCHEMA_INVALID_ATTRIBUTE, default_severity=ErrorSeverity.ERROR) -def schema_error_SCHEMA_INVALID_ATTRIBUTE(tag_name, invalid_attribute_name): - return f"'{invalid_attribute_name}' should not be present in a loaded schema, found on '{tag_name}'." \ - f"Something went very wrong." - -@hed_error(SchemaAttributeErrors.SCHEMA_DEPRECATED_INVALID) +@hed_error(SchemaAttributeErrors.SCHEMA_DEPRECATED_INVALID, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID) def schema_error_SCHEMA_DEPRECATED_INVALID(tag_name, invalid_deprecated_version): return f"'{tag_name}' has invalid or unknown value in attribute deprecatedFrom: '{invalid_deprecated_version}'." @hed_error(SchemaAttributeErrors.SCHEMA_CHILD_OF_DEPRECATED, - actual_code=SchemaAttributeErrors.SCHEMA_DEPRECATED_INVALID) + actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID) def schema_error_SCHEMA_CHILD_OF_DEPRECATED(deprecated_tag, non_deprecated_child): return f"Deprecated tag '{deprecated_tag}' has a child that is not deprecated: '{non_deprecated_child}'." -@hed_error(SchemaAttributeErrors.SCHEMA_SUGGESTED_TAG_INVALID) +@hed_error(SchemaAttributeErrors.SCHEMA_SUGGESTED_TAG_INVALID, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID) def schema_error_SCHEMA_SUGGESTED_TAG_INVALID(suggestedTag, invalidSuggestedTag, attribute_name): return f"Tag '{suggestedTag}' has an invalid {attribute_name}: '{invalidSuggestedTag}'." -@hed_error(SchemaAttributeErrors.SCHEMA_UNIT_CLASS_INVALID) +@hed_error(SchemaAttributeErrors.SCHEMA_UNIT_CLASS_INVALID, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID) def schema_error_SCHEMA_UNIT_CLASS_INVALID(tag, unit_class, attribute_name): return f"Tag '{tag}' has an invalid {attribute_name}: '{unit_class}'." -@hed_error(SchemaAttributeErrors.SCHEMA_VALUE_CLASS_INVALID) +@hed_error(SchemaAttributeErrors.SCHEMA_VALUE_CLASS_INVALID, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID) def schema_error_SCHEMA_VALUE_CLASS_INVALID(tag, unit_class, attribute_name): return f"Tag '{tag}' has an invalid {attribute_name}: '{unit_class}'." -@hed_error(SchemaAttributeErrors.SCHEMA_DEFAULT_UNITS_INVALID) +@hed_error(SchemaAttributeErrors.SCHEMA_DEFAULT_UNITS_INVALID, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID) def schema_error_SCHEMA_DEFAULT_UNITS_INVALID(tag, bad_unit, valid_units): valid_units = ",".join(valid_units) return f"Tag '{tag}' has an invalid defaultUnit '{bad_unit}'. Valid units are: '{valid_units}'." + + +@hed_error(SchemaAttributeErrors.SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID) +def schema_error_SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE(tag, conversion_factor): + return f"Tag '{tag}' has an invalid conversionFactor '{conversion_factor}'. Conversion factor must be positive." + + +@hed_error(SchemaAttributeErrors.SCHEMA_ALLOWED_CHARACTERS_INVALID, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID) +def schema_error_SCHEMA_ALLOWED_CHARACTERS_INVALID(tag, invalid_character): + return (f"Tag '{tag}' has an invalid allowedCharacter: '{invalid_character}'. " + f"Allowed characters are: a single character, " + f"or one of the following - letters, blank, digits, alphanumeric.") + + +@hed_error(SchemaAttributeErrors.SCHEMA_IN_LIBRARY_INVALID, actual_code=SchemaAttributeErrors.SCHEMA_ATTRIBUTE_VALUE_INVALID) +def schema_error_SCHEMA_IN_LIBRARY_INVALID(tag, bad_library): + return (f"Tag '{tag}' has an invalid inLibrary: '{bad_library}'. ") diff --git a/hed/schema/hed_schema.py b/hed/schema/hed_schema.py index 4cb3729c..0857abe9 100644 --- a/hed/schema/hed_schema.py +++ b/hed/schema/hed_schema.py @@ -731,7 +731,9 @@ def _add_tag_to_dict(self, long_tag_name, new_entry, key_class): # Add the InLibrary attribute to any library schemas as they are loaded # These are later removed when they are saved out, if saving unmerged if self.library and (not self.with_standard or (not self.merged and self.with_standard)): - new_entry._set_attribute_value(HedKey.InLibrary, self.library) + # only add it if not already present - This is a rare case + if not new_entry.has_attribute(HedKey.InLibrary): + new_entry._set_attribute_value(HedKey.InLibrary, self.library) section = self._sections[key_class] return section._add_to_dict(long_tag_name, new_entry) diff --git a/hed/schema/hed_schema_constants.py b/hed/schema/hed_schema_constants.py index 0cecc4ab..60a1a934 100644 --- a/hed/schema/hed_schema_constants.py +++ b/hed/schema/hed_schema_constants.py @@ -42,6 +42,7 @@ class HedKey: SuggestedTag = "suggestedTag" Rooted = "rooted" DeprecatedFrom = "deprecatedFrom" + ConversionFactor = "conversionFactor" # All known properties BoolProperty = 'boolProperty' diff --git a/hed/schema/hed_schema_entry.py b/hed/schema/hed_schema_entry.py index 102795d8..936943e8 100644 --- a/hed/schema/hed_schema_entry.py +++ b/hed/schema/hed_schema_entry.py @@ -176,7 +176,6 @@ def __eq__(self, other): return False return True - class UnitEntry(HedSchemaEntry): """ A single unit entry with modifiers in the HedSchema. """ def __init__(self, *args, **kwargs): @@ -207,12 +206,13 @@ def finalize_entry(self, schema): self.derivative_units = derivative_units def _get_conversion_factor(self, modifier_entry): - - base_factor = float(self.attributes.get("conversionFactor", "1.0").replace("^", "e")) - if modifier_entry: - modifier_factor = float(modifier_entry.attributes.get("conversionFactor", "1.0").replace("^", "e")) - else: - modifier_factor = 1.0 + base_factor = modifier_factor = 1.0 + try: + base_factor = float(self.attributes.get(HedKey.ConversionFactor, "1.0").replace("^", "e")) + if modifier_entry: + modifier_factor = float(modifier_entry.attributes.get(HedKey.ConversionFactor, "1.0").replace("^", "e")) + except (ValueError, AttributeError) as e: + pass # Just default to 1.0 return base_factor * modifier_factor def get_conversion_factor(self, unit_name): @@ -224,7 +224,7 @@ def get_conversion_factor(self, unit_name): Returns: conversion_factor(float or None): Returns the conversion factor or None """ - if "conversionFactor" in self.attributes: + if HedKey.ConversionFactor in self.attributes: return float(self.derivative_units.get(unit_name)) class HedTagEntry(HedSchemaEntry): diff --git a/hed/schema/schema_attribute_validators.py b/hed/schema/schema_attribute_validators.py index d1d7f5ec..0ccb9c33 100644 --- a/hed/schema/schema_attribute_validators.py +++ b/hed/schema/schema_attribute_validators.py @@ -150,4 +150,66 @@ def tag_is_deprecated_check(hed_schema, tag_entry, attribute_name): issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_CHILD_OF_DEPRECATED, tag_entry.name, child.name) - return issues \ No newline at end of file + return issues + + +def conversion_factor(hed_schema, tag_entry, attribute_name): + issues = [] + conversion_factor = tag_entry.attributes.get(attribute_name, "1.0") + try: + conversion_factor = float(conversion_factor.replace("^", "e")) + except (ValueError, AttributeError) as e: + pass + if not isinstance(conversion_factor, float) or conversion_factor <= 0.0: + issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_CONVERSION_FACTOR_NOT_POSITIVE, + tag_entry.name, + conversion_factor) + + return issues + + +def allowed_characters_check(hed_schema, tag_entry, attribute_name): + """ Check allowed character has a valid value + + Parameters: + hed_schema (HedSchema): The schema to use for validation + tag_entry (HedSchemaEntry): The schema entry for this attribute. + attribute_name (str): The name of this attribute + + Returns: + list: A list of issues. Each issue is a dictionary. + + """ + issues = [] + allowed_strings = {'letters', 'blank', 'digits', 'alphanumeric'} + + char_string = tag_entry.attributes.get(attribute_name, "") + characters = char_string.split(",") + for character in characters: + if character not in allowed_strings and len(character) != 1: + issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_ALLOWED_CHARACTERS_INVALID, + tag_entry.name, + character) + return issues + + +def in_library_check(hed_schema, tag_entry, attribute_name): + """ Check allowed character has a valid value + + Parameters: + hed_schema (HedSchema): The schema to use for validation + tag_entry (HedSchemaEntry): The schema entry for this attribute. + attribute_name (str): The name of this attribute + + Returns: + list: A list of issues. Each issue is a dictionary. + + """ + issues = [] + + library = tag_entry.attributes.get(attribute_name, "") + if hed_schema.library != library: + issues += ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_ALLOWED_CHARACTERS_INVALID, + tag_entry.name, + library) + return issues diff --git a/hed/schema/schema_compliance.py b/hed/schema/schema_compliance.py index c75c11de..1a68baf8 100644 --- a/hed/schema/schema_compliance.py +++ b/hed/schema/schema_compliance.py @@ -45,27 +45,20 @@ def check_compliance(hed_schema, check_for_warnings=True, name=None, error_handl class SchemaValidator: """Validator class to wrap some code. In general, just call check_compliance.""" attribute_validators = { - HedKey.SuggestedTag: [(schema_attribute_validators.tag_exists_check, - SchemaAttributeErrors.SCHEMA_SUGGESTED_TAG_INVALID)], - HedKey.RelatedTag: [(schema_attribute_validators.tag_exists_check, - SchemaAttributeErrors.SCHEMA_RELATED_TAG_INVALID)], - HedKey.UnitClass: [(schema_attribute_validators.tag_is_placeholder_check, - SchemaWarnings.SCHEMA_NON_PLACEHOLDER_HAS_CLASS), - (schema_attribute_validators.unit_class_exists, - SchemaAttributeErrors.SCHEMA_UNIT_CLASS_INVALID)], - HedKey.ValueClass: [(schema_attribute_validators.tag_is_placeholder_check, - SchemaWarnings.SCHEMA_NON_PLACEHOLDER_HAS_CLASS), - (schema_attribute_validators.value_class_exists, - SchemaAttributeErrors.SCHEMA_VALUE_CLASS_INVALID)], + HedKey.SuggestedTag: [schema_attribute_validators.tag_exists_check], + HedKey.RelatedTag: [schema_attribute_validators.tag_exists_check], + HedKey.UnitClass: [schema_attribute_validators.tag_is_placeholder_check, + schema_attribute_validators.unit_class_exists], + HedKey.ValueClass: [schema_attribute_validators.tag_is_placeholder_check, + schema_attribute_validators.value_class_exists], # Rooted tag is implicitly verified on loading - # HedKey.Rooted: [(schema_attribute_validators.tag_exists_base_schema_check, - # SchemaAttributeErrors.SCHEMA_ROOTED_TAG_INVALID)], - HedKey.DeprecatedFrom: [(schema_attribute_validators.tag_is_deprecated_check, - SchemaAttributeErrors.SCHEMA_DEPRECATED_INVALID)], - HedKey.TakesValue: [(schema_attribute_validators.tag_is_placeholder_check, - SchemaWarnings.SCHEMA_NON_PLACEHOLDER_HAS_CLASS)], - HedKey.DefaultUnits: [(schema_attribute_validators.unit_exists, - SchemaAttributeErrors.SCHEMA_DEFAULT_UNITS_INVALID)] + # HedKey.Rooted: [schema_attribute_validators.tag_exists_base_schema_check], + HedKey.DeprecatedFrom: [schema_attribute_validators.tag_is_deprecated_check], + HedKey.TakesValue: [schema_attribute_validators.tag_is_placeholder_check], + HedKey.DefaultUnits: [schema_attribute_validators.unit_exists], + HedKey.ConversionFactor: [schema_attribute_validators.conversion_factor], + HedKey.AllowedCharacter: [schema_attribute_validators.allowed_characters_check], + HedKey.InLibrary: [schema_attribute_validators.in_library_check] } def __init__(self, hed_schema, check_for_warnings=True, error_handler=None): @@ -80,7 +73,7 @@ def check_unknown_attributes(self): if unknown_attributes: for attribute_name, source_tags in unknown_attributes.items(): for tag in source_tags: - issues_list += self.error_handler.format_error_with_context(SchemaErrors.SCHEMA_ATTRIBUTE_INVALID, + issues_list += self.error_handler.format_error_with_context(SchemaAttributeErrors.SCHEMA_ATTRIBUTE_INVALID, attribute_name, source_tag=tag) return issues_list @@ -93,16 +86,14 @@ def check_attributes(self): for tag_entry in self.hed_schema[section_key].values(): self.error_handler.push_error_context(ErrorContext.SCHEMA_TAG, tag_entry.name) for attribute_name in tag_entry.attributes: - validators = self.attribute_validators.get(attribute_name, None) - if validators: - for validator, error_code in validators: - self.error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name) - new_issues = validator(self.hed_schema, tag_entry, attribute_name) - for issue in new_issues: - issue['code'] = error_code - issue['severity'] = ErrorSeverity.WARNING - self.error_handler.add_context_and_filter(new_issues) - issues_list += new_issues + validators = self.attribute_validators.get(attribute_name, []) + for validator in validators: + self.error_handler.push_error_context(ErrorContext.SCHEMA_ATTRIBUTE, attribute_name) + new_issues = validator(self.hed_schema, tag_entry, attribute_name) + for issue in new_issues: + issue['severity'] = ErrorSeverity.WARNING + self.error_handler.add_context_and_filter(new_issues) + issues_list += new_issues self.error_handler.pop_error_context() self.error_handler.pop_error_context() self.error_handler.pop_error_context() diff --git a/hed/schema/schema_io/schema2base.py b/hed/schema/schema_io/schema2base.py index e373cf1a..d9d082a1 100644 --- a/hed/schema/schema_io/schema2base.py +++ b/hed/schema/schema_io/schema2base.py @@ -106,9 +106,6 @@ def _output_tags(self, tags): self._end_tag_section() def _output_units(self, unit_classes): - if not unit_classes: - return - section_node = self._start_section(HedSectionKey.UnitClasses) for unit_class_entry in unit_classes.values(): @@ -128,8 +125,6 @@ def _output_units(self, unit_classes): self._write_entry(unit_entry, unit_class_node) def _output_section(self, hed_schema, key_class): - if not hed_schema[key_class]: - return parent_node = self._start_section(key_class) for entry in hed_schema[key_class].values(): if self._should_skip(entry): diff --git a/hed/schema/schema_io/wiki2schema.py b/hed/schema/schema_io/wiki2schema.py index a02f9ed6..de18f9d6 100644 --- a/hed/schema/schema_io/wiki2schema.py +++ b/hed/schema/schema_io/wiki2schema.py @@ -22,12 +22,19 @@ no_wiki_end_tag = '</nowiki>' -ErrorsBySection = { - HedWikiSection.Schema: HedExceptions.SCHEMA_START_MISSING, - HedWikiSection.EndSchema: HedExceptions.SCHEMA_END_INVALID, - HedWikiSection.EndHed: HedExceptions.HED_END_INVALID -} -required_sections = [HedWikiSection.Schema, HedWikiSection.EndSchema, HedWikiSection.EndHed] + +required_sections = [ + HedWikiSection.Prologue, + HedWikiSection.Schema, + HedWikiSection.EndSchema, + HedWikiSection.UnitsClasses, + HedWikiSection.UnitModifiers, + HedWikiSection.ValueClasses, + HedWikiSection.Attributes, + HedWikiSection.Properties, + HedWikiSection.Epilogue, + HedWikiSection.EndHed, +] class SchemaLoaderWiki(SchemaLoader): @@ -79,15 +86,13 @@ def _parse_data(self): # Validate we didn't miss any required sections. for section in required_sections: if section not in wiki_lines_by_section: - error_code = HedExceptions.INVALID_SECTION_SEPARATOR - if section in ErrorsBySection: - error_code = ErrorsBySection[section] + error_code = HedExceptions.SCHEMA_SECTION_MISSING msg = f"Required section separator '{SectionNames[section]}' not found in file" raise HedFileError(error_code, msg, filename=self.filename) if self.fatal_errors: self.fatal_errors = error_reporter.sort_issues(self.fatal_errors) - raise HedFileError(HedExceptions.HED_WIKI_DELIMITERS_INVALID, + raise HedFileError(self.fatal_errors[0]['code'], f"{len(self.fatal_errors)} issues found when parsing schema. See the .issues " f"parameter on this exception for more details.", self.filename, issues=self.fatal_errors) @@ -109,7 +114,7 @@ def _read_header_section(self, lines): for line_number, line in lines: if line.strip(): msg = f"Extra content [{line}] between HED line and other sections" - raise HedFileError(HedExceptions.HED_SCHEMA_HEADER_INVALID, msg, filename=self.filename) + raise HedFileError(HedExceptions.SCHEMA_HEADER_INVALID, msg, filename=self.filename) def _read_text_block(self, lines): text = "" @@ -163,7 +168,8 @@ def _read_schema(self, lines): parent_tags = parent_tags[:level] elif level > len(parent_tags): self._add_fatal_error(line_number, line, - "Line has too many *'s at the front. You cannot skip a level.") + "Line has too many *'s at the front. You cannot skip a level." + , HedExceptions.WIKI_LINE_START_INVALID) continue # Create the entry tag_entry = self._add_tag_line(parent_tags, line_number, line) @@ -261,14 +267,37 @@ def _get_header_attributes_internal(self, version_line): if "=" not in version_line: return self._get_header_attributes_internal_old(version_line) - final_attributes = {} + attributes, malformed = self._parse_attributes_line(version_line) + + for m in malformed: + # todo: May shift this at some point to report all errors + raise HedFileError(code=HedExceptions.SCHEMA_HEADER_INVALID, + message=f"Header line has a malformed attribute {m}", + filename=self.filename) + return attributes + + @staticmethod + def _parse_attributes_line(version_line): + matches = {} + unmatched = [] + last_end = 0 for match in attr_re.finditer(version_line): - attr_name = match.group(1) - attr_value = match.group(2) - final_attributes[attr_name] = attr_value + start, end = match.span() - return final_attributes + # If there's unmatched content between the last match and the current one + if start > last_end: + unmatched.append(version_line[last_end:start]) + + matches[match.group(1)] = match.group(2) + last_end = end + + # If there's unmatched content after the last match + if last_end < len(version_line): + unmatched.append(version_line[last_end:]) + + unmatched = [m.strip() for m in unmatched if m.strip()] + return matches, unmatched def _get_header_attributes_internal_old(self, version_line): """ Extracts all valid attributes like version from the HED line in .mediawiki format. @@ -288,7 +317,7 @@ def _get_header_attributes_internal_old(self, version_line): divider_index = pair.find(':') if divider_index == -1: msg = f"Found poorly matched key:value pair in header: {pair}" - raise HedFileError(HedExceptions.HED_SCHEMA_HEADER_INVALID, msg, filename=self.filename) + raise HedFileError(HedExceptions.SCHEMA_HEADER_INVALID, msg, filename=self.filename) key, value = pair[:divider_index], pair[divider_index + 1:] key = key.strip() value = value.strip() @@ -369,10 +398,17 @@ def _get_tag_name(self, tag_line): return None, 0 @staticmethod - def _get_tag_attributes(tag_line, starting_index): + def _validate_attribute_string(attribute_string): + pattern = r'^[A-Za-z]+(=.+)?$' + match = re.fullmatch(pattern, attribute_string) + if match: + return match.group() + + def _get_tag_attributes(self, line_number, tag_line, starting_index): """ Get the tag attributes from a line. Parameters: + line_number (int): The line number to report errors as tag_line (str): A tag line. starting_index (int): The first index we can check for the brackets. @@ -386,11 +422,14 @@ def _get_tag_attributes(tag_line, starting_index): return None, starting_index if attr_string: attributes_split = [x.strip() for x in attr_string.split(',')] - # Filter out attributes with spaces. - attributes_split = [a for a in attributes_split if " " not in a] final_attributes = {} for attribute in attributes_split: + if self._validate_attribute_string(attribute) is None: + self._add_fatal_error(line_number, tag_line, + f"Malformed attribute found {attribute}. " + f"Valid formatting is: attribute, or attribute=\"value\".") + continue split_attribute = attribute.split("=") if len(split_attribute) == 1: final_attributes[split_attribute[0]] = True @@ -468,7 +507,7 @@ def _create_entry(self, line_number, tag_line, key_class, element_name=None): if element_name: node_name = element_name - node_attributes, index = self._get_tag_attributes(tag_line, index) + node_attributes, index = self._get_tag_attributes(line_number, tag_line, index) if node_attributes is None: self._add_fatal_error(line_number, tag_line, "Attributes has mismatched delimiters") return @@ -489,7 +528,7 @@ def _create_entry(self, line_number, tag_line, key_class, element_name=None): return tag_entry def _add_fatal_error(self, line_number, line, warning_message="Schema term is empty or the line is malformed", - error_code=HedExceptions.HED_WIKI_DELIMITERS_INVALID): + error_code=HedExceptions.WIKI_DELIMITERS_INVALID): self.fatal_errors.append( {'code': error_code, ErrorContext.ROW: line_number, @@ -504,14 +543,12 @@ def _check_for_new_section(self, line, strings_for_section, current_section): if line.startswith(section_string): if key in strings_for_section: msg = f"Found section {SectionNames[key]} twice" - raise HedFileError(HedExceptions.INVALID_SECTION_SEPARATOR, + raise HedFileError(HedExceptions.WIKI_SEPARATOR_INVALID, msg, filename=self.filename) if current_section < key: new_section = key else: - error_code = HedExceptions.INVALID_SECTION_SEPARATOR - if key in ErrorsBySection: - error_code = ErrorsBySection[key] + error_code = HedExceptions.SCHEMA_SECTION_MISSING msg = f"Found section {SectionNames[key]} out of order in file" raise HedFileError(error_code, msg, filename=self.filename) break @@ -520,11 +557,11 @@ def _check_for_new_section(self, line, strings_for_section, current_section): def _handle_bad_section_sep(self, line, current_section): if current_section != HedWikiSection.Schema and line.startswith(wiki_constants.ROOT_TAG): msg = f"Invalid section separator '{line.strip()}'" - raise HedFileError(HedExceptions.INVALID_SECTION_SEPARATOR, msg, filename=self.filename) + raise HedFileError(HedExceptions.SCHEMA_SECTION_MISSING, msg, filename=self.filename) if line.startswith("!#"): msg = f"Invalid section separator '{line.strip()}'" - raise HedFileError(HedExceptions.INVALID_SECTION_SEPARATOR, msg, filename=self.filename) + raise HedFileError(HedExceptions.WIKI_SEPARATOR_INVALID, msg, filename=self.filename) def _split_lines_into_sections(self, wiki_lines): """ Takes a list of lines, and splits it into valid wiki sections. diff --git a/hed/schema/schema_validation_util.py b/hed/schema/schema_validation_util.py index 8404970e..25b27ab8 100644 --- a/hed/schema/schema_validation_util.py +++ b/hed/schema/schema_validation_util.py @@ -4,6 +4,7 @@ from hed.errors import ErrorHandler, SchemaWarnings from hed.schema import hed_schema_constants as constants from hed.errors.exceptions import HedExceptions, HedFileError +from hed.schema.hed_schema_constants import valid_header_attributes ALLOWED_TAG_CHARS = "-" ALLOWED_DESC_CHARS = "-_:;,./()+ ^" @@ -45,9 +46,9 @@ def validate_version_string(version_string): header_attribute_validators = { - constants.VERSION_ATTRIBUTE: (validate_version_string, HedExceptions.HED_SCHEMA_VERSION_INVALID), - constants.LIBRARY_ATTRIBUTE: (validate_library_name, HedExceptions.BAD_HED_LIBRARY_NAME) - } + constants.VERSION_ATTRIBUTE: (validate_version_string, HedExceptions.SCHEMA_VERSION_INVALID), + constants.LIBRARY_ATTRIBUTE: (validate_library_name, HedExceptions.BAD_HED_LIBRARY_NAME) +} def validate_present_attributes(attrib_dict, filename): @@ -92,9 +93,12 @@ def validate_attributes(attrib_dict, filename): had_error = validator(attribute_value) if had_error: raise HedFileError(error_code, had_error, filename) + if attribute_name not in valid_header_attributes: + raise HedFileError(HedExceptions.SCHEMA_UNKNOWN_HEADER_ATTRIBUTE, + f"Unknown attribute {attribute_name} found in header line", filename=filename) if constants.VERSION_ATTRIBUTE not in attrib_dict: - raise HedFileError(HedExceptions.HED_SCHEMA_VERSION_INVALID, + raise HedFileError(HedExceptions.SCHEMA_VERSION_INVALID, "No version attribute found in header", filename=filename) diff --git a/spec_tests/hed-specification b/spec_tests/hed-specification index c47fff94..c1aad366 160000 --- a/spec_tests/hed-specification +++ b/spec_tests/hed-specification @@ -1 +1 @@ -Subproject commit c47fff949db70c9105c875bbdfdf0d11389ffd68 +Subproject commit c1aad366fee6c7f1e68fbd73d2ce6dc369444ad8 diff --git a/spec_tests/test_errors.py b/spec_tests/test_errors.py index 972d53d4..3e87fdbd 100644 --- a/spec_tests/test_errors.py +++ b/spec_tests/test_errors.py @@ -12,55 +12,11 @@ from hed.errors import ErrorHandler, get_printable_issue_string -# To be removed eventually once all errors are being verified. -known_errors = [ - 'SIDECAR_INVALID', - 'CHARACTER_INVALID', - 'COMMA_MISSING', - "DEF_EXPAND_INVALID", - "DEF_INVALID", - "DEFINITION_INVALID", - "NODE_NAME_EMPTY", - "ONSET_OFFSET_INSET_ERROR", - "PARENTHESES_MISMATCH", - "PLACEHOLDER_INVALID", - "REQUIRED_TAG_MISSING", - "SIDECAR_INVALID", - "SIDECAR_KEY_MISSING", - "STYLE_WARNING", - "TAG_EMPTY", - "TAG_EXPRESSION_REPEATED", - "TAG_EXTENDED", - "TAG_EXTENSION_INVALID", - "TAG_GROUP_ERROR", - "TAG_INVALID", - "TAG_NOT_UNIQUE", - "TAG_NAMESPACE_PREFIX_INVALID", - "TAG_REQUIRES_CHILD", - "TILDES_UNSUPPORTED", - "UNITS_INVALID", - "UNITS_MISSING", - "VALUE_INVALID", - - "SIDECAR_BRACES_INVALID", - "SCHEMA_LIBRARY_INVALID", - - "SCHEMA_ATTRIBUTE_INVALID", - "SCHEMA_UNIT_CLASS_INVALID", - "SCHEMA_VALUE_CLASS_INVALID", - "SCHEMA_DEPRECATED_INVALID", - "SCHEMA_SUGGESTED_TAG_INVALID", - "SCHEMA_RELATED_TAG_INVALID", - "SCHEMA_NON_PLACEHOLDER_HAS_CLASS", - "SCHEMA_DEFAULT_UNITS_INVALID" -] - skip_tests = { "VERSION_DEPRECATED": "Not applicable", "tag-extension-invalid-bad-node-name": "Part of character invalid checking/didn't get to it yet", } - class MyTestCase(unittest.TestCase): @classmethod def setUpClass(cls): @@ -80,9 +36,7 @@ def run_single_test(self, test_file): test_info = json.load(fp) for info in test_info: error_code = info['error_code'] - verify_code = False - if error_code in known_errors: - verify_code = True + verify_code = True # To be deprecated once we add this to all tests self._verify_code = verify_code if error_code in skip_tests: @@ -93,6 +47,8 @@ def run_single_test(self, test_file): print(f"Skipping {name} test because: {skip_tests[name]}") continue + # if name != "attribute-invalid-in-library": + # continue description = info['description'] schema = info['schema'] check_for_warnings = info.get("warning", False) diff --git a/tests/data/schema_tests/merge_tests/issues_tests/HED_badroot_0.0.1.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/HED_badroot_0.0.1.mediawiki index a596775c..e2246335 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/HED_badroot_0.0.1.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/HED_badroot_0.0.1.mediawiki @@ -11,6 +11,16 @@ This schema is the first official release that includes an xsd and requires unit !# end schema +'''Unit classes''' + +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + '''Epilogue''' !# end hed \ No newline at end of file diff --git a/tests/data/schema_tests/merge_tests/issues_tests/HED_dupesubroot_0.0.1.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/HED_dupesubroot_0.0.1.mediawiki index 672792aa..2b76a3a4 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/HED_dupesubroot_0.0.1.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/HED_dupesubroot_0.0.1.mediawiki @@ -17,6 +17,16 @@ This schema is the first official release that includes an xsd and requires unit !# end schema +'''Unit classes''' + +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + '''Epilogue''' !# end hed \ No newline at end of file diff --git a/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid1.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid1.mediawiki index d5e6cf44..678a6249 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid1.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid1.mediawiki @@ -13,6 +13,16 @@ This schema is the first official release that includes an xsd and requires unit !# end schema +'''Unit classes''' + +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + '''Epilogue''' !# end hed \ No newline at end of file diff --git a/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid2.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid2.mediawiki index 979f72bd..037c9bc7 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid2.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid2.mediawiki @@ -13,6 +13,16 @@ This schema is the first official release that includes an xsd and requires unit !# end schema +'''Unit classes''' + +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + '''Epilogue''' !# end hed \ No newline at end of file diff --git a/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid3.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid3.mediawiki index 3438be07..f79d8361 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid3.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/HED_root_invalid3.mediawiki @@ -11,6 +11,16 @@ This schema is the first official release that includes an xsd and requires unit !# end schema +'''Unit classes''' + +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + '''Epilogue''' !# end hed \ No newline at end of file diff --git a/tests/data/schema_tests/merge_tests/issues_tests/HED_root_wrong_place_0.0.1.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/HED_root_wrong_place_0.0.1.mediawiki index 267a214e..80454ef4 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/HED_root_wrong_place_0.0.1.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/HED_root_wrong_place_0.0.1.mediawiki @@ -11,6 +11,16 @@ This schema is the first official release that includes an xsd and requires unit !# end schema +'''Unit classes''' + +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + '''Epilogue''' !# end hed \ No newline at end of file diff --git a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags1.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags1.mediawiki index ee20104a..d3368e37 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags1.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags1.mediawiki @@ -33,6 +33,13 @@ For more information see https://hed-schema-library.readthedocs.io/en/latest/ind '''Unit classes''' +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' '''Epilogue''' diff --git a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags2.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags2.mediawiki index 8b3a3a86..64144708 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags2.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags2.mediawiki @@ -32,6 +32,14 @@ For more information see https://hed-schema-library.readthedocs.io/en/latest/ind '''Unit classes''' +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + '''Epilogue''' diff --git a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags3.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags3.mediawiki index 7939dfd9..f8bccd4d 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags3.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags3.mediawiki @@ -32,6 +32,14 @@ For more information see https://hed-schema-library.readthedocs.io/en/latest/ind '''Unit classes''' +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + '''Epilogue''' diff --git a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags4.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags4.mediawiki index 4a084ebd..eb283125 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags4.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_tags4.mediawiki @@ -33,6 +33,14 @@ For more information see https://hed-schema-library.readthedocs.io/en/latest/ind '''Unit classes''' +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + '''Epilogue''' diff --git a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_unit_classes.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_unit_classes.mediawiki index f282aabb..289265f8 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_unit_classes.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_unit_classes.mediawiki @@ -34,6 +34,13 @@ For more information see https://hed-schema-library.readthedocs.io/en/latest/ind * weightUnits <nowiki>{defaultUnits=testUnit}</nowiki> ** testUnit <nowiki>{conversionFactor=100}</nowiki> +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' '''Epilogue''' The Standardized Computer-based Organized Reporting of EEG (SCORE) is a standard terminology for scalp EEG data assessment designed for use in clinical practice that may also be used for research purposes. diff --git a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_units.mediawiki b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_units.mediawiki index b7c4d5aa..ac67b8fe 100644 --- a/tests/data/schema_tests/merge_tests/issues_tests/overlapping_units.mediawiki +++ b/tests/data/schema_tests/merge_tests/issues_tests/overlapping_units.mediawiki @@ -34,6 +34,13 @@ For more information see https://hed-schema-library.readthedocs.io/en/latest/ind * weightUnitsNew <nowiki>{defaultUnits=g}</nowiki> ** g <nowiki>{conversionFactor=100}</nowiki> +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' '''Epilogue''' The Standardized Computer-based Organized Reporting of EEG (SCORE) is a standard terminology for scalp EEG data assessment designed for use in clinical practice that may also be used for research purposes. diff --git a/tests/data/schema_tests/merge_tests/sorted_root.mediawiki b/tests/data/schema_tests/merge_tests/sorted_root.mediawiki index d5e31f3b..6536476c 100644 --- a/tests/data/schema_tests/merge_tests/sorted_root.mediawiki +++ b/tests/data/schema_tests/merge_tests/sorted_root.mediawiki @@ -44,6 +44,16 @@ This schema is the first official release that includes an xsd and requires unit !# end schema +'''Unit classes''' + +'''Unit modifiers''' + +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + '''Epilogue''' !# end hed \ No newline at end of file diff --git a/tests/data/schema_tests/wiki_tests/HED_default.mediawiki b/tests/data/schema_tests/wiki_tests/HED_default.mediawiki index 049260f1..4327c6a4 100644 --- a/tests/data/schema_tests/wiki_tests/HED_default.mediawiki +++ b/tests/data/schema_tests/wiki_tests/HED_default.mediawiki @@ -1,6 +1,6 @@ HED version:8.0.0-alpha.1 - +'''Prologue''' This is a prologue line. This is a second prologue line. @@ -1098,7 +1098,15 @@ This is a second prologue line. * z <nowiki>{SIUnitSymbolModifier} [SI unit submultiple representing 10^-21]</nowiki> * yocto <nowiki>{SIUnitModifier} [SI unit submultiple representing 10^-24]</nowiki> * y <nowiki>{SIUnitSymbolModifier} [SI unit submultiple representing 10^-24]</nowiki> -!# end hed +'''Value classes''' + +'''Schema attributes''' + +'''Properties''' + +'''Epilogue''' This is an epilogue. -This is a second line of an epilogue. \ No newline at end of file +This is a second line of an epilogue. + +!# end hed \ No newline at end of file diff --git a/tests/data/schema_tests/wiki_tests/attribute_unknown1.mediawiki b/tests/data/schema_tests/wiki_tests/attribute_unknown1.mediawiki new file mode 100644 index 00000000..d2c398e3 --- /dev/null +++ b/tests/data/schema_tests/wiki_tests/attribute_unknown1.mediawiki @@ -0,0 +1,41 @@ +HED version="8.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://github.com/hed-standard/hed-specification/raw/master/hedxml/HED8.0.0.xsd" + +'''Prologue''' +This schema tests AppendixB SCHEMA_ATTRIBUTE_INVALID + +!# start schema + +'''Tag1''' <nowiki>{suggestedTag=Tag1}[suggested tag is not registered in the schema]</nowiki> +* Tag2 <nowiki>{valueClassAttribute}[value attribute is the wrong tag class]</nowiki> +* Tag3 <nowiki>{unitAttribute}[unit attribute is the wrong tag class]</nowiki> + +!# end schema +'''Unit classes''' <nowiki></nowiki> +* unitClass1 <nowiki>{unitAttribute}[Wrong attribute type]</nowiki> +** unit1 <nowiki>{tagAttribute}[Wrong attribute type]</nowiki> + +'''Unit modifiers''' <nowiki></nowiki> +* mod1 <nowiki>{tagAttribute}[Wrong attribute type]</nowiki> + +'''Value classes''' <nowiki></nowiki> +* valueClass1 <nowiki>{tagAttribute}[Wrong attribute type]</nowiki> + +'''Schema attributes''' <nowiki></nowiki> +* tagAttribute <nowiki></nowiki> +* unitAttribute <nowiki>{unitProperty}</nowiki> +* unitClassAttribute <nowiki>{unitClassProperty}</nowiki> +* unitModifierAttribute <nowiki>{unitModifierProperty}</nowiki> +* valueClassAttribute <nowiki>{valueClassProperty}</nowiki> +* attribute1 <nowiki>{valueClassProperty}</nowiki> + +'''Properties''' <nowiki></nowiki> +* boolProperty <nowiki></nowiki> +* unitClassProperty <nowiki></nowiki> +* unitModifierProperty <nowiki></nowiki> +* unitProperty <nowiki></nowiki> +* valueClassProperty <nowiki></nowiki> + +'''Epilogue''' +This is an updated version of the schema format. The properties are now part of the schema. The schema attributes are designed to be checked in software rather than hard-coded. The schema attributes, themselves have properties. + +!# end hed \ No newline at end of file diff --git a/tests/data/validator_tests/bids_schema.mediawiki b/tests/data/validator_tests/bids_schema.mediawiki index 971a9723..b306003b 100644 --- a/tests/data/validator_tests/bids_schema.mediawiki +++ b/tests/data/validator_tests/bids_schema.mediawiki @@ -1,5 +1,7 @@ HED version: 8.0.0-alpha.2 +'''Prologue''' + !# start schema '''Event''' @@ -1163,6 +1165,7 @@ HED version: 8.0.0-alpha.2 * yocto <nowiki>{SIUnitModifier} [SI unit submultiple representing 10^-24]</nowiki> * y <nowiki>{SIUnitSymbolModifier} [SI unit submultiple representing 10^-24]</nowiki> +'''Value classes''' '''Schema attributes''' * allowedCharacter <nowiki>{unitClassProperty}[An attribute of unit classes schema value placeholders indicating a special character that is allowed in expressing the value of that placeholder.]</nowiki> @@ -1184,6 +1187,8 @@ HED version: 8.0.0-alpha.2 * unitSymbol <nowiki>{boolProperty, unitProperty}[Abbreviation or symbol representing a type of unit. Unit symbols represent both the singular and the plural and thus cannot be pluralized.]</nowiki> * unitClass <nowiki>[Specifies the type of a unit for a tag.]</nowiki> +'''Properties''' + '''Epilogue''' This is the new format for the mediawiki schema diff --git a/tests/schema/test_hed_schema_io.py b/tests/schema/test_hed_schema_io.py index c21d839a..75f66d17 100644 --- a/tests/schema/test_hed_schema_io.py +++ b/tests/schema/test_hed_schema_io.py @@ -245,10 +245,10 @@ def _base_added_class_tests(self, schema): unit_class_entry = schema.unit_classes["weightUnits"] unit_entry = unit_class_entry.units["testUnit"] - self.assertEqual(unit_entry.attributes["conversionFactor"], str(100)) + self.assertEqual(unit_entry.attributes[HedKey.ConversionFactor], str(100)) unit_modifier_entry = schema.unit_modifiers["huge"] - self.assertEqual(unit_modifier_entry.attributes["conversionFactor"], "10^100") + self.assertEqual(unit_modifier_entry.attributes[HedKey.ConversionFactor], "10^100") self.assertTrue(unit_modifier_entry.attributes["customElementAttribute"]) value_class_entry = schema.value_classes["customValueClass"] @@ -328,9 +328,9 @@ def test_cannot_load_schemas(self): ] for file in files: - with self.assertRaises(HedFileError): - # print(file) + with self.assertRaises(HedFileError) as context: load_schema(file) + self.assertEqual(context.exception.code, HedExceptions.SCHEMA_LIBRARY_INVALID) def test_saving_in_library_wiki(self): old_score_schema = load_schema_version("score_1.0.0") diff --git a/tests/schema/test_schema_wiki_fatal_errors.py b/tests/schema/test_schema_wiki_fatal_errors.py index 583579b1..0759dba4 100644 --- a/tests/schema/test_schema_wiki_fatal_errors.py +++ b/tests/schema/test_schema_wiki_fatal_errors.py @@ -1,7 +1,7 @@ import unittest import os -from hed import schema +from hed import load_schema from hed.errors import HedFileError, HedExceptions @@ -12,25 +12,25 @@ class TestHedSchema(unittest.TestCase): def setUpClass(cls): cls.full_base_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), cls.base_schema_dir) cls.files_and_errors = { - "HED_schema_no_start.mediawiki": HedExceptions.SCHEMA_START_MISSING, - "HED_schema_no_end.mediawiki": HedExceptions.SCHEMA_END_INVALID, - "HED_hed_no_end.mediawiki": HedExceptions.HED_END_INVALID, - "HED_separator_invalid.mediawiki": HedExceptions.INVALID_SECTION_SEPARATOR, + "HED_schema_no_start.mediawiki": HedExceptions.SCHEMA_SECTION_MISSING, + "HED_schema_no_end.mediawiki": HedExceptions.SCHEMA_SECTION_MISSING, + "HED_hed_no_end.mediawiki": HedExceptions.SCHEMA_SECTION_MISSING, + "HED_separator_invalid.mediawiki": HedExceptions.WIKI_SEPARATOR_INVALID, "HED_header_missing.mediawiki": HedExceptions.SCHEMA_HEADER_MISSING, - "HED_header_invalid.mediawiki": HedExceptions.HED_SCHEMA_HEADER_INVALID, - "empty_file.mediawiki": HedExceptions.HED_SCHEMA_HEADER_INVALID, - "HED_header_invalid_version.mediawiki": HedExceptions.HED_SCHEMA_VERSION_INVALID, - "HED_header_missing_version.mediawiki": HedExceptions.HED_SCHEMA_VERSION_INVALID, + "HED_header_invalid.mediawiki": HedExceptions.SCHEMA_HEADER_INVALID, + "empty_file.mediawiki": HedExceptions.SCHEMA_HEADER_INVALID, + "HED_header_invalid_version.mediawiki": HedExceptions.SCHEMA_VERSION_INVALID, + "HED_header_missing_version.mediawiki": HedExceptions.SCHEMA_VERSION_INVALID, "HED_header_bad_library.mediawiki": HedExceptions.BAD_HED_LIBRARY_NAME, - "HED_schema_out_of_order.mediawiki": HedExceptions.SCHEMA_START_MISSING, - "empty_node.mediawiki": HedExceptions.HED_WIKI_DELIMITERS_INVALID, - "malformed_line.mediawiki": HedExceptions.HED_WIKI_DELIMITERS_INVALID, - "malformed_line2.mediawiki": HedExceptions.HED_WIKI_DELIMITERS_INVALID, - "malformed_line3.mediawiki": HedExceptions.HED_WIKI_DELIMITERS_INVALID, - "malformed_line4.mediawiki": HedExceptions.HED_WIKI_DELIMITERS_INVALID, - "malformed_line5.mediawiki": HedExceptions.HED_WIKI_DELIMITERS_INVALID, - "malformed_line6.mediawiki": HedExceptions.HED_WIKI_DELIMITERS_INVALID, - "malformed_line7.mediawiki": HedExceptions.HED_WIKI_DELIMITERS_INVALID, + "HED_schema_out_of_order.mediawiki": HedExceptions.SCHEMA_SECTION_MISSING, + "empty_node.mediawiki": HedExceptions.WIKI_DELIMITERS_INVALID, + "malformed_line.mediawiki": HedExceptions.WIKI_DELIMITERS_INVALID, + "malformed_line2.mediawiki": HedExceptions.WIKI_DELIMITERS_INVALID, + "malformed_line3.mediawiki": HedExceptions.WIKI_DELIMITERS_INVALID, + "malformed_line4.mediawiki": HedExceptions.WIKI_DELIMITERS_INVALID, + "malformed_line5.mediawiki": HedExceptions.WIKI_DELIMITERS_INVALID, + "malformed_line6.mediawiki": HedExceptions.WIKI_DELIMITERS_INVALID, + "malformed_line7.mediawiki": HedExceptions.WIKI_DELIMITERS_INVALID, "empty_node.xml": HedExceptions.HED_SCHEMA_NODE_NAME_INVALID } @@ -60,9 +60,10 @@ def test_invalid_schema(self): for filename, error in self.files_and_errors.items(): full_filename = self.full_base_folder + filename with self.assertRaises(HedFileError) as context: - schema.load_schema(full_filename) + load_schema(full_filename) # all of these should produce exceptions. - from hed.errors import ErrorHandler, ErrorContext, SchemaErrors, get_printable_issue_string + from hed.errors import ErrorHandler, ErrorContext, get_printable_issue_string + # Verify basic properties of exception expected_line_numbers = self.expected_line_numbers.get(filename, []) if expected_line_numbers: @@ -82,9 +83,10 @@ def test_merging_errors_schema(self): for filename, error in self.files_and_errors.items(): full_filename = self.full_base_folder + filename with self.assertRaises(HedFileError) as context: - schema.load_schema(full_filename) + load_schema(full_filename) # all of these should produce exceptions. - from hed.errors import ErrorHandler, ErrorContext, SchemaErrors, get_printable_issue_string + from hed.errors import ErrorHandler, ErrorContext, get_printable_issue_string + from hed.errors.error_types import SchemaAttributeErrors # Verify basic properties of exception expected_line_numbers = self.expected_line_numbers.get(filename, []) if expected_line_numbers: @@ -96,7 +98,7 @@ def test_merging_errors_schema(self): error_handler.push_error_context(ErrorContext.ROW, 1) error_handler.push_error_context(ErrorContext.COLUMN, 2) - issues = error_handler.format_error_with_context(SchemaErrors.SCHEMA_ATTRIBUTE_INVALID, + issues = error_handler.format_error_with_context(SchemaAttributeErrors.SCHEMA_ATTRIBUTE_INVALID, "error_attribute", source_tag="error_tag") error_handler.pop_error_context() error_handler.pop_error_context() @@ -106,3 +108,9 @@ def test_merging_errors_schema(self): self.assertTrue(context.exception.args[0] == error) self.assertTrue(context.exception.filename == full_filename) + + def test_attribute_invalid(self): + path = os.path.join(self.full_base_folder, "attribute_unknown1.mediawiki") + schema = load_schema(path) + issues = schema.check_compliance() + self.assertEqual(len(issues), 7) \ No newline at end of file diff --git a/tests/schema/util_create_schemas.py b/tests/schema/util_create_schemas.py index 850d014e..415b94dc 100644 --- a/tests/schema/util_create_schemas.py +++ b/tests/schema/util_create_schemas.py @@ -10,13 +10,30 @@ """ library_schema_end = """ -!# end schema + !# end hed """ -def _get_test_schema(node_lines): - library_schema_string = library_schema_start + "\n".join(node_lines) + library_schema_end +default_end_lines = """ +!# end schema +""" + +required_non_tag = [ +"'''Unit classes'''", +"'''Unit modifiers'''", +"'''Value classes'''", +"'''Schema attributes'''", +"'''Properties'''", +"'''Epilogue'''" +] +def _get_test_schema(node_lines, other_lines=(default_end_lines,)): + node_section = "\n".join(node_lines) + non_tag_section = "\n".join(other_lines) + for name in required_non_tag: + if name not in other_lines: + non_tag_section += f"\n{name}\n" + library_schema_string = library_schema_start + node_section + non_tag_section + library_schema_end test_schema = from_string(library_schema_string, ".mediawiki") return test_schema From c69b7f6cb0ac9cf2d48237669894bb6cb43b0f24 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 29 Sep 2023 11:22:39 +0000 Subject: [PATCH 03/21] Bump spec_tests/hed-specification from `c47fff9` to `c1aad36` Bumps [spec_tests/hed-specification](https://github.com/hed-standard/hed-specification) from `c47fff9` to `c1aad36`. - [Release notes](https://github.com/hed-standard/hed-specification/releases) - [Commits](https://github.com/hed-standard/hed-specification/compare/c47fff949db70c9105c875bbdfdf0d11389ffd68...c1aad366fee6c7f1e68fbd73d2ce6dc369444ad8) --- updated-dependencies: - dependency-name: spec_tests/hed-specification dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> --- spec_tests/hed-specification | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec_tests/hed-specification b/spec_tests/hed-specification index c47fff94..c1aad366 160000 --- a/spec_tests/hed-specification +++ b/spec_tests/hed-specification @@ -1 +1 @@ -Subproject commit c47fff949db70c9105c875bbdfdf0d11389ffd68 +Subproject commit c1aad366fee6c7f1e68fbd73d2ce6dc369444ad8 From dfbbd64c1ac98ce7b636dfe34f70b7511c22ad9d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 3 Oct 2023 11:57:26 +0000 Subject: [PATCH 04/21] Bump spec_tests/hed-specification from `c1aad36` to `7f3868b` Bumps [spec_tests/hed-specification](https://github.com/hed-standard/hed-specification) from `c1aad36` to `7f3868b`. - [Release notes](https://github.com/hed-standard/hed-specification/releases) - [Commits](https://github.com/hed-standard/hed-specification/compare/c1aad366fee6c7f1e68fbd73d2ce6dc369444ad8...7f3868bdff542db5660e2bd1a3d0d565e3793502) --- updated-dependencies: - dependency-name: spec_tests/hed-specification dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> --- spec_tests/hed-specification | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec_tests/hed-specification b/spec_tests/hed-specification index c1aad366..7f3868bd 160000 --- a/spec_tests/hed-specification +++ b/spec_tests/hed-specification @@ -1 +1 @@ -Subproject commit c1aad366fee6c7f1e68fbd73d2ce6dc369444ad8 +Subproject commit 7f3868bdff542db5660e2bd1a3d0d565e3793502 From 0014ef4fb415999a69f1f7aa3c7e5d0d49b07456 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Thu, 5 Oct 2023 16:27:07 -0500 Subject: [PATCH 05/21] Update basic_search to support words taht must not appear --- hed/models/basic_search.py | 88 ++++++----- tests/models/test_basic_search.py | 243 +++++++++++++++++------------- 2 files changed, 189 insertions(+), 142 deletions(-) diff --git a/hed/models/basic_search.py b/hed/models/basic_search.py index ae47b71e..9301a0cc 100644 --- a/hed/models/basic_search.py +++ b/hed/models/basic_search.py @@ -8,14 +8,18 @@ def find_matching(series, search_string, regex=False): """ Finds lines in the series that match the search string and returns a mask. Syntax Rules: - - '@': Prefixing a term in the search string means the object must appear anywhere within a line. + - '@': Prefixing a term in the search string means the term must appear anywhere within a line. + - '~': Prefixing a term in the search string means the term must NOT appear within a line. - Parentheses: Elements within parentheses must appear in the line with the same level of nesting. - eg: Search string: "(A), (B)" will match "(A), (B, C)", but not "(A, B)", since they don't - start in the same group. + e.g.: Search string: "(A), (B)" will match "(A), (B, C)", but not "(A, B)", since they don't + start in the same group. - "LongFormTag*": A * will match any remaining word(anything but a comma or parenthesis) - An individual term can be arbitrary regex, but it is limited to single continuous words. Notes: + - Specific words only care about their level relative to other specific words, not overall. + e.g. "(A, B)" will find: "A, B", "(A, B)", (A, (C), B)", or ((A, B))" + - If you have no grouping or anywhere words in the search, it assumes all terms are anywhere words. - The format of the series should match the format of the search string, whether it's in short or long form. - To enable support for matching parent tags, ensure that both the series and search string are in long form. @@ -33,19 +37,19 @@ def find_matching(series, search_string, regex=False): if not regex: # Replace *'s with a reasonable value for people who don't know regex search_string = re.sub(r'(?<!\.)\*', '.*?', search_string) - anywhere_words, specific_words = find_words(search_string) + anywhere_words, negative_words, specific_words = find_words(search_string) + # If we have no nesting or anywhere words, assume they don't care about level + if "(" not in search_string and "@" not in search_string: + anywhere_words += specific_words + specific_words = [] delimiter_map = construct_delimiter_map(search_string, specific_words) - source_words = anywhere_words + specific_words + candidate_indexes = _verify_basic_words(series, anywhere_words, negative_words) - # Create a set of series of masks to determine which rows contain each individual word - candidate_indexes = set(series.index) - - # Loop through source_words to filter down candidate_indexes - for word in source_words: + # do a basic check for all specific words(this doesn't verify word delimiters) + for word in specific_words: matches = series.str.contains(word, regex=True) current_word_indexes = set(matches[matches].index.tolist()) - # Update candidate_indexes by taking the intersection with current_word_indexes candidate_indexes &= current_word_indexes if not candidate_indexes: @@ -53,40 +57,63 @@ def find_matching(series, search_string, regex=False): candidate_indexes = sorted(candidate_indexes) - full_mask = pd.Series(False, index=series.index) - - candidate_series = series[candidate_indexes] + full_mask = pd.Series(False, index=series.index, dtype=bool) - mask = candidate_series.apply(verify_search_delimiters, args=(anywhere_words, specific_words, delimiter_map)) - full_mask.loc[candidate_indexes] = mask + if candidate_indexes: + if specific_words: + candidate_series = series[candidate_indexes] + mask = candidate_series.apply(verify_search_delimiters, args=(specific_words, delimiter_map)) + full_mask.loc[candidate_indexes] = mask + else: + full_mask.loc[candidate_indexes] = True return full_mask +def _get_word_indexes(series, word): + pattern = r'(?:[ ,()]|^)' + word + r'(?:[ ,()]|$)' + matches = series.str.contains(pattern, regex=True) + return set(matches[matches].index.tolist()) + + +def _verify_basic_words(series, anywhere_words, negative_words): + candidate_indexes = set(series.index) + for word in anywhere_words: + current_word_indexes = _get_word_indexes(series, word) + candidate_indexes &= current_word_indexes + + for word in negative_words: + current_word_indexes = _get_word_indexes(series, word) + candidate_indexes -= current_word_indexes + return candidate_indexes + + def find_words(search_string): - """ Extract all words in the search string. Dividing them into words that must be relative to each other, - and words that can be anywhere. + """ + Extract words in the search string based on their prefixes. Args: search_string (str): The search query string to parse. - Words prefixed with '@' are 'anywhere' words. + Words can be prefixed with '@' or '~'. Returns: - tuple: A tuple containing two lists: - - anywhere_words (list of str): Words that can appear anywhere in the text. - - specific_words (list of str): Words that must appear relative to other terms. + list: A list containing three lists: + - Words prefixed with '@' + - Words prefixed with '~' + - Words with no prefix """ - # Match sequences of characters that are not commas, parentheses, or standalone spaces. + # Match sequences of characters that are not commas or parentheses. pattern = r'[^,()]+' words = re.findall(pattern, search_string) # Remove any extraneous whitespace from each word words = [word.strip() for word in words if word.strip()] - anywhere_words = [word[1:] for word in words if word.startswith("@")] - specific_words = [word for word in words if not word.startswith("@")] + at_words = [word[1:] for word in words if word.startswith("@")] + tilde_words = [word[1:] for word in words if word.startswith("~")] + no_prefix_words = [word for word in words if not word.startswith("~") and not word.startswith("@")] - return anywhere_words, specific_words + return [at_words, tilde_words, no_prefix_words] def check_parentheses(text): @@ -180,12 +207,11 @@ def construct_delimiter_map(text, words): return delimiter_map -def verify_search_delimiters(text, anywhere_words, specific_words, delimiter_map): +def verify_search_delimiters(text, specific_words, delimiter_map): """ Verifies if the text contains specific words with expected delimiters between them. Args: text (str): The text to search in. - anywhere_words (list of str): Words that can appear anywhere in the text. specific_words (list of str): Words that must appear relative to other words in the text delimiter_map (dict): A dictionary specifying expected delimiters between pairs of specific words. @@ -194,12 +220,6 @@ def verify_search_delimiters(text, anywhere_words, specific_words, delimiter_map """ locations = defaultdict(list) - # Check for anywhere words - for word in anywhere_words: - pattern = r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)' - if not any(re.finditer(pattern, text)): - return False - # Find all locations for each word in the text for word in specific_words: for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text): diff --git a/tests/models/test_basic_search.py b/tests/models/test_basic_search.py index 0a942b93..36fcc168 100644 --- a/tests/models/test_basic_search.py +++ b/tests/models/test_basic_search.py @@ -35,37 +35,37 @@ def test_find_matching_results(self): class TestFindWords(unittest.TestCase): def test_basic(self): search_string = "@global (local1, local2)" - anywhere_words, specific_words = find_words(search_string) + anywhere_words, _, specific_words = find_words(search_string) self.assertEqual(anywhere_words, ['global']) self.assertEqual(specific_words, ['local1', 'local2']) def test_no_anywhere_words(self): search_string = "(local1, local2)" - anywhere_words, specific_words = find_words(search_string) + anywhere_words, _, specific_words = find_words(search_string) self.assertEqual(anywhere_words, []) self.assertEqual(specific_words, ['local1', 'local2']) def test_no_specific_words(self): search_string = "@global1, @global2" - anywhere_words, specific_words = find_words(search_string) + anywhere_words, _, specific_words = find_words(search_string) self.assertEqual(anywhere_words, ['global1', 'global2']) self.assertEqual(specific_words, []) def test_empty_string(self): search_string = "" - anywhere_words, specific_words = find_words(search_string) + anywhere_words, _, specific_words = find_words(search_string) self.assertEqual(anywhere_words, []) self.assertEqual(specific_words, []) def test_mixed_words(self): search_string = "@global (local1, local2), @another_global" - anywhere_words, specific_words = find_words(search_string) + anywhere_words, _, specific_words = find_words(search_string) self.assertEqual(anywhere_words, ['global', 'another_global']) self.assertEqual(specific_words, ['local1', 'local2']) def test_whitespace(self): search_string = " @Global , ( local1 , local2 ) " - anywhere_words, specific_words = find_words(search_string) + anywhere_words, _, specific_words = find_words(search_string) self.assertEqual(anywhere_words, ['Global']) self.assertEqual(specific_words, ['local1', 'local2']) @@ -138,118 +138,30 @@ def test_multiple_words(self): class TestVerifyDelimiters(unittest.TestCase): - def base_verify_func(self, query_text, text, anywhere_words, specific_words, expected_result): + def base_verify_func(self, query_text, text, specific_words, expected_result): delimiter_map = construct_delimiter_map(query_text, specific_words) - actual_result = verify_search_delimiters(text, anywhere_words, specific_words, delimiter_map) + actual_result = verify_search_delimiters(text, specific_words, delimiter_map) self.assertEqual(actual_result, expected_result) def test_all_conditions_met(self): query_text = "word0,((word1),word2)" specific_words = ["word0", "word1", "word2"] text = "word0,((word1),word2)" - self.base_verify_func(query_text, text, [], specific_words, True) + self.base_verify_func(query_text, text, specific_words, True) text = "((word1),word2), word0" - self.base_verify_func(query_text, text, [], specific_words, True) + self.base_verify_func(query_text, text, specific_words, True) text = "word0,(word2, (word1))" - self.base_verify_func(query_text, text, [], specific_words, True) + self.base_verify_func(query_text, text, specific_words, True) text = "word0,((word1),(ExtraGroup),word2)" - self.base_verify_func(query_text, text, [], specific_words, True) + self.base_verify_func(query_text, text, specific_words, True) text = "word0,((word2),word1)" - self.base_verify_func(query_text, text, [], specific_words, False) + self.base_verify_func(query_text, text, specific_words, False) text = "((word1),word0), word2" - self.base_verify_func(query_text, text, [], specific_words, False) + self.base_verify_func(query_text, text, specific_words, False) text = "word0,((word1))" - self.base_verify_func(query_text, text, [], specific_words, False) + self.base_verify_func(query_text, text, specific_words, False) text = "(word1),(ExtraGroup),word2)" - self.base_verify_func(query_text, text, [], specific_words, False) - - def test_complex_case_with_word_identifiers(self): - query_text = "word0,((word1),@word2,@word3,word4)" - specific_words = ["word0", "word1", "word4"] - anywhere_words = ["word2", "word3"] - text = "word0,((word1),word2,word3,word4)" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - text = "word2,word0,((word1),word3,word4)" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - text = "word3,((word1),word2,word4),word0" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - text = "word0,((word1),word4),word2,word3" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - text = "word0,word1,word4,word2" # Incorrect delimiters - self.base_verify_func(query_text, text, anywhere_words, specific_words, False) - text = "word2,word3" # Missing specific words - self.base_verify_func(query_text, text, anywhere_words, specific_words, False) - - def test_very_complex_case_with_word_identifiers(self): - query_text = "word0,(((word1,word2),@word3)),((word4,word5)))" - specific_words = ["word0", "word1", "word2", "word4", "word5"] - anywhere_words = ["word3"] - - # Test case where all conditions are met - text = "word0,(((word1,word2),word3)),((word4,word5)))" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - - # Test case with anywhere words out of specific context but still in the string - text = "word3,word0,(((word1,word2))),((word4,word5)))" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - - # Test case with correct specific words but incorrect delimiters - text = "word0,((word1,word2),word3),(word4,word5)" - self.base_verify_func(query_text, text, anywhere_words, specific_words, False) - - # Test case missing one specific word - text = "word0,(((word1,word2),word3)),(word4))" - self.base_verify_func(query_text, text, anywhere_words, specific_words, False) - - # Test case missing anywhere word - text = "word0,(((word1,word2))),((word4,word5)))" - self.base_verify_func(query_text, text, anywhere_words, specific_words, False) - - def test_incorrect_single_delimiter(self): - query_text = "word0,((word1)),word2" - specific_words = ["word0", "word1", "word2"] - anywhere_words = [] - - # Positive case 1: Exact match - text = "word0,((word1)),word2" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - - # Positive case 2: Additional parentheses around the entire sequence - text = "(word0,((word1)),word2)" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - - # Single closing parenthesis missing between word1 and word2 - text = "word0,((word1),word2)" - self.base_verify_func(query_text, text, anywhere_words, specific_words, False) - - # Single opening parenthesis missing between word0 and word1 - text = "word0,(word1)),word2" - self.base_verify_func(query_text, text, anywhere_words, specific_words, False) - - def test_mismatched_parentheses(self): - query_text = "word0,((word1)),(word2,word3)" - specific_words = ["word0", "word1", "word2", "word3"] - anywhere_words = [] - - # Positive case 1: Exact match - text = "word0,((word1)),(word2,word3)" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - - # Positive case 2: Reordered sequence with the same delimiters - text = "(word2,word3),word0,((word1))" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - - # Positive case 3: Additional text in between but the delimiters remain the same - text = "word0,someExtraText,((word1)),someMoreText,(word2,word3)" - self.base_verify_func(query_text, text, anywhere_words, specific_words, True) - - # Extra closing parenthesis between word2 and word3 - text = "word0,((word1),(word2,word3))" - self.base_verify_func(query_text, text, anywhere_words, specific_words, False) - - # Extra opening parenthesis between word1 and word2 - text = "word0,((word1),((word2,word3)" - self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + self.base_verify_func(query_text, text, specific_words, False) def test_wildcard_matching_verify_delimiters(self): query_text = "word0, ((word1.*?)), word2.*?" @@ -257,14 +169,15 @@ def test_wildcard_matching_verify_delimiters(self): # Positive test cases text = "((word1)), word0, word2X" - self.assertTrue(verify_search_delimiters(text, [], ["word0", "word1.*?", "word2.*?"], delimiter_map)) + self.assertTrue(verify_search_delimiters(text, ["word0", "word1.*?", "word2.*?"], delimiter_map)) text = "word0, ((word1Y)), word2Z" - self.assertTrue(verify_search_delimiters(text, [], ["word0", "word1.*?", "word2.*?"], delimiter_map)) + self.assertTrue(verify_search_delimiters(text, ["word0", "word1.*?", "word2.*?"], delimiter_map)) # Negative test cases text = "word0, (word1), word2" - self.assertFalse(verify_search_delimiters(text, [], ["word0", "word1.*?", "word2.*?"], delimiter_map)) + self.assertFalse(verify_search_delimiters(text, ["word0", "word1.*?", "word2.*?"], delimiter_map)) + class TestFindMatching(unittest.TestCase): def base_find_matching(self, series, search_string, expected): @@ -272,6 +185,18 @@ def base_find_matching(self, series, search_string, expected): self.assertTrue(all(mask == expected), f"Expected {expected}, got {mask}") def test_basic_matching(self): + series = pd.Series([ + "word0, word1, word2", + "word0, (word1, word2)" + ]) + search_string = "word0, word1" + expected = pd.Series([True, True]) + self.base_find_matching(series, search_string, expected) + search_string = "(word0, word1)" + expected = pd.Series([True, False]) + self.base_find_matching(series, search_string, expected) + + def test_group_matching(self): series = pd.Series([ "(word1), word0, ((word2))", "word0, ((word1)), word2", @@ -306,8 +231,110 @@ def test_wildcard_matching(self): series = pd.Series([ "word2, word0, ((word1X))", "word0, ((word1Y)), word2Z", + "word0, ((word1)), word2", "word0, (word1), word2" ]) search_string = "word0, ((word1*)), word2*" - expected = pd.Series([True, True, False]) + expected = pd.Series([True, True, True, False]) + self.base_find_matching(series, search_string, expected) + + def test_complex_case_with_word_identifiers(self): + query_text = "word0, ((word1), @word2, @word3, word4)" + series = pd.Series([ + "word0, ((word1), word2, word3, word4)", + "word2, word0, ((word1), word3, word4)", + "word3, ((word1), word2, word4), word0", + "word0, ((word1), word4), word2, word3", + "word0, word1, word4, word2", + "word2, word3" + ]) + expected = pd.Series([True, True, True, True, False, False]) + + self.base_find_matching(series, query_text, expected) + + def test_very_complex_case_with_word_identifiers(self): + query_text = "word0, (((word1, word2), @word3)), ((word4, word5)))" + series = pd.Series([ + "word0, (((word1, word2), word3)), ((word4, word5)))", + "word3, word0, (((word1, word2))), ((word4, word5)))", + "word0, ((word1, word2), word3), (word4, word5)", + "word0, (((word1, word2), word3)), (word4)", + "word0, (((word1, word2))), ((word4, word5)))" + ]) + expected = pd.Series([True, True, False, False, False]) + + self.base_find_matching(series, query_text, expected) + + def test_incorrect_single_delimiter(self): + query_text = "word0, ((word1)), word2" + series = pd.Series([ + "word0, ((word1)), word2", + "(word0, ((word1)), word2)", + "word0, ((word1), word2)", + "word0, (word1)), word2" + ]) + expected = pd.Series([True, True, False, False]) + self.base_find_matching(series, query_text, expected) + + def test_mismatched_parentheses2(self): + query_text = "word0, ((word1)), (word2, word3)" + series = pd.Series([ + "word0, ((word1)), (word2, word3)", + "(word2, word3), word0, ((word1))", + "word0, someExtraText, ((word1)), someMoreText, (word2, word3)", + "word0, ((word1), (word2, word3))", + "word0, ((word1), ((word2, word3)" + ]) + expected = pd.Series([True, True, True, False, False]) + self.base_find_matching(series, query_text, expected) + + def test_negative_words(self): + series = pd.Series([ + "word0, word1", + "word0, word2", + "word0, word2, word3", + "word0, (word1), word2", + "word0, (word2, word3), word1", + "word0, word1suffix", + ]) + + # 1. Basic Negative Test Case + search_string1 = "~word1, word0" + expected1 = pd.Series([False, True, True, False, False, True]) + + # 2. Two Negative Words + search_string2 = "~word1, ~word3, word0" + expected2 = pd.Series([False, True, False, False, False, True]) + + # 3. Combination of Negative and Mandatory Words + search_string3 = "@word2, ~word1, word0" + expected3 = pd.Series([False, True, True, False, False, False]) + + # 4. Negative Words with Wildcards + search_string4 = "word0, ~word1*" + expected4 = pd.Series([False, True, True, False, False, False]) + + # Running tests + self.base_find_matching(series, search_string1, expected1) + self.base_find_matching(series, search_string2, expected2) + self.base_find_matching(series, search_string3, expected3) + self.base_find_matching(series, search_string4, expected4) + + def test_negative_words_group(self): + series = pd.Series([ + "word0, (word1, (word2))", + "word0, (word1, (word2)), word3", + "word0, (word1, (word2), word3)", + "word0, (word1, (word2, word3))", + ]) + search_string = "word0, (word1, (word2))" + expected = pd.Series([True, True, True, True]) + self.base_find_matching(series, search_string, expected) + + search_string = "word0, (word1, (word2)), ~word3" + expected = pd.Series([True, False, False, False]) + self.base_find_matching(series, search_string, expected) + + search_string = "word0, (word1, (word2), ~word3)" + expected = pd.Series([True, False, False, False]) self.base_find_matching(series, search_string, expected) From f02a1218f2c222901105ee4a5be97057ac2c3e6b Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Wed, 11 Oct 2023 12:09:24 -0500 Subject: [PATCH 06/21] Make validation row numbers 1 based, and account for header --- hed/validator/spreadsheet_validator.py | 18 ++++++++++++------ tests/models/test_tabular_input.py | 3 ++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index ebd64795..751af961 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -45,23 +45,28 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): self._hed_validator = HedValidator(self._schema, def_dicts=def_dicts) self._onset_validator = OnsetValidator() onset_filtered = None + # Adjust to account for 1 based + row_adj = 1 if isinstance(data, BaseInput): - issues += self._validate_column_structure(data, error_handler) + # Adjust to account for column names + if data.has_column_names: + row_adj += 1 + issues += self._validate_column_structure(data, error_handler, row_adj) onset_filtered = data.series_filtered data = data.dataframe_a # Check the rows of the input data - issues += self._run_checks(data, onset_filtered, error_handler=error_handler) + issues += self._run_checks(data, onset_filtered, error_handler=error_handler, row_adj=row_adj) error_handler.pop_error_context() issues = sort_issues(issues) return issues - def _run_checks(self, hed_df, onset_filtered, error_handler): + def _run_checks(self, hed_df, onset_filtered, error_handler, row_adj): issues = [] columns = list(hed_df.columns) for row_number, text_file_row in enumerate(hed_df.itertuples(index=False)): - error_handler.push_error_context(ErrorContext.ROW, row_number) + error_handler.push_error_context(ErrorContext.ROW, row_number + row_adj) row_strings = [] new_column_issues = [] for column_number, cell in enumerate(text_file_row): @@ -100,13 +105,14 @@ def _run_checks(self, hed_df, onset_filtered, error_handler): error_handler.pop_error_context() return issues - def _validate_column_structure(self, base_input, error_handler): + def _validate_column_structure(self, base_input, error_handler, row_adj): """ Validate that each column in the input data has valid values. Parameters: base_input (BaseInput): The input data to be validated. error_handler (ErrorHandler): Holds context + row_adj(int): Number to adjust row by for reporting errors Returns: List of issues associated with each invalid value. Each issue is a dictionary. """ @@ -120,7 +126,7 @@ def _validate_column_structure(self, base_input, error_handler): valid_keys = column.hed_dict.keys() for row_number, value in enumerate(base_input.dataframe[column.column_name]): if value != "n/a" and value not in valid_keys: - error_handler.push_error_context(ErrorContext.ROW, row_number) + error_handler.push_error_context(ErrorContext.ROW, row_number + row_adj) issues += error_handler.format_error_with_context(ValidationErrors.SIDECAR_KEY_MISSING, invalid_key=value, category_keys=list(valid_keys)) diff --git a/tests/models/test_tabular_input.py b/tests/models/test_tabular_input.py index 7f5282e4..02ef32df 100644 --- a/tests/models/test_tabular_input.py +++ b/tests/models/test_tabular_input.py @@ -5,7 +5,7 @@ from hed.models import DefinitionEntry, Sidecar, TabularInput from hed import schema from hed.errors import HedFileError -from hed.errors import ErrorHandler +from hed.errors import ErrorHandler, ErrorContext class Test(unittest.TestCase): @@ -58,6 +58,7 @@ def test_expand_column_issues(self): input_file = TabularInput(events_path, sidecar=sidecar) validation_issues = input_file.validate(hed_schema=self.hed_schema) + self.assertEqual(validation_issues[0][ErrorContext.ROW], 2) self.assertEqual(len(validation_issues), 1) def test_blank_and_duplicate_columns(self): From e9e63e39f6a99487c5e69ae96e653c4580045ec0 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Thu, 12 Oct 2023 10:29:43 -0500 Subject: [PATCH 07/21] Corrected an error in the formatting of text output of validation summary --- hed/tools/remodeling/cli/run_remodel.py | 18 +++++++++-------- .../operations/summarize_hed_tags_op.py | 2 ++ .../operations/summarize_hed_validation_op.py | 20 +++++++++++-------- tests/models/test_hed_group.py | 2 +- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/hed/tools/remodeling/cli/run_remodel.py b/hed/tools/remodeling/cli/run_remodel.py index 45fce71a..8aefdb15 100644 --- a/hed/tools/remodeling/cli/run_remodel.py +++ b/hed/tools/remodeling/cli/run_remodel.py @@ -142,21 +142,21 @@ def run_bids_ops(dispatch, args, tabular_files): dispatch.hed_schema = bids.schema if args.verbose: print(f"Successfully parsed BIDS dataset with HED schema {str(bids.schema.get_schema_versions())}") - events = bids.get_tabular_group(args.file_suffix) + data = bids.get_tabular_group(args.file_suffix) if args.verbose: print(f"Processing {dispatch.data_root}") - filtered_events = [events.datafile_dict[key] for key in tabular_files] - for events_obj in filtered_events: - sidecar_list = events.get_sidecars_from_path(events_obj) + filtered_events = [data.datafile_dict[key] for key in tabular_files] + for data_obj in filtered_events: + sidecar_list = data.get_sidecars_from_path(data_obj) if sidecar_list: - sidecar = events.sidecar_dict[sidecar_list[-1]].contents + sidecar = data.sidecar_dict[sidecar_list[-1]].contents else: sidecar = None if args.verbose: - print(f"Events {events_obj.file_path} sidecar {sidecar}") - df = dispatch.run_operations(events_obj.file_path, sidecar=sidecar, verbose=args.verbose) + print(f"Tabular file {data_obj.file_path} sidecar {sidecar}") + df = dispatch.run_operations(data_obj.file_path, sidecar=sidecar, verbose=args.verbose) if not args.no_update: - df.to_csv(events_obj.file_path, sep='\t', index=False, header=True) + df.to_csv(data_obj.file_path, sep='\t', index=False, header=True) def run_direct_ops(dispatch, args, tabular_files): @@ -176,6 +176,8 @@ def run_direct_ops(dispatch, args, tabular_files): else: sidecar = None for file_path in tabular_files: + if args.verbose: + print(f"Tabular file {file_path} sidecar {sidecar}") df = dispatch.run_operations(file_path, verbose=args.verbose, sidecar=sidecar) if not args.no_update: df.to_csv(file_path, sep='\t', index=False, header=True) diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index ffef53fb..6e98abbe 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -87,6 +87,8 @@ def do_op(self, dispatcher, df, name, sidecar=None): if not summary: summary = HedTagSummary(self) dispatcher.summary_dicts[self.summary_name] = summary + x = {'df': dispatcher.post_proc_data(df_new), 'name': name, + 'schema': dispatcher.hed_schema, 'sidecar': sidecar} summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name, 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) return df_new diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py index cd3fc936..a2948eb8 100644 --- a/hed/tools/remodeling/operations/summarize_hed_validation_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_validation_op.py @@ -1,7 +1,7 @@ """ Validate the HED tags in a dataset and report errors. """ import os -from hed.errors import ErrorSeverity, ErrorHandler +from hed.errors import ErrorSeverity, ErrorHandler, get_printable_issue_string from hed.models.sidecar import Sidecar from hed.models.tabular_input import TabularInput from hed.tools.remodeling.operations.base_op import BaseOp @@ -110,9 +110,9 @@ def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): else: sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], indent=indent*2) if specifics['sidecar_had_issues']: - sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=False, indent=indent*2) + sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], count_only=False, indent=indent*2) else: - sum_list = sum_list + [f"{indent*2}Event file validation was incomplete because of sidecar errors"] + sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=False, indent=indent*2) return "\n".join(sum_list) def update_summary(self, new_info): @@ -134,6 +134,7 @@ def update_summary(self, new_info): issues = input_data.validate(new_info['schema']) if not self.check_for_warnings: issues = ErrorHandler.filter_issues_by_severity(issues, ErrorSeverity.ERROR) + issues = [get_printable_issue_string([issue], skip_filename=True) for issue in issues] results['event_issues'][new_info["name"]] = issues results['total_event_issues'] = len(issues) self.summary_dict[new_info["name"]] = results @@ -197,13 +198,15 @@ def get_error_list(error_dict, count_only=False, indent=BaseSummary.DISPLAY_INDE error_list = [] for key, item in error_dict.items(): if count_only and isinstance(item, list): - error_list.append(f"{indent}{key}: {len(item)} issues") + error_list.append(f"{key}: {len(item)} issues") elif count_only: - error_list.append(f"{indent}{key}: {item} issues") + error_list.append(f"{key}: {item} issues") elif not len(item): - error_list.append(f"{indent}{key} has no issues") + error_list.append(f"{key} has no issues") else: - HedValidationSummary._format_errors(error_list, key, item, indent) + error_list.append(f"{key}:") + error_list = error_list + item + #HedValidationSummary._format_errors(error_list, key, item, indent) return error_list @staticmethod @@ -246,6 +249,7 @@ def _get_sidecar_results(sidecar, new_info, check_for_warnings): results["sidecar_had_issues"] = True if not check_for_warnings: sidecar_issues = filtered_issues - results['sidecar_issues'][sidecar.name] = sidecar_issues + str_issues = [get_printable_issue_string([issue], skip_filename=True) for issue in sidecar_issues] + results['sidecar_issues'][sidecar.name] = str_issues results['total_sidecar_issues'] = len(sidecar_issues) return results diff --git a/tests/models/test_hed_group.py b/tests/models/test_hed_group.py index 117872e8..1a3632bf 100644 --- a/tests/models/test_hed_group.py +++ b/tests/models/test_hed_group.py @@ -62,7 +62,7 @@ def test_find_tags_with_term(self): # located tags now has found all 5 hed tags # This will find no tags - located_tags = basic_hed_string_obj.find_tags_with_term("bject", recursive=True, include_groups=0) + located_tags = basic_hed_string_obj.find_tags_with_term("reject", recursive=True, include_groups=0) self.assertEqual(len(located_tags), 0) # this will also find no tags From 06fd31686b861ceccefb7d0b32122c3ad54cd328 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Thu, 12 Oct 2023 15:56:50 -0500 Subject: [PATCH 08/21] Try bumping python version/restricting non master tests --- .github/workflows/ci.yaml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 919e5c8f..8927786a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,11 +7,26 @@ on: branches: ["*"] jobs: + determine_version: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - id: set-matrix + run: | + if [[ "${{ github.ref }}" == 'refs/heads/master' ]]; then + echo "::set-output name=matrix::[3.7, 3.9, 3.10, 3.11]" + else + echo "::set-output name=matrix::[3.9]" + end + build: + needs: determine_version strategy: matrix: platform: [ubuntu-latest] - python-version: [3.7, 3.9] + python-version: ${{fromJson(needs.determine_version.outputs.matrix)}} + runs-on: ${{ matrix.platform }} From 8c844f9b03675d6b56e32584efca8dde875ec31c Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Thu, 12 Oct 2023 15:59:27 -0500 Subject: [PATCH 09/21] Use correct loop end word --- .github/workflows/ci.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8927786a..25625de2 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -18,7 +18,7 @@ jobs: echo "::set-output name=matrix::[3.7, 3.9, 3.10, 3.11]" else echo "::set-output name=matrix::[3.9]" - end + fi build: needs: determine_version @@ -27,7 +27,6 @@ jobs: platform: [ubuntu-latest] python-version: ${{fromJson(needs.determine_version.outputs.matrix)}} - runs-on: ${{ matrix.platform }} steps: From 97b834189fff6078a19f05ce7c28d38f1d403291 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Thu, 12 Oct 2023 16:03:44 -0500 Subject: [PATCH 10/21] Don't use semi-deprecated style --- .github/workflows/ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 25625de2..01a48a12 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -15,9 +15,9 @@ jobs: - id: set-matrix run: | if [[ "${{ github.ref }}" == 'refs/heads/master' ]]; then - echo "::set-output name=matrix::[3.7, 3.9, 3.10, 3.11]" + echo "matrix=[3.7, 3.9, 3.10, 3.11]" >> $GITHUB_OUTPUT else - echo "::set-output name=matrix::[3.9]" + echo "matrix=[3.9]" >> $GITHUB_OUTPUT fi build: From 3fe606be962145b3cdb477092adf79ffc040b25b Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Thu, 12 Oct 2023 16:13:19 -0500 Subject: [PATCH 11/21] Try to detect master differently --- .github/workflows/ci.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 01a48a12..25f09d8b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -14,7 +14,11 @@ jobs: steps: - id: set-matrix run: | - if [[ "${{ github.ref }}" == 'refs/heads/master' ]]; then + if [[ "${{ github.event_name }}" == 'push' && "${{ github.ref }}" == 'refs/heads/master' ]]; then + # Push to master branch + echo "matrix=[3.7, 3.9, 3.10, 3.11]" >> $GITHUB_OUTPUT + elif [[ "${{ github.event_name }}" == 'pull_request' && "${{ github.event.pull_request.base.ref }}" == 'master' ]]; then + # PR to master branch echo "matrix=[3.7, 3.9, 3.10, 3.11]" >> $GITHUB_OUTPUT else echo "matrix=[3.9]" >> $GITHUB_OUTPUT From 39c7487d2c7e5ea3453232daeb52839c69378368 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Thu, 12 Oct 2023 16:19:34 -0500 Subject: [PATCH 12/21] don't treat 3.10 as 3.1 --- .github/workflows/ci.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 25f09d8b..ea78321f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -16,12 +16,12 @@ jobs: run: | if [[ "${{ github.event_name }}" == 'push' && "${{ github.ref }}" == 'refs/heads/master' ]]; then # Push to master branch - echo "matrix=[3.7, 3.9, 3.10, 3.11]" >> $GITHUB_OUTPUT + echo 'matrix=["3.7", "3.9", "3.10", "3.11"]' >> $GITHUB_OUTPUT elif [[ "${{ github.event_name }}" == 'pull_request' && "${{ github.event.pull_request.base.ref }}" == 'master' ]]; then # PR to master branch - echo "matrix=[3.7, 3.9, 3.10, 3.11]" >> $GITHUB_OUTPUT + echo 'matrix=["3.7", "3.9", "3.10", "3.11"]' >> $GITHUB_OUTPUT else - echo "matrix=[3.9]" >> $GITHUB_OUTPUT + echo 'matrix=["3.9"]' >> $GITHUB_OUTPUT fi build: From a762ba76a490a17f511cd90c79a7da9267fae912 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Thu, 12 Oct 2023 18:35:51 -0500 Subject: [PATCH 13/21] Maybe supress warning in 3.7 code --- hed/models/base_input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 9f437102..58bb94c9 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -137,7 +137,7 @@ def _indexed_dict_from_onsets(onsets): @staticmethod def _filter_by_index_list(original_series, indexed_dict): - new_series = pd.Series(["n/a"] * len(original_series)) + new_series = pd.Series(["n/a"] * len(original_series), dtype=str) for onset, indices in indexed_dict.items(): if indices: From 79f2d83f053fa0a6eb28cf62be3c7a50c360d229 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Thu, 12 Oct 2023 18:45:01 -0500 Subject: [PATCH 14/21] fix test case maybe --- tests/models/test_base_input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py index 71e21386..02c7f34a 100644 --- a/tests/models/test_base_input.py +++ b/tests/models/test_base_input.py @@ -304,7 +304,7 @@ def test_complex_onsets(self): {3.5: [0, 1], 4.0: [2], 4.4: [3, 4], -1.0: [5]}) def test_empty_and_single_item_series(self): - self.assertTrue(BaseInput._filter_by_index_list(pd.Series([]), {}).equals(pd.Series([]))) + self.assertTrue(BaseInput._filter_by_index_list(pd.Series([], dtype=str), {}).equals(pd.Series([], dtype=str))) self.assertTrue(BaseInput._filter_by_index_list(pd.Series(["apple"]), {0: [0]}).equals(pd.Series(["apple"]))) def test_two_item_series_with_same_onset(self): From cf5cedc597755af62b6bff90cbd7424bc5514480 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Fri, 13 Oct 2023 19:17:14 -0500 Subject: [PATCH 15/21] Initial codespell commit --- .github/workflows/codespell.yaml | 22 ++++++++++++++++++++++ pyproject.toml | 4 ++++ 2 files changed, 26 insertions(+) create mode 100644 .github/workflows/codespell.yaml diff --git a/.github/workflows/codespell.yaml b/.github/workflows/codespell.yaml new file mode 100644 index 00000000..3deac45a --- /dev/null +++ b/.github/workflows/codespell.yaml @@ -0,0 +1,22 @@ +--- +name: Codespell + +on: + push: + branches: [develop] + pull_request: + branches: [develop] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Codespell + uses: codespell-project/actions-codespell@v2 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 8bc13baf..f5549033 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,7 @@ [build-system] requires = ["setuptools>=42", "versioneer-518"] build-backend = "setuptools.build_meta" + +[tool.codespell] +skip = '*.git,*.pdf,*.xml,*.mediawiki,*.svg,versioneer.py,venv*,*.tsv,*.yaml,*.yml,*.json,*.rdf,*.jsonld' +ignore-words-list = 'te,parms' From 9afb7b66e62bdd6b96ce9f97503f73a2fe811635 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Mon, 16 Oct 2023 14:23:02 -0500 Subject: [PATCH 16/21] Do spelling fixes, excluse spec tests folder --- hed/models/column_mapper.py | 2 +- hed/models/expression_parser.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index 761ab81a..e0948b9b 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -149,7 +149,7 @@ def _set_sidecar(self, sidecar): sidecar (Sidecar or None): the sidecar to use :raises ValueError: - - A sidecar was prevoiusly set + - A sidecar was previously set """ if self._sidecar: raise ValueError("Trying to set a second sidecar on a column mapper.") diff --git a/hed/models/expression_parser.py b/hed/models/expression_parser.py index ab56aa76..736ff562 100644 --- a/hed/models/expression_parser.py +++ b/hed/models/expression_parser.py @@ -235,7 +235,7 @@ def handle_expr(self, hed_group, exact=False): class ExpressionOr(Expression): def handle_expr(self, hed_group, exact=False): groups1 = self.left.handle_expr(hed_group, exact=exact) - # Don't early out as we need to gather all groups incase tags appear more than once etc + # Don't early out as we need to gather all groups in case tags appear more than once etc groups2 = self.right.handle_expr(hed_group, exact=exact) # todo: optimize this eventually # Filter out duplicates diff --git a/pyproject.toml b/pyproject.toml index f5549033..7ba509cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,5 +3,5 @@ requires = ["setuptools>=42", "versioneer-518"] build-backend = "setuptools.build_meta" [tool.codespell] -skip = '*.git,*.pdf,*.xml,*.mediawiki,*.svg,versioneer.py,venv*,*.tsv,*.yaml,*.yml,*.json,*.rdf,*.jsonld' +skip = '*.git,*.pdf,*.xml,*.mediawiki,*.svg,versioneer.py,venv*,*.tsv,*.yaml,*.yml,*.json,*.rdf,*.jsonld,spec_tests' ignore-words-list = 'te,parms' From 05aec79ecc82743ae8f7f7b72b1d93c12063dde6 Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Mon, 16 Oct 2023 16:36:12 -0500 Subject: [PATCH 17/21] Fix Monique noitced in documentation --- hed/tools/remodeling/operations/remove_columns_op.py | 4 ++-- hed/tools/remodeling/operations/rename_columns_op.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hed/tools/remodeling/operations/remove_columns_op.py b/hed/tools/remodeling/operations/remove_columns_op.py index 267a7039..6901b6ce 100644 --- a/hed/tools/remodeling/operations/remove_columns_op.py +++ b/hed/tools/remodeling/operations/remove_columns_op.py @@ -6,8 +6,8 @@ class RemoveColumnsOp(BaseOp): """ Remove columns from a tabular file. Required remodeling parameters: - - **remove_names** (*list*): The names of the columns to be removed. - - **ignore_missing** (*boolean*): If true, names in remove_names that are not columns in df should be ignored. + - **column_names** (*list*): The names of the columns to be removed. + - **ignore_missing** (*boolean*): If true, names in column_names that are not columns in df should be ignored. """ diff --git a/hed/tools/remodeling/operations/rename_columns_op.py b/hed/tools/remodeling/operations/rename_columns_op.py index 2a2f275a..0a3329b0 100644 --- a/hed/tools/remodeling/operations/rename_columns_op.py +++ b/hed/tools/remodeling/operations/rename_columns_op.py @@ -8,7 +8,7 @@ class RenameColumnsOp (BaseOp): Required remodeling parameters: - **column_mapping** (*dict*): The names of the columns to be removed. - - **ignore_missing** (*bool*): If true, the names in remove_names that are not columns and should be ignored. + - **ignore_missing** (*bool*): If true, the names in column_mapping that are not columns and should be ignored. """ From 04acff8e2ebfbe39b91d68485c711f07b77a050d Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Fri, 20 Oct 2023 18:42:56 -0500 Subject: [PATCH 18/21] Improve toctree in documentation --- docs/source/_templates/autosummary/class.rst | 34 ---------- .../_templates/custom-class-template.rst | 68 +++++++++++-------- .../_templates/custom-module-template.rst | 27 +++++--- docs/source/index.rst | 4 +- 4 files changed, 58 insertions(+), 75 deletions(-) delete mode 100644 docs/source/_templates/autosummary/class.rst diff --git a/docs/source/_templates/autosummary/class.rst b/docs/source/_templates/autosummary/class.rst deleted file mode 100644 index 50efd957..00000000 --- a/docs/source/_templates/autosummary/class.rst +++ /dev/null @@ -1,34 +0,0 @@ -{{ fullname | escape | underline}} - -.. currentmodule:: {{ module }} - -.. autoclass:: {{ module }}.{{ objname }} - {% block methods %} - .. automethod:: __init__ - - {% if methods %} - .. rubric:: {{ _('Methods') }} - - .. autosummary:: - {% for item in methods %} - ~{{ name }}.{{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - - {% block attributes %} - {% if attributes %} - .. rubric:: {{ _('Attributes') }} - - .. autosummary:: - {% for item in attributes %} - ~{{ name }}.{{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - -.. autoclass:: {{ module }}.{{ objname }} - :members: - :undoc-members: - :show-inheritance: - :inherited-members: diff --git a/docs/source/_templates/custom-class-template.rst b/docs/source/_templates/custom-class-template.rst index 16ebb2f3..490f83ee 100644 --- a/docs/source/_templates/custom-class-template.rst +++ b/docs/source/_templates/custom-class-template.rst @@ -1,32 +1,42 @@ -{{ fullname | escape | underline}} +{{ fullname | escape | underline }} .. currentmodule:: {{ module }} -.. autoclass:: {{ objname }} - :members: - :show-inheritance: - :inherited-members: - - {% block methods %} - .. automethod:: __init__ - - {% if methods %} - .. rubric:: {{ _('Methods') }} - - .. autosummary:: - {% for item in methods %} - ~{{ name }}.{{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - - {% block attributes %} - {% if attributes %} - .. rubric:: {{ _('Attributes') }} - - .. autosummary:: - {% for item in attributes %} - ~{{ name }}.{{ item }} - {%- endfor %} - {% endif %} - {% endblock %} \ No newline at end of file +.. autoclass:: {{ module }}.{{ objname }} + :noindex: + +.. rubric:: {{ _('Methods') }} + +.. autosummary:: +{% for item in methods %} + {{ module }}.{{ objname }}.{{ item }} +{%- endfor %} + +.. rubric:: {{ _('Attributes') }} + +.. autosummary:: +{% for item in attributes %} + {{ module }}.{{ objname }}.{{ item }} +{%- endfor %} + +.. toctree:: + :hidden: + +{% for item in methods %} + {{ fullname }}#method-{{ item }} +{%- endfor %} +{% for item in attributes %} + {{ fullname }}#attribute-{{ item }} +{%- endfor %} + +{% for item in methods %} +.. _method-{{ item }}: + +.. automethod:: {{ module }}.{{ objname }}.{{ item }} +{%- endfor %} + +{% for item in attributes %} +.. _attribute-{{ item }}: + +.. autoattribute:: {{ module }}.{{ objname }}.{{ item }} +{%- endfor %} diff --git a/docs/source/_templates/custom-module-template.rst b/docs/source/_templates/custom-module-template.rst index 74078355..7fec9f9c 100644 --- a/docs/source/_templates/custom-module-template.rst +++ b/docs/source/_templates/custom-module-template.rst @@ -1,5 +1,7 @@ {{ fullname | escape | underline}} +.. currentmodule:: {{ module }} + .. automodule:: {{ fullname }} {% block attributes %} @@ -14,17 +16,22 @@ {% endif %} {% endblock %} - {% block functions %} - {% if functions %} - .. rubric:: {{ _('Functions') }} + {% block functions %} + {% if functions %} + .. rubric:: {{ _('Functions') }} - .. autosummary:: - :toctree: - {% for item in functions %} - {{ item }} - {%- endfor %} - {% endif %} - {% endblock %} + .. autosummary:: + {% for item in functions %} + {{ item }} + {% endfor %} + + {% for item in functions %} + .. _{{ item }}: + + .. autofunction:: {{ item }} + {% endfor %} + {% endif %} + {% endblock %} {% block classes %} {% if classes %} diff --git a/docs/source/index.rst b/docs/source/index.rst index 924fb2b7..3b82987d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -14,14 +14,14 @@ Hierarchical Event Descriptor (HED) Python Tools Note: this is a work in progress. More information is coming. .. toctree:: - :maxdepth: 4 + :maxdepth: 6 :caption: Contents: introduction.md user_guide.rst .. toctree:: - :maxdepth: 4 + :maxdepth: 6 :caption: HED Python API: api2.rst From 140676040e33242f3284bc6088c11e650709878a Mon Sep 17 00:00:00 2001 From: IanCa <ianrcallanan@gmail.com> Date: Fri, 20 Oct 2023 19:13:53 -0500 Subject: [PATCH 19/21] Don't draw full titles. Full names are still drawn some other places --- docs/source/_templates/custom-class-template.rst | 2 +- docs/source/_templates/custom-module-template.rst | 2 +- docs/source/api2.rst | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/_templates/custom-class-template.rst b/docs/source/_templates/custom-class-template.rst index 490f83ee..cf03326d 100644 --- a/docs/source/_templates/custom-class-template.rst +++ b/docs/source/_templates/custom-class-template.rst @@ -1,4 +1,4 @@ -{{ fullname | escape | underline }} +{{ fullname.split('.')[-1] | escape | underline }} .. currentmodule:: {{ module }} diff --git a/docs/source/_templates/custom-module-template.rst b/docs/source/_templates/custom-module-template.rst index 7fec9f9c..9e9c8a77 100644 --- a/docs/source/_templates/custom-module-template.rst +++ b/docs/source/_templates/custom-module-template.rst @@ -1,4 +1,4 @@ -{{ fullname | escape | underline}} +{{ fullname.split('.')[-1] | escape | underline}} .. currentmodule:: {{ module }} diff --git a/docs/source/api2.rst b/docs/source/api2.rst index 02d3aeae..1f78f469 100644 --- a/docs/source/api2.rst +++ b/docs/source/api2.rst @@ -1,5 +1,5 @@ -HED API reference (Auto style) -============================== +HED API reference +================= .. currentmodule:: hed From 3291b4c2bfb4b9c9b4d76076c14fffbc834e2525 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Oct 2023 11:40:02 +0000 Subject: [PATCH 20/21] Bump actions/checkout from 3 to 4 Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com> --- .github/workflows/codespell.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codespell.yaml b/.github/workflows/codespell.yaml index 3deac45a..9c6511f6 100644 --- a/.github/workflows/codespell.yaml +++ b/.github/workflows/codespell.yaml @@ -17,6 +17,6 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Codespell uses: codespell-project/actions-codespell@v2 \ No newline at end of file From 9814e4ab58f3914a99fc2dc2ca1711ed690c5ee3 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Mon, 23 Oct 2023 11:01:33 -0500 Subject: [PATCH 21/21] Updated the remodeling backup arguments --- hed/tools/remodeling/backup_manager.py | 7 +-- hed/tools/remodeling/cli/run_remodel.py | 8 ++- .../remodeling/cli/run_remodel_backup.py | 18 +++---- .../remodeling/cli/run_remodel_restore.py | 12 ++--- .../tools/remodeling/cli/test_run_remodel.py | 14 ++--- .../remodeling/cli/test_run_remodel_backup.py | 51 ++++++++++--------- .../cli/test_run_remodel_restore.py | 26 +++++----- tests/tools/remodeling/test_backup_manager.py | 2 +- 8 files changed, 67 insertions(+), 71 deletions(-) diff --git a/hed/tools/remodeling/backup_manager.py b/hed/tools/remodeling/backup_manager.py index ac18f2f0..75c6f4f1 100644 --- a/hed/tools/remodeling/backup_manager.py +++ b/hed/tools/remodeling/backup_manager.py @@ -10,7 +10,7 @@ class BackupManager: DEFAULT_BACKUP_NAME = 'default_back' - RELATIVE_BACKUP_LOCATION = 'derivatives/remodel' + RELATIVE_BACKUP_LOCATION = './derivatives/remodel/backups' BACKUP_DICTIONARY = 'backup_lock.json' BACKUP_ROOT = 'backup_root' @@ -24,14 +24,15 @@ def __init__(self, data_root, backups_root=None): :raises HedFileError: - If the data_root does not correspond to a real directory. + Notes: The backup_root will have remodeling/backups appended. """ if not os.path.isdir(data_root): raise HedFileError('NonExistentData', f"{data_root} is not an existing directory", "") self.data_root = data_root if backups_root: - self.backups_path = os.path.join(backups_root, 'backups') + self.backups_path = backups_root else: - self.backups_path = os.path.join(data_root, self.RELATIVE_BACKUP_LOCATION, 'backups') + self.backups_path = os.path.join(data_root, self.RELATIVE_BACKUP_LOCATION) self.backups_path = os.path.realpath(self.backups_path) os.makedirs(self.backups_path, exist_ok=True) self.backups_dict = self._get_backups() diff --git a/hed/tools/remodeling/cli/run_remodel.py b/hed/tools/remodeling/cli/run_remodel.py index 8aefdb15..32af02ea 100644 --- a/hed/tools/remodeling/cli/run_remodel.py +++ b/hed/tools/remodeling/cli/run_remodel.py @@ -20,6 +20,10 @@ def get_parser(): parser = argparse.ArgumentParser(description="Converts event files based on a json file specifying operations.") parser.add_argument("data_dir", help="Full path of dataset root directory.") parser.add_argument("model_path", help="Full path of the file with remodeling instructions.") + parser.add_argument("-bd", "--backup_dir", default="", dest="backup_dir", + help="Directory for the backup that is being created") + parser.add_argument("-bn", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name", + help="Name of the default backup for remodeling") parser.add_argument("-b", "--bids-format", action='store_true', dest="use_bids", help="If present, the dataset is in BIDS format with sidecars. HED analysis is available.") parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions", @@ -31,8 +35,8 @@ def get_parser(): help="Controls individual file summaries ('none', 'separate', 'consolidated')") parser.add_argument("-j", "--json-sidecar", dest="json_sidecar", nargs="?", help="Optional path to JSON sidecar with HED information") - parser.add_argument("-n", "--backup-name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name", - help="Name of the default backup for remodeling") +# parser.add_argument("-n", "--backup-name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name", +# help="Name of the default backup for remodeling") parser.add_argument("-nb", "--no-backup", action='store_true', dest="no_backup", help="If present, the operations are run directly on the files with no backup.") parser.add_argument("-ns", "--no-summaries", action='store_true', dest="no_summaries", diff --git a/hed/tools/remodeling/cli/run_remodel_backup.py b/hed/tools/remodeling/cli/run_remodel_backup.py index 6d78465d..3754a15d 100644 --- a/hed/tools/remodeling/cli/run_remodel_backup.py +++ b/hed/tools/remodeling/cli/run_remodel_backup.py @@ -1,5 +1,6 @@ """ Command-line program for creating a backup. """ +import os import argparse from hed.errors.exceptions import HedFileError from hed.tools.util.io_util import get_file_list, get_filtered_by_element @@ -15,21 +16,18 @@ def get_parser(): """ parser = argparse.ArgumentParser(description="Creates a backup for the remodeling process.") parser.add_argument("data_dir", help="Full path of dataset root directory.") + parser.add_argument("-bd", "--backup_dir", default="", dest="backup_dir", + help="Directory for the backup that is being created") + parser.add_argument("-bn", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name", + help="Name of the default backup for remodeling") parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions", help="File extensions to allow in locating files. A * indicates all files allowed.") parser.add_argument("-f", "--file-suffix", dest="file_suffix", nargs="*", default=['events'], help="Filename suffix of files to be backed up. A * indicates all files allowed.") - parser.add_argument("-n", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name", - help="Name of the default backup for remodeling") - parser.add_argument("-p", "--path-work", default="", dest="path_work", - help="The root path for remodeling work if given, " + - "otherwise [data_root]/derivatives/remodel is used.") + parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[], help="The name of the task.") parser.add_argument("-v", "--verbose", action='store_true', help="If present, output informative messages as computation progresses.") - parser.add_argument("-w", "--work-dir", default="", dest="work_dir", - help="If given, is the path to directory for saving, " + - "otherwise [data_root]derivatives/remodel is used.") parser.add_argument("-x", "--exclude-dirs", nargs="*", default=['derivatives'], dest="exclude_dirs", help="Directories names to exclude from search for files. " + "If omitted, no directories except the backup directory will be excluded." + @@ -60,8 +58,8 @@ def main(arg_list=None): exclude_dirs=exclude_dirs) if args.task_names: file_list = get_filtered_by_element(file_list, args.task_names) - if args.work_dir: - backups_root = args.work_dir + if args.backup_dir: + backups_root = args.backup_dir else: backups_root = None backup_man = BackupManager(args.data_dir, backups_root=backups_root) diff --git a/hed/tools/remodeling/cli/run_remodel_restore.py b/hed/tools/remodeling/cli/run_remodel_restore.py index 960bd091..72ba0c3c 100644 --- a/hed/tools/remodeling/cli/run_remodel_restore.py +++ b/hed/tools/remodeling/cli/run_remodel_restore.py @@ -14,15 +14,13 @@ def get_parser(): """ parser = argparse.ArgumentParser(description="Restores the backup files for the original data.") parser.add_argument("data_dir", help="Full path of dataset root directory.") - parser.add_argument("-n", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name", + parser.add_argument("-bd", "--backup_dir", default="", dest="backup_dir", + help="Directory for the backup that is being created") + parser.add_argument("-bn", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name", help="Name of the default backup for remodeling") - parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[], help="The names of the task.") parser.add_argument("-v", "--verbose", action='store_true', help="If present, output informative messages as computation progresses.") - parser.add_argument("-w", "--work_dir", default="", dest="work_dir", - help="The root path for remodeling work if given, " + - "otherwise [data_root]/derivatives/remodel is used.") return parser @@ -39,8 +37,8 @@ def main(arg_list=None): """ parser = get_parser() args = parser.parse_args(arg_list) - if args.work_dir: - backups_root = args.work_dir + if args.backup_dir: + backups_root = args.backup_dir else: backups_root = None backup_man = BackupManager(args.data_dir, backups_root=backups_root) diff --git a/tests/tools/remodeling/cli/test_run_remodel.py b/tests/tools/remodeling/cli/test_run_remodel.py index a63b5be2..1d2f4b91 100644 --- a/tests/tools/remodeling/cli/test_run_remodel.py +++ b/tests/tools/remodeling/cli/test_run_remodel.py @@ -51,7 +51,7 @@ def tearDownClass(cls): def test_parse_arguments(self): # Test no verbose - arg_list1 = [self.data_root, self.model_path, '-x', 'derivatives', '-n', 'back1'] + arg_list1 = [self.data_root, self.model_path, '-x', 'derivatives', '-bn', 'back1'] with patch('sys.stdout', new=io.StringIO()) as fp1: args1, operations1 = parse_arguments(arg_list1) self.assertFalse(fp1.getvalue()) @@ -60,7 +60,7 @@ def test_parse_arguments(self): self.assertEqual(args1.file_suffix, 'events') # Test * for extensions and suffix as well as verbose - arg_list2 = [self.data_root, self.model_path, '-x', 'derivatives', '-n', 'back1', '-f', '*', '-e', '*', '-v'] + arg_list2 = [self.data_root, self.model_path, '-x', 'derivatives', '-bn', 'back1', '-f', '*', '-e', '*', '-v'] with patch('sys.stdout', new=io.StringIO()) as fp2: args2, operations2 = parse_arguments(arg_list2) self.assertTrue(fp2.getvalue()) @@ -164,13 +164,13 @@ def test_main_bids_no_sidecar_with_hed_task(self): def test_main_errors(self): # Test bad data directory - arg_list = ['junk/junk', self.model_path, '-x', 'derivatives', '-n', 'back1'] + arg_list = ['junk/junk', self.model_path, '-x', 'derivatives', '-bn', 'back1'] with self.assertRaises(HedFileError) as context: main(arg_list=arg_list) self.assertEqual(context.exception.args[0], "DataDirectoryDoesNotExist") # Test no backup - arg_list = [self.data_root, self.model_path, '-x', 'derivatives', '-n', 'back1'] + arg_list = [self.data_root, self.model_path, '-x', 'derivatives', '-bn', 'back1'] with self.assertRaises(HedFileError) as context: main(arg_list=arg_list) self.assertEqual(context.exception.args[0], "BackupDoesNotExist") @@ -193,12 +193,6 @@ def test_run_bids_ops_verbose(self): main(arg_list) self.assertFalse(fp.getvalue()) - # def test_temp(self): - # data_root = "g:/ds002718OpenNeuro" - # model_path = 'G:/wh_excerpt_rmdl.json' - # arg_list = [data_root, model_path, '-x', 'derivatives', 'code', 'stimuli', '-b', '-n', ''] - # main(arg_list) - if __name__ == '__main__': unittest.main() diff --git a/tests/tools/remodeling/cli/test_run_remodel_backup.py b/tests/tools/remodeling/cli/test_run_remodel_backup.py index 65f2391b..2dbf2770 100644 --- a/tests/tools/remodeling/cli/test_run_remodel_backup.py +++ b/tests/tools/remodeling/cli/test_run_remodel_backup.py @@ -15,7 +15,9 @@ class Test(unittest.TestCase): def setUpClass(cls): file_list = ['top_level.tsv', 'sub1/sub1_events.tsv', 'sub2/sub2_events.tsv', 'sub2/sub2_next_events.tsv'] # cls.file_list = file_list - cls.extract_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../data/remodel_tests') + extract_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../data/remodel_tests') + cls.alt_path = os.path.realpath(os.path.join(extract_path, 'temp')) + cls.extract_path = extract_path test_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../data/remodel_tests/test_root') cls.test_root = test_root cls.test_paths = [os.path.join(test_root, file) for file in file_list] @@ -34,8 +36,12 @@ def setUp(self): zip_ref.extractall(self.extract_path) def tearDown(self): - shutil.rmtree(self.test_root) - shutil.rmtree(self.data_root) + if os.path.exists(self.test_root): + shutil.rmtree(self.test_root) + if os.path.exists(self.data_root): + shutil.rmtree(self.data_root) + if os.path.exists(self.alt_path): + shutil.rmtree(self.alt_path) @classmethod def tearDownClass(cls): @@ -43,11 +49,11 @@ def tearDownClass(cls): def test_main_events(self): self.assertFalse(os.path.exists(self.derv_path), 'backup directory does not exist before creation') - arg_list = [self.test_root, '-n', BackupManager.DEFAULT_BACKUP_NAME, '-x', 'derivatives', + arg_list = [self.test_root, '-bn', BackupManager.DEFAULT_BACKUP_NAME, '-bd', self.derv_path, '-x', 'derivatives', '-f', 'events', '-e', '.tsv'] main(arg_list) self.assertTrue(os.path.exists(self.derv_path), 'backup directory exists before creation') - json_path = os.path.realpath(os.path.join(self.derv_path, 'backups', BackupManager.DEFAULT_BACKUP_NAME, + json_path = os.path.realpath(os.path.join(self.derv_path, BackupManager.DEFAULT_BACKUP_NAME, BackupManager.BACKUP_DICTIONARY)) with open(json_path, 'r') as fp: key_dict = json.load(fp) @@ -56,19 +62,18 @@ def test_main_events(self): self.assertEqual(len(file_list), 3, "The backup of events.tsv has the right number of files") def test_main_all(self): - arg_list = [self.test_root, '-n', BackupManager.DEFAULT_BACKUP_NAME, '-x', 'derivatives', - '-f', '*', '-e', '*'] + arg_list = [self.test_root, '-bn', BackupManager.DEFAULT_BACKUP_NAME, '-bd', self.derv_path, + '-x', 'derivatives', '-f', '*', '-e', '*'] self.assertFalse(os.path.exists(self.derv_path), 'backup directory does not exist before creation') main(arg_list) self.assertTrue(os.path.exists(self.derv_path), 'backup directory exists before creation') - json_path = os.path.realpath(os.path.join(self.derv_path, 'backups', BackupManager.DEFAULT_BACKUP_NAME, + json_path = os.path.realpath(os.path.join(self.derv_path, BackupManager.DEFAULT_BACKUP_NAME, BackupManager.BACKUP_DICTIONARY)) with open(json_path, 'r') as fp: key_dict = json.load(fp) self.assertEqual(len(key_dict), 4, "The backup of events.tsv does not include top_level.tsv") - back_path = os.path.realpath(os.path.join(self.derv_path, 'backups', - BackupManager.DEFAULT_BACKUP_NAME, 'backup_root')) + back_path = os.path.realpath(os.path.join(self.derv_path, BackupManager.DEFAULT_BACKUP_NAME, 'backup_root')) file_list1 = get_file_list(back_path) self.assertIsInstance(file_list1, list) self.assertEqual(len(file_list1), 4) @@ -78,11 +83,11 @@ def test_main_task(self): self.assertTrue(os.path.exists(der_path)) shutil.rmtree(der_path) self.assertFalse(os.path.exists(der_path)) - arg_list = [self.data_root, '-n', BackupManager.DEFAULT_BACKUP_NAME, '-x', 'derivatives', + arg_list = [self.data_root, '-bn', BackupManager.DEFAULT_BACKUP_NAME, '-x', 'derivatives', '-f', 'events', '-e', '.tsv', '-t', 'FacePerception'] main(arg_list) self.assertTrue(os.path.exists(der_path)) - back_path = os.path.realpath(os.path.join(self.data_root, BackupManager.RELATIVE_BACKUP_LOCATION, 'backups', + back_path = os.path.realpath(os.path.join(self.data_root, BackupManager.RELATIVE_BACKUP_LOCATION, BackupManager.DEFAULT_BACKUP_NAME, 'backup_root')) self.assertTrue(os.path.exists(back_path)) backed_files = get_file_list(back_path) @@ -93,37 +98,33 @@ def test_main_bad_task(self): self.assertTrue(os.path.exists(der_path)) shutil.rmtree(der_path) self.assertFalse(os.path.exists(der_path)) - arg_list = [self.data_root, '-n', BackupManager.DEFAULT_BACKUP_NAME, '-x', 'derivatives', + arg_list = [self.data_root, '-bn', BackupManager.DEFAULT_BACKUP_NAME, '-x', 'derivatives', '-f', 'events', '-e', '.tsv', '-t', 'Baloney'] main(arg_list) self.assertTrue(os.path.exists(der_path)) - back_path = os.path.realpath(os.path.join(self.data_root, BackupManager.RELATIVE_BACKUP_LOCATION, 'backups', + back_path = os.path.realpath(os.path.join(self.data_root, BackupManager.RELATIVE_BACKUP_LOCATION, BackupManager.DEFAULT_BACKUP_NAME, 'backup_root')) self.assertTrue(os.path.exists(back_path)) backed_files = get_file_list(back_path) self.assertEqual(len(backed_files), 0) def test_alt_loc(self): - alt_path = os.path.realpath(os.path.join(self.extract_path, 'temp')) - if os.path.exists(alt_path): - shutil.rmtree(alt_path) - self.assertFalse(os.path.exists(alt_path)) - arg_list = [self.data_root, '-n', BackupManager.DEFAULT_BACKUP_NAME, '-x', 'derivatives', '-w', alt_path, + if os.path.exists(self.alt_path): + shutil.rmtree(self.alt_path) + self.assertFalse(os.path.exists(self.alt_path)) + arg_list = [self.data_root, '-bn', BackupManager.DEFAULT_BACKUP_NAME, '-x', 'derivatives', '-bd', self.alt_path, '-f', 'events', '-e', '.tsv', ] main(arg_list) - self.assertTrue(os.path.exists(alt_path)) - back_path = os.path.realpath(os.path.join(alt_path, 'backups', 'default_back', 'backup_root')) - self.assertTrue(os.path.exists(back_path)) + self.assertTrue(os.path.exists(self.alt_path)) + back_path = os.path.realpath(os.path.join(self.alt_path, 'default_back/backup_root')) self.assertTrue(os.path.exists(back_path)) backed_files = get_file_list(back_path) self.assertEqual(len(backed_files), 6) - if os.path.exists(alt_path): - shutil.rmtree(alt_path) def test_main_backup_exists(self): der_path = os.path.realpath(os.path.join(self.data_root, 'derivatives')) self.assertTrue(os.path.exists(der_path)) - arg_list = [self.data_root, '-n', BackupManager.DEFAULT_BACKUP_NAME, '-x', 'derivatives', + arg_list = [self.data_root, '-bn', BackupManager.DEFAULT_BACKUP_NAME, '-x', 'derivatives', '-f', 'events', '-e', '.tsv', '-t', 'Baloney'] with self.assertRaises(HedFileError) as context: main(arg_list) diff --git a/tests/tools/remodeling/cli/test_run_remodel_restore.py b/tests/tools/remodeling/cli/test_run_remodel_restore.py index c3645f3a..af75f1d2 100644 --- a/tests/tools/remodeling/cli/test_run_remodel_restore.py +++ b/tests/tools/remodeling/cli/test_run_remodel_restore.py @@ -17,15 +17,19 @@ def setUpClass(cls): '../../../data/remodel_tests/test_root_back1') cls.test_zip_back1 = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../data/remodel_tests/test_root_back1.zip') - - cls.extract_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../data/remodel_tests') + extract_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../../data/remodel_tests') + cls.alt_path = os.path.realpath(os.path.join(extract_path, 'temp')) + cls.extract_path = extract_path def setUp(self): with zipfile.ZipFile(self.test_zip_back1, 'r') as zip_ref: zip_ref.extractall(self.extract_path) def tearDown(self): - shutil.rmtree(self.test_root_back1) + if os.path.exists(self.test_root_back1): + shutil.rmtree(self.test_root_back1) + if os.path.exists(self.alt_path): + shutil.rmtree(self.alt_path) def test_main_restore(self): files1 = get_file_list(self.test_root_back1, exclude_dirs=['derivatives']) @@ -35,7 +39,7 @@ def test_main_restore(self): os.remove(os.path.realpath(os.path.join(self.test_root_back1, 'top_level.tsv'))) files2 = get_file_list(self.test_root_back1, exclude_dirs=['derivatives']) self.assertFalse(files2, "run_restore starts with the right number of files.") - arg_list = [self.test_root_back1, '-n', 'back1'] + arg_list = [self.test_root_back1, '-bn', 'back1'] main(arg_list) files3 = get_file_list(self.test_root_back1, exclude_dirs=['derivatives']) self.assertEqual(len(files3), len(files1), "run_restore restores all the files after") @@ -48,11 +52,10 @@ def test_no_backup(self): self.assertEqual(context.exception.args[0], "BackupDoesNotExist") def test_restore_alt_loc(self): - alt_path = os.path.realpath(os.path.join(self.extract_path, 'temp')) - if os.path.exists(alt_path): - shutil.rmtree(alt_path) - self.assertFalse(os.path.exists(alt_path)) - arg_list = [self.test_root_back1, '-n', 'back1', '-x', 'derivatives', '-w', alt_path, + if os.path.exists(self.alt_path): + shutil.rmtree(self.alt_path) + self.assertFalse(os.path.exists(self.alt_path)) + arg_list = [self.test_root_back1, '-bn', 'back1', '-x', 'derivatives', '-bd', self.alt_path, '-f', 'events', '-e', '.tsv'] back_main(arg_list) files1 = get_file_list(self.test_root_back1, exclude_dirs=['derivatives']) @@ -62,14 +65,11 @@ def test_restore_alt_loc(self): os.remove(os.path.realpath(os.path.join(self.test_root_back1, 'top_level.tsv'))) files2 = get_file_list(self.test_root_back1, exclude_dirs=['derivatives']) self.assertFalse(files2, "run_restore starts with the right number of files.") - arg_list = [self.test_root_back1, '-n', 'back1', '-w', alt_path] + arg_list = [self.test_root_back1, '-bn', 'back1', '-bd', self.alt_path] main(arg_list) files3 = get_file_list(self.test_root_back1, exclude_dirs=['derivatives']) self.assertEqual(len(files3)+1, len(files1), "run_restore restores all the files after") - if os.path.exists(alt_path): - shutil.rmtree(alt_path) - if __name__ == '__main__': unittest.main() diff --git a/tests/tools/remodeling/test_backup_manager.py b/tests/tools/remodeling/test_backup_manager.py index 53c297cf..1d9a50e5 100644 --- a/tests/tools/remodeling/test_backup_manager.py +++ b/tests/tools/remodeling/test_backup_manager.py @@ -32,7 +32,7 @@ def setUpClass(cls): test_root_bad = os.path.realpath(os.path.join(os.path.dirname(__file__), '../../data/remodel_tests/test_root_bad')) cls.test_root_bad = test_root_bad - cls.test_root_bad_backups = os.path.join(test_root_bad, BackupManager.RELATIVE_BACKUP_LOCATION, 'backups') + cls.test_root_bad_backups = os.path.join(test_root_bad, BackupManager.RELATIVE_BACKUP_LOCATION) cls.test_paths_bad = [os.path.join(test_root_bad, file) for file in file_list] cls.test_zip_bad = os.path.realpath(os.path.join(os.path.dirname(__file__), '../../data/remodel_tests/test_root_bad.zip'))