From c5f03860d49753f0fb394cf141e64bce5f789726 Mon Sep 17 00:00:00 2001 From: IanCa Date: Thu, 21 Sep 2023 12:39:20 -0500 Subject: [PATCH] Add a string based search first pass Update query parser from [[]] to {} notation and clean up some Minor bug fixes in query parser --- hed/models/base_input.py | 2 +- hed/models/basic_search.py | 237 +++++++++++++++++++ hed/models/df_util.py | 26 ++ hed/models/expression_parser.py | 159 +++++++------ tests/models/test_base_input.py | 27 ++- tests/models/test_basic_search.py | 313 +++++++++++++++++++++++++ tests/models/test_expression_parser.py | 118 +++++++--- tests/schema/test_hed_schema_io.py | 4 + 8 files changed, 756 insertions(+), 130 deletions(-) create mode 100644 hed/models/basic_search.py create mode 100644 tests/models/test_basic_search.py diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 12e2d889..9f437102 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -137,7 +137,7 @@ def _indexed_dict_from_onsets(onsets): @staticmethod def _filter_by_index_list(original_series, indexed_dict): - new_series = ["n/a"] * len(original_series) # Initialize new_series with "n/a" + new_series = pd.Series(["n/a"] * len(original_series)) for onset, indices in indexed_dict.items(): if indices: diff --git a/hed/models/basic_search.py b/hed/models/basic_search.py new file mode 100644 index 00000000..ae47b71e --- /dev/null +++ b/hed/models/basic_search.py @@ -0,0 +1,237 @@ +import re +from itertools import combinations, product +from collections import defaultdict +import pandas as pd + + +def find_matching(series, search_string, regex=False): + """ Finds lines in the series that match the search string and returns a mask. + + Syntax Rules: + - '@': Prefixing a term in the search string means the object must appear anywhere within a line. + - Parentheses: Elements within parentheses must appear in the line with the same level of nesting. + eg: Search string: "(A), (B)" will match "(A), (B, C)", but not "(A, B)", since they don't + start in the same group. + - "LongFormTag*": A * will match any remaining word(anything but a comma or parenthesis) + - An individual term can be arbitrary regex, but it is limited to single continuous words. + + Notes: + - The format of the series should match the format of the search string, whether it's in short or long form. + - To enable support for matching parent tags, ensure that both the series and search string are in long form. + + Args: + series (pd.Series): A Pandas Series object containing the lines to be searched. + search_string (str): The string to search for in each line of the series. + regex (bool): By default, translate any * wildcard characters to .*? regex + If True, do no translation and pass the words as is. Due to how it's setup, you must not include + the following characters: (), + + Returns: + mask (pd.Series): A Boolean mask Series of the same length as the input series. + The mask has `True` for lines that match the search string and `False` otherwise. + """ + if not regex: + # Replace *'s with a reasonable value for people who don't know regex + search_string = re.sub(r'(?= 3.9 - # negated_groups = [search_result(group, []) for group in hed_group.get_all_groups() if group not in groups] + # negated_groups = [SearchResult(group, []) for group in hed_group.get_all_groups() if group not in groups] # Python 3.7/8 compatible version. - negated_groups = [search_result(group, []) for group in hed_group.get_all_groups() + negated_groups = [SearchResult(group, []) for group in hed_group.get_all_groups() if not any(group is found_group.group for found_group in found_groups)] return negated_groups -class ExpressionContainingGroup(Expression): - def handle_expr(self, hed_group, exact=False): - result = self.right.handle_expr(hed_group, exact=True) - found_groups = result - if result: - found_parent_groups = [] - for group in found_groups: - if not group.group.is_group: - continue - if group.group._parent: - found_parent_groups.append(search_result(group.group._parent, group.group)) - - if found_parent_groups: - return found_parent_groups - - return [] - - class ExpressionDescendantGroup(Expression): def handle_expr(self, hed_group, exact=False): found_groups = self.right.handle_expr(hed_group) - found_parent_groups = [] - if found_groups: - for group in found_groups: - if not group.group.is_group: - continue - if group.group._parent: - found_parent_groups.append(search_result(group.group._parent, group.group)) - - if found_parent_groups: - return found_parent_groups - return [] + found_parent_groups = self._get_parent_groups(found_groups) + return found_parent_groups class ExpressionExactMatch(Expression): + def __init__(self, token, left=None, right=None): + super().__init__(token, left, right) + self.optional = "any" + + def _filter_exact_matches(self, search_results): + filtered_list = [] + for group in search_results: + if len(group.group.children) == len(group.tags): + filtered_list.append(group) + + return filtered_list + def handle_expr(self, hed_group, exact=False): found_groups = self.right.handle_expr(hed_group, exact=True) - if found_groups: - return_list = [] - for group in found_groups: - if len(group.group.children) == len(group.tags): - return_list.append(group) + if self.optional == "any": + return self._get_parent_groups(found_groups) - if return_list: - return return_list + filtered_list = self._filter_exact_matches(found_groups) + if filtered_list: + return self._get_parent_groups(filtered_list) # Basically if we don't have an exact match above, do the more complex matching including optional if self.left: optional_groups = self.left.handle_expr(hed_group, exact=True) found_groups = ExpressionAnd.merge_groups(found_groups, optional_groups) - if found_groups: - return_list = [] - for group in found_groups: - if len(group.group.children) == len(group.tags): - return_list.append(group) - - if return_list: - return return_list + filtered_list = self._filter_exact_matches(found_groups) + if filtered_list: + return self._get_parent_groups(filtered_list) return [] @@ -337,7 +322,6 @@ class QueryParser: def __init__(self, expression_string): """Compiles a QueryParser for a particular expression, so it can be used to search hed strings. - Basic Input Examples: 'Event' - Finds any strings with Event, or a descendent tag of Event such as Sensory-event @@ -354,11 +338,15 @@ def __init__(self, expression_string): '[Event and Action]' - Find a group that contains both Event and Action(at any level) - '[[Event and Action]]' - Find a group with Event And Action at the same level. + '{Event and Action}' - Find a group with Event And Action at the same level. + + '{Event and Action:}' - Find a group with Event And Action at the same level, and nothing else + + '{Event and Action:Agent}' - Find a group with Event And Action at the same level, and optionally an Agent tag. Practical Complex Example: - [[{(Onset or Offset), (Def or [[Def-expand]]): ???}]] - A group with an onset tag, + {(Onset or Offset), (Def or {Def-expand}): ???} - A group with an onset tag, a def tag or def-expand group, and an optional wildcard group Parameters: @@ -392,15 +380,22 @@ def current_token(self): def _handle_and_op(self): expr = self._handle_negation() - next_token = self._next_token_is([Token.And, Token.Or]) + next_token = self._next_token_is([Token.And]) while next_token: right = self._handle_negation() if next_token.kind == Token.And: expr = ExpressionAnd(next_token, expr, right) - elif next_token.kind == Token.Or: - expr = ExpressionOr(next_token, expr, right) - next_token = self._next_token_is([Token.And, Token.Or]) + next_token = self._next_token_is([Token.And]) + return expr + def _handle_or_op(self): + expr = self._handle_and_op() # Note: calling _handle_and_op here + next_token = self._next_token_is([Token.Or]) + while next_token: + right = self._handle_and_op() # Note: calling _handle_and_op here + if next_token.kind == Token.Or: + expr = ExpressionOr(next_token, expr, right) + next_token = self._next_token_is([Token.Or]) return expr def _handle_negation(self): @@ -417,33 +412,35 @@ def _handle_negation(self): def _handle_grouping_op(self): next_token = self._next_token_is( - [Token.ContainingGroup, Token.LogicalGroup, Token.DescendantGroup, Token.ExactMatch]) - if next_token == Token.ContainingGroup: - interior = self._handle_and_op() - expr = ExpressionContainingGroup(next_token, right=interior) - next_token = self._next_token_is([Token.ContainingGroupEnd]) - if next_token != Token.ContainingGroupEnd: - raise ValueError("Parse error: Missing closing square brackets") - # Can we move this to the and_or level? or does that break everything...? - elif next_token == Token.LogicalGroup: - expr = self._handle_and_op() + [Token.LogicalGroup, Token.DescendantGroup, Token.ExactMatch]) + if next_token == Token.LogicalGroup: + expr = self._handle_or_op() next_token = self._next_token_is([Token.LogicalGroupEnd]) if next_token != Token.LogicalGroupEnd: raise ValueError("Parse error: Missing closing paren") elif next_token == Token.DescendantGroup: - interior = self._handle_and_op() + interior = self._handle_or_op() expr = ExpressionDescendantGroup(next_token, right=interior) next_token = self._next_token_is([Token.DescendantGroupEnd]) if next_token != Token.DescendantGroupEnd: raise ValueError("Parse error: Missing closing square bracket") elif next_token == Token.ExactMatch: - interior = self._handle_and_op() + interior = self._handle_or_op() expr = ExpressionExactMatch(next_token, right=interior) next_token = self._next_token_is([Token.ExactMatchEnd, Token.ExactMatchOptional]) if next_token == Token.ExactMatchOptional: - optional_portion = self._handle_and_op() - expr.left = optional_portion + # We have an optional portion - this needs to now be an exact match + expr.optional = "none" next_token = self._next_token_is([Token.ExactMatchEnd]) + if next_token != Token.ExactMatchEnd: + optional_portion = self._handle_or_op() + expr.left = optional_portion + next_token = self._next_token_is([Token.ExactMatchEnd]) + if "~" in str(expr): + raise ValueError("Cannot use negation in exact matching groups," + " as it's not clear what is being matched.\n" + "{thing and ~(expression)} is allowed.") + if next_token is None: raise ValueError("Parse error: Missing closing curly bracket") else: @@ -452,13 +449,15 @@ def _handle_grouping_op(self): expr = ExpressionWildcardNew(next_token) elif next_token: expr = Expression(next_token) + else: + expr = None return expr def _parse(self, expression_string): self.tokens = self._tokenize(expression_string) - expr = self._handle_and_op() + expr = self._handle_or_op() if self.at_token + 1 != len(self.tokens): raise ValueError("Parse error in search string") diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py index f5b381eb..71e21386 100644 --- a/tests/models/test_base_input.py +++ b/tests/models/test_base_input.py @@ -304,25 +304,30 @@ def test_complex_onsets(self): {3.5: [0, 1], 4.0: [2], 4.4: [3, 4], -1.0: [5]}) def test_empty_and_single_item_series(self): - self.assertEqual(BaseInput._filter_by_index_list([], {}), []) - self.assertEqual(BaseInput._filter_by_index_list(["apple"], {0: [0]}), ["apple"]) + self.assertTrue(BaseInput._filter_by_index_list(pd.Series([]), {}).equals(pd.Series([]))) + self.assertTrue(BaseInput._filter_by_index_list(pd.Series(["apple"]), {0: [0]}).equals(pd.Series(["apple"]))) def test_two_item_series_with_same_onset(self): - self.assertEqual(BaseInput._filter_by_index_list(["apple", "orange"], {0: [0, 1]}), ["apple,orange", "n/a"]) + input_series = pd.Series(["apple", "orange"]) + expected_series = pd.Series(["apple,orange", "n/a"]) + self.assertTrue(BaseInput._filter_by_index_list(input_series, {0: [0, 1]}).equals(expected_series)) def test_multiple_item_series(self): - original = ["apple", "orange", "banana", "mango"] + input_series = pd.Series(["apple", "orange", "banana", "mango"]) indexed_dict = {0: [0, 1], 1: [2], 2: [3]} - self.assertEqual(BaseInput._filter_by_index_list(original, indexed_dict), ["apple,orange", "n/a", "banana", "mango"]) + expected_series = pd.Series(["apple,orange", "n/a", "banana", "mango"]) + self.assertTrue(BaseInput._filter_by_index_list(input_series, indexed_dict).equals(expected_series)) def test_complex_scenarios(self): # Test with negative, zero and positive onsets - original = ["negative", "zero", "positive"] + original = pd.Series(["negative", "zero", "positive"]) indexed_dict = {-1: [0], 0: [1], 1: [2]} - self.assertEqual(BaseInput._filter_by_index_list(original, indexed_dict), ["negative", "zero", "positive"]) + expected_series1 = pd.Series(["negative", "zero", "positive"]) + self.assertTrue(BaseInput._filter_by_index_list(original, indexed_dict).equals(expected_series1)) # Test with more complex indexed_dict - original = ["apple", "orange", "banana", "mango", "grape"] - indexed_dict = {0: [0, 1], 1: [2], 2: [3, 4]} - self.assertEqual(BaseInput._filter_by_index_list(original, indexed_dict), - ["apple,orange", "n/a", "banana", "mango,grape", "n/a"]) + original2 = ["apple", "orange", "banana", "mango", "grape"] + indexed_dict2= {0: [0, 1], 1: [2], 2: [3, 4]} + expected_series2 = pd.Series(["apple,orange", "n/a", "banana", "mango,grape", "n/a"]) + self.assertTrue(BaseInput._filter_by_index_list(original2, indexed_dict2).equals(expected_series2)) + diff --git a/tests/models/test_basic_search.py b/tests/models/test_basic_search.py new file mode 100644 index 00000000..0a942b93 --- /dev/null +++ b/tests/models/test_basic_search.py @@ -0,0 +1,313 @@ +import unittest +import pandas as pd +from hed import load_schema_version + +import os +from hed import TabularInput +from hed.models import df_util, basic_search +from hed.models.basic_search import find_words, check_parentheses, reverse_and_flip_parentheses, \ + construct_delimiter_map, verify_search_delimiters, find_matching +import numpy as np + + +class TestNewSearch(unittest.TestCase): + @classmethod + def setUpClass(cls): + bids_root_path = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../data/bids_tests/eeg_ds003645s_hed')) + sidecar1_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) + cls.events_path = os.path.realpath( + os.path.join(bids_root_path, 'sub-002/eeg/sub-002_task-FacePerception_run-1_events.tsv')) + cls.base_input = TabularInput(cls.events_path, sidecar1_path) + cls.schema = load_schema_version() + cls.df = cls.base_input.series_filtered + + def test_find_matching_results(self): + result1 = basic_search.find_matching(self.df, "(Face, Item-interval/1)") + result2 = basic_search.find_matching(self.df, "(Face, Item-interval/1*)") + + # Add assertions + self.assertTrue(np.sum(result1) > 0, "result1 should have some true values") + self.assertTrue(np.sum(result2) > 0, "result2 should have some true values") + self.assertTrue(np.sum(result1) < np.sum(result2), "result1 should have fewer true values than result2") + + +class TestFindWords(unittest.TestCase): + def test_basic(self): + search_string = "@global (local1, local2)" + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, ['global']) + self.assertEqual(specific_words, ['local1', 'local2']) + + def test_no_anywhere_words(self): + search_string = "(local1, local2)" + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, []) + self.assertEqual(specific_words, ['local1', 'local2']) + + def test_no_specific_words(self): + search_string = "@global1, @global2" + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, ['global1', 'global2']) + self.assertEqual(specific_words, []) + + def test_empty_string(self): + search_string = "" + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, []) + self.assertEqual(specific_words, []) + + def test_mixed_words(self): + search_string = "@global (local1, local2), @another_global" + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, ['global', 'another_global']) + self.assertEqual(specific_words, ['local1', 'local2']) + + def test_whitespace(self): + search_string = " @Global , ( local1 , local2 ) " + anywhere_words, specific_words = find_words(search_string) + self.assertEqual(anywhere_words, ['Global']) + self.assertEqual(specific_words, ['local1', 'local2']) + + +class TestCheckParentheses(unittest.TestCase): + def test_balanced_parentheses(self): + self.assertEqual(check_parentheses("(())"), "") + self.assertEqual(check_parentheses("(someText())"), "") + self.assertEqual(check_parentheses("((some)text())"), "") + self.assertEqual(check_parentheses("()"), "") + + def test_unbalanced_parentheses(self): + self.assertEqual(check_parentheses("(()"), "(") + self.assertEqual(check_parentheses("()someText("), "(") + self.assertEqual(check_parentheses("(text)text)"), ")") + self.assertEqual(check_parentheses("text)"), ")") + + def test_mixed_parentheses(self): + self.assertEqual(check_parentheses("(()(())"), "(") + self.assertEqual(check_parentheses("(someText))((someText)"), ")(") + self.assertEqual(check_parentheses("((someText))someText"), "") + self.assertEqual(check_parentheses("(someText(someText))someText"), "") + + def test_special_cases(self): + self.assertEqual(check_parentheses(""), "") + self.assertEqual(check_parentheses("abc"), "") + self.assertEqual(check_parentheses("((()))("), "(") + self.assertEqual(check_parentheses("text"), "") + + def test_reverse_and_flip_parentheses(self): + self.assertEqual(reverse_and_flip_parentheses("(abc)"), "(cba)") + self.assertEqual(reverse_and_flip_parentheses("Hello()"), "()olleH") + self.assertEqual(reverse_and_flip_parentheses(")("), ")(") + self.assertEqual(reverse_and_flip_parentheses("((()))"), "((()))") + self.assertEqual(reverse_and_flip_parentheses("()()()"), "()()()") + self.assertEqual(reverse_and_flip_parentheses("abc"), "cba") + self.assertEqual(reverse_and_flip_parentheses("123(abc)321"), "123(cba)321") + self.assertEqual(reverse_and_flip_parentheses("a(bc)d"), "d(cb)a") + + +class TestConstructDelimiterMap(unittest.TestCase): + def test_empty_text(self): + self.assertEqual(construct_delimiter_map("", ["word1", "word2"]), {}) + + def test_empty_words(self): + self.assertEqual(construct_delimiter_map("word1,word2", []), {}) + + def test_single_occurrence(self): + text = "word1,word2" + expected_result = { + ("word1", "word2"): "", + ("word2", "word1"): "" + } + self.assertEqual(construct_delimiter_map(text, ["word1", "word2"]), expected_result) + + def test_multiple_words(self): + text = "word0,((word1),word2)" + expected_result = { + ("word0", "word1"): "((", + ("word0", "word2"): "(", + ("word1", "word0"): "))", + ("word1", "word2"): ")", + ("word2", "word1"): "(", + ("word2", "word0"): ")" + } + self.assertEqual(construct_delimiter_map(text, ["word0", "word1", "word2"]), expected_result) + + text = "word0 , ( (word1 ), word2)" + self.assertEqual(construct_delimiter_map(text, ["word0", "word1", "word2"]), expected_result) + + +class TestVerifyDelimiters(unittest.TestCase): + def base_verify_func(self, query_text, text, anywhere_words, specific_words, expected_result): + delimiter_map = construct_delimiter_map(query_text, specific_words) + actual_result = verify_search_delimiters(text, anywhere_words, specific_words, delimiter_map) + self.assertEqual(actual_result, expected_result) + + def test_all_conditions_met(self): + query_text = "word0,((word1),word2)" + specific_words = ["word0", "word1", "word2"] + text = "word0,((word1),word2)" + self.base_verify_func(query_text, text, [], specific_words, True) + text = "((word1),word2), word0" + self.base_verify_func(query_text, text, [], specific_words, True) + text = "word0,(word2, (word1))" + self.base_verify_func(query_text, text, [], specific_words, True) + text = "word0,((word1),(ExtraGroup),word2)" + self.base_verify_func(query_text, text, [], specific_words, True) + text = "word0,((word2),word1)" + self.base_verify_func(query_text, text, [], specific_words, False) + text = "((word1),word0), word2" + self.base_verify_func(query_text, text, [], specific_words, False) + text = "word0,((word1))" + self.base_verify_func(query_text, text, [], specific_words, False) + text = "(word1),(ExtraGroup),word2)" + self.base_verify_func(query_text, text, [], specific_words, False) + + def test_complex_case_with_word_identifiers(self): + query_text = "word0,((word1),@word2,@word3,word4)" + specific_words = ["word0", "word1", "word4"] + anywhere_words = ["word2", "word3"] + text = "word0,((word1),word2,word3,word4)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + text = "word2,word0,((word1),word3,word4)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + text = "word3,((word1),word2,word4),word0" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + text = "word0,((word1),word4),word2,word3" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + text = "word0,word1,word4,word2" # Incorrect delimiters + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + text = "word2,word3" # Missing specific words + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + def test_very_complex_case_with_word_identifiers(self): + query_text = "word0,(((word1,word2),@word3)),((word4,word5)))" + specific_words = ["word0", "word1", "word2", "word4", "word5"] + anywhere_words = ["word3"] + + # Test case where all conditions are met + text = "word0,(((word1,word2),word3)),((word4,word5)))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Test case with anywhere words out of specific context but still in the string + text = "word3,word0,(((word1,word2))),((word4,word5)))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Test case with correct specific words but incorrect delimiters + text = "word0,((word1,word2),word3),(word4,word5)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + # Test case missing one specific word + text = "word0,(((word1,word2),word3)),(word4))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + # Test case missing anywhere word + text = "word0,(((word1,word2))),((word4,word5)))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + def test_incorrect_single_delimiter(self): + query_text = "word0,((word1)),word2" + specific_words = ["word0", "word1", "word2"] + anywhere_words = [] + + # Positive case 1: Exact match + text = "word0,((word1)),word2" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Positive case 2: Additional parentheses around the entire sequence + text = "(word0,((word1)),word2)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Single closing parenthesis missing between word1 and word2 + text = "word0,((word1),word2)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + # Single opening parenthesis missing between word0 and word1 + text = "word0,(word1)),word2" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + def test_mismatched_parentheses(self): + query_text = "word0,((word1)),(word2,word3)" + specific_words = ["word0", "word1", "word2", "word3"] + anywhere_words = [] + + # Positive case 1: Exact match + text = "word0,((word1)),(word2,word3)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Positive case 2: Reordered sequence with the same delimiters + text = "(word2,word3),word0,((word1))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Positive case 3: Additional text in between but the delimiters remain the same + text = "word0,someExtraText,((word1)),someMoreText,(word2,word3)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, True) + + # Extra closing parenthesis between word2 and word3 + text = "word0,((word1),(word2,word3))" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + # Extra opening parenthesis between word1 and word2 + text = "word0,((word1),((word2,word3)" + self.base_verify_func(query_text, text, anywhere_words, specific_words, False) + + def test_wildcard_matching_verify_delimiters(self): + query_text = "word0, ((word1.*?)), word2.*?" + delimiter_map = construct_delimiter_map(query_text, ["word0", "word1.*?", "word2.*?"]) + + # Positive test cases + text = "((word1)), word0, word2X" + self.assertTrue(verify_search_delimiters(text, [], ["word0", "word1.*?", "word2.*?"], delimiter_map)) + + text = "word0, ((word1Y)), word2Z" + self.assertTrue(verify_search_delimiters(text, [], ["word0", "word1.*?", "word2.*?"], delimiter_map)) + + # Negative test cases + text = "word0, (word1), word2" + self.assertFalse(verify_search_delimiters(text, [], ["word0", "word1.*?", "word2.*?"], delimiter_map)) + +class TestFindMatching(unittest.TestCase): + def base_find_matching(self, series, search_string, expected): + mask = find_matching(series, search_string) + self.assertTrue(all(mask == expected), f"Expected {expected}, got {mask}") + + def test_basic_matching(self): + series = pd.Series([ + "(word1), word0, ((word2))", + "word0, ((word1)), word2", + "(word1), word0, (word2)" + ]) + search_string = "word0, ((word1)), word2" + expected = pd.Series([False, True, False]) + self.base_find_matching(series, search_string, expected) + + def test_anywhere_words(self): + series = pd.Series([ + "(word1), word0, ((word2))", + "word0, ((word1)), word2", + "word0, (word3), ((word1)), word2" + ]) + search_string = "@word3, word0, ((word1)), word2" + expected = pd.Series([False, False, True]) + self.base_find_matching(series, search_string, expected) + + def test_mismatched_parentheses(self): + series = pd.Series([ + "(word1), word0, ((word2))", + "word0, ((word1)), word2", + "word0, (word1)), word2", + "word0, ((word1), word2" + ]) + search_string = "word0, ((word1)), word2" + expected = pd.Series([False, True, False, False]) + self.base_find_matching(series, search_string, expected) + + def test_wildcard_matching(self): + series = pd.Series([ + "word2, word0, ((word1X))", + "word0, ((word1Y)), word2Z", + "word0, (word1), word2" + ]) + search_string = "word0, ((word1*)), word2*" + expected = pd.Series([True, True, False]) + self.base_find_matching(series, search_string, expected) diff --git a/tests/models/test_expression_parser.py b/tests/models/test_expression_parser.py index cca54411..5bdb71b7 100644 --- a/tests/models/test_expression_parser.py +++ b/tests/models/test_expression_parser.py @@ -118,7 +118,7 @@ def test_finding_tags2(self): "Agent, (Event)": True, "(Item), (Event)": True } - self.base_test("(Item or Agent) and [[Action or Event]]", test_strings) + self.base_test("(Item or Agent) and {Action or Event}", test_strings) def test_exact_group(self): test_strings = { @@ -131,7 +131,7 @@ def test_exact_group(self): "(A, B, (C, D))": True, "(A, B, C)": True } - self.base_test("[[a, b]]", test_strings) + self.base_test("{a, b}", test_strings) def test_exact_group_simple_complex(self): test_strings = { @@ -145,7 +145,7 @@ def test_exact_group_simple_complex(self): "(E, F, (A, B, (C, D)))": True, "(A, B, (E, F, (C, D)))": False, # TODO: Should this be True? [[c]] isn't directly inside an a group. } - self.base_test("[[a, [[c]] ]]", test_strings) + self.base_test("{a, {c} }", test_strings) def test_exact_group_complex(self): test_strings = { @@ -155,7 +155,7 @@ def test_exact_group_complex(self): "(A, B, ((C, D)))": False, "(E, F, (A, B, (C, D)))": True, } - self.base_test("[[a, b, [[c, d]] ]]", test_strings) + self.base_test("{a, b, {c, d} }", test_strings) def test_duplicate_search(self): test_strings = { @@ -183,7 +183,7 @@ def test_exact_group_complex_split(self): "(E, F, (A, B, (C, D)))": False, "((A, B), (C, D))": True, } - self.base_test("[[ [[a, b]], [[c, d]] ]]", test_strings) + self.base_test("{ {a, b}, {c, d} }", test_strings) def test_mixed_group_split(self): test_strings = { @@ -192,7 +192,7 @@ def test_mixed_group_split(self): "((Event), ((Clear-throat)))": True, "((Event, Clear-throat))": False, } - self.base_test("[[ [Event], [Action] ]]", test_strings) + self.base_test("{ [Event], [Action] }", test_strings) def test_exact_group_split(self): test_strings = { @@ -201,7 +201,7 @@ def test_exact_group_split(self): "((Event), ((Clear-throat)))": False, "((Event, Clear-throat))": False, } - self.base_test("[[ [[Event]], [[Action]] ]]", test_strings) + self.base_test("{ {Event}, {Action} }", test_strings) def test_exact_group_split_or(self): test_strings = { @@ -210,17 +210,18 @@ def test_exact_group_split_or(self): "((A), ((D)))": True, "((A, D))": True, } - self.base_test("[[ [[a]] or [[d]] ]]", test_strings) + self.base_test("{ {a} or {d} }", test_strings) def test_exact_group_split_or_negation(self): test_strings = { - "(Event, Clear-throat)": False, + # "(Event, Clear-throat)": False, "((Event), (Clear-throat))": True, "((Event))": False, "((Event), ((Clear-throat)))": True, "((Event, Clear-throat))": False, } - self.base_test("[[ [[~Event]] ]]", test_strings) + # Need to think this through more. How do you exact match a negative tag? + self.base_test("{ {~Event} }", test_strings) def test_exact_group_split_or_negation_dual(self): test_strings = { @@ -233,7 +234,7 @@ def test_exact_group_split_or_negation_dual(self): "((A), (B, C))": False, "((A), ((B), C))": True, } - self.base_test("[[ [[~a and ~b]] ]]", test_strings) + self.base_test("{ {~a and ~b} }", test_strings) def test_exact_group_split_or_negation_dual2(self): test_strings = { @@ -246,7 +247,7 @@ def test_exact_group_split_or_negation_dual2(self): "((A), (B, C))": False, "((A), ((B), C))": True, } - self.base_test("[[ [[~(a or b)]] ]]", test_strings) + self.base_test("{ {~(a or b)} }", test_strings) def test_exact_group_split_or_negation_complex(self): test_strings = { @@ -260,7 +261,7 @@ def test_exact_group_split_or_negation_complex(self): "((A), (B, C)), (D)": False, "((A), (B, C)), (H)": False, } - self.base_test("[[ [[~(a or b)]] ]] and [[D or ~F]]", test_strings) + self.base_test("{ {~(a or b)} } and {D or ~F}", test_strings) # TODO: Should this work, and what should it mean? # Right now this is always true, since there is at least one group without ", (a)" in every string. @@ -272,7 +273,7 @@ def test_exact_group_negation(self): "((A), ((D)))": True, "((A, D))": True, } - self.base_test("[[ ~[[a]] ]]", test_strings) + self.base_test("{ ~{a} }", test_strings) def test_exact_group_negation2(self): test_strings = { @@ -282,9 +283,42 @@ def test_exact_group_negation2(self): "((A), ((D, B)))": True, "((A, D))": False, "(B, (D))": True, - "(B)": True + "(B)": True, + "((A), B)": False } - self.base_test("[[ ~[[a]], b]]", test_strings) + self.base_test("{ ~{a}, b}", test_strings) + + def test_exact_group_negation3(self): + test_strings = { + "(A, D, B)": False, + "((A), (D), B)": True, + "((A))": False, + "((A), ((D, B)))": True, + "((A, D))": False, + "(B, (D))": True, + "(B)": True, + "((A), B)": True + } + self.base_test("{ ~a and b}", test_strings) + + def test_exact_group_negation4(self): + test_strings = { + "(A, D, B)": False, + "((A), (D), B)": False, + "((A))": False, + "((A), ((D, B)))": False, + "((A, D))": False, + "(B)": True, + "(B, (D))": True, + "((A), B)": False + } + self.base_test("{ @c and @a and b: ???}", test_strings) + + def test_exact_group_negation5(self): + test_string = "{ ~a and b:}" + with self.assertRaises(ValueError) as context: + QueryParser(test_string) + self.assertTrue(context.exception.args[0]) def test_mixed_group_complex_split(self): test_strings = { @@ -297,7 +331,7 @@ def test_mixed_group_complex_split(self): "((A, B), (C, D))": True, "((A, B, C, D))": False, } - self.base_test("[[ [a, b], [c, d] ]]", test_strings) + self.base_test("{ [a, b], [c, d] }", test_strings) def test_exact_group_complex2(self): test_strings = { @@ -309,7 +343,7 @@ def test_exact_group_complex2(self): "(B, (C)), (A, B, (C))": True, "(A, B, (A, (C)))": False } - self.base_test("[[a, b, [[c]] ]]", test_strings) + self.base_test("{a, b, {c} }", test_strings) def test_containing_group_complex2(self): test_strings = { @@ -362,13 +396,13 @@ def test_mixed_groups(self): test_strings = { "(A, B), (C, D, (E, F))": True } - self.base_test("[[a]], [[ [[e, f]] ]]", test_strings) + self.base_test("{a}, { {e, f} }", test_strings) test_strings = { "(A, B), (C, D, (E, F))": False } # This example works because it finds the group containing (c, d, (e, f)), rather than the ef group - self.base_test("[[a]], [e, [[f]] ]", test_strings) + self.base_test("{a}, [e, {f} ]", test_strings) def test_and(self): test_strings = { @@ -411,18 +445,17 @@ def test_and_wildcard_nothing_else(self): "A": False, "B": False, "C": False, - "A, B": True, + "A, B": False, "A, C": False, "B, C": False, "A, B, C": False, "D, A, B": False, "A, B, (C)": False, "(A, B), C": True, - "(A, B, C)": False, + "(A, B, C)": True, } self.base_test("{a and b}", test_strings) - def test_and_wildcard_nothing_else2(self): test_strings = { "A": False, "B": False, @@ -436,8 +469,7 @@ def test_and_wildcard_nothing_else2(self): "(A, B), C": True, "(A, B, C)": False, } - self.base_test("[{a and b}]", test_strings) - self.base_test("[[{a and b}]]", test_strings) + self.base_test("{a and b:}", test_strings) def test_and_logical_wildcard(self): test_strings = { @@ -450,9 +482,11 @@ def test_and_logical_wildcard(self): self.base_test("A, B and ?", test_strings) test_strings = { - "A": False, + "A": True, "A, C": True, "A, B, C": True, + "B, C": False, + "B, C, D, E": True } self.base_test("(a or (b and c) and ?)", test_strings) @@ -469,7 +503,7 @@ def test_double_wildcard(self): def test_or_wildcard(self): test_strings = { - "A": False, + "A": True, "B": False, "C": False, "A, B": True, @@ -589,10 +623,10 @@ def test_and_or(self): self.base_test("a and (b or c)", test_strings) test_strings = { - "A": False, + "A": True, "B": False, "C": False, - "A, B": False, + "A, B": True, "A, C": True, "B, C": True } @@ -698,35 +732,43 @@ def test_not_in_line3(self): def test_optional_exact_group(self): test_strings = { - "A, C": True, + "(A, C)": True, } self.base_test("{a and (b or c)}", test_strings) test_strings = { - "A, B, C, D": True, + "(A, B, C, D)": True, } self.base_test("{a and b: c and d}", test_strings) test_strings = { - "A, B, C": True, - "A, B, C, D": False, + "(A, B, C)": True, + "(A, B, C, D)": False, } self.base_test("{a and b: c or d}", test_strings) test_strings = { - "A, C": True, - "A, D": True, - "A, B, C": False, - "A, B, C, D": False, + "(A, C)": True, + "(A, D)": True, + "(A, B, C)": False, + "(A, B, C, D)": False, } self.base_test("{a or b: c or d}", test_strings) test_strings = { "(Onset, (Def-expand/taco))": True, + "(Onset, Def-expand/taco)": False, + "(Onset, Def/taco, (Def-expand/taco))": True, # this one validates + "(Onset, (Def/taco))": False, "(Onset, (Def-expand/taco, (Label/DefContents)))": True, "(Onset, (Def-expand/taco), (Label/OnsetContents))": True, "(Onset, (Def-expand/taco), (Label/OnsetContents, Description/MoreContents))": True, "Onset, (Def-expand/taco), (Label/OnsetContents)": False, "(Onset, (Def-expand/taco), Label/OnsetContents)": False, } - self.base_test("[[{(Onset or Offset), (Def or [[Def-expand]]): ???}]]", test_strings) \ No newline at end of file + self.base_test("{(Onset or Offset), (Def or {Def-expand}): ???}", test_strings) + test_strings = { + "(A, B)": True, + "(A, B, C)": True + } + self.base_test("{a or b}", test_strings) \ No newline at end of file diff --git a/tests/schema/test_hed_schema_io.py b/tests/schema/test_hed_schema_io.py index f3591ead..c21d839a 100644 --- a/tests/schema/test_hed_schema_io.py +++ b/tests/schema/test_hed_schema_io.py @@ -170,6 +170,8 @@ def _base_merging_test(self, files): reload1 = load_schema(path1) reload2 = load_schema(path2) self.assertEqual(reload1, reload2) + except Exception: + self.assertTrue(False) finally: os.remove(path1) os.remove(path2) @@ -183,6 +185,8 @@ def _base_merging_test(self, files): reload1 = load_schema(path1) reload2 = load_schema(path2) self.assertEqual(reload1, reload2) + except Exception: + self.assertTrue(False) finally: os.remove(path1) os.remove(path2)