Add a string based search first pass

Update query parser from [[]] to {} notation and clean up some Minor bug fixes in query parser
hed-standard · Sep 21, 2023 · c5f0386 · c5f0386
1 parent ccfe535
commit c5f0386
Show file tree

Hide file tree

Showing 8 changed files with 756 additions and 130 deletions.
diff --git a/hed/models/base_input.py b/hed/models/base_input.py
@@ -137,7 +137,7 @@ def _indexed_dict_from_onsets(onsets):
 
     @staticmethod
     def _filter_by_index_list(original_series, indexed_dict):
-        new_series = ["n/a"] * len(original_series)  # Initialize new_series with "n/a"
+        new_series = pd.Series(["n/a"] * len(original_series))
 
         for onset, indices in indexed_dict.items():
             if indices:

diff --git a/hed/models/basic_search.py b/hed/models/basic_search.py
@@ -0,0 +1,237 @@
+import re
+from itertools import combinations, product
+from collections import defaultdict
+import pandas as pd
+
+
+def find_matching(series, search_string, regex=False):
+    """ Finds lines in the series that match the search string and returns a mask.
+
+    Syntax Rules:
+        - '@': Prefixing a term in the search string means the object must appear anywhere within a line.
+        - Parentheses: Elements within parentheses must appear in the line with the same level of nesting.
+                eg: Search string: "(A), (B)" will match "(A), (B, C)", but not "(A, B)", since they don't
+                    start in the same group.
+        - "LongFormTag*": A * will match any remaining word(anything but a comma or parenthesis)
+        - An individual term can be arbitrary regex, but it is limited to single continuous words.
+
+    Notes:
+        - The format of the series should match the format of the search string, whether it's in short or long form.
+        - To enable support for matching parent tags, ensure that both the series and search string are in long form.
+
+    Args:
+        series (pd.Series): A Pandas Series object containing the lines to be searched.
+        search_string (str): The string to search for in each line of the series.
+        regex (bool): By default, translate any * wildcard characters to .*? regex
+                      If True, do no translation and pass the words as is. Due to how it's setup, you must not include
+                      the following characters: (),
+
+    Returns:
+        mask (pd.Series): A Boolean mask Series of the same length as the input series.
+                          The mask has `True` for lines that match the search string and `False` otherwise.
+    """
+    if not regex:
+        # Replace *'s with a reasonable value for people who don't know regex
+        search_string = re.sub(r'(?<!\.)\*', '.*?', search_string)
+    anywhere_words, specific_words = find_words(search_string)
+    delimiter_map = construct_delimiter_map(search_string, specific_words)
+    source_words = anywhere_words + specific_words
+
+    # Create a set of series of masks to determine which rows contain each individual word
+    candidate_indexes = set(series.index)
+
+    # Loop through source_words to filter down candidate_indexes
+    for word in source_words:
+        matches = series.str.contains(word, regex=True)
+        current_word_indexes = set(matches[matches].index.tolist())
+
+        # Update candidate_indexes by taking the intersection with current_word_indexes
+        candidate_indexes &= current_word_indexes
+
+        if not candidate_indexes:
+            break
+
+    candidate_indexes = sorted(candidate_indexes)
+
+    full_mask = pd.Series(False, index=series.index)
+
+    candidate_series = series[candidate_indexes]
+
+    mask = candidate_series.apply(verify_search_delimiters, args=(anywhere_words, specific_words, delimiter_map))
+    full_mask.loc[candidate_indexes] = mask
+
+    return full_mask
+
+
+def find_words(search_string):
+    """ Extract all words in the search string.  Dividing them into words that must be relative to each other,
+         and words that can be anywhere.
+
+    Args:
+        search_string (str): The search query string to parse.
+                             Words prefixed with '@' are 'anywhere' words.
+
+    Returns:
+        tuple: A tuple containing two lists:
+            - anywhere_words (list of str): Words that can appear anywhere in the text.
+            - specific_words (list of str): Words that must appear relative to other terms.
+    """
+    # Match sequences of characters that are not commas, parentheses, or standalone spaces.
+    pattern = r'[^,()]+'
+    words = re.findall(pattern, search_string)
+
+    # Remove any extraneous whitespace from each word
+    words = [word.strip() for word in words if word.strip()]
+
+    anywhere_words = [word[1:] for word in words if word.startswith("@")]
+    specific_words = [word for word in words if not word.startswith("@")]
+
+    return anywhere_words, specific_words
+
+
+def check_parentheses(text):
+    """ Checks for balanced parentheses in the given text and returns the unbalanced ones.
+
+    Args:
+        text (str): The text to be checked for balanced parentheses.
+
+    Returns:
+        str: A string containing the unbalanced parentheses in their original order.
+
+    Notes:
+        - The function only considers the characters '(' and ')' for balancing.
+        - Balanced pairs of parentheses are removed, leaving behind only the unbalanced ones.
+
+    """
+    # Extract all parentheses from the text
+    all_parentheses = ''.join(re.findall('[()]', text))
+
+    stack = []
+    remaining_parentheses = []
+
+    # Loop through all parentheses and find balanced ones
+    for p in all_parentheses:
+        if p == '(':
+            stack.append(p)
+        elif p == ')' and stack:
+            stack.pop()
+        else:
+            remaining_parentheses.append(p)
+
+    # Add unbalanced ( back to remaining parentheses
+    remaining_parentheses.extend(stack)
+
+    return ''.join(remaining_parentheses)
+
+
+def reverse_and_flip_parentheses(s):
+    """ Reverses a string and flips the parentheses.
+
+        Args:
+            s (str): The string to be reversed and have its parentheses flipped.
+
+        Returns:
+            str: The reversed string with flipped parentheses.
+
+        Notes:
+            - The function takes into account only the '(' and ')' characters for flipping.
+    """
+    # Reverse the string
+    reversed_s = s[::-1]
+
+    # Flip the parentheses directly in the reversed string
+    flipped_s = reversed_s.translate(str.maketrans("()", ")("))
+    return flipped_s
+
+
+def construct_delimiter_map(text, words):
+    """ Takes an input search query and list of words, returning the parenthetical delimiters between them.
+
+    Args: delimiter
+        text (str): The search query
+        words(list): A list of words we want to map between from the query
+
+    Returns:
+        dict: The two-way delimiter map
+    """
+    locations = {}
+    # Find the locations of each word in the text
+    for word in words:
+        for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text):
+            start_index = match.start(1)
+            end_index = match.end(1)
+            match_length = end_index - start_index
+            locations[start_index] = (word, match_length)
+
+    sorted_locations = sorted(locations.items())
+
+    delimiter_map = {}
+    # Use combinations to get every combination of two words in order
+    for (start1, (word1, length1)), (start2, (word2, length2)) in combinations(sorted_locations, 2):
+        end1 = start1 + length1
+        delimiter_text = text[end1:start2]
+        delimiter_map[(word1, word2)] = check_parentheses(delimiter_text)
+
+    # Add the reversed version of the above
+    reverse_map = {(word2, word1): reverse_and_flip_parentheses(delimiter_text) for ((word1, word2), delimiter_text) in
+                   delimiter_map.items()}
+    delimiter_map.update(reverse_map)
+
+    return delimiter_map
+
+
+def verify_search_delimiters(text, anywhere_words, specific_words, delimiter_map):
+    """ Verifies if the text contains specific words with expected delimiters between them.
+
+    Args:
+        text (str): The text to search in.
+        anywhere_words (list of str): Words that can appear anywhere in the text.
+        specific_words (list of str): Words that must appear relative to other words in the text
+        delimiter_map (dict): A dictionary specifying expected delimiters between pairs of specific words.
+
+    Returns:
+        bool: True if all conditions are met, otherwise False.
+    """
+    locations = defaultdict(list)
+
+    # Check for anywhere words
+    for word in anywhere_words:
+        pattern = r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)'
+        if not any(re.finditer(pattern, text)):
+            return False
+
+    # Find all locations for each word in the text
+    for word in specific_words:
+        for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text):
+            start_index = match.start(1)
+            matched_word = match.group(1)
+            locations[word].append((start_index, len(matched_word), word))
+
+    if len(locations) != len(specific_words):
+        return False
+
+    # Generate all possible combinations of word sequences
+    # this covers cases where the same tag is found twice, and you need to check both
+    for sequence in product(*locations.values()):
+        sorted_sequence = sorted(sequence)
+
+        # Check if the delimiters for this sequence match the expected delimiters
+        valid = True
+        for i in range(len(sorted_sequence) - 1):
+            start1, len1, word1 = sorted_sequence[i]
+            start2, len2, word2 = sorted_sequence[i + 1]
+
+            end1 = start1 + len1
+            delimiter_text = text[end1:start2]
+
+            found_delimiter = check_parentheses(delimiter_text)
+            expected_delimiter = delimiter_map.get((word1, word2), None)
+
+            if found_delimiter != expected_delimiter:
+                valid = False
+                break
+
+        if valid:
+            return True  # Return True if any sequence is valid
+
+    return False  # Return False if no valid sequence is found
diff --git a/hed/models/df_util.py b/hed/models/df_util.py
@@ -120,6 +120,26 @@ def expand_defs(df, hed_schema, def_dict, columns=None):
             df.loc[mask, column] = df.loc[mask, column].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict))
 
 
+def sort_strings(df, hed_schema, tag_form="short_tag", columns=None):
+    """ Expands any def tags found in the dataframe.
+
+        Converts in place
+
+    Parameters:
+        df (pd.Dataframe or pd.Series): The dataframe or series to modify
+        hed_schema (HedSchema or None): The schema to use to identify defs
+        columns (list or None): The columns to modify on the dataframe
+    """
+    if isinstance(df, pd.Series):
+        df[:] = df.apply(partial(_sort, hed_schema=hed_schema, tag_form=tag_form))
+    else:
+        if columns is None:
+            columns = df.columns
+
+        for column in columns:
+            df.loc[column] = df.loc[column].apply(partial(_sort, hed_schema=hed_schema, tag_form=tag_form))
+
+
 def _convert_to_form(hed_string, hed_schema, tag_form):
     return str(HedString(hed_string, hed_schema).get_as_form(tag_form))
 
@@ -132,6 +152,12 @@ def _expand_defs(hed_string, hed_schema, def_dict):
     return str(HedString(hed_string, hed_schema, def_dict).expand_defs())
 
 
+def _sort(hed_string, hed_schema, tag_form):
+    sorted_string = HedString(hed_string, hed_schema)
+    sorted_string.sort()
+    return sorted_string.get_as_form(tag_form)
+
+
 def process_def_expands(hed_strings, hed_schema, known_defs=None, ambiguous_defs=None):
     """ Gather def-expand tags in the strings/compare with known definitions to find any differences