Skip to content

Commit

Permalink
Add a string based search first pass
Browse files Browse the repository at this point in the history
Update query parser from [[]] to {} notation and clean up some
Minor bug fixes in query parser
  • Loading branch information
IanCa committed Sep 21, 2023
1 parent ccfe535 commit c5f0386
Show file tree
Hide file tree
Showing 8 changed files with 756 additions and 130 deletions.
2 changes: 1 addition & 1 deletion hed/models/base_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def _indexed_dict_from_onsets(onsets):

@staticmethod
def _filter_by_index_list(original_series, indexed_dict):
new_series = ["n/a"] * len(original_series) # Initialize new_series with "n/a"
new_series = pd.Series(["n/a"] * len(original_series))

for onset, indices in indexed_dict.items():
if indices:
Expand Down
237 changes: 237 additions & 0 deletions hed/models/basic_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
import re
from itertools import combinations, product
from collections import defaultdict
import pandas as pd


def find_matching(series, search_string, regex=False):
""" Finds lines in the series that match the search string and returns a mask.
Syntax Rules:
- '@': Prefixing a term in the search string means the object must appear anywhere within a line.
- Parentheses: Elements within parentheses must appear in the line with the same level of nesting.
eg: Search string: "(A), (B)" will match "(A), (B, C)", but not "(A, B)", since they don't
start in the same group.
- "LongFormTag*": A * will match any remaining word(anything but a comma or parenthesis)
- An individual term can be arbitrary regex, but it is limited to single continuous words.
Notes:
- The format of the series should match the format of the search string, whether it's in short or long form.
- To enable support for matching parent tags, ensure that both the series and search string are in long form.
Args:
series (pd.Series): A Pandas Series object containing the lines to be searched.
search_string (str): The string to search for in each line of the series.
regex (bool): By default, translate any * wildcard characters to .*? regex
If True, do no translation and pass the words as is. Due to how it's setup, you must not include
the following characters: (),
Returns:
mask (pd.Series): A Boolean mask Series of the same length as the input series.
The mask has `True` for lines that match the search string and `False` otherwise.
"""
if not regex:
# Replace *'s with a reasonable value for people who don't know regex
search_string = re.sub(r'(?<!\.)\*', '.*?', search_string)
anywhere_words, specific_words = find_words(search_string)
delimiter_map = construct_delimiter_map(search_string, specific_words)
source_words = anywhere_words + specific_words

# Create a set of series of masks to determine which rows contain each individual word
candidate_indexes = set(series.index)

# Loop through source_words to filter down candidate_indexes
for word in source_words:
matches = series.str.contains(word, regex=True)
current_word_indexes = set(matches[matches].index.tolist())

# Update candidate_indexes by taking the intersection with current_word_indexes
candidate_indexes &= current_word_indexes

if not candidate_indexes:
break

candidate_indexes = sorted(candidate_indexes)

full_mask = pd.Series(False, index=series.index)

candidate_series = series[candidate_indexes]

mask = candidate_series.apply(verify_search_delimiters, args=(anywhere_words, specific_words, delimiter_map))
full_mask.loc[candidate_indexes] = mask

return full_mask


def find_words(search_string):
""" Extract all words in the search string. Dividing them into words that must be relative to each other,
and words that can be anywhere.
Args:
search_string (str): The search query string to parse.
Words prefixed with '@' are 'anywhere' words.
Returns:
tuple: A tuple containing two lists:
- anywhere_words (list of str): Words that can appear anywhere in the text.
- specific_words (list of str): Words that must appear relative to other terms.
"""
# Match sequences of characters that are not commas, parentheses, or standalone spaces.
pattern = r'[^,()]+'
words = re.findall(pattern, search_string)

# Remove any extraneous whitespace from each word
words = [word.strip() for word in words if word.strip()]

anywhere_words = [word[1:] for word in words if word.startswith("@")]
specific_words = [word for word in words if not word.startswith("@")]

return anywhere_words, specific_words


def check_parentheses(text):
""" Checks for balanced parentheses in the given text and returns the unbalanced ones.
Args:
text (str): The text to be checked for balanced parentheses.
Returns:
str: A string containing the unbalanced parentheses in their original order.
Notes:
- The function only considers the characters '(' and ')' for balancing.
- Balanced pairs of parentheses are removed, leaving behind only the unbalanced ones.
"""
# Extract all parentheses from the text
all_parentheses = ''.join(re.findall('[()]', text))

stack = []
remaining_parentheses = []

# Loop through all parentheses and find balanced ones
for p in all_parentheses:
if p == '(':
stack.append(p)
elif p == ')' and stack:
stack.pop()
else:
remaining_parentheses.append(p)

# Add unbalanced ( back to remaining parentheses
remaining_parentheses.extend(stack)

return ''.join(remaining_parentheses)


def reverse_and_flip_parentheses(s):
""" Reverses a string and flips the parentheses.
Args:
s (str): The string to be reversed and have its parentheses flipped.
Returns:
str: The reversed string with flipped parentheses.
Notes:
- The function takes into account only the '(' and ')' characters for flipping.
"""
# Reverse the string
reversed_s = s[::-1]

# Flip the parentheses directly in the reversed string
flipped_s = reversed_s.translate(str.maketrans("()", ")("))
return flipped_s


def construct_delimiter_map(text, words):
""" Takes an input search query and list of words, returning the parenthetical delimiters between them.
Args: delimiter
text (str): The search query
words(list): A list of words we want to map between from the query
Returns:
dict: The two-way delimiter map
"""
locations = {}
# Find the locations of each word in the text
for word in words:
for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text):
start_index = match.start(1)
end_index = match.end(1)
match_length = end_index - start_index
locations[start_index] = (word, match_length)

sorted_locations = sorted(locations.items())

delimiter_map = {}
# Use combinations to get every combination of two words in order
for (start1, (word1, length1)), (start2, (word2, length2)) in combinations(sorted_locations, 2):
end1 = start1 + length1
delimiter_text = text[end1:start2]
delimiter_map[(word1, word2)] = check_parentheses(delimiter_text)

# Add the reversed version of the above
reverse_map = {(word2, word1): reverse_and_flip_parentheses(delimiter_text) for ((word1, word2), delimiter_text) in
delimiter_map.items()}
delimiter_map.update(reverse_map)

return delimiter_map


def verify_search_delimiters(text, anywhere_words, specific_words, delimiter_map):
""" Verifies if the text contains specific words with expected delimiters between them.
Args:
text (str): The text to search in.
anywhere_words (list of str): Words that can appear anywhere in the text.
specific_words (list of str): Words that must appear relative to other words in the text
delimiter_map (dict): A dictionary specifying expected delimiters between pairs of specific words.
Returns:
bool: True if all conditions are met, otherwise False.
"""
locations = defaultdict(list)

# Check for anywhere words
for word in anywhere_words:
pattern = r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)'
if not any(re.finditer(pattern, text)):
return False

# Find all locations for each word in the text
for word in specific_words:
for match in re.finditer(r'(?:[ ,()]|^)(' + word + r')(?:[ ,()]|$)', text):
start_index = match.start(1)
matched_word = match.group(1)
locations[word].append((start_index, len(matched_word), word))

if len(locations) != len(specific_words):
return False

# Generate all possible combinations of word sequences
# this covers cases where the same tag is found twice, and you need to check both
for sequence in product(*locations.values()):
sorted_sequence = sorted(sequence)

# Check if the delimiters for this sequence match the expected delimiters
valid = True
for i in range(len(sorted_sequence) - 1):
start1, len1, word1 = sorted_sequence[i]
start2, len2, word2 = sorted_sequence[i + 1]

end1 = start1 + len1
delimiter_text = text[end1:start2]

found_delimiter = check_parentheses(delimiter_text)
expected_delimiter = delimiter_map.get((word1, word2), None)

if found_delimiter != expected_delimiter:
valid = False
break

if valid:
return True # Return True if any sequence is valid

return False # Return False if no valid sequence is found
26 changes: 26 additions & 0 deletions hed/models/df_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,26 @@ def expand_defs(df, hed_schema, def_dict, columns=None):
df.loc[mask, column] = df.loc[mask, column].apply(partial(_expand_defs, hed_schema=hed_schema, def_dict=def_dict))


def sort_strings(df, hed_schema, tag_form="short_tag", columns=None):
""" Expands any def tags found in the dataframe.
Converts in place
Parameters:
df (pd.Dataframe or pd.Series): The dataframe or series to modify
hed_schema (HedSchema or None): The schema to use to identify defs
columns (list or None): The columns to modify on the dataframe
"""
if isinstance(df, pd.Series):
df[:] = df.apply(partial(_sort, hed_schema=hed_schema, tag_form=tag_form))
else:
if columns is None:
columns = df.columns

for column in columns:
df.loc[column] = df.loc[column].apply(partial(_sort, hed_schema=hed_schema, tag_form=tag_form))


def _convert_to_form(hed_string, hed_schema, tag_form):
return str(HedString(hed_string, hed_schema).get_as_form(tag_form))

Expand All @@ -132,6 +152,12 @@ def _expand_defs(hed_string, hed_schema, def_dict):
return str(HedString(hed_string, hed_schema, def_dict).expand_defs())


def _sort(hed_string, hed_schema, tag_form):
sorted_string = HedString(hed_string, hed_schema)
sorted_string.sort()
return sorted_string.get_as_form(tag_form)


def process_def_expands(hed_strings, hed_schema, known_defs=None, ambiguous_defs=None):
""" Gather def-expand tags in the strings/compare with known definitions to find any differences
Expand Down
Loading

0 comments on commit c5f0386

Please sign in to comment.