diff --git a/hed/tools/analysis/key_map.py b/hed/tools/analysis/key_map.py index e2f7f535..09d7f318 100644 --- a/hed/tools/analysis/key_map.py +++ b/hed/tools/analysis/key_map.py @@ -59,7 +59,9 @@ def make_template(self, additional_cols=None, show_counts=True): Parameters: additional_cols (list or None): Optional list of additional columns to append to the returned dataframe. - show_counts (bool): If true, number of times each key combination appears is in first column + show_counts (bool): If True, number of times each key combination appears is in first column and + values are sorted in descending order by + Returns: DataFrame: A dataframe containing the template. @@ -79,6 +81,7 @@ def make_template(self, additional_cols=None, show_counts=True): df[additional_cols] = 'n/a' if show_counts: df.insert(0, 'key_counts', self._get_counts()) + df.sort_values(by=['key_counts'], inplace=True, ignore_index=True, ascending=False) return df def _get_counts(self): @@ -142,7 +145,7 @@ def resort(self): for index, row in self.col_map.iterrows(): key_hash = get_row_hash(row, self.key_cols) self.map_dict[key_hash] = index - + def update(self, data, allow_missing=True): """ Update the existing map with information from data. diff --git a/hed/tools/analysis/sequence_map.py b/hed/tools/analysis/sequence_map.py new file mode 100644 index 00000000..e5c81ebf --- /dev/null +++ b/hed/tools/analysis/sequence_map.py @@ -0,0 +1,118 @@ +""" A map of containing the number of times a particular sequence of values in a column of an event file. """ + + +import pandas as pd +from hed.tools.util.data_util import get_key_hash + + +class SequenceMap: + """ A map of unique sequences of column values of a particular length appear in an event file. + + Attributes: + + name (str): An optional name of this remap for identification purposes. + + Notes: This mapping converts all columns in the mapping to strings. + The remapping does not support other types of columns. + + """ + def __init__(self, codes=None, name=''): + """ Information for setting up the maps. + + Parameters: + codes (list or None): If None use all codes, otherwise only include listed codes in the map. + name (str): Name associated with this remap (usually a pathname of the events file). + + """ + + self.codes = codes + self.name = name + self.node_counts = {} + self.edges = {} # map of keys to n-element sequences + self.edge_counts = {} # Keeps a running count of the number of times a key appears in the data + + @property + + def __str__(self): + node_counts = [f"{value}({str(count)})" for value, count in self.node_counts.items()] + node_str = (" ").join(node_counts) + return node_str + # temp_list = [f"{self.name} counts for key [{str(self.key_cols)}]:"] + # for index, row in self.col_map.iterrows(): + # key_hash = get_row_hash(row, self.columns) + # temp_list.append(f"{str(list(row.values))}:\t{self.count_dict[key_hash]}") + # return "\n".join(temp_list) + + def dot_str(self, group_spec={}): + base = 'digraph g { \n' + node_list = [f"{node};" for node in self.codes if node not in self.node_counts] + if node_list: + base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) +"\n}\n" + if group_spec: + for group, spec in group_spec.items(): + group_list = [f"{node};" for node in self.node_counts if node in spec["nodes"]] + if group_list: + spec_color = spec["color"] + if spec_color[0] == '#': + spec_color = f'"{spec_color}"' + base = base + 'subgraph cluster_' + group + '{\n' + f'bgcolor={spec_color};\n' + \ + '\n'.join(group_list) + '\n}\n' + edge_list = [f"{value[0]} -> {value[1]} [label={str(self.edge_counts[key])}];" + for key, value in self.edges.items()] + dot_str = base + ("\n").join(edge_list) + "}\n" + return dot_str + + # def resort(self): + # """ Sort the col_map in place by the key columns. """ + # self.col_map.sort_values(by=self.key_cols, inplace=True, ignore_index=True) + # for index, row in self.col_map.iterrows(): + # key_hash = get_row_hash(row, self.key_cols) + # self.map_dict[key_hash] = index + + def update(self, data): + """ Update the existing map with information from data. + + Parameters: + data (Series): DataFrame or filename of an events file or event map. + allow_missing (bool): If true allow missing keys and add as n/a columns. + + :raises HedFileError: + - If there are missing keys and allow_missing is False. + + """ + filtered = self.prep(data) + if self.codes: + mask = filtered.isin(self.codes) + filtered = filtered[mask] + for index, value in filtered.items(): + if value not in self.node_counts: + self.node_counts[value] = 1 + else: + self.node_counts[value] = self.node_counts[value] + 1 + if index + 1 >= len(filtered): + break + key_list = filtered[index:index+2].tolist() + key = get_key_hash(key_list) + if key in self.edges: + self.edge_counts[key] = self.edge_counts[key] + 1 + else: + self.edges[key] = key_list + self.edge_counts[key] = 1 + + @staticmethod + def prep(data): + """ Remove quotes from the specified columns and convert to string. + + Parameters: + data (Series): Dataframe to process by removing quotes. + + Returns: Series + Notes: + - Replacement is done in place. + """ + + filtered = data.astype(str) + filtered.fillna('n/a').astype(str) + filtered = filtered.str.replace('"', '') + filtered = filtered.str.replace("'", "") + return filtered \ No newline at end of file diff --git a/tests/tools/analysis/test_key_map.py b/tests/tools/analysis/test_key_map.py index d0630066..4ae2860d 100644 --- a/tests/tools/analysis/test_key_map.py +++ b/tests/tools/analysis/test_key_map.py @@ -59,8 +59,9 @@ def test_make_template(self): df1 = t_map.make_template(show_counts=False) self.assertIsInstance(df1, pd.DataFrame, "make_template should return a DataFrame") self.assertEqual(len(df1.columns), 1, "make_template should return 1 column single key, no additional columns") - df2 = t_map.make_template() + df2 = t_map.make_template(show_counts=True) self.assertEqual(len(df2.columns), 2, "make_template returns an extra column for counts") + t_map2 = KeyMap(['event_type', 'type']) t_map2.update(self.stern_test1_path) df3 = t_map2.make_template() diff --git a/tests/tools/analysis/test_sequence_map.py b/tests/tools/analysis/test_sequence_map.py new file mode 100644 index 00000000..07112c77 --- /dev/null +++ b/tests/tools/analysis/test_sequence_map.py @@ -0,0 +1,50 @@ +import unittest +import os +import pandas as pd +from hed.errors.exceptions import HedFileError +from hed.tools.analysis.sequence_map import SequenceMap +from hed.tools.util.data_util import get_new_dataframe +from hed.tools.util.io_util import get_file_list + + +class Test(unittest.TestCase): + @classmethod + def setUpClass(cls): + # curation_base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data/remodel_tests') + base_path = '' + cls.events_path = os.path.realpath(base_path + '/sub-01/ses-01/eeg/sub-01_ses-01_task-DriveRandomSound_run-1_events.tsv') + + + def test_constructor(self): + codes1 = ['1111', '1112', '1121', '1122', '1131', '1132', '1141', + '1142', '1311', '1312', '1321', '1322', + '4210', '4220', '4230', '4311', '4312'] + + smap1 = SequenceMap(codes=codes1) + self.assertIsInstance(smap1, SequenceMap) + # df = get_new_dataframe(self.events_path) + # data = df['value'] + # smap1.update(data) + # #print(f"{smap1.__str__}") + # print("to here") + + def test_update(self): + codes1 = ['1111', '1121', '1131', '1141', '1311', '1321', + '4210', '4220', '4230', '4311'] + codes1 = ['1111', '1121', '1131', '1141', '1311', '4311'] + #codes1 = ['1111', '1121', '1131', '1141', '1311'] + smap1 = SequenceMap(codes=codes1) + self.assertIsInstance(smap1, SequenceMap) + # df = get_new_dataframe(self.events_path) + # data = df['value'] + # smap1.update(data) + # print(f"{smap1.dot_str()}") + # group_spec = {"stimulus": {"color": "#FFAAAA", "nodes": ["1111", "1121", "1131", "1141", "1311"]}} + # print(f"{smap1.dot_str(group_spec=group_spec)}") + # + def test_str(self): + pass + + +if __name__ == '__main__': + unittest.main()