diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index fbb72f41..3ff7f3c9 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -42,10 +42,6 @@ Finding help The `HED online tools `_ provide an easy-to-use interface that requires no programming. -:Mailing lists and forums: - - * Don't hesitate to ask questions about the python hedtools on `NeuroStars - `_. :Issues and problems: * If you notice a bug in the python hedtools code or encounter other problems using the tools, please `open an issue`_ in the diff --git a/hed/tools/analysis/sequence_map.py b/hed/tools/analysis/sequence_map.py index e5c81ebf..0ecd0fea 100644 --- a/hed/tools/analysis/sequence_map.py +++ b/hed/tools/analysis/sequence_map.py @@ -44,10 +44,15 @@ def __str__(self): # return "\n".join(temp_list) def dot_str(self, group_spec={}): + """ Produce a DOT string representing this sequence map. + + + """ base = 'digraph g { \n' - node_list = [f"{node};" for node in self.codes if node not in self.node_counts] - if node_list: - base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) +"\n}\n" + if self.codes: + node_list = [f"{node};" for node in self.codes if node not in self.node_counts] + if node_list: + base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) +"\n}\n" if group_spec: for group, spec in group_spec.items(): group_list = [f"{node};" for node in self.node_counts if node in spec["nodes"]] @@ -57,17 +62,37 @@ def dot_str(self, group_spec={}): spec_color = f'"{spec_color}"' base = base + 'subgraph cluster_' + group + '{\n' + f'bgcolor={spec_color};\n' + \ '\n'.join(group_list) + '\n}\n' - edge_list = [f"{value[0]} -> {value[1]} [label={str(self.edge_counts[key])}];" - for key, value in self.edges.items()] + edge_list = self.get_edge_list(sort=True) + dot_str = base + ("\n").join(edge_list) + "}\n" return dot_str + + def edge_to_str(self, key): + value = self.edges.get(key, []) + if value: + return f"{value[0]} -> {value[1]} " + else: + return "" + def get_edge_list(self, sort=True): + """Produces a DOT format edge list with the option of sorting by edge counts. + + Parameters: + sort (bool): if true the edge list is sorted by edge counts + + Returns: + list: list of DOT strings representing the edges labeled by counts. + + """ + + df = pd.DataFrame(list(self.edge_counts.items()), columns=['Key', 'Counts']) + if sort: + df = df.sort_values(by='Counts', ascending=False) + edge_list = [f"{self.edge_to_str(row['Key'])} [label={str(self.edge_counts[row['Key']])}];" + for index, row in df.iterrows()] + return edge_list - # def resort(self): - # """ Sort the col_map in place by the key columns. """ - # self.col_map.sort_values(by=self.key_cols, inplace=True, ignore_index=True) - # for index, row in self.col_map.iterrows(): - # key_hash = get_row_hash(row, self.key_cols) - # self.map_dict[key_hash] = index + def filter_edges(self): + print("to here") def update(self, data): """ Update the existing map with information from data. @@ -99,6 +124,36 @@ def update(self, data): self.edges[key] = key_list self.edge_counts[key] = 1 + def update(self, data): + """ Update the existing map with information from data. + + Parameters: + data (Series): DataFrame or filename of an events file or event map. + allow_missing (bool): If true allow missing keys and add as n/a columns. + + :raises HedFileError: + - If there are missing keys and allow_missing is False. + + """ + filtered = self.prep(data) + if self.codes: + mask = filtered.isin(self.codes) + filtered = filtered[mask] + for index, value in filtered.items(): + if value not in self.node_counts: + self.node_counts[value] = 1 + else: + self.node_counts[value] = self.node_counts[value] + 1 + if index + 1 >= len(filtered): + break + key_list = filtered[index:index + 2].tolist() + key = get_key_hash(key_list) + if key in self.edges: + self.edge_counts[key] = self.edge_counts[key] + 1 + else: + self.edges[key] = key_list + self.edge_counts[key] = 1 + @staticmethod def prep(data): """ Remove quotes from the specified columns and convert to string. diff --git a/hed/tools/analysis/sequence_map_new.py b/hed/tools/analysis/sequence_map_new.py new file mode 100644 index 00000000..0415f91e --- /dev/null +++ b/hed/tools/analysis/sequence_map_new.py @@ -0,0 +1,160 @@ +""" A map of containing the number of times a particular sequence of values in a column of an event file. """ + +import pandas as pd +from hed.tools.util.data_util import get_key_hash + + +class SequenceMapNew: + """ A map of unique sequences of column values of a particular length appear in an event file. + + Attributes: + + name (str): An optional name of this remap for identification purposes. + + Notes: This mapping converts all columns in the mapping to strings. + The remapping does not support other types of columns. + + """ + + def __init__(self, codes=None, name='', seq=[0, -1]): + """ Information for setting up the maps. + + Parameters: + codes (list or None): If None use all codes, otherwise only include listed codes in the map. + name (str): Name associated with this remap (usually a pathname of the events file). + + """ + + self.codes = codes + self.name = name + self.seq = seq + self.nodes = {} # Node keys to node names + self.node_counts = {} # Node values to count + self.sequences = {} # Sequence keys to sequence + self.seq_counts = {} # Sequence keys to counts + self.edges = {} # map of edge keys to 2-element sequence keys + self.edge_counts = {} # edge keys to edge counts + + @property + def __str__(self): + node_counts = [f"{value}({str(count)})" for value, count in self.node_counts.items()] + node_str = (" ").join(node_counts) + return node_str + # temp_list = [f"{self.name} counts for key [{str(self.key_cols)}]:"] + # for index, row in self.col_map.iterrows(): + # key_hash = get_row_hash(row, self.columns) + # temp_list.append(f"{str(list(row.values))}:\t{self.count_dict[key_hash]}") + # return "\n".join(temp_list) + + def dot_str(self, group_spec={}): + """ Produce a DOT string representing this sequence map. + + + """ + base = 'digraph g { \n' + if self.codes: + node_list = [f"{node};" for node in self.codes if node not in self.node_counts] + if node_list: + base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) + "\n}\n" + if group_spec: + for group, spec in group_spec.items(): + group_list = [f"{node};" for node in self.node_counts if node in spec["nodes"]] + if group_list: + spec_color = spec["color"] + if spec_color[0] == '#': + spec_color = f'"{spec_color}"' + base = base + 'subgraph cluster_' + group + '{\n' + f'bgcolor={spec_color};\n' + \ + '\n'.join(group_list) + '\n}\n' + edge_list = self.get_edge_list(sort=True) + + dot_str = base + ("\n").join(edge_list) + "}\n" + return dot_str + + def edge_to_str(self, key): + value = self.edges.get(key, []) + if value: + x = ("+").join(value[0]) + y = ("+").join(value[1]) + return f"{str(self.sequences[value[0]])} -> {str(self.sequences[value[1]])} " + else: + return "" + + def get_edge_list(self, sort=True): + """Produces a DOT format edge list with the option of sorting by edge counts. + + Parameters: + sort (bool): if true the edge list is sorted by edge counts + + Returns: + list: list of DOT strings representing the edges labeled by counts. + + """ + + df = pd.DataFrame(list(self.edge_counts.items()), columns=['Key', 'Counts']) + if sort: + df = df.sort_values(by='Counts', ascending=False) + edge_list = [] + for index, row in df.iterrows(): + edge_list.append(f"{self.edge_to_str(row['Key'])} [label={str(self.edge_counts[row['Key']])}];") + return edge_list + + def filter_edges(self): + print("to here") + + def update(self, data): + filtered = self.get_sequence_data(data) + last_seq_key = None + for index, row in filtered.iterrows(): + # Update node counts + this_node = row['value'] + self.node_counts[this_node] = self.node_counts.get(this_node, 0) + 1 + this_seq = row['seq'] + if not this_seq: + last_seq_key = None + continue; + this_seq_key = get_key_hash(this_seq) + self.sequences[this_seq_key] = this_seq + self.seq_counts[this_seq_key] = self.seq_counts.get(this_seq_key, 0) + 1 + if last_seq_key: + this_edge_key = get_key_hash([last_seq_key, this_seq_key]) + self.edges[this_edge_key] = [last_seq_key, this_seq_key] + self.edge_counts[this_edge_key] = self.edge_counts.get(this_edge_key, 0) + 1 + last_seq_key = this_seq_key + + def get_sequence_data(self, data): + filtered = self.prep(data) + empty_lists = [[] for _ in range(len(filtered))] + + # Create a DataFrame + df = pd.DataFrame({'value': filtered.values, 'seq': empty_lists}) + + for index, row in df.iterrows(): + df.at[index, 'seq'] = self.get_sequence(df, index) + return df + + def get_sequence(self, df, index): + seq_list = [] + for i, val in enumerate(self.seq): + df_ind = val + index + if df_ind < 0 or df_ind >= len(df): + return [] + seq_list.append(df.iloc[df_ind, 0]) + return seq_list + + @staticmethod + def prep(data): + """ Remove quotes from the specified columns and convert to string. + + Parameters: + data (Series): Dataframe to process by removing quotes. + + Returns: Series + Notes: + - Replacement is done in place. + """ + + filtered = data.astype(str) + filtered.fillna('n/a').astype(str) + filtered = filtered.str.replace('"', '') + filtered = filtered.str.replace("'", "") + return filtered diff --git a/hed/tools/remodeling/cli/run_remodel.py b/hed/tools/remodeling/cli/run_remodel.py index 32af02ea..c640ba78 100644 --- a/hed/tools/remodeling/cli/run_remodel.py +++ b/hed/tools/remodeling/cli/run_remodel.py @@ -4,7 +4,7 @@ import json import argparse from hed.errors.exceptions import HedFileError -from hed.tools.util.io_util import get_file_list, get_task_from_file +from hed.tools.util.io_util import get_file_list, get_task_from_file, get_task_dict from hed.tools.bids.bids_dataset import BidsDataset from hed.tools.remodeling.dispatcher import Dispatcher from hed.tools.remodeling.backup_manager import BackupManager @@ -119,14 +119,7 @@ def parse_arguments(arg_list=None): def parse_tasks(files, task_args): if not task_args: return {"": files} - task_dict = {} - for my_file in files: - task = get_task_from_file(my_file) - if not task: - continue - task_entry = task_dict.get(task, []) - task_entry.append(my_file) - task_dict[task] = task_entry + task_dict = get_task_dict(files) if task_args == "*" or isinstance(task_args, list) and task_args[0] == "*": return task_dict task_dict = {key: task_dict[key] for key in task_args if key in task_dict} diff --git a/hed/tools/util/io_util.py b/hed/tools/util/io_util.py index 53fab27a..1a00b34b 100644 --- a/hed/tools/util/io_util.py +++ b/hed/tools/util/io_util.py @@ -328,3 +328,20 @@ def get_task_from_file(file_path): return "" splits = re.split(r'[_.]', basename[position+5:]) return splits[0] + +def get_task_dict(files): + """ Return a dictionary of the tasks that appear in the file names of a list of files. + + Parameters: + files = + + """ + task_dict = {} + for my_file in files: + task = get_task_from_file(my_file) + if not task: + continue + task_entry = task_dict.get(task, []) + task_entry.append(my_file) + task_dict[task] = task_entry + return task_dict