Merge pull request hed-standard#818 from VisLab/develop

Updated broken link in the docs
monique2208 · Jan 3, 2024 · ef91120 · ef91120
2 parents 971b9d7 + 88374f5
commit ef91120
Show file tree

Hide file tree

Showing 5 changed files with 245 additions and 24 deletions.
diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst
@@ -42,10 +42,6 @@ Finding help
 
     The `HED online tools <https://hedtools.org>`_ provide an easy-to-use interface that requires no programming. 
 
-:Mailing lists and forums:
-
-    * Don't hesitate to ask questions about the python hedtools on `NeuroStars
-      <https://neurostars.org/tags/hedtools>`_.
 
 :Issues and problems:
     * If you notice a bug in the python hedtools code or encounter other problems using the tools, please `open an issue`_ in the

diff --git a/hed/tools/analysis/sequence_map.py b/hed/tools/analysis/sequence_map.py
@@ -44,10 +44,15 @@ def __str__(self):
         # return "\n".join(temp_list)
 
     def dot_str(self, group_spec={}):
+        """ Produce a DOT string representing this sequence map.
+        
+        
+        """
         base = 'digraph g { \n'
-        node_list = [f"{node};" for node in self.codes if node not in self.node_counts]
-        if node_list:
-            base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) +"\n}\n"
+        if self.codes:
+            node_list = [f"{node};" for node in self.codes if node not in self.node_counts]
+            if node_list:
+                base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) +"\n}\n"
         if group_spec:
             for group, spec in group_spec.items():
                 group_list = [f"{node};" for node in self.node_counts if node in spec["nodes"]]
@@ -57,17 +62,37 @@ def dot_str(self, group_spec={}):
                         spec_color = f'"{spec_color}"'
                     base = base + 'subgraph cluster_' + group + '{\n' + f'bgcolor={spec_color};\n' + \
                            '\n'.join(group_list) + '\n}\n'
-        edge_list = [f"{value[0]} -> {value[1]} [label={str(self.edge_counts[key])}];" 
-                     for key, value in self.edges.items()]
+        edge_list = self.get_edge_list(sort=True)
+
         dot_str = base +  ("\n").join(edge_list) + "}\n"
         return dot_str
+
+    def edge_to_str(self, key):
+        value = self.edges.get(key, [])
+        if value:
+            return f"{value[0]} -> {value[1]} "
+        else:
+            return ""
+    def get_edge_list(self, sort=True):
+        """Produces a DOT format edge list with the option of sorting by edge counts.
+        
+        Parameters:
+            sort (bool): if true the edge list is sorted by edge counts
+            
+        Returns:
+            list:  list of DOT strings representing the edges labeled by counts.
+        
+        """
+
+        df = pd.DataFrame(list(self.edge_counts.items()), columns=['Key', 'Counts'])
+        if sort:
+            df = df.sort_values(by='Counts', ascending=False)
+        edge_list = [f"{self.edge_to_str(row['Key'])} [label={str(self.edge_counts[row['Key']])}];" 
+                     for index, row in df.iterrows()]
+        return edge_list
 
-    # def resort(self):
-    #     """ Sort the col_map in place by the key columns. """
-    #     self.col_map.sort_values(by=self.key_cols, inplace=True, ignore_index=True)
-    #     for index, row in self.col_map.iterrows():
-    #         key_hash = get_row_hash(row, self.key_cols)
-    #         self.map_dict[key_hash] = index
+    def filter_edges(self):
+        print("to here")
 
     def update(self, data):
         """ Update the existing map with information from data.
@@ -99,6 +124,36 @@ def update(self, data):
                 self.edges[key] = key_list
                 self.edge_counts[key] = 1
 
+    def update(self, data):
+        """ Update the existing map with information from data.
+
+        Parameters:
+            data (Series):     DataFrame or filename of an events file or event map.
+            allow_missing (bool):        If true allow missing keys and add as n/a columns.
+
+        :raises HedFileError:
+            - If there are missing keys and allow_missing is False.
+
+        """
+        filtered = self.prep(data)
+        if self.codes:
+            mask = filtered.isin(self.codes)
+            filtered = filtered[mask]
+        for index, value in filtered.items():
+            if value not in self.node_counts:
+                self.node_counts[value] = 1
+            else:
+                self.node_counts[value] = self.node_counts[value] + 1
+            if index + 1 >= len(filtered):
+                break
+            key_list = filtered[index:index + 2].tolist()
+            key = get_key_hash(key_list)
+            if key in self.edges:
+                self.edge_counts[key] = self.edge_counts[key] + 1
+            else:
+                self.edges[key] = key_list
+                self.edge_counts[key] = 1
+
     @staticmethod
     def prep(data):
         """ Remove quotes from the specified columns and convert to string.

diff --git a/hed/tools/analysis/sequence_map_new.py b/hed/tools/analysis/sequence_map_new.py
@@ -0,0 +1,160 @@
+""" A map of containing the number of times a particular sequence of values in a column of an event file. """
+
+import pandas as pd
+from hed.tools.util.data_util import get_key_hash
+
+
+class SequenceMapNew:
+    """ A map of unique sequences of column values of a particular length appear in an event file.
+
+    Attributes:
+        
+        name (str):       An optional name of this remap for identification purposes.
+
+    Notes: This mapping converts all columns in the mapping to strings.
+    The remapping does not support other types of columns.
+
+    """
+
+    def __init__(self, codes=None, name='', seq=[0, -1]):
+        """ Information for setting up the maps.
+
+        Parameters:
+            codes (list or None): If None use all codes, otherwise only include listed codes in the map.
+            name (str):          Name associated with this remap (usually a pathname of the events file).
+
+        """
+
+        self.codes = codes
+        self.name = name
+        self.seq = seq
+        self.nodes = {}  # Node keys to node names
+        self.node_counts = {}  # Node values to count  
+        self.sequences = {}  # Sequence keys to sequence
+        self.seq_counts = {}  # Sequence keys to counts
+        self.edges = {}  # map of edge keys to 2-element sequence keys
+        self.edge_counts = {}  # edge keys to edge counts
+
+    @property
+    def __str__(self):
+        node_counts = [f"{value}({str(count)})" for value, count in self.node_counts.items()]
+        node_str = (" ").join(node_counts)
+        return node_str
+        # temp_list = [f"{self.name} counts for key [{str(self.key_cols)}]:"]
+        # for index, row in self.col_map.iterrows():
+        #     key_hash = get_row_hash(row, self.columns)
+        #     temp_list.append(f"{str(list(row.values))}:\t{self.count_dict[key_hash]}")
+        # return "\n".join(temp_list)
+
+    def dot_str(self, group_spec={}):
+        """ Produce a DOT string representing this sequence map.
+        
+        
+        """
+        base = 'digraph g { \n'
+        if self.codes:
+            node_list = [f"{node};" for node in self.codes if node not in self.node_counts]
+            if node_list:
+                base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) + "\n}\n"
+        if group_spec:
+            for group, spec in group_spec.items():
+                group_list = [f"{node};" for node in self.node_counts if node in spec["nodes"]]
+                if group_list:
+                    spec_color = spec["color"]
+                    if spec_color[0] == '#':
+                        spec_color = f'"{spec_color}"'
+                    base = base + 'subgraph cluster_' + group + '{\n' + f'bgcolor={spec_color};\n' + \
+                           '\n'.join(group_list) + '\n}\n'
+        edge_list = self.get_edge_list(sort=True)
+
+        dot_str = base + ("\n").join(edge_list) + "}\n"
+        return dot_str
+
+    def edge_to_str(self, key):
+        value = self.edges.get(key, [])
+        if value:
+            x = ("+").join(value[0])
+            y = ("+").join(value[1])
+            return f"{str(self.sequences[value[0]])} -> {str(self.sequences[value[1]])} "
+        else:
+            return ""
+
+    def get_edge_list(self, sort=True):
+        """Produces a DOT format edge list with the option of sorting by edge counts.
+        
+        Parameters:
+            sort (bool): if true the edge list is sorted by edge counts
+            
+        Returns:
+            list:  list of DOT strings representing the edges labeled by counts.
+        
+        """
+
+        df = pd.DataFrame(list(self.edge_counts.items()), columns=['Key', 'Counts'])
+        if sort:
+            df = df.sort_values(by='Counts', ascending=False)
+        edge_list = []
+        for index, row in df.iterrows():
+             edge_list.append(f"{self.edge_to_str(row['Key'])} [label={str(self.edge_counts[row['Key']])}];")
+        return edge_list
+
+    def filter_edges(self):
+        print("to here")
+
+    def update(self, data):
+        filtered = self.get_sequence_data(data)
+        last_seq_key = None
+        for index, row in filtered.iterrows():
+            # Update node counts
+            this_node = row['value']
+            self.node_counts[this_node] = self.node_counts.get(this_node, 0) + 1
+            this_seq = row['seq']
+            if not this_seq:
+                last_seq_key = None
+                continue;
+            this_seq_key = get_key_hash(this_seq)
+            self.sequences[this_seq_key] = this_seq
+            self.seq_counts[this_seq_key] = self.seq_counts.get(this_seq_key, 0) + 1
+            if last_seq_key:
+                this_edge_key = get_key_hash([last_seq_key, this_seq_key])
+                self.edges[this_edge_key] = [last_seq_key, this_seq_key]
+                self.edge_counts[this_edge_key] = self.edge_counts.get(this_edge_key, 0) + 1
+            last_seq_key = this_seq_key
+
+    def get_sequence_data(self, data):
+        filtered = self.prep(data)
+        empty_lists = [[] for _ in range(len(filtered))]
+
+        # Create a DataFrame
+        df = pd.DataFrame({'value': filtered.values, 'seq': empty_lists})
+
+        for index, row in df.iterrows():
+            df.at[index, 'seq'] = self.get_sequence(df, index)
+        return df
+
+    def get_sequence(self, df, index):
+        seq_list = []
+        for i, val in enumerate(self.seq):
+            df_ind = val + index
+            if df_ind < 0 or df_ind >= len(df):
+                return []
+            seq_list.append(df.iloc[df_ind, 0])
+        return seq_list
+
+    @staticmethod
+    def prep(data):
+        """ Remove quotes from the specified columns and convert to string.
+
+        Parameters:
+            data (Series):   Dataframe to process by removing quotes.
+            
+        Returns: Series
+        Notes:
+            - Replacement is done in place.
+        """
+
+        filtered = data.astype(str)
+        filtered.fillna('n/a').astype(str)
+        filtered = filtered.str.replace('"', '')
+        filtered = filtered.str.replace("'", "")
+        return filtered
diff --git a/hed/tools/remodeling/cli/run_remodel.py b/hed/tools/remodeling/cli/run_remodel.py
@@ -4,7 +4,7 @@
 import json
 import argparse
 from hed.errors.exceptions import HedFileError
-from hed.tools.util.io_util import get_file_list, get_task_from_file
+from hed.tools.util.io_util import get_file_list, get_task_from_file, get_task_dict
 from hed.tools.bids.bids_dataset import BidsDataset
 from hed.tools.remodeling.dispatcher import Dispatcher
 from hed.tools.remodeling.backup_manager import BackupManager
@@ -119,14 +119,7 @@ def parse_arguments(arg_list=None):
 def parse_tasks(files, task_args):
     if not task_args:
         return {"": files}
-    task_dict = {}
-    for my_file in files:
-        task = get_task_from_file(my_file)
-        if not task:
-            continue
-        task_entry = task_dict.get(task, [])
-        task_entry.append(my_file)
-        task_dict[task] = task_entry
+    task_dict = get_task_dict(files)
     if task_args == "*" or isinstance(task_args, list) and task_args[0] == "*":
         return task_dict
     task_dict = {key: task_dict[key] for key in task_args if key in task_dict}

diff --git a/hed/tools/util/io_util.py b/hed/tools/util/io_util.py
@@ -328,3 +328,20 @@ def get_task_from_file(file_path):
         return ""
     splits = re.split(r'[_.]', basename[position+5:])
     return splits[0]
+
+def get_task_dict(files):
+    """ Return a dictionary of the tasks that appear in the file names of a list of files.
+    
+    Parameters:
+        files = 
+    
+    """
+    task_dict = {}
+    for my_file in files:
+        task = get_task_from_file(my_file)
+        if not task:
+            continue
+        task_entry = task_dict.get(task, [])
+        task_entry.append(my_file)
+        task_dict[task] = task_entry
+    return task_dict