Skip to content

Commit

Permalink
Merge pull request hed-standard#818 from VisLab/develop
Browse files Browse the repository at this point in the history
Updated broken link in the docs
  • Loading branch information
VisLab authored Jan 3, 2024
2 parents 971b9d7 + 88374f5 commit ef91120
Show file tree
Hide file tree
Showing 5 changed files with 245 additions and 24 deletions.
4 changes: 0 additions & 4 deletions docs/source/introduction.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@ Finding help

The `HED online tools <https://hedtools.org>`_ provide an easy-to-use interface that requires no programming.

:Mailing lists and forums:

* Don't hesitate to ask questions about the python hedtools on `NeuroStars
<https://neurostars.org/tags/hedtools>`_.

:Issues and problems:
* If you notice a bug in the python hedtools code or encounter other problems using the tools, please `open an issue`_ in the
Expand Down
77 changes: 66 additions & 11 deletions hed/tools/analysis/sequence_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,15 @@ def __str__(self):
# return "\n".join(temp_list)

def dot_str(self, group_spec={}):
""" Produce a DOT string representing this sequence map.
"""
base = 'digraph g { \n'
node_list = [f"{node};" for node in self.codes if node not in self.node_counts]
if node_list:
base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) +"\n}\n"
if self.codes:
node_list = [f"{node};" for node in self.codes if node not in self.node_counts]
if node_list:
base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) +"\n}\n"
if group_spec:
for group, spec in group_spec.items():
group_list = [f"{node};" for node in self.node_counts if node in spec["nodes"]]
Expand All @@ -57,17 +62,37 @@ def dot_str(self, group_spec={}):
spec_color = f'"{spec_color}"'
base = base + 'subgraph cluster_' + group + '{\n' + f'bgcolor={spec_color};\n' + \
'\n'.join(group_list) + '\n}\n'
edge_list = [f"{value[0]} -> {value[1]} [label={str(self.edge_counts[key])}];"
for key, value in self.edges.items()]
edge_list = self.get_edge_list(sort=True)

dot_str = base + ("\n").join(edge_list) + "}\n"
return dot_str

def edge_to_str(self, key):
value = self.edges.get(key, [])
if value:
return f"{value[0]} -> {value[1]} "
else:
return ""
def get_edge_list(self, sort=True):
"""Produces a DOT format edge list with the option of sorting by edge counts.
Parameters:
sort (bool): if true the edge list is sorted by edge counts
Returns:
list: list of DOT strings representing the edges labeled by counts.
"""

df = pd.DataFrame(list(self.edge_counts.items()), columns=['Key', 'Counts'])
if sort:
df = df.sort_values(by='Counts', ascending=False)
edge_list = [f"{self.edge_to_str(row['Key'])} [label={str(self.edge_counts[row['Key']])}];"
for index, row in df.iterrows()]
return edge_list

# def resort(self):
# """ Sort the col_map in place by the key columns. """
# self.col_map.sort_values(by=self.key_cols, inplace=True, ignore_index=True)
# for index, row in self.col_map.iterrows():
# key_hash = get_row_hash(row, self.key_cols)
# self.map_dict[key_hash] = index
def filter_edges(self):
print("to here")

def update(self, data):
""" Update the existing map with information from data.
Expand Down Expand Up @@ -99,6 +124,36 @@ def update(self, data):
self.edges[key] = key_list
self.edge_counts[key] = 1

def update(self, data):
""" Update the existing map with information from data.
Parameters:
data (Series): DataFrame or filename of an events file or event map.
allow_missing (bool): If true allow missing keys and add as n/a columns.
:raises HedFileError:
- If there are missing keys and allow_missing is False.
"""
filtered = self.prep(data)
if self.codes:
mask = filtered.isin(self.codes)
filtered = filtered[mask]
for index, value in filtered.items():
if value not in self.node_counts:
self.node_counts[value] = 1
else:
self.node_counts[value] = self.node_counts[value] + 1
if index + 1 >= len(filtered):
break
key_list = filtered[index:index + 2].tolist()
key = get_key_hash(key_list)
if key in self.edges:
self.edge_counts[key] = self.edge_counts[key] + 1
else:
self.edges[key] = key_list
self.edge_counts[key] = 1

@staticmethod
def prep(data):
""" Remove quotes from the specified columns and convert to string.
Expand Down
160 changes: 160 additions & 0 deletions hed/tools/analysis/sequence_map_new.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
""" A map of containing the number of times a particular sequence of values in a column of an event file. """

import pandas as pd
from hed.tools.util.data_util import get_key_hash


class SequenceMapNew:
""" A map of unique sequences of column values of a particular length appear in an event file.
Attributes:
name (str): An optional name of this remap for identification purposes.
Notes: This mapping converts all columns in the mapping to strings.
The remapping does not support other types of columns.
"""

def __init__(self, codes=None, name='', seq=[0, -1]):
""" Information for setting up the maps.
Parameters:
codes (list or None): If None use all codes, otherwise only include listed codes in the map.
name (str): Name associated with this remap (usually a pathname of the events file).
"""

self.codes = codes
self.name = name
self.seq = seq
self.nodes = {} # Node keys to node names
self.node_counts = {} # Node values to count
self.sequences = {} # Sequence keys to sequence
self.seq_counts = {} # Sequence keys to counts
self.edges = {} # map of edge keys to 2-element sequence keys
self.edge_counts = {} # edge keys to edge counts

@property
def __str__(self):
node_counts = [f"{value}({str(count)})" for value, count in self.node_counts.items()]
node_str = (" ").join(node_counts)
return node_str
# temp_list = [f"{self.name} counts for key [{str(self.key_cols)}]:"]
# for index, row in self.col_map.iterrows():
# key_hash = get_row_hash(row, self.columns)
# temp_list.append(f"{str(list(row.values))}:\t{self.count_dict[key_hash]}")
# return "\n".join(temp_list)

def dot_str(self, group_spec={}):
""" Produce a DOT string representing this sequence map.
"""
base = 'digraph g { \n'
if self.codes:
node_list = [f"{node};" for node in self.codes if node not in self.node_counts]
if node_list:
base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + ("\n").join(node_list) + "\n}\n"
if group_spec:
for group, spec in group_spec.items():
group_list = [f"{node};" for node in self.node_counts if node in spec["nodes"]]
if group_list:
spec_color = spec["color"]
if spec_color[0] == '#':
spec_color = f'"{spec_color}"'
base = base + 'subgraph cluster_' + group + '{\n' + f'bgcolor={spec_color};\n' + \
'\n'.join(group_list) + '\n}\n'
edge_list = self.get_edge_list(sort=True)

dot_str = base + ("\n").join(edge_list) + "}\n"
return dot_str

def edge_to_str(self, key):
value = self.edges.get(key, [])
if value:
x = ("+").join(value[0])
y = ("+").join(value[1])
return f"{str(self.sequences[value[0]])} -> {str(self.sequences[value[1]])} "
else:
return ""

def get_edge_list(self, sort=True):
"""Produces a DOT format edge list with the option of sorting by edge counts.
Parameters:
sort (bool): if true the edge list is sorted by edge counts
Returns:
list: list of DOT strings representing the edges labeled by counts.
"""

df = pd.DataFrame(list(self.edge_counts.items()), columns=['Key', 'Counts'])
if sort:
df = df.sort_values(by='Counts', ascending=False)
edge_list = []
for index, row in df.iterrows():
edge_list.append(f"{self.edge_to_str(row['Key'])} [label={str(self.edge_counts[row['Key']])}];")
return edge_list

def filter_edges(self):
print("to here")

def update(self, data):
filtered = self.get_sequence_data(data)
last_seq_key = None
for index, row in filtered.iterrows():
# Update node counts
this_node = row['value']
self.node_counts[this_node] = self.node_counts.get(this_node, 0) + 1
this_seq = row['seq']
if not this_seq:
last_seq_key = None
continue;
this_seq_key = get_key_hash(this_seq)
self.sequences[this_seq_key] = this_seq
self.seq_counts[this_seq_key] = self.seq_counts.get(this_seq_key, 0) + 1
if last_seq_key:
this_edge_key = get_key_hash([last_seq_key, this_seq_key])
self.edges[this_edge_key] = [last_seq_key, this_seq_key]
self.edge_counts[this_edge_key] = self.edge_counts.get(this_edge_key, 0) + 1
last_seq_key = this_seq_key

def get_sequence_data(self, data):
filtered = self.prep(data)
empty_lists = [[] for _ in range(len(filtered))]

# Create a DataFrame
df = pd.DataFrame({'value': filtered.values, 'seq': empty_lists})

for index, row in df.iterrows():
df.at[index, 'seq'] = self.get_sequence(df, index)
return df

def get_sequence(self, df, index):
seq_list = []
for i, val in enumerate(self.seq):
df_ind = val + index
if df_ind < 0 or df_ind >= len(df):
return []
seq_list.append(df.iloc[df_ind, 0])
return seq_list

@staticmethod
def prep(data):
""" Remove quotes from the specified columns and convert to string.
Parameters:
data (Series): Dataframe to process by removing quotes.
Returns: Series
Notes:
- Replacement is done in place.
"""

filtered = data.astype(str)
filtered.fillna('n/a').astype(str)
filtered = filtered.str.replace('"', '')
filtered = filtered.str.replace("'", "")
return filtered
11 changes: 2 additions & 9 deletions hed/tools/remodeling/cli/run_remodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json
import argparse
from hed.errors.exceptions import HedFileError
from hed.tools.util.io_util import get_file_list, get_task_from_file
from hed.tools.util.io_util import get_file_list, get_task_from_file, get_task_dict
from hed.tools.bids.bids_dataset import BidsDataset
from hed.tools.remodeling.dispatcher import Dispatcher
from hed.tools.remodeling.backup_manager import BackupManager
Expand Down Expand Up @@ -119,14 +119,7 @@ def parse_arguments(arg_list=None):
def parse_tasks(files, task_args):
if not task_args:
return {"": files}
task_dict = {}
for my_file in files:
task = get_task_from_file(my_file)
if not task:
continue
task_entry = task_dict.get(task, [])
task_entry.append(my_file)
task_dict[task] = task_entry
task_dict = get_task_dict(files)
if task_args == "*" or isinstance(task_args, list) and task_args[0] == "*":
return task_dict
task_dict = {key: task_dict[key] for key in task_args if key in task_dict}
Expand Down
17 changes: 17 additions & 0 deletions hed/tools/util/io_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,3 +328,20 @@ def get_task_from_file(file_path):
return ""
splits = re.split(r'[_.]', basename[position+5:])
return splits[0]

def get_task_dict(files):
""" Return a dictionary of the tasks that appear in the file names of a list of files.
Parameters:
files =
"""
task_dict = {}
for my_file in files:
task = get_task_from_file(my_file)
if not task:
continue
task_entry = task_dict.get(task, [])
task_entry.append(my_file)
task_dict[task] = task_entry
return task_dict

0 comments on commit ef91120

Please sign in to comment.