Skip to content

Commit

Permalink
Merge pull request #1001 from IanCa/dev_boutiques2
Browse files Browse the repository at this point in the history
Add not even half baked start on changing dispatcher for boutiques
  • Loading branch information
VisLab authored Aug 1, 2024
2 parents a1a54ba + e73b7a7 commit 892b643
Show file tree
Hide file tree
Showing 10 changed files with 383 additions and 67 deletions.
1 change: 1 addition & 0 deletions hed/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@
from .remodeling.cli import run_remodel
from .remodeling.cli import run_remodel_backup
from .remodeling.cli import run_remodel_restore
from .remodeling.cli import run_summary
20 changes: 19 additions & 1 deletion hed/tools/analysis/column_name_summary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
""" Summarize the unique column names in a dataset. """

import copy
import json


Expand All @@ -11,6 +11,24 @@ def __init__(self, name=''):
self.file_dict = {}
self.unique_headers = []

@staticmethod
def load_as_json2(json_data):
summary = ColumnNameSummary()
json_data = json_data["File summary"]
summary.name = json_data["Name"]
# summary.total_events = json_data["Total events"]
# summary.total_files = json_data["Total files"]
specifics = json_data["Specifics"]
all_column_data = specifics["Columns"]
for index, column_data in enumerate(all_column_data):
file_list = column_data["Files"]
unique_header = column_data["Column names"]
summary.unique_headers.append(unique_header)
for file in file_list:
summary.file_dict[file] = index

return summary

def update(self, name, columns):
""" Update the summary based on columns associated with a file.
Expand Down
56 changes: 38 additions & 18 deletions hed/tools/analysis/tabular_summary.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
""" Summarize the contents of columnar files. """


import json
from hed.errors.exceptions import HedFileError
from hed.tools.util import data_util
Expand Down Expand Up @@ -74,29 +73,50 @@ def extract_sidecar_template(self):
side_dict[column_name] = annotation_util.generate_sidecar_entry(column_name, [])
return side_dict

@staticmethod
def load_as_json2(json_data):
summary = TabularSummary()
json_data = json_data["File summary"]
summary.name = json_data["Name"]
summary.total_events = json_data["Total events"]
summary.total_files = json_data["Total files"]
specifics = json_data["Specifics"]
# todo ian: this doesn't use value column summaries or categorical counts? What
summary.categorical_info = specifics["Categorical column summaries"]
summary.value_info = specifics["Value column summaries"]
summary.skip_cols = specifics["Skip columns"]
# summary.files = specifics["Files"]

return summary

def _sort_internal(self):
categorical_cols = {}
for key in sorted(self.categorical_info):
cat_dict = self.categorical_info[key]
val_dict = {v_key: cat_dict[v_key] for v_key in sorted(cat_dict.keys())}
categorical_cols[key] = val_dict
value_cols = {key: self.value_info[key] for key in sorted(self.value_info)}
self.categorical_info = categorical_cols
self.value_info = value_cols

def get_summary(self, as_json=False):
""" Return the summary in dictionary format.
Parameters:
as_json (bool): If False, return as a Python dictionary, otherwise convert to a JSON dictionary.
"""
sorted_keys = sorted(self.categorical_info.keys())
categorical_cols = {}
for key in sorted_keys:
cat_dict = self.categorical_info[key]
sorted_v_keys = sorted(list(cat_dict))
val_dict = {}
for v_key in sorted_v_keys:
val_dict[v_key] = cat_dict[v_key]
categorical_cols[key] = val_dict
sorted_cols = sorted(map(str, list(self.value_info)))
value_cols = {}
for key in sorted_cols:
value_cols[key] = self.value_info[key]
summary = {"Name": self.name, "Total events": self.total_events, "Total files": self.total_files,
"Categorical columns": categorical_cols, "Value columns": value_cols,
"Skip columns": self.skip_cols, "Files": self.files}
self._sort_internal()
summary = {"Name": self.name,
"Total events": self.total_events,
"Total files": self.total_files,
"Categorical columns": self.categorical_info,
"Value columns": self.value_info,
"Skip columns": self.skip_cols,
"Files": self.files}

# reloaded_summary = self.load_as_json(summary)

if as_json:
return json.dumps(summary, indent=4)
else:
Expand Down Expand Up @@ -198,7 +218,7 @@ def _update_dataframe(self, data, name):
else:
col_values = col_values.astype(str)
values = col_values.value_counts(ascending=True)
self._update_categorical(col_name, values)
self._update_categorical(col_name, values)

def _update_dict_categorical(self, col_dict):
""" Update this summary with the categorical information in the dictionary from another summary.
Expand Down
11 changes: 9 additions & 2 deletions hed/tools/remodeling/cli/run_remodel.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
""" Main command-line program for running the remodeling tools. """

import copy
import os
import json
import argparse
Expand Down Expand Up @@ -62,6 +62,8 @@ def get_parser():
help="If given, is the path to directory for saving, otherwise derivatives/remodel is used.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=[], dest="exclude_dirs",
help="Directories names to exclude from search for files.")
parser.add_argument("-a", "--analysis-level", dest="analysis_level", default="none",
choices=["participant", "group", "none"])
return parser


Expand Down Expand Up @@ -224,13 +226,17 @@ def main(arg_list=None):
for task, files in task_dict.items():
dispatch = Dispatcher(operations, data_root=args.data_dir, backup_name=backup_name,
hed_versions=args.hed_versions)

if args.use_bids:
run_bids_ops(dispatch, args, files)
else:
run_direct_ops(dispatch, args, files)

if not args.no_summaries:
# Todo ian: replace dataset_summary variable
dispatch.save_summaries(args.save_formats, individual_summaries=args.individual_summaries,
summary_dir=save_dir, task_name=task)
summary_dir=save_dir, task_name=task,
dataset_summary=args.analysis_level != "participant")
except Exception as ex:
if args.log_dir:
log_name = io_util.get_alphanumeric_path(os.path.realpath(args.data_dir)) + '_' + timestamp + '.txt'
Expand All @@ -239,5 +245,6 @@ def main(arg_list=None):
raise



if __name__ == '__main__':
main()
194 changes: 194 additions & 0 deletions hed/tools/remodeling/cli/run_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
""" Main command-line program for running the remodeling tools. """
import copy
import os
import json
import argparse
import logging
from hed.errors.exceptions import HedFileError

from hed.tools.bids.bids_dataset import BidsDataset
from hed.tools.remodeling.remodeler_validator import RemodelerValidator
from hed.tools.remodeling.dispatcher import Dispatcher
from hed.tools.remodeling.backup_manager import BackupManager
from hed.tools.util import io_util


def get_parser():
""" Create a parser for the run_remodel command-line arguments.
Returns:
argparse.ArgumentParser: A parser for parsing the command line arguments.
"""
parser = argparse.ArgumentParser(description="Converts event files based on a json file specifying operations.")
parser.add_argument("data_dir", help="Full path of dataset root directory.")
parser.add_argument("model_path", help="Full path of the file with remodeling instructions.")
parser.add_argument("-bd", "--backup_dir", default="", dest="backup_dir",
help="Directory for the backup that is being created")
parser.add_argument("-bn", "--backup_name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
help="Name of the default backup for remodeling")
parser.add_argument("-b", "--bids-format", action='store_true', dest="use_bids",
help="If present, the dataset is in BIDS format with sidecars. HED analysis is available.")
parser.add_argument("-e", "--extensions", nargs="*", default=['.tsv'], dest="extensions",
help="File extensions to allow in locating files.")
parser.add_argument("-f", "--file-suffix", dest="file_suffix", default='events',
help="Filename suffix excluding file type of items to be analyzed (events by default).")
parser.add_argument("-i", "--individual-summaries", dest="individual_summaries", default="separate",
choices=["separate", "consolidated", "none"],
help="Controls individual file summaries ('none', 'separate', 'consolidated')")
parser.add_argument("-j", "--json-sidecar", dest="json_sidecar", nargs="?",
help="Optional path to JSON sidecar with HED information")
parser.add_argument("-ld", "--log_dir", dest="log_dir", default="",
help="Directory for storing log entries for errors.")
# parser.add_argument("-n", "--backup-name", default=BackupManager.DEFAULT_BACKUP_NAME, dest="backup_name",
# help="Name of the default backup for remodeling")
parser.add_argument("-nb", "--no-backup", action='store_true', dest="no_backup",
help="If present, the operations are run directly on the files with no backup.")
parser.add_argument("-ns", "--no-summaries", action='store_true', dest="no_summaries",
help="If present, the summaries are not saved, but rather discarded.")
parser.add_argument("-nu", "--no-update", action='store_true', dest="no_update",
help="If present, the files are not saved, but rather discarded.")
parser.add_argument("-r", "--hed-versions", dest="hed_versions", nargs="*", default=[],
help="Optional list of HED schema versions used for annotation, include prefixes.")
parser.add_argument("-s", "--save-formats", nargs="*", default=['.json', '.txt'], dest="save_formats",
help="Format for saving any summaries, if any. If no summaries are to be written," +
"use the -ns option.")
parser.add_argument("-t", "--task-names", dest="task_names", nargs="*", default=[],
help="The names of the task. If an empty list is given, all tasks are lumped together." +
" If * is given, then tasks are found and reported individually.")
parser.add_argument("-v", "--verbose", action='store_true',
help="If present, output informative messages as computation progresses.")
parser.add_argument("-w", "--work-dir", default="", dest="work_dir",
help="If given, is the path to directory for saving, otherwise derivatives/remodel is used.")
parser.add_argument("-x", "--exclude-dirs", nargs="*", default=[], dest="exclude_dirs",
help="Directories names to exclude from search for files.")
parser.add_argument("-a", "--analysis-level", dest="analysis_level", default="group",
choices=["group"])
return parser


def handle_backup(args):
""" Restore the backup if applicable.
Parameters:
args (obj): Parsed arguments as an object.
Returns:
str or None: Backup name if there was a backup done.
"""
if args.no_backup:
backup_name = None
else:
backup_man = BackupManager(args.data_dir)
if not backup_man.get_backup(args.backup_name):
raise HedFileError("BackupDoesNotExist", f"Backup {args.backup_name} does not exist. "
f"Please run_remodel_backup first", "")
backup_man.restore_backup(args.backup_name, args.task_names, verbose=args.verbose)
backup_name = args.backup_name
return backup_name


def parse_arguments(arg_list=None):
""" Parse the command line arguments or arg_list if given.
Parameters:
arg_list (list): List of command line arguments as a list.
Returns:
Object: Argument object.
List: A list of parsed operations (each operation is a dictionary).
:raises ValueError:
- If the operations were unable to be correctly parsed.
"""
parser = get_parser()
args = parser.parse_args(arg_list)
if '*' in args.file_suffix:
args.file_suffix = None
if '*' in args.extensions:
args.extensions = None
args.data_dir = os.path.realpath(args.data_dir)
args.exclude_dirs = args.exclude_dirs + ['remodel']
args.model_path = os.path.realpath(args.model_path)
if args.verbose:
print(f"Data directory: {args.data_dir}\nModel path: {args.model_path}")
with open(args.model_path, 'r') as fp:
operations = json.load(fp)
validator = RemodelerValidator()
errors = validator.validate(operations)
if errors:
raise ValueError("UnableToFullyParseOperations",
f"Fatal operation error, cannot continue:\n{errors}")
return args, operations


def parse_tasks(files, task_args):
""" Parse the tasks argument to get a task list.
Parameters:
files (list): List of full paths of files.
task_args (str or list): The argument values for the task parameter.
"""
if not task_args:
return {"": files}
task_dict = io_util.get_task_dict(files)
if task_args == "*" or isinstance(task_args, list) and task_args[0] == "*":
return task_dict
task_dict = {key: task_dict[key] for key in task_args if key in task_dict}
return task_dict


def main(arg_list=None):
""" The command-line program.
Parameters:
arg_list (list or None): Called with value None when called from the command line.
Otherwise, called with the command-line parameters as an argument list.
:raises HedFileError:
- if the data root directory does not exist.
- if the specified backup does not exist.
"""
args, operations = parse_arguments(arg_list)

if args.log_dir:
os.makedirs(args.log_dir, exist_ok=True)
timestamp = io_util.get_timestamp()
try:
if not os.path.isdir(args.data_dir):
raise HedFileError("DataDirectoryDoesNotExist",
f"The root data directory {args.data_dir} does not exist", "")
backup_name = handle_backup(args)
save_dir = None
if args.work_dir:
save_dir = os.path.realpath(os.path.join(args.work_dir, Dispatcher.REMODELING_SUMMARY_PATH))
files = io_util.get_file_list(args.data_dir, name_suffix=args.file_suffix, extensions=args.extensions,
exclude_dirs=args.exclude_dirs)
task_dict = parse_tasks(files, args.task_names)
for task, files in task_dict.items():
dispatch = Dispatcher(operations, data_root=args.data_dir, backup_name=backup_name,
hed_versions=args.hed_versions)

# next task: add a makeshift "analysis level" parameter. particpant = generate individual, group = reload indivdual on load
# Need a way to determine WHICH run to reload options from

dispatch.load_existing_summaries(save_dir)

if not args.no_summaries:
dispatch.save_summaries(args.save_formats, individual_summaries=args.individual_summaries,
summary_dir=save_dir, task_name=task)
except Exception as ex:
if args.log_dir:
log_name = io_util.get_alphanumeric_path(os.path.realpath(args.data_dir)) + '_' + timestamp + '.txt'
logging.basicConfig(filename=os.path.join(args.log_dir, log_name), level=logging.ERROR)
logging.exception(f"{args.data_dir}: {args.model_path}")
raise



if __name__ == '__main__':
main()
Loading

0 comments on commit 892b643

Please sign in to comment.