diff --git a/hed/tools/remodeling/cli/run_remodel.py b/hed/tools/remodeling/cli/run_remodel.py index c640ba782..0761eca5d 100644 --- a/hed/tools/remodeling/cli/run_remodel.py +++ b/hed/tools/remodeling/cli/run_remodel.py @@ -6,6 +6,7 @@ from hed.errors.exceptions import HedFileError from hed.tools.util.io_util import get_file_list, get_task_from_file, get_task_dict from hed.tools.bids.bids_dataset import BidsDataset +from hed.tools.remodeling.validator import RemodelerValidator from hed.tools.remodeling.dispatcher import Dispatcher from hed.tools.remodeling.backup_manager import BackupManager @@ -109,10 +110,11 @@ def parse_arguments(arg_list=None): print(f"Data directory: {args.data_dir}\nModel path: {args.model_path}") with open(args.model_path, 'r') as fp: operations = json.load(fp) - parsed_operations, errors = Dispatcher.parse_operations(operations) + validator = RemodelerValidator() + errors = validator.validate(operations) if errors: raise ValueError("UnableToFullyParseOperations", - f"Fatal operation error, cannot continue:\n{Dispatcher.errors_to_str(errors)}") + f"Fatal operation error, cannot continue:\n{errors}") return args, operations diff --git a/hed/tools/remodeling/dispatcher.py b/hed/tools/remodeling/dispatcher.py index 2bfb90b3f..039b05f27 100644 --- a/hed/tools/remodeling/dispatcher.py +++ b/hed/tools/remodeling/dispatcher.py @@ -22,7 +22,7 @@ def __init__(self, operation_list, data_root=None, """ Constructor for the dispatcher. Parameters: - operation_list (list): List of unparsed operations. + operation_list (list): List of valid unparsed operations. data_root (str or None): Root directory for the dataset. If none, then backups are not made. hed_versions (str, list, HedSchema, or HedSchemaGroup): The HED schema. @@ -42,11 +42,7 @@ def __init__(self, operation_list, data_root=None, raise HedFileError("BackupDoesNotExist", f"Remodeler cannot be run with a dataset without first creating the " f"{self.backup_name} backup for {self.data_root}", "") - op_list, errors = self.parse_operations(operation_list) - if errors: - these_errors = self.errors_to_str(errors, 'Dispatcher failed due to invalid operations') - raise ValueError("InvalidOperationList", f"{these_errors}") - self.parsed_ops = op_list + self.parsed_ops = self.parse_operations(operation_list) self.hed_schema = self.get_schema(hed_versions) self.summary_dicts = {} @@ -183,31 +179,11 @@ def save_summaries(self, save_formats=['.json', '.txt'], individual_summaries="s @staticmethod def parse_operations(operation_list): - errors = [] operations = [] for index, item in enumerate(operation_list): - try: - if not isinstance(item, dict): - raise TypeError("InvalidOperationFormat", - f"Each operations must be a dictionary but operation {str(item)} is {type(item)}") - if "operation" not in item: - raise KeyError("MissingOperation", - f"operation {str(item)} does not have a operation key") - if "parameters" not in item: - raise KeyError("MissingParameters", - f"Operation {str(item)} does not have a parameters key") - if item["operation"] not in valid_operations: - raise KeyError("OperationNotListedAsValid", - f"Operation {item['operation']} must be added to operations_list " - f"before it can be executed.") - new_operation = valid_operations[item["operation"]](item["parameters"]) - operations.append(new_operation) - except Exception as ex: - errors.append({"index": index, "item": f"{item}", "error_type": type(ex), - "error_code": ex.args[0], "error_msg": ex.args[1]}) - if errors: - return [], errors - return operations, [] + new_operation = valid_operations[item["operation"]](item["parameters"]) + operations.append(new_operation) + return operations @staticmethod def prep_data(df): diff --git a/hed/tools/remodeling/operations/base_op.py b/hed/tools/remodeling/operations/base_op.py index bc3e906c6..ffcdc4be4 100644 --- a/hed/tools/remodeling/operations/base_op.py +++ b/hed/tools/remodeling/operations/base_op.py @@ -1,71 +1,29 @@ """ Base class for remodeling operations. """ +from abc import ABC, abstractmethod -class BaseOp: - """ Base class for operations. All remodeling operations should extend this class. - - The base class holds the parameters and does basic parameter checking against the operation's specification. - - """ - - def __init__(self, op_spec, parameters): - """ Base class constructor for operations. +class BaseOp(ABC): + """ Base class for operations. All remodeling operations should extend this class.""" + def __init__(self, parameters): + """ Constructor for the BaseOp class. Should be extended by operations. + Parameters: - op_spec (dict): Specification for required and optional parameters. - parameters (dict): Actual values of the parameters for the operation. - - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - - :raises ValueError: - - If the specification is missing a valid operation. - + parameters (dict): A dictionary specifying the appropriate parameters for the operation. """ - self.operation = op_spec.get("operation", "") - if not self.operation: - raise ValueError("OpMustHaveOperation", "Op must have operation is empty") - self.required_params = op_spec.get("required_parameters", {}) - self.optional_params = op_spec.get("optional_parameters", {}) - self.check_parameters(parameters) - - def check_parameters(self, parameters): - """ Verify that the parameters meet the operation specification. - - Parameters: - parameters (dict): Dictionary of parameters for this operation. + self.parameters = parameters - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. + @property + @abstractmethod + def NAME(self): + pass - :raises TypeError: - - If a parameter has the wrong type. - - """ - - required = set(self.required_params.keys()) - required_missing = required.difference(set(parameters.keys())) - if required_missing: - raise KeyError("MissingRequiredParameters", - f"{self.operation} requires parameters {list(required_missing)}") - for param_name, param_value in parameters.items(): - if param_name in self.required_params: - param_type = self.required_params[param_name] - elif param_name in self.optional_params: - param_type = self.optional_params[param_name] - else: - raise KeyError("BadParameter", - f"{param_name} not a required or optional parameter for {self.operation}") - if isinstance(param_type, list): - self._check_list_type(param_value, param_type) - elif not isinstance(param_value, param_type): - raise TypeError("BadType", f"{param_value} has type {type(param_value)} not {param_type}") + @property + @abstractmethod + def PARAMS(self): + pass + @abstractmethod def do_op(self, dispatcher, df, name, sidecar=None): """ Base class method to be overridden by each operation. @@ -78,21 +36,13 @@ def do_op(self, dispatcher, df, name, sidecar=None): """ return df.copy() - + @staticmethod - def _check_list_type(param_value, param_type): - """ Check a parameter value against its specified type. - - Parameters: - param_value (any): The value to be checked. - param_type (any): Class to check the param_value against. - - :raises TypeError: - - If param_value is not an instance of param_type. - - """ - - for this_type in param_type: - if isinstance(param_value, this_type): - return - raise TypeError("BadType", f"{param_value} has type {type(param_value)} which is not in {str(param_type)}") + @abstractmethod + def validate_input_data(parameters): + '''Validates whether operation parameter input data meets specific criteria beyond what can be captured in json schema. + For example, whether two input arrays are the same length. Minimum implementation should return an empty list + to indicate no errors were found. If additional validation is necessary, method should perform the validation and + return a list with user friendly error strings. + ''' + return [] diff --git a/hed/tools/remodeling/operations/convert_columns_op.py b/hed/tools/remodeling/operations/convert_columns_op.py index e98a8cce5..3768f9feb 100644 --- a/hed/tools/remodeling/operations/convert_columns_op.py +++ b/hed/tools/remodeling/operations/convert_columns_op.py @@ -1,27 +1,53 @@ """ Convert the type of the specified columns of a tabular file. """ +#TODO finish implementation from hed.tools.remodeling.operations.base_op import BaseOp class ConvertColumnsOp(BaseOp): - """ Convert. + """ Convert data type in column Required remodeling parameters: - **column_names** (*list*): The list of columns to convert. - - **convert_to_** (*str*): Name of type to convert to. (One of 'str', 'int', 'float', 'fixed'.) + - **convert_to** (*str*): Name of type to convert to. (One of 'str', 'int', 'float', 'fixed'.) + + Optional remodeling parameters: - **decimal_places** (*int*): Number decimal places to keep (for fixed only). - - + """ - + NAME = "convert_columns" + PARAMS = { - "operation": "convert_columns", - "required_parameters": { - "column_names": list, - "convert_to": str + "type": "object", + "properties": { + "column_names": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "convert_to": { + "type": "string", + "enum": ['str', 'int', 'float', 'fixed'], + }, + "decimal_places": { + "type": "integer" + } + }, + "required": [ + "column_names", + "convert_to" + ], + "additionalProperties": False, + "if": { + "properties": { + "convert_to": {"const": "fixed"} + } }, - "optional_parameters": { - "decimal_places": int + "then": { + "required": ["decimal_places"] } } @@ -31,25 +57,11 @@ def __init__(self, parameters): Parameters: parameters (dict): Parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - - :raises ValueError: - - If convert_to is not one of the allowed values. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.column_names = parameters['column_names'] self.convert_to = parameters['convert_to'] self.decimal_places = parameters.get('decimal_places', None) - self.allowed_types = ['str', 'int', 'float', 'fixed'] - if self.convert_to not in self.allowed_types: - raise ValueError("CannotConvertToSpecifiedType", - f"The convert_to value {self.convert_to} must be one of {str(self.allowed_types)}") def do_op(self, dispatcher, df, name, sidecar=None): """ Convert the specified column to a specified type. @@ -67,3 +79,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): df_new = df.copy() return df_new + + @staticmethod + def validate_input_data(operations): + return [] diff --git a/hed/tools/remodeling/operations/factor_column_op.py b/hed/tools/remodeling/operations/factor_column_op.py index e01a81d8b..992b8e8ba 100644 --- a/hed/tools/remodeling/operations/factor_column_op.py +++ b/hed/tools/remodeling/operations/factor_column_op.py @@ -4,27 +4,52 @@ # TODO: Does not handle empty factor names. # TODO: Does not handle optional return columns. +# TODO: Same length factornames and factorvalues class FactorColumnOp(BaseOp): """ Create tabular file factor columns from column values. Required remodeling parameters: - - **column_name** (*str*): The name of a column in the DataFrame. - - **factor_values** (*list*): Values in the column column_name to create factors for. - - **factor_names** (*list*): Names to use as the factor columns. + - **column_name** (*str*): The name of a column in the DataFrame. + Optional remodeling parameters + - **factor_names** (*list*): Names to use as the factor columns. + - **factor_values** (*list*): Values in the column column_name to create factors for. """ + NAME = "factor_column" PARAMS = { - "operation": "factor_column", - "required_parameters": { - "column_name": str, - "factor_values": list, - "factor_names": list + "type": "object", + "properties": { + "column_name": { + "type": "string" + }, + "factor_names": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "factor_values": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + } }, - "optional_parameters": {} + "required": [ + "column_name" + ], + "dependentRequired": { + "factor_names": ["factor_values"] + }, + "additionalProperties": False } def __init__(self, parameters): @@ -33,25 +58,11 @@ def __init__(self, parameters): Parameters: parameters (dict): Parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - - :raises ValueError: - - If factor_names is not empty and is not the same length as factor_values. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.column_name = parameters['column_name'] self.factor_values = parameters['factor_values'] self.factor_names = parameters['factor_names'] - if self.factor_names and len(self.factor_values) != len(self.factor_names): - raise ValueError("FactorNamesLenBad", - f"The factor_names length {len(self.factor_names)} must be empty or equal" + - f"to the factor_values length {len(self.factor_values)} .") def do_op(self, dispatcher, df, name, sidecar=None): """ Create factor columns based on values in a specified column. @@ -71,11 +82,23 @@ def do_op(self, dispatcher, df, name, sidecar=None): factor_names = self.factor_names if len(factor_values) == 0: factor_values = df[self.column_name].unique() - factor_names = [self.column_name + '.' + str(column_value) for column_value in factor_values] + factor_names = [self.column_name + '.' + + str(column_value) for column_value in factor_values] df_new = df.copy() for index, factor_value in enumerate(factor_values): - factor_index = df_new[self.column_name].map(str).isin([str(factor_value)]) + factor_index = df_new[self.column_name].map( + str).isin([str(factor_value)]) column = factor_names[index] df_new[column] = factor_index.astype(int) return df_new + + @staticmethod + def validate_input_data(parameters): + if parameters.get("factor_names", False): + if len(parameters.get("factor_names")) != len(parameters.get("factor_values")): + return ["The list in factor_names, in the factor_column operation, should have the same number of items as factor_values."] + else: + return [] + else: + return [] diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py index c5b2ca08f..d28fa5e83 100644 --- a/hed/tools/remodeling/operations/factor_hed_tags_op.py +++ b/hed/tools/remodeling/operations/factor_hed_tags_op.py @@ -15,27 +15,54 @@ class FactorHedTagsOp(BaseOp): """ Create tabular file factors from tag queries. Required remodeling parameters: - - **queries** (*list*): Queries to be applied successively as filters. - - **query_names** (*list*): Column names for the query factors. - - **remove_types** (*list*): Structural HED tags to be removed. - - **expand_context** (*bool*): Expand the context if True. + - **queries** (*list*): Queries to be applied successively as filters. + + Optional remodeling parameters: + - **expand_context** (*bool*): Expand the context if True. + - **query_names** (*list*): Column names for the query factors. + - **remove_types** (*list*): Structural HED tags to be removed. Notes: - - If factor column names are not provided, *query1*, *query2*, ... are used. + - If query names are not provided, *query1*, *query2*, ... are used. - When the context is expanded, the effect of events for temporal extent is accounted for. - - Context expansion is not implemented in the current version. """ - + NAME = "factor_hed_tags" + PARAMS = { - "operation": "factor_hed_tags", - "required_parameters": { - "queries": list, - "query_names": list, - "remove_types": list + "type": "object", + "properties": { + "queries": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "expand_context": { + "type": "boolean" + }, + "query_names": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "remove_types": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + } }, - "optional_parameters": { - "expand_context": bool - } + "required": [ + "queries" + ], + "additionalProperties": False } def __init__(self, parameters): @@ -44,20 +71,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Actual values of the parameters for the operation. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - - :raises ValueError: - - If the specification is missing a valid operation. - - If the length of query names is not empty and not same length as queries. - - If there are duplicate query names. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.queries = parameters['queries'] self.query_names = parameters['query_names'] self.remove_types = parameters['remove_types'] @@ -93,9 +108,18 @@ def do_op(self, dispatcher, df, name, sidecar=None): event_man = EventManager(input_data, dispatcher.hed_schema) hed_strings, _ = get_assembled(input_data, sidecar, dispatcher.hed_schema, extra_def_dicts=None, join_columns=True, shrink_defs=False, expand_defs=True) - df_factors = search_strings(hed_strings, self.expression_parsers, query_names=self.query_names) + df_factors = search_strings( + hed_strings, self.expression_parsers, query_names=self.query_names) if len(df_factors.columns) > 0: df_list.append(df_factors) df_new = pd.concat(df_list, axis=1) df_new.replace('n/a', np.NaN, inplace=True) return df_new + + @staticmethod + def validate_input_data(parameters): + errors = [] + if parameters.get("query_names", False): + if len(parameters.get("query_names")) != len(parameters.get("queries")): + errors.append("The list in query_names, in the factor_hed_tags operation, should have the same number of items as queries.") + return errors diff --git a/hed/tools/remodeling/operations/factor_hed_type_op.py b/hed/tools/remodeling/operations/factor_hed_type_op.py index df13e3631..5c7f8885e 100644 --- a/hed/tools/remodeling/operations/factor_hed_type_op.py +++ b/hed/tools/remodeling/operations/factor_hed_type_op.py @@ -14,18 +14,33 @@ class FactorHedTypeOp(BaseOp): """ Create tabular file factors from type variables and append to tabular data. Required remodeling parameters: - - **type_tag** (*str*): HED tag used to find the factors (most commonly `condition-variable`). - - **type_values** (*list*): Factor values to include. If empty all values of that type_tag are used. + - **type_tag** (*str*): HED tag used to find the factors (most commonly `condition-variable`). - """ + Optional remodeling parameters: + - **type_values** (*list*): If provided, specifies which factor values to include. + """ + NAME = "factor_hed_type" + PARAMS = { - "operation": "factor_hed_type", - "required_parameters": { - "type_tag": str, - "type_values": list + "type": "object", + "properties": { + "type_tag": { + "type": "string" + }, + "type_values": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + } }, - "optional_parameters": {} + "required": [ + "type_tag" + ], + "additionalProperties": False } def __init__(self, parameters): @@ -34,18 +49,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Actual values of the parameters for the operation. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - - :raises ValueError: - - If the specification is missing a valid operation. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.type_tag = parameters["type_tag"] self.type_values = parameters["type_values"] @@ -68,12 +73,18 @@ def do_op(self, dispatcher, df, name, sidecar=None): input_data = TabularInput(df, sidecar=sidecar, name=name) df_list = [input_data.dataframe.copy()] - var_manager = HedTypeManager(EventManager(input_data, dispatcher.hed_schema)) + var_manager = HedTypeManager( + EventManager(input_data, dispatcher.hed_schema)) var_manager.add_type(self.type_tag.lower()) - df_factors = var_manager.get_factor_vectors(self.type_tag, self.type_values, factor_encoding="one-hot") + df_factors = var_manager.get_factor_vectors( + self.type_tag, self.type_values, factor_encoding="one-hot") if len(df_factors.columns) > 0: df_list.append(df_factors) df_new = pd.concat(df_list, axis=1) df_new.replace('n/a', np.NaN, inplace=True) return df_new + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/merge_consecutive_op.py b/hed/tools/remodeling/operations/merge_consecutive_op.py index 9ce7a16d7..89459b302 100644 --- a/hed/tools/remodeling/operations/merge_consecutive_op.py +++ b/hed/tools/remodeling/operations/merge_consecutive_op.py @@ -10,21 +10,47 @@ class MergeConsecutiveOp(BaseOp): Required remodeling parameters: - **column_name** (*str*): name of column whose consecutive values are to be compared (the merge column). - **event_code** (*str* or *int* or *float*): the particular value in the match column to be merged. - - **match_columns** (*list*): A list of columns whose values have to be matched for two events to be the same. - **set_durations** (*bool*): If true, set the duration of the merged event to the extent of the merged events. - - **ignore_missing** (*bool*): If true, missing match_columns are ignored. + - **ignore_missing** (*bool*): If true, missing match_columns are ignored. + + Optional remodeling parameters: + - **match_columns** (*list*): A list of columns whose values have to be matched for two events to be the same. """ + NAME = "merge_consecutive" + PARAMS = { - "operation": "merge_consecutive", - "required_parameters": { - "column_name": str, - "event_code": [str, int, float], - "match_columns": list, - "set_durations": bool, - "ignore_missing": bool + "type": "object", + "properties": { + "column_name": { + "type": "string" + }, + "event_code": { + "type": [ + "string", + "number" + ] + }, + "match_columns": { + "type": "array", + "items": { + "type": "string" + } + }, + "set_durations": { + "type": "boolean" + }, + "ignore_missing": { + "type": "boolean" + } }, - "optional_parameters": {} + "required": [ + "column_name", + "event_code", + "set_durations", + "ignore_missing" + ], + "additionalProperties": False } def __init__(self, parameters): @@ -33,25 +59,11 @@ def __init__(self, parameters): Parameters: parameters (dict): Actual values of the parameters for the operation. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - - :raises ValueError: - - If the specification is missing a valid operation. - - If one of the match column is the merge column. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.column_name = parameters["column_name"] self.event_code = parameters["event_code"] self.match_columns = parameters["match_columns"] - if self.column_name in self.match_columns: - raise ValueError("MergeColumnCannotBeMatchColumn", - f"Column {self.column_name} cannot be one of the match columns: {str(self.match_columns)}") self.set_durations = parameters["set_durations"] self.ignore_missing = parameters["ignore_missing"] @@ -90,7 +102,8 @@ def do_op(self, dispatcher, df, name, sidecar=None): raise ValueError("MissingMatchColumns", f"{name}: {str(missing)} columns are unmatched by data columns" f"[{str(df.columns)}] and not ignored") - match_columns = list(set(self.match_columns).intersection(set(df.columns))) + match_columns = list( + set(self.match_columns).intersection(set(df.columns))) df_new = df.copy() code_mask = df_new[self.column_name] == self.event_code @@ -140,8 +153,19 @@ def _update_durations(df_new, remove_groups): remove_df = pd.DataFrame(remove_groups, columns=["remove"]) max_groups = max(remove_groups) for index in range(max_groups): - df_group = df_new.loc[remove_df["remove"] == index + 1, ["onset", "duration"]] + df_group = df_new.loc[remove_df["remove"] + == index + 1, ["onset", "duration"]] max_group = df_group.sum(axis=1, skipna=True).max() anchor = df_group.index[0] - 1 - max_anchor = df_new.loc[anchor, ["onset", "duration"]].sum(skipna=True).max() - df_new.loc[anchor, "duration"] = max(max_group, max_anchor) - df_new.loc[anchor, "onset"] + max_anchor = df_new.loc[anchor, [ + "onset", "duration"]].sum(skipna=True).max() + df_new.loc[anchor, "duration"] = max( + max_group, max_anchor) - df_new.loc[anchor, "onset"] + + @staticmethod + def validate_input_data(parameters): + errors = [] + if parameters.get("match_columns", False): + if parameters.get("column_name") in parameters.get("match_columns"): + errors.append("The column_name in the merge_consecutive operation cannot be specified as a match_column.") + return errors diff --git a/hed/tools/remodeling/operations/number_groups_op.py b/hed/tools/remodeling/operations/number_groups_op.py index d3a5467db..1a2bd1fa3 100644 --- a/hed/tools/remodeling/operations/number_groups_op.py +++ b/hed/tools/remodeling/operations/number_groups_op.py @@ -8,47 +8,78 @@ class NumberGroupsOp(BaseOp): """ Implementation in progress. """ - + NAME = "number_groups" + PARAMS = { - "operation": "number_groups", - "required_parameters": { - "number_column_name": str, - "source_column": str, - "start": dict, - "stop": dict + "type": "object", + "properties": { + "number_column_name": { + "type": "string" + }, + "source_column": { + "type": "string" + }, + "start": { + "type": "object", + "properties": { + "values": { + "type": "array" + }, + "inclusion": { + "type": "string", + "enum": [ + "include", + "exclude" + ] + } + }, + "required": [ + "values", + "inclusion" + ], + "additionalProperties": False + }, + "stop": { + "type": "object", + "properties": { + "values": { + "type": "array" + }, + "inclusion": { + "type": "string", + "enum": [ + "include", + "exclude" + ] + } + }, + "required": [ + "values", + "inclusion" + ], + "additionalProperties": False + }, + "overwrite": { + "type": "boolean" + } }, - "optional_parameters": {"overwrite": bool} + "required": [ + "number_column_name", + "source_column", + "start", + "stop" + ], + "additionalProperties": False } def __init__(self, parameters): - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.number_column_name = parameters['number_column_name'] self.source_column = parameters['source_column'] self.start = parameters['start'] self.stop = parameters['stop'] self.start_stop_test = {"values": list, "inclusion": str} self.inclusion_test = ["include", "exclude"] - - required = set(self.start_stop_test.keys()) - for param_to_test in [self.start, self.stop]: - required_missing = required.difference(set(param_to_test.keys())) - if required_missing: - raise KeyError("MissingRequiredParameters", - f"Specified {param_to_test} for number_rows requires parameters" - f"{list(required_missing)}") - for param_name, param_value in param_to_test.items(): - param_type = str - if param_name in required: - param_type = self.start_stop_test[param_name] - else: - raise KeyError("BadParameter", - f"{param_name} not a required or optional parameter for {self.operation}") - # TODO: This has a syntax error - # if not isinstance(param_value, param_type): - # raise TypeError("BadType" f"{param_value} has type {type(param_value)} not {param_type}") - if (param_name == 'inclusion') & (param_value not in self.inclusion_test): - raise ValueError("BadValue" f" {param_name} must be one of {self.inclusion_test} not {param_value}") - self.overwrite = parameters.get('overwrite', False) def do_op(self, dispatcher, df, name, sidecar=None): @@ -104,3 +135,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): # df_new.loc[group, self.number_column_name] = i + 1 return df_new + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/number_rows_op.py b/hed/tools/remodeling/operations/number_rows_op.py index e37b180fb..c2b38a08a 100644 --- a/hed/tools/remodeling/operations/number_rows_op.py +++ b/hed/tools/remodeling/operations/number_rows_op.py @@ -7,36 +7,49 @@ class NumberRowsOp(BaseOp): """ Implementation in progress. """ + NAME = "number_rows" + PARAMS = { - "operation": "number_rows", - "required_parameters": { - "number_column_name": str + "type": "object", + "properties": { + "number_column_name": { + "type": "string" + }, + "overwrite": { + "type": "boolean" + }, + "match_value": { + "type": "object", + "properties": { + "column": { + "type": "string" + }, + "value": { + "type": [ + "string", + "number" + ] + } + }, + "required": [ + "column", + "value" + ], + "additionalProperties": False + } }, - "optional_parameters": {"overwrite": bool, "match_value": dict} + "required": [ + "number_column_name" + ], + "additionalProperties": False } def __init__(self, parameters): - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.number_column_name = parameters['number_column_name'] self.overwrite = parameters.get('overwrite', False) self.match_value = parameters.get('match_value', False) - if self.match_value: - self.match_value_params = {"column": str, "value": str} - required = set(self.match_value_params.keys()) - required_missing = required.difference(set(self.match_value.keys())) - if required_missing: - raise KeyError("MissingRequiredParameters", - f"Specified match_value for number_rows requires parameters {list(required_missing)}") - for param_name, param_value in self.match_value.items(): - if param_name in required: - param_type = self.match_value_params[param_name] - else: - raise KeyError("BadParameter", - f"{param_name} not a required or optional parameter for {self.operation}") - # TODO: this has a syntax error - # if not isinstance(param_value, param_type): - # raise TypeError("BadType" f"{param_value} has type {type(param_value)} not {param_type}") - + def do_op(self, dispatcher, df, name, sidecar=None): """ Add numbers events dataframe. @@ -74,3 +87,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): # df_new[self.number_column_name] = df_new.index + 1 return df_new + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/remap_columns_op.py b/hed/tools/remodeling/operations/remap_columns_op.py index c83315795..176218be7 100644 --- a/hed/tools/remodeling/operations/remap_columns_op.py +++ b/hed/tools/remodeling/operations/remap_columns_op.py @@ -24,18 +24,59 @@ class RemapColumnsOp(BaseOp): TODO: Allow wildcards """ + NAME = "remap_columns" PARAMS = { - "operation": "remap_columns", - "required_parameters": { - "source_columns": list, - "destination_columns": list, - "map_list": list, - "ignore_missing": bool + "type": "object", + "properties": { + "source_columns": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1 + }, + "destination_columns": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1 + }, + "map_list": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": [ + "string", + "number" + ] + }, + "minItems" : 1 + }, + "minItems": 1, + "uniqueItems": True + }, + "ignore_missing": { + "type": "boolean" + }, + "integer_sources": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + } }, - "optional_parameters": { - "integer_sources": list - } + "required": [ + "source_columns", + "destination_columns", + "map_list", + "ignore_missing" + ], + "additionalProperties": False } def __init__(self, parameters): @@ -44,52 +85,32 @@ def __init__(self, parameters): Parameters: parameters (dict): Parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - - :raises ValueError: - - If an integer column is not a key column. - - If a column designated as an integer source does not have valid integers. - - If no source columns are specified. - - If no destination columns are specified. - - If a map_list entry has the wrong number of items (source columns + destination columns). - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.source_columns = parameters['source_columns'] self.integer_sources = [] self.string_sources = self.source_columns if "integer_sources" in parameters: - self.integer_sources = parameters['integer_sources'] - if not set(self.integer_sources).issubset(set(self.source_columns)): - raise ValueError("IntegerSourceColumnsInvalid", - f"Integer courses {str(self.integer_sources)} must be in {str(self.source_columns)}") - self.string_sources = list(set(self.source_columns).difference(set(self.integer_sources))) + self.string_sources = list( + set(self.source_columns).difference(set(self.integer_sources))) self.destination_columns = parameters['destination_columns'] self.map_list = parameters['map_list'] self.ignore_missing = parameters['ignore_missing'] - if len(self.source_columns) < 1: - raise ValueError("EmptySourceColumns", - f"The source column list {str(self.source_columns)} must be non-empty") - - if len(self.destination_columns) < 1: - raise ValueError("EmptyDestinationColumns", - f"The destination column list {str(self.destination_columns)} must be non-empty") - entry_len = len(self.source_columns) + len(self.destination_columns) - for index, item in enumerate(self.map_list): - if len(item) != entry_len: - raise ValueError("BadColumnMapEntry", - f"Map list entry {index} has {len(item)} elements, but must have {entry_len} elements") self.key_map = self._make_key_map() def _make_key_map(self): - key_df = pd.DataFrame(self.map_list, columns=self.source_columns+self.destination_columns) - key_map = KeyMap(self.source_columns, target_cols=self.destination_columns, name="remap") + """ + + :raises ValueError: + - If a column designated as an integer source does not have valid integers. + + """ + + key_df = pd.DataFrame( + self.map_list, columns=self.source_columns+self.destination_columns) + key_map = KeyMap(self.source_columns, + target_cols=self.destination_columns, name="remap") key_map.update(key_df) return key_map @@ -110,7 +131,8 @@ def do_op(self, dispatcher, df, name, sidecar=None): """ df1 = df.copy() - df1[self.source_columns] = df1[self.source_columns].replace(np.NaN, 'n/a') + df1[self.source_columns] = df1[self.source_columns].replace( + np.NaN, 'n/a') for column in self.integer_sources: int_mask = df1[column] != 'n/a' df1.loc[int_mask, column] = df1.loc[int_mask, column].astype(int) @@ -120,3 +142,16 @@ def do_op(self, dispatcher, df, name, sidecar=None): raise ValueError("MapSourceValueMissing", f"{name}: Ignore missing is false, but source values [{missing}] are in data but not map") return df_new + + @staticmethod + def validate_input_data(parameters): + errors = [] + if len(set([len(x) for x in parameters.get("map_list")])) != 1: + errors.append("The lists specified in the map_list parameter in the remap_columns operation should all have the same length.") + else: + if (len(parameters.get('source_columns')) + len(parameters.get("destination_columns"))) != len(parameters.get("map_list")[0]): + errors.append("The lists specified in the map_list parameter in the remap_columns operation should have a length equal to the number of source columns + the number of destination columns.") + if parameters.get("integer_sources", False): + if not all([(x in parameters.get("source_columns")) for x in parameters.get("integer_sources")]): + errors.append("All integer_sources in the remap_columns operation should be source_columns.") + return errors diff --git a/hed/tools/remodeling/operations/remove_columns_op.py b/hed/tools/remodeling/operations/remove_columns_op.py index 6901b6ce5..a20015d48 100644 --- a/hed/tools/remodeling/operations/remove_columns_op.py +++ b/hed/tools/remodeling/operations/remove_columns_op.py @@ -10,14 +10,28 @@ class RemoveColumnsOp(BaseOp): - **ignore_missing** (*boolean*): If true, names in column_names that are not columns in df should be ignored. """ - + NAME = "remove_columns" + PARAMS = { - "operation": "remove_columns", - "required_parameters": { - "column_names": list, - "ignore_missing": bool + "type": "object", + "properties": { + "column_names": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "ignore_missing": { + "type": "boolean" + } }, - "optional_parameters": {} + "required": [ + "column_names", + "ignore_missing" + ], + "additionalProperties": False } def __init__(self, parameters): @@ -26,15 +40,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.column_names = parameters['column_names'] ignore_missing = parameters['ignore_missing'] if ignore_missing: @@ -65,3 +72,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): raise KeyError("MissingColumnCannotBeRemoved", f"{name}: Ignore missing is False but a column in {str(self.column_names)} is " f"not in the data columns [{str(df_new.columns)}]") + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/remove_rows_op.py b/hed/tools/remodeling/operations/remove_rows_op.py index 217fb7934..181f70d15 100644 --- a/hed/tools/remodeling/operations/remove_rows_op.py +++ b/hed/tools/remodeling/operations/remove_rows_op.py @@ -11,14 +11,31 @@ class RemoveRowsOp(BaseOp): - **remove_values** (*list*): The values to test for row removal. """ - + NAME = "remove_rows" + PARAMS = { - "operation": "remove_rows", - "required_parameters": { - "column_name": str, - "remove_values": list + "type": "object", + "properties": { + "column_name": { + "type": "string" + }, + "remove_values": { + "type": "array", + "items": { + "type": [ + "string", + "number" + ] + }, + "minItems": 1, + "uniqueItems": True + } }, - "optional_parameters": {} + "required": [ + "column_name", + "remove_values" + ], + "additionalProperties": False } def __init__(self, parameters): @@ -27,15 +44,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.column_name = parameters["column_name"] self.remove_values = parameters["remove_values"] @@ -58,3 +68,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): for value in self.remove_values: df_new = df_new.loc[df_new[self.column_name] != value, :] return df_new + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/rename_columns_op.py b/hed/tools/remodeling/operations/rename_columns_op.py index 0a3329b0c..160427b81 100644 --- a/hed/tools/remodeling/operations/rename_columns_op.py +++ b/hed/tools/remodeling/operations/rename_columns_op.py @@ -7,18 +7,33 @@ class RenameColumnsOp (BaseOp): """ Rename columns in a tabular file. Required remodeling parameters: - - **column_mapping** (*dict*): The names of the columns to be removed. + - **column_mapping** (*dict*): The names of the columns to be renamed. - **ignore_missing** (*bool*): If true, the names in column_mapping that are not columns and should be ignored. """ - + NAME = "rename_columns" + PARAMS = { - "operation": "rename_columns", - "required_parameters": { - "column_mapping": dict, - "ignore_missing": bool + "type": "object", + "properties": { + "column_mapping": { + "type": "object", + "patternProperties": { + ".*": { + "type": "string" + } + }, + "minProperties": 1 + }, + "ignore_missing": { + "type": "boolean" + } }, - "optional_parameters": {} + "required": [ + "column_mapping", + "ignore_missing" + ], + "additionalProperties": False } def __init__(self, parameters): @@ -27,15 +42,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.column_mapping = parameters['column_mapping'] if parameters['ignore_missing']: self.error_handling = 'ignore' @@ -65,3 +73,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): raise KeyError("MappedColumnsMissingFromData", f"{name}: ignore_missing is False, mapping columns [{self.column_mapping}]" f" but df columns are [{str(df.columns)}") + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/reorder_columns_op.py b/hed/tools/remodeling/operations/reorder_columns_op.py index 91fcfcc30..becf66e04 100644 --- a/hed/tools/remodeling/operations/reorder_columns_op.py +++ b/hed/tools/remodeling/operations/reorder_columns_op.py @@ -11,15 +11,32 @@ class ReorderColumnsOp(BaseOp): - keep_others (*bool*): If true, columns not in column_order are placed at end. """ - + NAME = "reorder_columns" + PARAMS = { - "operation": "reorder_columns", - "required_parameters": { - "column_order": list, - "ignore_missing": bool, - "keep_others": bool + "type": "object", + "properties": { + "column_order": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "ignore_missing": { + "type": "boolean" + }, + "keep_others": { + "type": "boolean" + } }, - "optional_parameters": {} + "required": [ + "column_order", + "ignore_missing", + "keep_others" + ], + "additionalProperties": False } def __init__(self, parameters): @@ -28,15 +45,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.column_order = parameters['column_order'] self.ignore_missing = parameters['ignore_missing'] self.keep_others = parameters['keep_others'] @@ -59,15 +69,21 @@ def do_op(self, dispatcher, df, name, sidecar=None): """ df_new = df.copy() current_columns = list(df_new.columns) - missing_columns = set(self.column_order).difference(set(df_new.columns)) + missing_columns = set(self.column_order).difference( + set(df_new.columns)) ordered = self.column_order if missing_columns and not self.ignore_missing: raise ValueError("MissingReorderedColumns", f"{str(missing_columns)} are not in dataframe columns " f" [{str(df_new.columns)}] and not ignored.") elif missing_columns: - ordered = [elem for elem in self.column_order if elem not in list(missing_columns)] + ordered = [ + elem for elem in self.column_order if elem not in list(missing_columns)] if self.keep_others: ordered += [elem for elem in current_columns if elem not in ordered] df_new = df_new.loc[:, ordered] return df_new + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/split_rows_op.py b/hed/tools/remodeling/operations/split_rows_op.py index ea0b5dc13..04dbb65df 100644 --- a/hed/tools/remodeling/operations/split_rows_op.py +++ b/hed/tools/remodeling/operations/split_rows_op.py @@ -14,15 +14,68 @@ class SplitRowsOp(BaseOp): - **remove_parent_row** (*bool*): If true, the original row that was split is removed. """ + NAME = "split_rows" PARAMS = { - "operation": "split_rows", - "required_parameters": { - "anchor_column": str, - "new_events": dict, - "remove_parent_row": bool + "type": "object", + "properties": { + "anchor_column": { + "type": "string" + }, + "new_events": { + "type": "object", + "patternProperties": { + ".*": { + "type": "object", + "properties": { + "onset_source": { + "type": "array", + "items": { + "type": [ + "string", + "number" + ] + }, + "minItems": 1 + }, + "duration": { + "type": "array", + "items": { + "type": [ + "string", + "number" + ] + }, + "minItems": 1 + }, + "copy_columns": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + } + }, + "required": [ + "onset_source", + "duration" + ], + "additionalProperties": False + } + }, + "minProperties": 1 + }, + "remove_parent_row": { + "type": "boolean" + } }, - "optional_parameters": {} + "required": [ + "anchor_column", + "new_events", + "remove_parent_row" + ], + "additionalProperties": False } def __init__(self, parameters): @@ -31,15 +84,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.anchor_column = parameters['anchor_column'] self.new_events = parameters['new_events'] self.remove_parent_row = parameters['remove_parent_row'] @@ -85,7 +131,8 @@ def _split_rows(self, df, df_list): """ for event, event_parms in self.new_events.items(): add_events = pd.DataFrame([], columns=df.columns) - add_events['onset'] = self._create_onsets(df, event_parms['onset_source']) + add_events['onset'] = self._create_onsets( + df, event_parms['onset_source']) add_events[self.anchor_column] = event self._add_durations(df, add_events, event_parms['duration']) if len(event_parms['copy_columns']) > 0: @@ -103,7 +150,8 @@ def _add_durations(df, add_events, duration_sources): if isinstance(duration, float) or isinstance(duration, int): add_events['duration'] = add_events['duration'].add(duration) elif isinstance(duration, str) and duration in list(df.columns): - add_events['duration'] = add_events['duration'].add(pd.to_numeric(df[duration], errors='coerce')) + add_events['duration'] = add_events['duration'].add( + pd.to_numeric(df[duration], errors='coerce')) else: raise TypeError("BadDurationInModel", f"Remodeling duration {str(duration)} must either be numeric or a column name", "") @@ -134,3 +182,7 @@ def _create_onsets(df, onset_source): raise TypeError("BadOnsetInModel", f"Remodeling onset {str(onset)} must either be numeric or a column name.", "") return onsets + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py index 5770a6185..f267eb439 100644 --- a/hed/tools/remodeling/operations/summarize_column_names_op.py +++ b/hed/tools/remodeling/operations/summarize_column_names_op.py @@ -9,22 +9,35 @@ class SummarizeColumnNamesOp(BaseOp): """ Summarize the column names in a collection of tabular files. Required remodeling parameters: - - **summary_name** (*str*) The name of the summary. - - **summary_filename** (*str*) Base filename of the summary. + - **summary_name** (*str*): The name of the summary. + - **summary_filename** (*str*): Base filename of the summary. + + Optional remodeling parameters: + - **append_timecode** (*bool*): If false (default), the timecode is not appended to the base filename when summary is saved, otherwise it is. The purpose is to check that all the tabular files have the same columns in same order. """ - + NAME = "summarize_column_names" + PARAMS = { - "operation": "summarize_column_names", - "required_parameters": { - "summary_name": str, - "summary_filename": str + "type": "object", + "properties": { + "summary_name": { + "type": "string" + }, + "summary_filename": { + "type": "string" + }, + "append_timecode": { + "type": "boolean" + } }, - "optional_parameters": { - "append_timecode": bool - } + "required": [ + "summary_name", + "summary_filename" + ], + "additionalProperties": False } SUMMARY_TYPE = "column_names" @@ -35,15 +48,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] self.append_timecode = parameters.get('append_timecode', False) @@ -69,8 +75,13 @@ def do_op(self, dispatcher, df, name, sidecar=None): if not summary: summary = ColumnNamesSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({"name": name, "column_names": list(df_new.columns)}) + summary.update_summary( + {"name": name, "column_names": list(df_new.columns)}) return df_new + + @staticmethod + def validate_input_data(parameters): + return [] class ColumnNamesSummary(BaseSummary): diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py index 4c4401881..40518c414 100644 --- a/hed/tools/remodeling/operations/summarize_column_values_op.py +++ b/hed/tools/remodeling/operations/summarize_column_values_op.py @@ -10,30 +10,60 @@ class SummarizeColumnValuesOp(BaseOp): Required remodeling parameters: - **summary_name** (*str*): The name of the summary. - - **summary_filename** (*str*): Base filename of the summary. - - **skip_columns** (*list*): Names of columns to skip in the summary. - - **value_columns** (*list*): Names of columns to treat as value columns rather than categorical columns. + - **summary_filename** (*str*): Base filename of the summary. - Optional remodeling parameters: - - **max_categorical** (*int*): Maximum number of unique values to include in summary for a categorical column. + Optional remodeling parameters: + - **append_timecode** (*bool*): If false (default), the timecode is not appended to the base filename when summary is saved, otherwise it is. + - **max_categorical** (*int*): Maximum number of unique values to include in summary for a categorical column. + - **skip_columns** (*list*): Names of columns to skip in the summary. + - **value_columns** (*list*): Names of columns to treat as value columns rather than categorical columns. + - **values_per_line** (*int*): The number of values output per line in the summary. The purpose is to produce a summary of the values in a tabular file. """ - + NAME = "summarize_column_values" + PARAMS = { - "operation": "summarize_column_values", - "required_parameters": { - "summary_name": str, - "summary_filename": str, - "skip_columns": list, - "value_columns": list + "type": "object", + "properties": { + "summary_name": { + "type": "string" + }, + "summary_filename": { + "type": "string" + }, + "append_timecode": { + "type": "boolean" + }, + "max_categorical": { + "type": "integer" + }, + "skip_columns": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "value_columns": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "values_per_line": { + "type": "integer" + } }, - "optional_parameters": { - "append_timecode": bool, - "max_categorical": int, - "values_per_line": int - } + "required": [ + "summary_name", + "summary_filename" + ], + "additionalProperties": False } SUMMARY_TYPE = 'column_values' @@ -46,23 +76,17 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] self.skip_columns = parameters['skip_columns'] self.value_columns = parameters['value_columns'] self.append_timecode = parameters.get('append_timecode', False) self.max_categorical = parameters.get('max_categorical', float('inf')) - self.values_per_line = parameters.get('values_per_line', self.VALUES_PER_LINE) + self.values_per_line = parameters.get( + 'values_per_line', self.VALUES_PER_LINE) def do_op(self, dispatcher, df, name, sidecar=None): """ Create a summary of the column values in df. @@ -86,9 +110,14 @@ def do_op(self, dispatcher, df, name, sidecar=None): if not summary: summary = ColumnValueSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name}) + summary.update_summary( + {'df': dispatcher.post_proc_data(df_new), 'name': name}) return df_new + @staticmethod + def validate_input_data(parameters): + return [] + class ColumnValueSummary(BaseSummary): @@ -109,7 +138,8 @@ def update_summary(self, new_info): name = new_info['name'] if name not in self.summary_dict: self.summary_dict[name] = \ - TabularSummary(value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name=name) + TabularSummary(value_cols=self.op.value_columns, + skip_cols=self.op.skip_columns, name=name) self.summary_dict[name].update(new_info['df']) def get_details_dict(self, summary): @@ -123,11 +153,14 @@ def get_details_dict(self, summary): """ this_summary = summary.get_summary(as_json=False) - unique_counts = [(key, len(count_dict)) for key, count_dict in this_summary['Categorical columns'].items()] + unique_counts = [(key, len(count_dict)) for key, + count_dict in this_summary['Categorical columns'].items()] this_summary['Categorical counts'] = dict(unique_counts) for key, dict_entry in this_summary['Categorical columns'].items(): - num_disp, sorted_tuples = ColumnValueSummary.sort_dict(dict_entry, reverse=True) - this_summary['Categorical columns'][key] = dict(sorted_tuples[:min(num_disp, self.op.max_categorical)]) + num_disp, sorted_tuples = ColumnValueSummary.sort_dict( + dict_entry, reverse=True) + this_summary['Categorical columns'][key] = dict( + sorted_tuples[:min(num_disp, self.op.max_categorical)]) return {"Name": this_summary['Name'], "Total events": this_summary["Total events"], "Total files": this_summary['Total files'], "Files": list(this_summary['Files'].keys()), @@ -144,7 +177,8 @@ def merge_all_info(self): TabularSummary - the summary object for column values. """ - all_sum = TabularSummary(value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name='Dataset') + all_sum = TabularSummary( + value_cols=self.op.value_columns, skip_cols=self.op.skip_columns, name='Dataset') for counts in self.summary_dict.values(): all_sum.update_summary(counts) return all_sum @@ -190,10 +224,13 @@ def _get_categorical_string(self, result, offset="", indent=" "): if not cat_dict: return "" count_dict = result['Categorical counts'] - sum_list = [f"{offset}{indent}Categorical column values[Events, Files]:"] + sum_list = [ + f"{offset}{indent}Categorical column values[Events, Files]:"] sorted_tuples = sorted(cat_dict.items(), key=lambda x: x[0]) for entry in sorted_tuples: - sum_list = sum_list + self._get_categorical_col(entry, count_dict, offset="", indent=" ") + sum_list = sum_list + \ + self._get_categorical_col( + entry, count_dict, offset="", indent=" ") return "\n".join(sum_list) def _get_detail_list(self, result, indent=BaseSummary.DISPLAY_INDENT): @@ -209,12 +246,14 @@ def _get_detail_list(self, result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [] specifics = result["Specifics"] - cat_string = self._get_categorical_string(specifics, offset="", indent=indent) + cat_string = self._get_categorical_string( + specifics, offset="", indent=indent) if cat_string: sum_list.append(cat_string) val_dict = specifics.get("Value column summaries", {}) if val_dict: - sum_list.append(ColumnValueSummary._get_value_string(val_dict, offset="", indent=indent)) + sum_list.append(ColumnValueSummary._get_value_string( + val_dict, offset="", indent=indent)) return sum_list def _get_categorical_col(self, entry, count_dict, offset="", indent=" "): @@ -236,7 +275,8 @@ def _get_categorical_col(self, entry, count_dict, offset="", indent=" "): # Create and partition the list of individual entries value_list = [f"{item[0]}{str(item[1])}" for item in entry[1].items()] value_list = value_list[:num_disp] - part_list = ColumnValueSummary.partition_list(value_list, self.op.values_per_line) + part_list = ColumnValueSummary.partition_list( + value_list, self.op.values_per_line) return col_list + [f"{offset}{indent * 3}{ColumnValueSummary.get_list_str(item)}" for item in part_list] @staticmethod @@ -266,5 +306,6 @@ def _get_value_string(val_dict, offset="", indent=""): @staticmethod def sort_dict(count_dict, reverse=False): - sorted_tuples = sorted(count_dict.items(), key=lambda x: x[1][0], reverse=reverse) + sorted_tuples = sorted( + count_dict.items(), key=lambda x: x[1][0], reverse=reverse) return len(sorted_tuples), sorted_tuples diff --git a/hed/tools/remodeling/operations/summarize_definitions_op.py b/hed/tools/remodeling/operations/summarize_definitions_op.py index 28b0c6e55..99b06582a 100644 --- a/hed/tools/remodeling/operations/summarize_definitions_op.py +++ b/hed/tools/remodeling/operations/summarize_definitions_op.py @@ -11,21 +11,34 @@ class SummarizeDefinitionsOp(BaseOp): Required remodeling parameters: - **summary_name** (*str*): The name of the summary. - - **summary_filename** (*str*): Base filename of the summary. + - **summary_filename** (*str*): Base filename of the summary. + + Optional remodeling parameters: + - **append_timecode** (*bool*): If false (default), the timecode is not appended to the base filename when summary is saved, otherwise it is. The purpose is to produce a summary of the values in a tabular file. """ - + NAME = "summarize_definitions" + PARAMS = { - "operation": "summarize_definitions", - "required_parameters": { - "summary_name": str, - "summary_filename": str + "type": "object", + "properties": { + "summary_name": { + "type": "string" + }, + "summary_filename": { + "type": "string" + }, + "append_timecode": { + "type": "boolean" + } }, - "optional_parameters": { - "append_timecode": bool - } + "required": [ + "summary_name", + "summary_filename" + ], + "additionalProperties": False } SUMMARY_TYPE = 'type_defs' @@ -36,14 +49,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] self.append_timecode = parameters.get('append_timecode', False) @@ -71,11 +78,16 @@ def do_op(self, dispatcher, df, name, sidecar=None): 'schema': dispatcher.hed_schema}) return df_new + @staticmethod + def validate_input_data(parameters): + return [] + class DefinitionSummary(BaseSummary): def __init__(self, sum_op, hed_schema, known_defs=None): super().__init__(sum_op) - self.def_gatherer = DefExpandGatherer(hed_schema, known_defs=known_defs) + self.def_gatherer = DefExpandGatherer( + hed_schema, known_defs=known_defs) def update_summary(self, new_info): """ Update the summary for a given tabular input file. @@ -87,8 +99,10 @@ def update_summary(self, new_info): - The summary needs a "name" str, a "schema" and a "Sidecar". """ - data_input = TabularInput(new_info['df'], sidecar=new_info['sidecar'], name=new_info['name']) - series, def_dict = data_input.series_a, data_input.get_def_dict(new_info['schema']) + data_input = TabularInput( + new_info['df'], sidecar=new_info['sidecar'], name=new_info['name']) + series, def_dict = data_input.series_a, data_input.get_def_dict( + new_info['schema']) self.def_gatherer.process_def_expands(series, def_dict) @staticmethod @@ -101,8 +115,10 @@ def _build_summary_dict(items_dict, title, process_func, display_description=Fal if "#" in str(value): key = key + "/#" if display_description: - description, value = DefinitionSummary._remove_description(value) - items[key] = {"description": description, "contents": str(value)} + description, value = DefinitionSummary._remove_description( + value) + items[key] = {"description": description, + "contents": str(value)} elif isinstance(value, list): items[key] = [str(x) for x in value] else: @@ -124,7 +140,8 @@ def get_details_dict(self, def_gatherer): display_description=True) ambiguous_defs_summary = self._build_summary_dict(def_gatherer.ambiguous_defs, "Ambiguous Definitions", def_gatherer.get_ambiguous_group) - errors_summary = self._build_summary_dict(def_gatherer.errors, "Errors", None) + errors_summary = self._build_summary_dict( + def_gatherer.errors, "Errors", None) known_defs_summary.update(ambiguous_defs_summary) known_defs_summary.update(errors_summary) @@ -166,7 +183,8 @@ def _nested_dict_to_string(data, indent, level=1): for key, value in data.items(): if isinstance(value, dict): result.append(f"{indent * level}{key}: {len(value)} items") - result.append(DefinitionSummary._nested_dict_to_string(value, indent, level + 1)) + result.append(DefinitionSummary._nested_dict_to_string( + value, indent, level + 1)) elif isinstance(value, list): result.append(f"{indent * level}{key}:") for item in value: diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index 6e98abbed..7682f4e88 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -18,27 +18,66 @@ class SummarizeHedTagsOp(BaseOp): - **tags** (*dict*): Specifies how to organize the tag output. Optional remodeling parameters: - - **expand_context** (*bool*): If True, include counts from expanded context (not supported). + - **append_timecode** (*bool*): If True, the timecode is appended to the base filename when summary is saved. + - **include_context** (*bool*): If True, context of events is included in summary. + - **remove_types** (*list*): A list of type tags, such as Condition-variable or Task, to be excluded from the summary. + - **replace_defs** (*bool*): If True, the def tag is replaced by the contents of the definitions. The purpose of this op is to produce a summary of the occurrences of hed tags organized in a specified manner. The """ - + NAME = "summarize_hed_tags" + PARAMS = { - "operation": "summarize_hed_tags", - "required_parameters": { - "summary_name": str, - "summary_filename": str, - "tags": dict + "type": "object", + "properties": { + "summary_name": { + "type": "string" + }, + "summary_filename": { + "type": "string" + }, + "tags": { + "type": "object", + "patternProperties": { + ".*": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "minProperties": 1, + "additionalProperties": False + } + }, + "append_timecode": { + "type": "boolean" + }, + "include_context": { + "type": "boolean" + }, + "remove_types": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "replace_defs": { + "type": "boolean" + } }, - "optional_parameters": { - "append_timecode": bool, - "include_context": bool, - "replace_defs": bool, - "remove_types": list - } + "required": [ + "summary_name", + "summary_filename", + "tags" + ], + "additionalProperties": False } SUMMARY_TYPE = "hed_tag_summary" @@ -48,23 +87,17 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. - - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - + """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] self.tags = parameters['tags'] self.append_timecode = parameters.get('append_timecode', False) self.include_context = parameters.get('include_context', True) self.replace_defs = parameters.get("replace_defs", True) - self.remove_types = parameters.get("remove_types", ["Condition-variable", "Task"]) + self.remove_types = parameters.get( + "remove_types", ["Condition-variable", "Task"]) def do_op(self, dispatcher, df, name, sidecar=None): """ Summarize the HED tags present in the dataset. @@ -93,6 +126,10 @@ def do_op(self, dispatcher, df, name, sidecar=None): 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) return df_new + @staticmethod + def validate_input_data(parameters): + return [] + class HedTagSummary(BaseSummary): @@ -110,11 +147,13 @@ def update_summary(self, new_info): - The summary needs a "name" str, a "schema", a "df, and a "Sidecar". """ - counts = HedTagCounts(new_info['name'], total_events=len(new_info['df'])) - input_data = TabularInput(new_info['df'], sidecar=new_info['sidecar'], name=new_info['name']) - tag_man = HedTagManager(EventManager(input_data, new_info['schema']), + counts = HedTagCounts( + new_info['name'], total_events=len(new_info['df'])) + input_data = TabularInput( + new_info['df'], sidecar=new_info['sidecar'], name=new_info['name']) + tag_man = HedTagManager(EventManager(input_data, new_info['schema']), remove_types=self.sum_op.remove_types) - hed_objs = tag_man.get_hed_objs(include_context=self.sum_op.include_context, + hed_objs = tag_man.get_hed_objs(include_context=self.sum_op.include_context, replace_defs=self.sum_op.replace_defs) for hed in hed_objs: counts.update_event_counts(hed, new_info['name']) @@ -190,7 +229,8 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Dataset: Total events={result.get('Total events', 0)} " f"Total files={len(result.get('Files', 0))}"] - sum_list = sum_list + HedTagSummary._get_tag_list(result, indent=indent) + sum_list = sum_list + \ + HedTagSummary._get_tag_list(result, indent=indent) return "\n".join(sum_list) @staticmethod @@ -206,14 +246,16 @@ def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Total events={result.get('Total events', 0)}"] - sum_list = sum_list + HedTagSummary._get_tag_list(result, indent=indent) + sum_list = sum_list + \ + HedTagSummary._get_tag_list(result, indent=indent) return "\n".join(sum_list) @staticmethod def _tag_details(tags): tag_list = [] for tag in tags: - tag_list.append(f"{tag['tag']}[{tag['events']},{len(tag['files'])}]") + tag_list.append( + f"{tag['tag']}[{tag['events']},{len(tag['files'])}]") return tag_list @staticmethod @@ -223,10 +265,12 @@ def _get_tag_list(result, indent=BaseSummary.DISPLAY_INDENT): for category, tags in tag_info['Main tags'].items(): sum_list.append(f"{indent}{indent}{category}:") if tags: - sum_list.append(f"{indent}{indent}{indent}{' '.join(HedTagSummary._tag_details(tags))}") + sum_list.append( + f"{indent}{indent}{indent}{' '.join(HedTagSummary._tag_details(tags))}") if tag_info['Other tags']: sum_list.append(f"{indent}Other tags[events,files]:") - sum_list.append(f"{indent}{indent}{' '.join(HedTagSummary._tag_details(tag_info['Other tags']))}") + sum_list.append( + f"{indent}{indent}{' '.join(HedTagSummary._tag_details(tag_info['Other tags']))}") return sum_list @staticmethod @@ -236,3 +280,7 @@ def _get_details(key_list, template, verbose=False): for tag_cnt in template[item.lower()]: key_details.append(tag_cnt.get_info(verbose=verbose)) return key_details + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py index 6aaa4c7ea..364c3d91f 100644 --- a/hed/tools/remodeling/operations/summarize_hed_type_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_type_op.py @@ -15,23 +15,40 @@ class SummarizeHedTypeOp(BaseOp): Required remodeling parameters: - **summary_name** (*str*): The name of the summary. - **summary_filename** (*str*): Base filename of the summary. - - **type_tag** (*str*):Type tag to get_summary (e.g. `condition-variable` or `task` tags). + - **type_tag** (*str*):Type tag to get_summary (e.g. `condition-variable` or `task` tags). + + Optional remodeling parameters: + - **append_timecode** (*bool*): If true, the timecode is appended to the base filename when summary is saved The purpose of this op is to produce a summary of the occurrences of specified tag. This summary is often used with `condition-variable` to produce a summary of the experimental design. """ + NAME = "summarize_hed_type" + PARAMS = { - "operation": "summarize_hed_type", - "required_parameters": { - "summary_name": str, - "summary_filename": str, - "type_tag": str + "type": "object", + "properties": { + "summary_name": { + "type": "string" + }, + "summary_filename": { + "type": "string" + }, + "type_tag": { + "type": "string" + }, + "append_timecode": { + "type": "boolean" + } }, - "optional_parameters": { - "append_timecode": bool - } + "required": [ + "summary_name", + "summary_filename", + "type_tag" + ], + "additionalProperties": False } SUMMARY_TYPE = 'hed_type_summary' @@ -42,15 +59,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] self.type_tag = parameters['type_tag'].lower() @@ -81,6 +91,10 @@ def do_op(self, dispatcher, df, name, sidecar=None): 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) return df_new + @staticmethod + def validate_input_data(parameters): + return [] + class HedTypeSummary(BaseSummary): @@ -102,10 +116,13 @@ def update_summary(self, new_info): sidecar = new_info['sidecar'] if sidecar and not isinstance(sidecar, Sidecar): sidecar = Sidecar(sidecar) - input_data = TabularInput(new_info['df'], sidecar=sidecar, name=new_info['name']) - type_values = HedType(EventManager(input_data, new_info['schema']), new_info['name'], type_tag=self.type_tag) + input_data = TabularInput( + new_info['df'], sidecar=sidecar, name=new_info['name']) + type_values = HedType(EventManager( + input_data, new_info['schema']), new_info['name'], type_tag=self.type_tag) counts = HedTypeCounts(new_info['name'], self.type_tag) - counts.update_summary(type_values.get_summary(), type_values.total_events, new_info['name']) + counts.update_summary(type_values.get_summary(), + type_values.total_events, new_info['name']) counts.add_descriptions(type_values.type_defs) self.summary_dict[new_info["name"]] = counts @@ -183,10 +200,13 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): if item['direct_references']: str1 = str1 + f" Direct references:{item['direct_references']}" if item['events_with_multiple_refs']: - str1 = str1 + f" Multiple references:{item['events_with_multiple_refs']})" + str1 = str1 + \ + f" Multiple references:{item['events_with_multiple_refs']})" sum_list.append(f"{indent}{key}: {str1}") if item['level_counts']: - sum_list = sum_list + HedTypeSummary._level_details(item['level_counts'], indent=indent) + sum_list = sum_list + \ + HedTypeSummary._level_details( + item['level_counts'], indent=indent) return "\n".join(sum_list) @staticmethod @@ -207,12 +227,14 @@ def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT): f"Total events={result.get('Total events', 0)}"] for key, item in type_info.items(): - sum_list.append(f"{indent*2}{key}: {item['levels']} levels in {item['events']} events") + sum_list.append( + f"{indent*2}{key}: {item['levels']} levels in {item['events']} events") str1 = "" if item['direct_references']: str1 = str1 + f" Direct references:{item['direct_references']}" if item['events_with_multiple_refs']: - str1 = str1 + f" (Multiple references:{item['events_with_multiple_refs']})" + str1 = str1 + \ + f" (Multiple references:{item['events_with_multiple_refs']})" if str1: sum_list.append(f"{indent*3}{str1}") if item['level_counts']: @@ -227,7 +249,13 @@ def _level_details(level_counts, offset="", indent=""): str1 = f"[{details['events']} events, {details['files']} files]:" level_list.append(f"{offset}{indent*2}{key} {str1}") if details['tags']: - level_list.append(f"{offset}{indent*3}Tags: {str(details['tags'])}") + level_list.append( + f"{offset}{indent*3}Tags: {str(details['tags'])}") if details['description']: - level_list.append(f"{offset}{indent*3}Description: {details['description']}") + level_list.append( + f"{offset}{indent*3}Description: {details['description']}") return level_list + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py index a2948eb8b..6d43d9cfa 100644 --- a/hed/tools/remodeling/operations/summarize_hed_validation_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_validation_op.py @@ -16,20 +16,39 @@ class SummarizeHedValidationOp(BaseOp): - **summary_filename** (*str*): Base filename of the summary. - **check_for_warnings** (*bool*): If true include warnings as well as errors. + Optional remodeling parameters: + - **append_timecode** (*bool*): If true, the timecode is appended to the base filename when summary is saved. + The purpose of this op is to produce a summary of the HED validation errors in a file. """ - + NAME = "summarize_hed_validation" + PARAMS = { - "operation": "summarize_hed_validation", - "required_parameters": { - "summary_name": str, - "summary_filename": str + "type": "object", + "properties": { + "summary_name": { + "type": "string" + }, + "summary_filename": { + "type": "string" + }, + "append_timecode": { + "type": "boolean" + }, + "check_for_warnings": { + "type": "boolean" + }, + "append_timecode": { + "type": "boolean" + } }, - "optional_parameters": { - "append_timecode": bool, - "check_for_warnings": bool - } + "required": [ + "summary_name", + "summary_filename", + "check_for_warnings" + ], + "additionalProperties": False } SUMMARY_TYPE = 'hed_validation' @@ -40,15 +59,8 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] self.append_timecode = parameters.get('append_timecode', False) @@ -79,6 +91,10 @@ def do_op(self, dispatcher, df, name, sidecar=None): 'schema': dispatcher.hed_schema, 'sidecar': sidecar}) return df_new + @staticmethod + def validate_input_data(parameters): + return [] + class HedValidationSummary(BaseSummary): @@ -105,10 +121,16 @@ def _get_result_string(self, name, result, indent=BaseSummary.DISPLAY_INDENT): sum_list = [f"{name}: [{len(specifics['sidecar_files'])} sidecar files, " f"{len(specifics['event_files'])} event files]"] if specifics.get('is_merged'): - sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], count_only=True, indent=indent) - sum_list = sum_list + self.get_error_list(specifics['event_issues'], count_only=True, indent=indent) + sum_list = sum_list + \ + self.get_error_list( + specifics['sidecar_issues'], count_only=True, indent=indent) + sum_list = sum_list + \ + self.get_error_list( + specifics['event_issues'], count_only=True, indent=indent) else: - sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], indent=indent*2) + sum_list = sum_list + \ + self.get_error_list( + specifics['sidecar_issues'], indent=indent*2) if specifics['sidecar_had_issues']: sum_list = sum_list + self.get_error_list(specifics['sidecar_issues'], count_only=False, indent=indent*2) else: @@ -127,8 +149,10 @@ def update_summary(self, new_info): sidecar = new_info.get('sidecar', None) if sidecar and not isinstance(sidecar, Sidecar): - sidecar = Sidecar(files=new_info['sidecar'], name=os.path.basename(sidecar)) - results = self._get_sidecar_results(sidecar, new_info, self.check_for_warnings) + sidecar = Sidecar( + files=new_info['sidecar'], name=os.path.basename(sidecar)) + results = self._get_sidecar_results( + sidecar, new_info, self.check_for_warnings) if not results['sidecar_had_issues']: input_data = TabularInput(new_info['df'], sidecar=sidecar) issues = input_data.validate(new_info['schema']) @@ -183,7 +207,8 @@ def _update_events_results(results, ind_results): @staticmethod def _update_sidecar_results(results, ind_results): results["total_sidecar_issues"] += ind_results["total_sidecar_issues"] - results["sidecar_files"] = results["sidecar_files"] + ind_results["sidecar_files"] + results["sidecar_files"] = results["sidecar_files"] + \ + ind_results["sidecar_files"] for ikey, errors in ind_results["sidecar_issues"].items(): results["sidecar_issues"][ikey] = errors @@ -213,17 +238,21 @@ def get_error_list(error_dict, count_only=False, indent=BaseSummary.DISPLAY_INDE def _format_errors(error_list, name, errors, indent): error_list.append(f"{indent}{name} issues:") for this_item in errors: - error_list.append(f"{indent * 2}{HedValidationSummary._format_error(this_item)}") + error_list.append( + f"{indent * 2}{HedValidationSummary._format_error(this_item)}") @staticmethod def _format_error(error): error_str = error['code'] error_locations = [] - HedValidationSummary.update_error_location(error_locations, "row", "ec_row", error) - HedValidationSummary.update_error_location(error_locations, "column", "ec_column", error) + HedValidationSummary.update_error_location( + error_locations, "row", "ec_row", error) + HedValidationSummary.update_error_location( + error_locations, "column", "ec_column", error) HedValidationSummary.update_error_location(error_locations, "sidecar column", "ec_sidecarColumnName", error) - HedValidationSummary.update_error_location(error_locations, "sidecar key", "ec_sidecarKeyName", error) + HedValidationSummary.update_error_location( + error_locations, "sidecar key", "ec_sidecarKeyName", error) location_str = ",".join(error_locations) if location_str: error_str = error_str + f"[{location_str}]" @@ -244,7 +273,8 @@ def _get_sidecar_results(sidecar, new_info, check_for_warnings): results["sidecar_files"].append(sidecar.name) results["sidecar_issues"][sidecar.name] = [] sidecar_issues = sidecar.validate(new_info['schema']) - filtered_issues = ErrorHandler.filter_issues_by_severity(sidecar_issues, ErrorSeverity.ERROR) + filtered_issues = ErrorHandler.filter_issues_by_severity( + sidecar_issues, ErrorSeverity.ERROR) if filtered_issues: results["sidecar_had_issues"] = True if not check_for_warnings: @@ -253,3 +283,7 @@ def _get_sidecar_results(sidecar, new_info, check_for_warnings): results['sidecar_issues'][sidecar.name] = str_issues results['total_sidecar_issues'] = len(sidecar_issues) return results + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py b/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py index f584ee1d3..e06765325 100644 --- a/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py +++ b/hed/tools/remodeling/operations/summarize_sidecar_from_events_op.py @@ -11,25 +11,52 @@ class SummarizeSidecarFromEventsOp(BaseOp): Required remodeling parameters: - **summary_name** (*str*): The name of the summary. - - **summary_filename** (*str*): Base filename of the summary. + - **summary_filename** (*str*): Base filename of the summary. + + Optional remodeling parameters: + - **append_timecode** (*bool*): - **skip_columns** (*list*): Names of columns to skip in the summary. - **value_columns** (*list*): Names of columns to treat as value columns rather than categorical columns. The purpose is to produce a JSON sidecar template for annotating a dataset with HED tags. """ - + NAME = "summarize_sidecar_from_events" + PARAMS = { - "operation": "summarize_sidecar_from_events", - "required_parameters": { - "summary_name": str, - "summary_filename": str, - "skip_columns": list, - "value_columns": list, + "type": "object", + "properties": { + "summary_name": { + "type": "string" + }, + "summary_filename": { + "type": "string" + }, + "skip_columns": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "value_columns": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "uniqueItems": True + }, + "append_timecode": { + "type": "boolean" + } }, - "optional_parameters": { - "append_timecode": bool - } + "required": [ + "summary_name", + "summary_filename" + ], + "additionalProperties": False } SUMMARY_TYPE = "events_to_sidecar" @@ -40,16 +67,9 @@ def __init__(self, parameters): Parameters: parameters (dict): Dictionary with the parameter values for required and optional parameters. - :raises KeyError: - - If a required parameter is missing. - - If an unexpected parameter is provided. - - :raises TypeError: - - If a parameter has the wrong type. - """ - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] self.skip_columns = parameters['skip_columns'] @@ -78,9 +98,14 @@ def do_op(self, dispatcher, df, name, sidecar=None): if not summary: summary = EventsToSidecarSummary(self) dispatcher.summary_dicts[self.summary_name] = summary - summary.update_summary({'df': dispatcher.post_proc_data(df_new), 'name': name}) + summary.update_summary( + {'df': dispatcher.post_proc_data(df_new), 'name': name}) return df_new + @staticmethod + def validate_input_data(parameters): + return [] + class EventsToSidecarSummary(BaseSummary): @@ -100,7 +125,8 @@ def update_summary(self, new_info): """ - tab_sum = TabularSummary(value_cols=self.value_cols, skip_cols=self.skip_cols, name=new_info["name"]) + tab_sum = TabularSummary( + value_cols=self.value_cols, skip_cols=self.skip_cols, name=new_info["name"]) tab_sum.update(new_info['df'], new_info['name']) self.summary_dict[new_info["name"]] = tab_sum @@ -195,3 +221,7 @@ def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT): f"Value columns: {str(specifics.get('Value info', {}).keys())}", f"Sidecar:\n{json.dumps(specifics['Sidecar'], indent=indent)}"] return "\n".join(sum_list) + + @staticmethod + def validate_input_data(parameters): + return [] diff --git a/hed/tools/remodeling/operations/valid_operations.py b/hed/tools/remodeling/operations/valid_operations.py index 8753ed1d6..52cf41162 100644 --- a/hed/tools/remodeling/operations/valid_operations.py +++ b/hed/tools/remodeling/operations/valid_operations.py @@ -1,6 +1,6 @@ """ The valid operations for the remodeling tools. """ -# from hed.tools.remodeling.operations.convert_columns_op import ConvertColumnsOp +from hed.tools.remodeling.operations.convert_columns_op import ConvertColumnsOp from hed.tools.remodeling.operations.factor_column_op import FactorColumnOp from hed.tools.remodeling.operations.factor_hed_tags_op import FactorHedTagsOp from hed.tools.remodeling.operations.factor_hed_type_op import FactorHedTypeOp @@ -27,8 +27,8 @@ 'factor_hed_tags': FactorHedTagsOp, 'factor_hed_type': FactorHedTypeOp, 'merge_consecutive': MergeConsecutiveOp, - 'number_groups_op': NumberGroupsOp, - 'number_rows_op': NumberRowsOp, + 'number_groups': NumberGroupsOp, + 'number_rows': NumberRowsOp, 'remap_columns': RemapColumnsOp, 'remove_columns': RemoveColumnsOp, 'remove_rows': RemoveRowsOp, @@ -38,8 +38,8 @@ 'summarize_column_names': SummarizeColumnNamesOp, 'summarize_column_values': SummarizeColumnValuesOp, 'summarize_definitions': SummarizeDefinitionsOp, - 'summarize_sidecar_from_events': SummarizeSidecarFromEventsOp, - 'summarize_hed_type': SummarizeHedTypeOp, 'summarize_hed_tags': SummarizeHedTagsOp, - 'summarize_hed_validation': SummarizeHedValidationOp + 'summarize_hed_type': SummarizeHedTypeOp, + 'summarize_hed_validation': SummarizeHedValidationOp, + 'summarize_sidecar_from_events': SummarizeSidecarFromEventsOp } diff --git a/hed/tools/remodeling/validator.py b/hed/tools/remodeling/validator.py new file mode 100644 index 000000000..60ce68bec --- /dev/null +++ b/hed/tools/remodeling/validator.py @@ -0,0 +1,196 @@ +import os +import json +from copy import deepcopy +from jsonschema import Draft202012Validator +from jsonschema.exceptions import ErrorTree +from hed.tools.remodeling.operations.valid_operations import valid_operations + + +class RemodelerValidator(): + """Validator for remodeler input files.""" + + MESSAGE_STRINGS = { + "0": { + "minItems": "There are no operations defined. Specify at least 1 operation for the remodeler to execute.", + "type": "Operations must be contained in a list or array. This is also true when you run a single operation." + }, + "1": { + "type": "Each operation must be defined in a dictionary. {instance} is not a dictionary object.", + "required": "Operation dictionary {operation_index} is missing '{missing_value}'. Every operation dictionary must specify the type of operation, a description, and the operation parameters.", + "additionalProperties": "Operation dictionary {operation_index} contains an unexpected field '{added_property}'. Every operation dictionary must specify the type of operation, a description, and the operation parameters." + }, + "2": { + "type": "Operation {operation_index}: {instance} is not a {validator_value}. {operation_field} should be of type {validator_value}.", + "enum": "{instance} is not a known remodeler operation. Accepted remodeler operations can be found in the documentation.", + "required": "Operation {operation_index}: The parameter {missing_value} is missing. {missing_value} is a required parameter of {operation_name}.", + "additionalProperties": "Operation {operation_index}: Operation parameters for {operation_name} contain an unexpected field '{added_property}'.", + "dependentRequired": "Operation {operation_index}: The parameter {missing_value} is missing. {missing_value} is a required parameter of {operation_name} when {dependent_on} is specified." + }, + "more": { + "type": "Operation {operation_index}: The value of {parameter_path}, in the {operation_name} operation, should be a {validator_value}. {instance} is not a {validator_value}.", + "minItems": "Operation {operation_index}: The list in {parameter_path}, in the {operation_name} operation, should have at least {validator_value} item(s).", + "required": "Operation {operation_index}: The field {missing_value} is missing in {parameter_path}. {missing_value} is a required parameter of {parameter_path}.", + "additionalProperties": "Operation {operation_index}: Operation parameters for {parameter_path} contain an unexpected field '{added_property}'.", + "enum": "Operation {operation_index}: Operation parameter {parameter_path}, in the {operation_name} operation, contains and unexpected value. Value should be one of {validator_value}.", + "uniqueItems": "Operation {operation_index}: The list in {parameter_path}, in the {operation_name} operation, should only contain unique items.", + "minProperties": "Operation {operation_index}: The dictionary in {parameter_path}, in the {operation_name} operation, should have at least {validator_value} key(s)." + } + } + + BASE_ARRAY = { + "type": "array", + "items": {}, + "minItems": 1 + } + + OPERATION_DICT = { + "type": "object", + "required": [ + "operation", + "description", + "parameters" + ], + "additionalProperties": False, + "properties": { + "operation": { + "type": "string", + "enum": [], + "default": "convert_columns" + }, + "description": { + "type": "string" + }, + "parameters": { + "type": "object", + "properties": {} + } + }, + "allOf": [] + } + + PARAMETER_SPECIFICATION_TEMPLATE = { + "if": { + "properties": { + "operation": { + "const": "" + } + }, + "required": [ + "operation" + ] + }, + "then": { + "properties": { + "parameters": {} + } + } + } + + def __init__(self): + """ Constructor for remodeler Validator + + Parameters: + - **schema** (*dict*): The compiled json schema against which remodeler files should be validated + - **validator** (*Draft202012Validator*): The instantiated json schema validator + """ + self.schema = self._construct_schema() + self.validator = Draft202012Validator(self.schema) + + def validate(self, operations): + """ Validates a dictionary against the json schema specification for the remodeler file, plus any additional data validation that is + necessary and returns a list of user friendly error messages. + + Parameters: + **operations** (*dict*): Dictionary with input operations to run through the remodeler + + Returns: + **list_of_error_strings** (*list*): List with all error messages for every error identified by the validator + """ + + list_of_error_strings = [] + for error in sorted(self.validator.iter_errors(operations), key=lambda e: e.path): + list_of_error_strings.append( + self._parse_message(error, operations)) + if list_of_error_strings: + return list_of_error_strings + + operation_by_parameters = [(operation["operation"], operation["parameters"]) for operation in operations] + + for index, operation in enumerate(operation_by_parameters): + error_strings = valid_operations[operation[0]].validate_input_data(operation[1]) + for error_string in error_strings: + list_of_error_strings.append("Operation %s: %s" %(index+1, error_string)) + + return list_of_error_strings + + def _parse_message(self, error, operations): + ''' Return a user friendly error message based on the jsonschema validation error + + Parameters: + - **error** (*ValidationError*): A validation error from jsonschema validator + - **operations** (*dict*): The operations that were validated + + Note: + - json schema error does not contain all necessary information to return a + proper error message so we also take some information directly from the operations + that led to the error + - all necessary information is gathered into an error dict, message strings are predefined in a dictionary which are formatted with additional information + ''' + error_dict = vars(error) + + level = len(error_dict["path"]) + if level > 2: + level = "more" + # some information is in the validation error but not directly in a field so I need to + # modify before they can parsed in + # if they are necessary, they are there, if they are not there, they are not necessary + try: + error_dict["operation_index"] = error_dict["path"][0] + 1 + error_dict["operation_field"] = error_dict["path"][1].capitalize() + error_dict["operation_name"] = operations[int( + error_dict["path"][0])]['operation'] + # everything except the first two values reversed + parameter_path = [*error_dict['path']][:1:-1] + for ind, value in enumerate(parameter_path): + if isinstance(value, int): + parameter_path[ind] = f"item {value+1}" + error_dict["parameter_path"] = ", ".join(parameter_path) + except (IndexError, TypeError, KeyError): + pass + + type = str(error_dict["validator"]) + + # the missing value with required elements, or the wrong additional value is not known to the + # validation error object + # this is a known issue of jsonschema: https://github.com/python-jsonschema/jsonschema/issues/119 + # for now the simplest thing seems to be to extract it from the error message + if type == 'required': + error_dict["missing_value"] = error_dict["message"].split("'")[ + 1::2][0] + if type == 'additionalProperties': + error_dict["added_property"] = error_dict["message"].split("'")[ + 1::2][0] + + # dependent required provided both the missing value and the reason it is required in one dictionary + # it is split over two for the error message + if type == 'dependentRequired': + error_dict["missing_value"] = list(error_dict["validator_value"].keys())[0] + error_dict["dependent_on"] = list(error_dict["validator_value"].values())[0] + + return self.MESSAGE_STRINGS[str(level)][type].format(**error_dict) + + def _construct_schema(self): + + schema = deepcopy(self.BASE_ARRAY) + schema["items"] = deepcopy(self.OPERATION_DICT) + + for operation in valid_operations.items(): + schema["items"]["properties"]["operation"]["enum"].append(operation[0]) + + parameter_specification = deepcopy(self.PARAMETER_SPECIFICATION_TEMPLATE) + parameter_specification["if"]["properties"]["operation"]["const"] = operation[0] + parameter_specification["then"]["properties"]["parameters"] = operation[1].PARAMS + + schema["items"]["allOf"].append(deepcopy(parameter_specification)) + + return schema diff --git a/requirements.txt b/requirements.txt index ea546d348..587a5823d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ pandas>=1.3.5 portalocker>=2.7.0 semantic_version>=2.10.0 wordcloud==1.9.3 +jsonschema==4.18.4 rdflib>=6 diff --git a/tests/data/remodel_tests/all_remodel_operations.json b/tests/data/remodel_tests/all_remodel_operations.json new file mode 100644 index 000000000..34e929f95 --- /dev/null +++ b/tests/data/remodel_tests/all_remodel_operations.json @@ -0,0 +1,278 @@ +[ + { + "operation": "remove_columns", + "description": "Remove unwanted columns prior to analysis", + "parameters": { + "column_names": [ + "value", + "sample" + ], + "ignore_missing": true + } + }, + { + "operation": "factor_column", + "description": "Create factors for the succesful_stop and unsuccesful_stop values.", + "parameters": { + "column_name": "trial_type", + "factor_values": [ + "succesful_stop", + "unsuccesful_stop" + ], + "factor_names": [ + "stopped", + "stop_failed" + ] + } + }, + { + "operation": "factor_hed_tags", + "description": "Create factors based on whether the event represented a correct or incorrect action.", + "parameters": { + "queries": [ + "correct-action", + "incorrect-action" + ], + "query_names": [ + "correct", + "incorrect" + ], + "expand_context": false + } + }, + { + "operation": "factor_hed_type", + "description": "Factor based on the sex of the images being presented.", + "parameters": { + "type_tag": "Condition-variable" + } + }, + { + "operation": "merge_consecutive", + "description": "Merge consecutive *succesful_stop* events that match the *match_columns.", + "parameters": { + "column_name": "trial_type", + "event_code": "succesful_stop", + "match_columns": [ + "stop_signal_delay", + "response_hand", + "sex" + ], + "set_durations": true, + "ignore_missing": true + } + }, + { + "operation": "remap_columns", + "description": "Map response_accuracy and response hand into a single column.", + "parameters": { + "source_columns": [ + "response_accuracy", + "response_hand" + ], + "destination_columns": [ + "response_type" + ], + "map_list": [ + [ + "correct", + "left", + "correct_left" + ], + [ + "correct", + "right", + "correct_right" + ], + [ + "incorrect", + "left", + "incorrect_left" + ], + [ + "incorrect", + "right", + "incorrect_left" + ], + [ + "n/a", + "n/a", + "n/a" + ] + ], + "ignore_missing": true + } + }, + { + "operation": "remove_columns", + "description": "Remove extra columns before the next step.", + "parameters": { + "column_names": [ + "stop_signal_delay", + "response_accuracy", + "face" + ], + "ignore_missing": true + } + }, + { + "operation": "remove_rows", + "description": "Remove rows where trial_type is either succesful_stop or unsuccesful_stop.", + "parameters": { + "column_name": "trial_type", + "remove_values": [ + "succesful_stop", + "unsuccesful_stop" + ] + } + }, + { + "operation": "rename_columns", + "description": "Rename columns to be more descriptive.", + "parameters": { + "column_mapping": { + "stop_signal_delay": "stop_delay", + "response_hand": "hand_used" + }, + "ignore_missing": true + } + }, + { + "operation": "reorder_columns", + "description": "Reorder columns.", + "parameters": { + "column_order": [ + "onset", + "duration", + "response_time", + "trial_type" + ], + "ignore_missing": true, + "keep_others": false + } + }, + { + "operation": "split_rows", + "description": "add response events to the trials.", + "parameters": { + "anchor_column": "trial_type", + "new_events": { + "response": { + "onset_source": [ + "response_time" + ], + "duration": [ + 0 + ], + "copy_columns": [ + "response_accuracy", + "response_hand", + "sex", + "trial_number" + ] + }, + "stop_signal": { + "onset_source": [ + "stop_signal_delay" + ], + "duration": [ + 0.5 + ], + "copy_columns": [ + "trial_number" + ] + } + }, + "remove_parent_row": false + } + }, + { + "operation": "summarize_column_names", + "description": "Summarize column names.", + "parameters": { + "summary_name": "AOMIC_column_names", + "summary_filename": "AOMIC_column_names" + } + }, + { + "operation": "summarize_column_values", + "description": "Summarize the column values in an excerpt.", + "parameters": { + "summary_name": "AOMIC_column_values", + "summary_filename": "AOMIC_column_values", + "skip_columns": [ + "onset", + "duration" + ], + "value_columns": [ + "response_time", + "stop_signal_delay" + ] + } + }, + { + "operation": "summarize_definitions", + "description": "Summarize the definitions used in this dataset.", + "parameters": { + "summary_name": "HED_column_definition_summary", + "summary_filename": "HED_column_definition_summary" + } + }, + { + "operation": "summarize_hed_tags", + "description": "Summarize the HED tags in the dataset.", + "parameters": { + "summary_name": "summarize_hed_tags", + "summary_filename": "summarize_hed_tags", + "tags": { + "Sensory events": [ + "Sensory-event", + "Sensory-presentation", + "Task-stimulus-role", + "Experimental-stimulus" + ], + "Agent actions": [ + "Agent-action", + "Agent", + "Action", + "Agent-task-role", + "Task-action-type", + "Participant-response" + ], + "Objects": [ + "Item" + ] + } + } + }, + { + "operation": "summarize_hed_type", + "description": "Summarize column names.", + "parameters": { + "summary_name": "AOMIC_condition_variables", + "summary_filename": "AOMIC_condition_variables", + "type_tag": "condition-variable" + } + }, + { + "operation": "summarize_hed_validation", + "description": "Summarize validation errors in the sample dataset.", + "parameters": { + "summary_name": "AOMIC_sample_validation", + "summary_filename": "AOMIC_sample_validation", + "check_for_warnings": true + } + }, + { + "operation": "summarize_sidecar_from_events", + "description": "Generate a sidecar from the excerpted events file.", + "parameters": { + "summary_name": "AOMIC_generate_sidecar", + "summary_filename": "AOMIC_generate_sidecar", + "value_columns": [ + "response_time", + "stop_signal_delay" + ] + } + } +] \ No newline at end of file diff --git a/tests/tools/remodeling/operations/test_base_op.py b/tests/tools/remodeling/operations/test_base_op.py index 096a0ba12..e581cbdb7 100644 --- a/tests/tools/remodeling/operations/test_base_op.py +++ b/tests/tools/remodeling/operations/test_base_op.py @@ -4,44 +4,34 @@ class TestOp(BaseOp): + NAME = "test" PARAMS = { - "operation": "test_op", - "required_parameters": { - "op_name": str, - "skip_columns": list, - "keep_all": bool, + "type": "object", + "properties": { + "column_name": { + "type": "string" + } }, - "optional_parameters": { - "keep_columns": list - } + "required": [ + "column_name" + ], + "additionalProperties": False } - def __init__(self, parameters): - super().__init__(self.PARAMS, parameters) - def do_op(self, dispatcher, df, name, sidecar=None): return df + + @staticmethod + def validate_input_data(parameters): + return [] class Test(unittest.TestCase): @classmethod def setUpClass(cls): - cls.params = { - "operation": "test_op", - "required_parameters": { - "is_string": str, - "is_multiple": [str, int, float, list], - "is_bool": bool, - "is_list": list - }, - "optional_parameters": {} - } base_parameters = { - "is_string": "Condition-variable", - "is_multiple": ["a", "b", "c"], - "is_bool": False, - "is_list": [3, 4, 5] + "column_name": "a_descriptive_name" } cls.json_parameters = json.dumps(base_parameters) @@ -51,44 +41,30 @@ def tearDownClass(cls): def test_constructor(self): parameters = json.loads(self.json_parameters) - op1 = BaseOp(self.params, parameters) - self.assertIsInstance(op1, BaseOp, "constructor should create a BaseOp") - self.assertIn("is_string", op1.required_params, - "constructor required_params should contain an expected value") - - def test_constructor_no_operation(self): - with self.assertRaises(ValueError) as context: - BaseOp({}, {}) - self.assertEqual(context.exception.args[0], 'OpMustHaveOperation') - - def test_constructor_no_parameters(self): - op1 = BaseOp({"operation": "no_parameter_operation"}, {}) - self.assertIsInstance(op1, BaseOp, "constructor allows a operation with no parameters") - - def test_check_parameters_bad_element(self): - parameters = json.loads(self.json_parameters) - parameters["is_multiple"] = {"a": 1, "b": 2} - with self.assertRaises(TypeError) as context: - BaseOp(self.params, parameters) - self.assertEqual(context.exception.args[0], 'BadType') - - def test_parse_operations_missing_required(self): - parameters = json.loads(self.json_parameters) - parameters.pop("is_string") - with self.assertRaises(KeyError) as context: - BaseOp(TestOp.PARAMS, parameters) - self.assertEqual(context.exception.args[0], 'MissingRequiredParameters') - - def test_check_parameters_test(self): - parameters1 = {"op_name": "test", "skip_columns": ["onset", "duration"], "keep_all": True, "junk": "bad_parm"} - with self.assertRaises(KeyError) as context1: - BaseOp(TestOp.PARAMS, parameters1) - self.assertEqual(context1.exception.args[0], 'BadParameter') - parameters2 = {"op_name": "test", "skip_columns": ["onset", "duration"], "keep_all": "true"} - with self.assertRaises(TypeError) as context2: - TestOp(parameters2) - self.assertEqual(context2.exception.args[0], 'BadType') - + test_instantiate = TestOp(parameters) + self.assertDictEqual(test_instantiate.parameters, parameters) + + + def test_constructor_no_name(self): + class TestOpNoName(BaseOp): + PARAMS = { + "type": "object", + "properties": { + "column_name": { + "type": "string" + } + }, + "required": [ + "column_name" + ], + "additionalProperties": False + } + + def do_op(self, dispatcher, df, name, sidecar=None): + return df + + with self.assertRaises(TypeError): + instantiate = TestOpNoName({}) if __name__ == '__main__': unittest.main() diff --git a/tests/tools/remodeling/operations/test_base_summary.py b/tests/tools/remodeling/operations/test_base_summary.py index 5faed6467..e45d620a8 100644 --- a/tests/tools/remodeling/operations/test_base_summary.py +++ b/tests/tools/remodeling/operations/test_base_summary.py @@ -6,6 +6,7 @@ class TestOp(BaseOp): + NAME = "test_op" PARAMS = { "operation": "test_summary_op", "required_parameters": { @@ -20,11 +21,18 @@ class TestOp(BaseOp): SUMMARY_TYPE = "test_sum" def __init__(self, parameters): - super().__init__(self.PARAMS, parameters) + super().__init__(parameters) self.summary_name = parameters['summary_name'] self.summary_filename = parameters['summary_filename'] self.append_timecode = parameters.get('append_timecode', False) + def do_op(self, dispatcher, df, name, sidecar=None): + return df.copy() + + @staticmethod + def validate_input_data(parameters): + return [] + class TestSummary(BaseSummary): diff --git a/tests/tools/remodeling/operations/test_convert_columns_op.py b/tests/tools/remodeling/operations/test_convert_columns_op.py index 48d177b0f..d988f616b 100644 --- a/tests/tools/remodeling/operations/test_convert_columns_op.py +++ b/tests/tools/remodeling/operations/test_convert_columns_op.py @@ -36,12 +36,5 @@ def setUp(self): def tearDownClass(cls): pass - def test_constructor_bad_convert_to(self): - self.base_parameters["convert_to"] = "blech" - with self.assertRaises(ValueError) as context: - ConvertColumnsOp(self.base_parameters) - self.assertEqual(context.exception.args[0], "CannotConvertToSpecifiedType") - - if __name__ == '__main__': unittest.main() diff --git a/tests/tools/remodeling/operations/test_factor_column_op.py b/tests/tools/remodeling/operations/test_factor_column_op.py index 402353d5e..454f91d81 100644 --- a/tests/tools/remodeling/operations/test_factor_column_op.py +++ b/tests/tools/remodeling/operations/test_factor_column_op.py @@ -40,13 +40,6 @@ def setUp(self): def tearDownClass(cls): pass - def test_bad_constructor(self): - self.base_parameters["factor_names"] = ["stopped"] - with self.assertRaises(ValueError) as context: - FactorColumnOp(self.base_parameters) - self.assertEqual(context.exception.args[0], "FactorNamesLenBad", - "factor_names and factor_values must be same length") - def test_no_names(self): self.base_parameters["factor_names"] = [] self.base_parameters["factor_values"] = [] diff --git a/tests/tools/remodeling/operations/test_merge_consecutive_op.py b/tests/tools/remodeling/operations/test_merge_consecutive_op.py index 9b504b221..5dcbf720f 100644 --- a/tests/tools/remodeling/operations/test_merge_consecutive_op.py +++ b/tests/tools/remodeling/operations/test_merge_consecutive_op.py @@ -50,18 +50,6 @@ def get_dfs(self, op): df_new = op.do_op(self.dispatch, self.dispatch.prep_data(df), 'run-01') return df, self.dispatch.post_proc_data(df_new) - def test_valid(self): - parms = json.loads(self.json_parms) - op = MergeConsecutiveOp(parms) - self.assertIsInstance(op, MergeConsecutiveOp) - - def test_invalid(self): - parms = json.loads(self.json_parms) - parms["column_name"] = "sex" - with self.assertRaises(ValueError) as context: - MergeConsecutiveOp(parms) - self.assertEqual(context.exception.args[0], "MergeColumnCannotBeMatchColumn") - def test_do_op_valid(self): # Test when no extras but ignored. parms = json.loads(self.json_parms) diff --git a/tests/tools/remodeling/operations/test_number_groups.py b/tests/tools/remodeling/operations/test_number_groups.py index 1bae16d80..fc3f056f7 100644 --- a/tests/tools/remodeling/operations/test_number_groups.py +++ b/tests/tools/remodeling/operations/test_number_groups.py @@ -203,73 +203,3 @@ def test_number_groups_new_column(self): # "split_rows should not change the input df columns") # self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), # "split_rows should not change the input df values") - - # test expected breaks parameters - def test_missing_startstop_param(self): - # test when missing parameter - parms = json.loads(self.json_missing_startstop_parms) - - # with self.assertRaisesRegex(KeyError, "MissingRequiredParameters"): - # op = NumberGroupsOp(parms) - - def test_wrong_startstop_param(self): - # test when a start stop parameter is missing - parms = json.loads(self.json_wrong_startstop_parms) - - # with self.assertRaisesRegex(KeyError, "BadParameter"): - # op = NumberGroupsOp(parms) - - def test_wrong_startstop_type_param(self): - # Test when wrong type in start stop parameters - parms = json.loads(self.json_wrong_startstop_type_parms) - # TODO fix code and put back in - # with self.assertRaisesRegex(TypeError, "BadType"): - # op = NumberGroupsOp(parms) - - def test_wrong_value_inclusion(self): - # test when a wrong value is given for inclusion (only accept include and exclude string) - parms = json.loads(self.json_wrong_inclusion_parms) - - # with self.assertRaisesRegex(ValueError, "BadValue"): - # op = NumberGroupsOp(parms) - - # test expected breaks event file - parameters - def test_existing_column_overwrite_unspecified(self): - # Test when existing column name is given with overwrite unspecified (=False) - parms = json.loads(self.json_parms) - op = NumberGroupsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # - # with self.assertRaisesRegex(ValueError, "ExistingNumberColumn"): - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - - def test_existing_column_overwrite_false(self): - # Test when existing column name is given with overwrite specified False - parms = json.loads(self.json_overwrite_false_parms) - op = NumberGroupsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # - # with self.assertRaisesRegex(ValueError, "ExistingNumberColumn"): - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - - def test_missing_source_column(self): - # Test when source column does not exist in event file - parms = json.loads(self.json_parms) - op = NumberGroupsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # - # with self.assertRaisesRegex(ValueError, "ExistingNumberColumn"): - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - - def test_missing_startstop_value(self): - # Test when one of startstop values does not exist in source column - parms = json.loads(self.json_missing_startstop_value_parms) - op = NumberGroupsOp(parms) - # df_test = pd.DataFrame(self.sample_data, columns=self.sample_columns) - # - # with self.assertRaisesRegex(ValueError, "MissingValue"): - # op.do_op(self.dispatcher, df_test, self.file_name) - diff --git a/tests/tools/remodeling/operations/test_number_rows_op.py b/tests/tools/remodeling/operations/test_number_rows_op.py index 9c60a63aa..78fdc6bcb 100644 --- a/tests/tools/remodeling/operations/test_number_rows_op.py +++ b/tests/tools/remodeling/operations/test_number_rows_op.py @@ -166,65 +166,8 @@ def setUpClass(cls): "number_column_name": "number" } - overwrite_false_parameters = { - "number_column_name": "number", - "overwrite": False - } - - overwrite_true_parameters = { - "number_column_name": "number", - "overwrite": True - } - - filter_complete_parameters = { - "number_column_name": "number", - "match_value": {"column": "code", "value": "40"} - } - - filter_incomplete_parameters = { - "number_column_name": "number", - "match_value": {"column": "code"} - } - - filter_invalid_parameters = { - "number_column_name": "number", - "match_value": {"column": "code", "value": "40", "label": "code"} - } - - filter_wrong_type_parameters = { - "number_column_name": "number", - "match_value": {"column": 246, "value": []} - } - - filter_missing_column_parameters = { - "number_column_name": "number", - "match_value": {"column": "trial_type", "value": "40"} - } - - filter_missing_value_parameters = { - "number_column_name": "number", - "match_value": {"column": "code", "value": "stop_trial"} - } - - filter_overwrite_parameters = { - "number_column_name": "number", - "match_value": {"column": "number", "value": "40"}, - "overwrite": True - } - cls.json_parms = json.dumps(base_parameters) - cls.json_overwrite_false_parms = json.dumps(overwrite_false_parameters) - cls.json_overwrite_true_parms = json.dumps(overwrite_true_parameters) - - cls.json_filter_complete_parameters = json.dumps(filter_complete_parameters) - cls.json_filter_incomplete_parameters = json.dumps(filter_incomplete_parameters) - cls.json_filter_invalid_parameters = json.dumps(filter_invalid_parameters) - cls.json_filter_wrong_type_parameters = json.dumps(filter_wrong_type_parameters) - cls.json_filter_missing_column_parameters = json.dumps(filter_missing_column_parameters) - cls.json_filter_missing_value_parameters = json.dumps(filter_missing_value_parameters) - cls.json_filter_overwrite_parameters = json.dumps(filter_overwrite_parameters) - cls.dispatcher = None cls.file_name = None @@ -257,136 +200,5 @@ def test_number_rows_new_column(self): # self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), # "number_rows should not change the input df values") - def test_existing_column_overwrite_false(self): - # Test when existing column name is given with overwrite specified False - parms = json.loads(self.json_overwrite_false_parms) - op = NumberRowsOp(parms) - # df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # with self.assertRaisesRegex(ValueError, "ExistingNumberColumn") as context: - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - - def test_existing_column_overwrite_unspecified(self): - # Test when existing column name is given with overwrite unspecified (=False) - parms = json.loads(self.json_parms) - op = NumberRowsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - - # with self.assertRaisesRegex(ValueError, "ExistingNumberColumn"): - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - - def test_existing_column_overwrite_true(self): - # Test when existing column name is given with overwrite True - parms = json.loads(self.json_overwrite_true_parms) - op = NumberRowsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # df_check = pd.DataFrame(self.overwritten_data, columns=self.existing_sample_columns) - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - # df_new = df_new.fillna('n/a') - - # self.assertTrue(list(df_new.columns) == list(self.existing_sample_columns), - # "numbered_events should have the same columns as original dataframe in case of overwrite") - # self.assertTrue(len(df_new) == len(df_test), - # "numbered_events should have same length as original dataframe") - # self.assertTrue(all([i + 1 == value for (i, value) in enumerate(df_new[parms['number_column_name']])]), - # "event should be numbered consecutively from 1 to length of the dataframe") - # self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()), - # "numbered_events should not differ from check") - - # Test that df has not been changed by the op - # self.assertTrue(list(df.columns) == list(df_test.columns), - # "split_rows should not change the input df columns") - # self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), - # "split_rows should not change the input df values") - - def test_filter_complete_parameters(self): - # Test when valid complete filter/match_value parameters are given - parms = json.loads(self.json_filter_complete_parameters) - op = NumberRowsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.sample_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.sample_columns) - # df_check = pd.DataFrame(self.filter_numbered_data, columns=self.numbered_columns) - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - # df_new = df_new.fillna('n/a') - - # self.assertTrue(list(df_new.columns) == list(self.numbered_columns), - # "numbered_events should have expected columns") - # self.assertTrue(len(df_new) == len(df_test), - # "numbered_events should have same length as original dataframe") - # self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()), - # "numbered_events should not differ from check") - - # Test that df has not been changed by the op - # self.assertTrue(list(df.columns) == list(df_test.columns), - # "split_rows should not change the input df columns") - # self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), - # "split_rows should not change the input df values") - - def test_filter_incomplete_parameters(self): - # Test when filter/match_value parameters are not complete - parms = json.loads(self.json_filter_incomplete_parameters) - - # with self.assertRaisesRegex(KeyError, "MissingRequiredParameters"): - # op = NumberRowsOp(parms) - - def test_filter_invalid_parameters(self): - # Test when invalid filter/match_value parameters are given - parms = json.loads(self.json_filter_invalid_parameters) - - # with self.assertRaisesRegex(KeyError, "BadParameter"): - # op = NumberRowsOp(parms) - - def test_filter_wrong_type_parameters(self): - # Test when invalid filter/match_value parameters are given - parms = json.loads(self.json_filter_wrong_type_parameters) - # TODO: need to fix type - # with self.assertRaisesRegex(TypeError, "BadType"): - # op = NumberRowsOp(parms) - - def test_filter_missing_column_parameters(self): - # Test when specified filter column is not in event file - parms = json.loads(self.json_filter_missing_column_parameters) - op = NumberRowsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.sample_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.sample_columns) - - # with self.assertRaisesRegex(ValueError, "MissingMatchColumn"): - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - - def test_filter_missing_value_parameters(self): - # Test when specified filter value is not in event file - parms = json.loads(self.json_filter_missing_value_parameters) - op = NumberRowsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.sample_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.sample_columns) - - # with self.assertRaisesRegex(ValueError, "MissingMatchValue"): - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - - def test_filter_overwrite(self): - # Test when specified filter value is not in event file - parms = json.loads(self.json_filter_overwrite_parameters) - op = NumberRowsOp(parms) - # df = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # df_test = pd.DataFrame(self.sample_data, columns=self.existing_sample_columns) - # df_check = pd.DataFrame(self.filter_overwritten_numbered_data, columns=self.existing_sample_columns) - # df_new = op.do_op(self.dispatcher, df_test, self.file_name) - # df_new = df_new.fillna('n/a') - - # self.assertTrue(list(df_new.columns) == list(self.existing_sample_columns), - # "numbered_events should have expected columns") - # self.assertTrue(len(df_new) == len(df_test), - # "numbered_events should have same length as original dataframe") - # self.assertTrue(np.array_equal(df_new.to_numpy(), df_check.to_numpy()), - # "numbered_events should not differ from check") - - # Test that df has not been changed by the op - # self.assertTrue(list(df.columns) == list(df_test.columns), - # "split_rows should not change the input df columns") - # self.assertTrue(np.array_equal(df.to_numpy(), df_test.to_numpy()), - # "split_rows should not change the input df values") - - if __name__ == '__main__': unittest.main() diff --git a/tests/tools/remodeling/operations/test_remap_columns_op.py b/tests/tools/remodeling/operations/test_remap_columns_op.py index be4637d73..cd05c7ae3 100644 --- a/tests/tools/remodeling/operations/test_remap_columns_op.py +++ b/tests/tools/remodeling/operations/test_remap_columns_op.py @@ -64,47 +64,6 @@ def get_dfs(self, op, df=None): df_new = op.do_op(self.dispatch, self.dispatch.prep_data(df), 'run-01') return df, self.dispatch.post_proc_data(df_new) - def test_valid(self): - # Test when no extras but ignored. - parms = json.loads(self.json_parms) - op = RemapColumnsOp(parms) - df, df_test = self.get_dfs(op) - self.assertNotIn("response_type", df.columns, "remap_columns before does not have response_type column") - self.assertIn("response_type", df_test.columns, "remap_columns after has response_type column") - - def test_invalid_params(self): - parms1 = json.loads(self.json_parms) - parms1["source_columns"] = [] - with self.assertRaises(ValueError) as context1: - RemapColumnsOp(parms1) - self.assertEqual(context1.exception.args[0], "EmptySourceColumns") - - parms2 = json.loads(self.json_parms) - parms2["destination_columns"] = [] - with self.assertRaises(ValueError) as context2: - RemapColumnsOp(parms2) - self.assertEqual(context2.exception.args[0], "EmptyDestinationColumns") - - parms3 = json.loads(self.json_parms) - parms3["map_list"][1] = ["right", "correct_right"], - with self.assertRaises(ValueError) as context3: - RemapColumnsOp(parms3) - self.assertEqual(context3.exception.args[0], "BadColumnMapEntry") - - parms4 = json.loads(self.json_parms1) - parms4["integer_sources"] = ["test", "baloney"] - with self.assertRaises(ValueError) as context4: - RemapColumnsOp(parms4) - self.assertEqual(context4.exception.args[0], "IntegerSourceColumnsInvalid") - - def test_integer_sources(self): - parms1 = json.loads(self.json_parms1) - op1 = RemapColumnsOp(parms1) - self.assertIn('test', op1.integer_sources) - parms2 = json.loads(self.json_parms2) - op2 = RemapColumnsOp(parms2) - self.assertIn('test', op2.integer_sources) - def test_valid_missing(self): # Test when no extras but ignored. parms = json.loads(self.json_parms) diff --git a/tests/tools/remodeling/operations/test_split_rows_op.py b/tests/tools/remodeling/operations/test_split_rows_op.py index 9710e1b61..df9e4ec3a 100644 --- a/tests/tools/remodeling/operations/test_split_rows_op.py +++ b/tests/tools/remodeling/operations/test_split_rows_op.py @@ -155,8 +155,7 @@ def test_split_rows_from_files(self): df = pd.read_csv(self.events_path, delimiter='\t', header=0, dtype=str, keep_default_na=False, na_values=None) with open(self.model1_path) as fp: operation_list = json.load(fp) - operations, errors = Dispatcher.parse_operations(operation_list) - self.assertFalse(errors, 'split_rows should not give errors if operation is correct') + operations = Dispatcher.parse_operations(operation_list) dispatch = Dispatcher(operation_list) df = dispatch.prep_data(df) df_new = operations[0].do_op(dispatch, df, "Name") diff --git a/tests/tools/remodeling/operations/test_summarize_column_names_op.py b/tests/tools/remodeling/operations/test_summarize_column_names_op.py index 2aadd8e72..c0afbf1dc 100644 --- a/tests/tools/remodeling/operations/test_summarize_column_names_op.py +++ b/tests/tools/remodeling/operations/test_summarize_column_names_op.py @@ -56,7 +56,7 @@ def test_constructor(self): def test_summary_op(self): with open(self.model_path, 'r') as fp: parms = json.load(fp) - parsed_commands, errors = Dispatcher.parse_operations(parms) + parsed_commands = Dispatcher.parse_operations(parms) dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions='8.1.0') df = dispatch.get_data_file(self.events_path) df = dispatch.prep_data(df) diff --git a/tests/tools/remodeling/operations/test_summarize_column_values_op.py b/tests/tools/remodeling/operations/test_summarize_column_values_op.py index 5fe53c4ab..9e838d5d5 100644 --- a/tests/tools/remodeling/operations/test_summarize_column_values_op.py +++ b/tests/tools/remodeling/operations/test_summarize_column_values_op.py @@ -93,7 +93,7 @@ def test_summary_op(self): '../../../data/remodel_tests/aomic_sub-0013_summary_all_rmdl.json')) with open(column_summary_path, 'r') as fp: parms = json.load(fp) - parsed_commands, errors = Dispatcher.parse_operations(parms) + parsed_commands = Dispatcher.parse_operations(parms) dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) df = dispatch.get_data_file(events) old_len = len(df) diff --git a/tests/tools/remodeling/operations/test_summarize_definitions_op.py b/tests/tools/remodeling/operations/test_summarize_definitions_op.py index 3e6843b25..76a469d2e 100644 --- a/tests/tools/remodeling/operations/test_summarize_definitions_op.py +++ b/tests/tools/remodeling/operations/test_summarize_definitions_op.py @@ -24,18 +24,6 @@ def setUpClass(cls): def tearDownClass(cls): pass - def test_constructor(self): - parms = json.loads(self.json_parms) - parms["expand_context"] = "" - with self.assertRaises(KeyError) as context: - SummarizeDefinitionsOp(parms) - self.assertEqual(context.exception.args[0], "BadParameter") - parms2 = json.loads(self.json_parms) - parms2["mystery"] = True - with self.assertRaises(KeyError) as context: - SummarizeDefinitionsOp(parms2) - self.assertEqual(context.exception.args[0], "BadParameter") - def test_do_op(self): dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) parms = json.loads(self.json_parms) diff --git a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py index 3e1c1d128..196a95758 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py @@ -44,23 +44,6 @@ def setUpClass(cls): def tearDownClass(cls): pass - def test_constructor(self): - parms = json.loads(self.json_parms) - sum_op1 = SummarizeHedTagsOp(parms) - self.assertIsInstance(sum_op1, SummarizeHedTagsOp, "constructor creates an object of the correct type") - - def test_constructor_bad_params(self): - parms = json.loads(self.json_parms) - parms["include_context"] = "" - with self.assertRaises(TypeError) as context: - SummarizeHedTagsOp(parms) - self.assertEqual(context.exception.args[0], "BadType") - parms2 = json.loads(self.json_parms) - parms2["mystery"] = True - with self.assertRaises(KeyError) as context: - SummarizeHedTagsOp(parms2) - self.assertEqual(context.exception.args[0], "BadParameter") - def test_do_op_no_replace_no_context_remove_on(self): dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) parms = json.loads(self.json_parms) diff --git a/tests/tools/remodeling/operations/test_summarize_hed_type_op.py b/tests/tools/remodeling/operations/test_summarize_hed_type_op.py index 642539967..b4cedafdc 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_type_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_type_op.py @@ -63,7 +63,7 @@ def test_summary(self): parms = json.load(fp) dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) df = dispatch.get_data_file(self.events) - parsed_commands, errors = Dispatcher.parse_operations(parms) + parsed_commands = Dispatcher.parse_operations(parms) sum_op = parsed_commands[2] sum_op.do_op(dispatch, dispatch.prep_data(df), 'run-01', sidecar=self.sidecar_path) context1 = dispatch.summary_dicts['AOMIC_condition_variables'] @@ -85,7 +85,7 @@ def test_text_summary_with_levels(self): parms = json.load(fp) dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) df = dispatch.get_data_file(self.events_wh) - parsed_commands, errors = Dispatcher.parse_operations(parms) + parsed_commands = Dispatcher.parse_operations(parms) sum_op = parsed_commands[2] sum_op.do_op(dispatch, dispatch.prep_data(df), 'run-01', sidecar=self.sidecar_path_wh) context1 = dispatch.summary_dicts['AOMIC_condition_variables'] @@ -97,8 +97,7 @@ def test_text_summary(self): with open(self.summary_path, 'r') as fp: parms = json.load(fp) - parsed_commands, errors = Dispatcher.parse_operations(parms) - self.assertFalse(errors) + parsed_commands = Dispatcher.parse_operations(parms) dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) df = dispatch.get_data_file(self.events) old_len = len(df) diff --git a/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py b/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py index 97b87df83..cf86665cc 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_validation_op.py @@ -42,12 +42,6 @@ def test_constructor(self): sum_op1 = SummarizeHedValidationOp(parms) self.assertIsInstance(sum_op1, SummarizeHedValidationOp, "constructor creates an object of the correct type") - parms2 = json.loads(self.json_parms) - parms2["mystery"] = True - with self.assertRaises(KeyError) as context: - SummarizeHedValidationOp(parms2) - self.assertEqual(context.exception.args[0], "BadParameter") - def test_do_op(self): dispatch = Dispatcher([], data_root=None, backup_name=None, hed_versions=['8.1.0']) parms = json.loads(self.json_parms) diff --git a/tests/tools/remodeling/test_dispatcher.py b/tests/tools/remodeling/test_dispatcher.py index b91a3c6f8..ad6a01888 100644 --- a/tests/tools/remodeling/test_dispatcher.py +++ b/tests/tools/remodeling/test_dispatcher.py @@ -57,17 +57,6 @@ def test_constructor_empty_operations(self): self.assertIsInstance(disp, Dispatcher, "") self.assertFalse(disp.parsed_ops, "constructor empty operations list has empty parsed ops") - def test_constructor_bad_no_operation(self): - test = [{"operation": "remove_rows", "parameters": {"column_name": "response_time", "remove_values": ["n/a"]}}, - {"parameters": {"column_name": "trial_type", "remove_values": ["succesful_stop", "unsuccesful_stop"]}}] - operations, errors = Dispatcher.parse_operations(test) - self.assertFalse(operations, "parse_operations returns empty if no operation") - self.assertEqual(len(errors), 1, - "parse_operation returns a list of one error if one operation with no operation") - self.assertEqual(errors[0]['index'], 1, "parse_operation error has the correct index for missing operation") - self.assertEqual(errors[0]['error_type'], KeyError, - "parse_operation error has the correct type for missing operation") - def test_get_data_file(self): model_path1 = os.path.join(self.data_path, 'simple_reorder_rmdl.json') with open(model_path1) as fp: @@ -92,46 +81,6 @@ def test_get_summary_save_dir(self): dispatch2.get_summary_save_dir() self.assertEqual(context.exception.code, 'NoDataRoot') - def test_parse_operations_errors(self): - test = [{"operation": "remove_rows", "parameters": {"column_name": "response_time", "remove_values": ["n/a"]}}, - {"operation": "remove_rows"}] - operations, errors = Dispatcher.parse_operations(test) - self.assertFalse(operations, "parse_operations returns empty if no parameters") - self.assertEqual(len(errors), 1, - "parse_operation returns a list of one error if one operation with no parameters") - self.assertEqual(errors[0]['index'], 1, "parse_operation error has the correct index for missing parameters") - self.assertEqual(errors[0]['error_type'], KeyError, - "parse_operation error has the correct type for missing parameters") - - test = [{"operation": "remove_rows", - "parameters": {"column_name": "trial_type", "remove_values": ["succesful_stop", "unsuccesful_stop"]}}, - {"operation": "remove_rows", "parameters": {"column_name": "response_time"}}] - operations, errors = Dispatcher.parse_operations(test) - self.assertFalse(operations, "parse_operations returns empty if missing required") - self.assertEqual(len(errors), 1, - "parse_operation returns a list of one error if one operation with missing required") - self.assertEqual(errors[0]['index'], 1, "parse_operation error has the correct index for missing parameters") - self.assertEqual(errors[0]['error_type'], KeyError, - "parse_operation error has the correct type for missing required") - with self.assertRaises(ValueError) as context: - Dispatcher(test) - self.assertEqual(context.exception.args[0], 'InvalidOperationList') - - test2 = [{"operation": "blimey", - "parameters": {"column_name": "trial_type", "remove_values": ["succesful_stop", "unsuccesful_stop"]}}] - operations, errors = Dispatcher.parse_operations(test2) - self.assertFalse(operations, "parse_operations returns empty if missing required") - self.assertEqual(len(errors), 1, - "parse_operation returns a list of one error if bad operation") - self.assertEqual(errors[0]['index'], 0, "parse_operation error has the correct index for bad operation") - self.assertEqual(errors[0]['error_type'], KeyError, - "parse_operation error has the correct type for bad operation") - self.assertEqual(errors[0]['error_code'], 'OperationNotListedAsValid''', - "parse_operation error has has correct code for bad operation") - with self.assertRaises(ValueError) as context: - Dispatcher(test2) - self.assertEqual(context.exception.args[0], 'InvalidOperationList') - def test_parse_operation_list(self): test = [{"operation": "remove_rows", "parameters": {"column_name": "trial_type", "remove_values": ["succesful_stop", "unsuccesful_stop"]}}, @@ -176,8 +125,6 @@ def test_run_operations_hed(self): } } ] - operations, errors = Dispatcher.parse_operations(op_list) - self.assertFalse(errors) dispatch = Dispatcher(op_list, hed_versions=['8.1.0']) df = dispatch.run_operations(events_path, sidecar=sidecar_path, verbose=False) self.assertIsInstance(df, pd.DataFrame) diff --git a/tests/tools/remodeling/test_validator.py b/tests/tools/remodeling/test_validator.py new file mode 100644 index 000000000..15447edea --- /dev/null +++ b/tests/tools/remodeling/test_validator.py @@ -0,0 +1,152 @@ +import os +import json +import unittest +from copy import deepcopy +from hed.tools.remodeling.validator import RemodelerValidator + +class Test(unittest.TestCase): + + @classmethod + def setUpClass(cls): + with open(os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '../data/remodel_tests/all_remodel_operations.json'))) as f: + cls.remodel_file = json.load(f) + + @classmethod + def tearDownClass(cls): + pass + + def test_validator_build(self): + validator = RemodelerValidator() + + def test_validate_valid(self): + validator = RemodelerValidator() + error_strings = validator.validate(self.remodel_file) + self.assertFalse(error_strings) + + def test_validate_array(self): + validator = RemodelerValidator() + wrong_input_type = {"operation": "remove_columns"} + error_strings = validator.validate(wrong_input_type) + self.assertEqual(error_strings[0], "Operations must be contained in a list or array. This is also true when you run a single operation.") + + no_operations = [] + error_strings = validator.validate(no_operations) + self.assertEqual(error_strings[0], "There are no operations defined. Specify at least 1 operation for the remodeler to execute.") + + def test_validate_operations(self): + validator = RemodelerValidator() + + invalid_operation_type = ["string"] + error_strings = validator.validate(invalid_operation_type) + self.assertEqual(error_strings[0], "Each operation must be defined in a dictionary. string is not a dictionary object.") + + invalid_operation_missing = [self.remodel_file[0].copy()] + del invalid_operation_missing[0]["description"] + error_strings = validator.validate(invalid_operation_missing) + self.assertEqual(error_strings[0], "Operation dictionary 1 is missing 'description'. Every operation dictionary must specify the type of operation, a description, and the operation parameters.") + + invalid_operation_name = [self.remodel_file[0].copy()] + invalid_operation_name[0]["operation"] = "unlisted_operation" + error_strings = validator.validate(invalid_operation_name) + self.assertEqual(error_strings[0], "unlisted_operation is not a known remodeler operation. Accepted remodeler operations can be found in the documentation.") + + def test_validate_parameters(self): + validator = RemodelerValidator() + + missing_parameter = [deepcopy(self.remodel_file[0])] + del missing_parameter[0]["parameters"]["column_names"] + error_strings = validator.validate(missing_parameter) + self.assertEqual(error_strings[0], "Operation 1: The parameter column_names is missing. column_names is a required parameter of remove_columns.") + + missing_parameter_nested = [deepcopy(self.remodel_file[10])] + del missing_parameter_nested[0]["parameters"]["new_events"]["response"]["onset_source"] + error_strings = validator.validate(missing_parameter_nested) + self.assertEqual(error_strings[0], "Operation 1: The field onset_source is missing in response, new_events. onset_source is a required parameter of response, new_events.") + + invalid_parameter = [deepcopy(self.remodel_file[0])] + invalid_parameter[0]["parameters"]["invalid"] = "invalid_value" + error_strings = validator.validate(invalid_parameter) + self.assertEqual(error_strings[0], "Operation 1: Operation parameters for remove_columns contain an unexpected field 'invalid'.") + + invalid_parameter_nested = [deepcopy(self.remodel_file[10])] + invalid_parameter_nested[0]["parameters"]["new_events"]["response"]["invalid"] = "invalid_value" + error_strings = validator.validate(invalid_parameter_nested) + self.assertEqual(error_strings[0], "Operation 1: Operation parameters for response, new_events contain an unexpected field 'invalid'.") + + invalid_type = [deepcopy(self.remodel_file[0])] + invalid_type[0]["parameters"]["column_names"] = 0 + error_strings = validator.validate(invalid_type) + self.assertEqual(error_strings[0], "Operation 1: The value of column_names, in the remove_columns operation, should be a array. 0 is not a array.") + + invalid_type_nested = [deepcopy(self.remodel_file[10])] + invalid_type_nested[0]["parameters"]["new_events"]["response"]["onset_source"] = {"key": "value"} + error_strings = validator.validate(invalid_type_nested) + self.assertEqual(error_strings[0], "Operation 1: The value of onset_source, response, new_events, in the split_rows operation, should be a array. {'key': 'value'} is not a array.") + + empty_array = [deepcopy(self.remodel_file[0])] + empty_array[0]["parameters"]["column_names"] = [] + error_strings = validator.validate(empty_array) + self.assertEqual(error_strings[0], "Operation 1: The list in column_names, in the remove_columns operation, should have at least 1 item(s).") + + empty_array_nested = [deepcopy(self.remodel_file[5])] + empty_array_nested[0]["parameters"]["map_list"][0] = [] + error_strings = validator.validate(empty_array_nested) + self.assertEqual(error_strings[0], "Operation 1: The list in item 1, map_list, in the remap_columns operation, should have at least 1 item(s).") + + # invalid_value = [deepcopy(self.remodel_file[18])] + # invalid_value[0]["parameters"]["convert_to"] = "invalid_value" + # error_strings = validator.validate(invalid_value) + # self.assertEqual(error_strings[0], "Operation 1: Operation parameter convert_to, in the convert_columns operation, contains and unexpected value. Value should be one of ['str', 'int', 'float', 'fixed'].") + + # value_dependency = [deepcopy(self.remodel_file[18])] + # value_dependency[0]["parameters"]["convert_to"] = "fixed" + # error_strings = validator.validate(value_dependency) + # self.assertEqual(error_strings[0], "Operation 1: The parameter decimal_places is missing. decimal_places is a required parameter of convert_columns.") + + property_dependency = [deepcopy(self.remodel_file[1])] + del property_dependency[0]["parameters"]["factor_values"] + error_strings = validator.validate(property_dependency) + self.assertEqual(error_strings[0], "Operation 1: The parameter factor_names is missing. factor_names is a required parameter of factor_column when ['factor_values'] is specified.") + + double_item_in_array = [deepcopy(self.remodel_file[0])] + double_item_in_array[0]["parameters"]["column_names"] = ['response', 'response'] + error_strings = validator.validate(double_item_in_array) + self.assertEqual(error_strings[0], "Operation 1: The list in column_names, in the remove_columns operation, should only contain unique items.") + + double_item_in_array_nested = [deepcopy(self.remodel_file[10])] + double_item_in_array_nested[0]["parameters"]["new_events"]["response"]["copy_columns"] = ['response', 'response'] + error_strings = validator.validate(double_item_in_array_nested) + self.assertEqual(error_strings[0], "Operation 1: The list in copy_columns, response, new_events, in the split_rows operation, should only contain unique items.") + + def test_validate_parameter_data(self): + validator = RemodelerValidator() + + factor_column_validate = [deepcopy(self.remodel_file)[1]] + factor_column_validate[0]["parameters"]["factor_names"] = ["stopped"] + error_strings = validator.validate(factor_column_validate) + self.assertEqual(error_strings[0], "Operation 1: The list in factor_names, in the factor_column operation, should have the same number of items as factor_values.") + + factor_hed_tags_validate = [deepcopy(self.remodel_file)[2]] + factor_hed_tags_validate[0]["parameters"]["query_names"] = ["correct"] + error_strings = validator.validate(factor_hed_tags_validate) + self.assertEqual(error_strings[0], "Operation 1: The list in query_names, in the factor_hed_tags operation, should have the same number of items as queries.") + + merge_consecutive_validate = [deepcopy(self.remodel_file)[4]] + merge_consecutive_validate[0]["parameters"]["match_columns"].append("trial_type") + error_strings = validator.validate(merge_consecutive_validate) + self.assertEqual(error_strings[0], "Operation 1: The column_name in the merge_consecutive operation cannot be specified as a match_column.") + + remap_columns_validate_same_length = [deepcopy(self.remodel_file)[5]] + remap_columns_validate_same_length[0]["parameters"]["map_list"][0] = [""] + error_strings = validator.validate(remap_columns_validate_same_length) + self.assertEqual(error_strings[0], "Operation 1: The lists specified in the map_list parameter in the remap_columns operation should all have the same length.") + + remap_columns_validate_right_length = [deepcopy(self.remodel_file[5])] + remap_columns_validate_right_length[0]["parameters"]["map_list"] = [["string1", "string2"], ["string3", "string4"]] + error_strings = validator.validate(remap_columns_validate_right_length) + self.assertEqual(error_strings[0], "Operation 1: The lists specified in the map_list parameter in the remap_columns operation should have a length equal to the number of source columns + the number of destination columns.") + + remap_columns_integer_sources = [deepcopy(self.remodel_file[5])] + remap_columns_integer_sources[0]["parameters"]["integer_sources"] = ["unknown_column"] + error_strings = validator.validate(remap_columns_integer_sources) + self.assertEqual(error_strings[0], "Operation 1: All integer_sources in the remap_columns operation should be source_columns.") \ No newline at end of file