diff --git a/README.md b/README.md index 0a83496..ff8005f 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ The pipeline will load two dummy approaches (which can be accessed on ```pipe.ap parameters (which can be accessed on ```pipe.approaches_pars```). For each fold, these approaches will be fitted to the train set and predict the 'color' of the examples on the dev sets. -The metrics used to evaluate the performance of the approaches are listed in ```pipe.metrics```. +The metrics used to evaluate the performance of the approaches are listed in ```pipe.evaluation_pars['metrics']```. An exhaustive grid search is performed, to get all possible combinations of the parameters of each of the approaches. The performance of each of these combinations on each fold can be accessed on: @@ -63,10 +63,11 @@ The performance of each of these combinations on each fold can be accessed on: pipe.get_results() ``` -To plot these results per fold: +To plot these results per fold for each of the metrics: ``` pipe.plot_results() ``` +To plot only a certain list of metrics, this list can be given as an argument of this function. To get the final ranking of best approaches (after combining the results of different folds): ``` diff --git a/modev/__init__.py b/modev/__init__.py index b7ee448..efbee7f 100644 --- a/modev/__init__.py +++ b/modev/__init__.py @@ -1,4 +1,5 @@ import modev.approaches +import modev.common import modev.default_pars import modev.etl import modev.evaluation diff --git a/modev/approaches.py b/modev/approaches.py index deafdcd..6c34611 100644 --- a/modev/approaches.py +++ b/modev/approaches.py @@ -69,11 +69,6 @@ def __init__(self, random_state=default_pars.random_state): random_state : int Random state to use when picking elements from the train set. - Attributes - ---------- - possible_choices - Possible values (taken from target column in the train set) to choose values from, when making predictions. - Methods ------- fit @@ -100,6 +95,7 @@ def fit(self, train_x, train_y): None """ + _ = train_x self.possible_choices = train_y def predict(self, test_x): diff --git a/modev/common.py b/modev/common.py new file mode 100644 index 0000000..486c051 --- /dev/null +++ b/modev/common.py @@ -0,0 +1,15 @@ +"""Common functions that are designed for modev. + +""" +from modev import default_pars + +approach_key = default_pars.approach_key +id_key = default_pars.id_key +fold_key = default_pars.fold_key +pars_key = default_pars.pars_key + + +def get_metrics_from_results(results): + non_metrics_columns = [pars_key, approach_key, id_key, fold_key] + metrics = [col for col in results.columns if col not in non_metrics_columns] + return metrics diff --git a/modev/default_pars.py b/modev/default_pars.py index 3364f7d..142dc0c 100644 --- a/modev/default_pars.py +++ b/modev/default_pars.py @@ -39,9 +39,13 @@ selection_pars_combined_results_condition = None selection_pars_results_condition = None +validation_pars_test_fraction = 0.5 +validation_pars_test_n_sets = 6 +# Default validation pars for k-fold cross-validation: validation_pars_labels = None validation_pars_playground_n_folds = 4 validation_pars_return_original_indexes = True validation_pars_shuffle = True -validation_pars_test_fraction = 0.5 -validation_pars_test_n_sets = 6 +# Default validation pars for temporal-fold cross-validation: +validation_min_n_train_examples = 10 +validation_dev_n_sets = 4 diff --git a/modev/etl.py b/modev/etl.py index 40770bd..988b880 100644 --- a/modev/etl.py +++ b/modev/etl.py @@ -21,7 +21,7 @@ def apply_selection_to_data(data, selection): def count_rows(data_file): with open(data_file) as f: - num_rows = sum(1 for line in f) + num_rows = sum(1 for _ in f) return num_rows diff --git a/modev/evaluation.py b/modev/evaluation.py index 2375883..3d30aea 100644 --- a/modev/evaluation.py +++ b/modev/evaluation.py @@ -50,7 +50,6 @@ def evaluate_predictions(raw_true, raw_pred, metrics, **kwargs): * 'threshold_at_*': threshold at k (e.g. 'threshold_at_10') or at k percent (e.g. 'threshold_at_5_pct'). Note: For the time being, all metrics have to return only one number; In the case of a multi-class classification, a micro-average precision is returned. - # TODO: Allow saving metrics like precision and recall as lists (for different labels). Returns ------- @@ -80,6 +79,11 @@ def evaluate_predictions(raw_true, raw_pred, metrics, **kwargs): type_of_metric = metric.split('_at_')[0] # Extract only the type of metric at k needed. results[metric] = metrics_at_k(true, pred, k)[type_of_metric] + # TODO: Allow saving metrics like precision and recall as lists (for different labels). Maybe the easiest is to + # create metrics *_per_label, that repeat that metric for each of the labels. But for that ensure that 'metrics' + # doesn't need to be redefined in pipeline. + + # TODO: Allow saving file with individual predictions in test examples. return results diff --git a/modev/execution.py b/modev/execution.py index 0f63308..14c4a61 100644 --- a/modev/execution.py +++ b/modev/execution.py @@ -1,22 +1,28 @@ """Functions related to the execution of the pipeline. """ +import numpy as np from tqdm.auto import tqdm from modev import default_pars +approach_key = default_pars.approach_key +dev_key = default_pars.dev_key +fold_key = default_pars.fold_key +function_key = default_pars.function_key +pars_key = default_pars.pars_key +playground_key = default_pars.playground_key +test_key = default_pars.test_key +train_key = default_pars.train_key -def _get_train_and_test_sets(data, indexes, fold, test_mode=default_pars.execution_pars_test_mode, - train_name=default_pars.train_key, - dev_name=default_pars.dev_key, - test_name=default_pars.test_key, - playground_name=default_pars.playground_key): + +def _get_train_and_test_sets(data, indexes, fold, test_mode=default_pars.execution_pars_test_mode): if test_mode: - train_set = data.loc[indexes[playground_name]] - test_set = data.loc[indexes[f'{test_name}_{fold}']] + train_set = data.loc[indexes[playground_key]] + test_set = data.loc[indexes[f'{test_key}_{fold}']] else: - train_set = data.loc[indexes[f'{train_name}_{fold}']] - test_set = data.loc[indexes[f'{dev_name}_{fold}']] + train_set = data.loc[indexes[f'{train_key}_{fold}']] + test_set = data.loc[indexes[f'{dev_key}_{fold}']] return train_set, test_set @@ -26,29 +32,37 @@ def separate_predictors_and_target(data_set, target_col): return data_set_x, data_set_y -def _get_approaches_functions_from_grid(approaches_grid, function_key=default_pars.function_key): +def _get_approaches_functions_from_grid(approaches_grid): approaches_functions = {app_name: approaches_grid[app_name][function_key] for app_name in approaches_grid} return approaches_functions -def run_experiment(data, indexes, validation_pars, execution_function, execution_pars, evaluation_function, - evaluation_pars, exploration_function, approaches_function, approaches_pars, - fold_key=default_pars.fold_key, - pars_key=default_pars.pars_key, - approach_key=default_pars.approach_key): +def _add_metrics_to_pars_folds(i, pars_folds, results): + for metric in results: + if metric not in pars_folds.columns: + pars_folds[metric] = np.nan + pars_folds.loc[i, metric] = results[metric] + + +def _get_list_of_sets_from_indexes(indexes, set_name): + list_of_sets = [int(part.split('_')[-1]) for part in indexes if part.startswith(set_name)] + return list_of_sets + + +def run_experiment(data, indexes, execution_function, execution_pars, evaluation_function, evaluation_pars, + exploration_function, approaches_function, approaches_pars): # Extract all necessary info from experiment. - metrics = evaluation_pars['metrics'] target = execution_pars['target'] test_mode = execution_pars['test_mode'] # Get list of folds to execute. if test_mode: - folds = list(range(validation_pars['test_n_sets'])) + folds = _get_list_of_sets_from_indexes(indexes, test_key) else: - folds = list(range(validation_pars['playground_n_folds'])) + folds = _get_list_of_sets_from_indexes(indexes, dev_key) # Initialise parameter space explorer. - explorer = exploration_function(approaches_pars, folds, metrics) + explorer = exploration_function(approaches_pars, folds) pars_folds = explorer.initialise_results() n_iterations = explorer.select_executions_left() @@ -71,13 +85,12 @@ def run_experiment(data, indexes, validation_pars, execution_function, execution # Evaluate predictions. results = evaluation_function(list(test_y), list(predictions), **evaluation_pars) - # Write results for these parameters and fold. - for metric in results: - pars_folds.loc[i, metric] = results[metric] + # Ensure metrics columns exist in pars_folds and write results for these parameters and fold. + _add_metrics_to_pars_folds(i, pars_folds, results) return pars_folds -def execute_model(approach_function, approach_pars, train_x, train_y, test_x, **kwargs): +def execute_model(approach_function, approach_pars, train_x, train_y, test_x, **_kwargs): """Execution method (including training and prediction) for an approach. This function takes an approach 'approach_function' with parameters 'approach_pars', a train set (with predictors diff --git a/modev/exploration.py b/modev/exploration.py index b2db460..c4b721a 100644 --- a/modev/exploration.py +++ b/modev/exploration.py @@ -7,6 +7,8 @@ from modev import default_pars +fixed_pars_key = default_pars.fixed_pars_key + def expand_parameter_grid(grid, fixed_pars=default_pars.exploration_pars_fixed_pars): pars_names = list(grid) @@ -27,7 +29,7 @@ def expand_parameter_grid(grid, fixed_pars=default_pars.exploration_pars_fixed_p return pars -def _split_approaches_name_and_pars(approaches_pars, fixed_pars_key=default_pars.fixed_pars_key): +def expand_name_and_parameter_grids(approaches_pars): app_names = [] app_pars = [] for name in approaches_pars: @@ -43,7 +45,7 @@ def _split_approaches_name_and_pars(approaches_pars, fixed_pars_key=default_pars class GridSearch: - def __init__(self, approaches_pars, folds, metrics): + def __init__(self, approaches_pars: dict, folds: list): """Grid search exploration of the parameter space. Parameters @@ -52,22 +54,19 @@ def __init__(self, approaches_pars, folds, metrics): Dictionaries of approaches. Each key corresponds to one approach name, and the value is a dictionary. This inner dictionary of an individual approach has one key per parameter, and the value is a list of parameter values to explore. - folds : list of ints + folds : list List of folds (e.g. [0, 1, 2, 3]). - metrics : list - List of metrics. """ self.approaches_pars = approaches_pars self.folds = folds - self.metrics = metrics self.pars_folds = None self.selection_to_execute = None self.next_point_generator = None def initialise_results(self): # TODO: Generalise this to save to/load from file. - app_names, app_pars = _split_approaches_name_and_pars(self.approaches_pars) + app_names, app_pars = expand_name_and_parameter_grids(self.approaches_pars) app_ids = np.arange(len(app_pars)) # Repeat each pars combination for each fold. pars_folds = pd.DataFrame(np.repeat(app_pars, len(self.folds)), columns=[default_pars.pars_key]) @@ -76,9 +75,6 @@ def initialise_results(self): pars_folds[default_pars.fold_key] = np.tile(self.folds, len(app_pars)) # Add a column for each of the evaluation metrics. self.pars_folds = pars_folds - # Add metrics to results dataframe. - for metric in self.metrics: - pars_folds[metric] = np.nan return self.pars_folds def select_executions_left(self): @@ -99,3 +95,7 @@ def get_next_point(self): # TODO: Create RandomSearch with similar structure. # TODO: Create AutoSearch with similar structure. # In this case, _next_point_finder can use self.pars_folds at any time to see explored points and decide next. +# The structure of approaches_inputs is the same for grid, random and auto searches. +# In the cases of random and auto, only first and last element will be taken, and the rest ignored. +# In these cases, the type of the first element will determine whether it is int or float. +# If more than one approach are given, split the iterations among them. diff --git a/modev/pipeline.py b/modev/pipeline.py index 8405d54..9ab1125 100644 --- a/modev/pipeline.py +++ b/modev/pipeline.py @@ -3,13 +3,18 @@ """ import logging +from modev import common from modev import default_pars from modev import execution from modev import plotting +from modev import templates from modev import validation from modev.templates import default -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.WARNING) +app_name_key = default_pars.approach_name_key +function_key = default_pars.function_key + +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.WARNING) def _check_requirements(previous_requirements, error_message): @@ -17,21 +22,20 @@ def _check_requirements(previous_requirements, error_message): logging.error(error_message) -def _split_function_and_pars(inputs, function_key=default_pars.function_key): +def _split_function_and_pars(inputs): function = inputs[function_key] pars = {par: inputs[par] for par in inputs if par != function_key} return function, pars -def _split_approaches_function_and_pars(approaches, function_key=default_pars.function_key, - app_name_key=default_pars.approach_name_key): +def _split_approaches_function_and_pars(approaches): function = {app[app_name_key]: app[function_key] for app in approaches} pars = {app[app_name_key]: {par: app[par] for par in app if par not in [function_key, app_name_key]} for app in approaches} return function, pars -def _override_default_inputs(given_inputs, default_inputs, function_key=default_pars.function_key): +def _override_default_inputs(given_inputs, default_inputs): # If function_key is not given in pars, default function will be used. # Therefore, ensure all required parameters are taken from default, except the ones explicitly given in pars. if given_inputs is None: @@ -136,8 +140,6 @@ def __init__(self, *, self.selection_function, self.selection_pars = _split_function_and_pars(selection_inputs) self.approaches_function, self.approaches_pars = _split_approaches_function_and_pars(approaches_inputs) # Initialise other attributes. - self.metrics = self.evaluation_pars['metrics'] - self.main_metric = self.selection_pars['main_metric'] self.data = None self.indexes = None self.results = None @@ -166,7 +168,7 @@ def get_indexes(self, reload=False): def get_results(self, reload=False): _check_requirements([self.data, self.indexes], self.requirements_error_message) if self.results is None or reload: - self.results = execution.run_experiment(self.data, self.indexes, self.validation_pars, + self.results = execution.run_experiment(self.data, self.indexes, self.execution_function, self.execution_pars, self.evaluation_function, self.evaluation_pars, self.exploration_function, self.approaches_function, @@ -174,9 +176,9 @@ def get_results(self, reload=False): return self.results def get_selected_models(self, reload=False): - _check_requirements([self.data, self.results], self.requirements_error_message) + _check_requirements([self.results], self.requirements_error_message) if self.ranking is None or reload: - self.ranking = self.selection_function(self.results, self.metrics, **self.selection_pars) + self.ranking = self.selection_function(self.results, **self.selection_pars) return self.ranking def run(self, reload=False): @@ -192,5 +194,9 @@ def run(self, reload=False): return self.ranking - def plot_results(self): - plotting.metric_vs_folds(self.results, self.main_metric) + def plot_results(self, metrics=None): + _check_requirements([self.results], self.requirements_error_message) + if metrics is None: + metrics = common.get_metrics_from_results(self.results) + for metric in metrics: + plotting.metric_vs_folds(self.results, metric) diff --git a/modev/plotting.py b/modev/plotting.py index 446ffea..2e9cbc5 100644 --- a/modev/plotting.py +++ b/modev/plotting.py @@ -7,23 +7,26 @@ from modev import default_pars -def metric_vs_folds(results, main_metric, plot_file=default_pars.plotting_pars_plot_file, +approach_key = default_pars.approach_key +id_key = default_pars.id_key +fold_key = default_pars.fold_key + + +def metric_vs_folds(results, metric, plot_file=default_pars.plotting_pars_plot_file, added_cols_hover=default_pars.plotting_pars_added_cols_hover, title=default_pars.plotting_pars_title, show=default_pars.plotting_pars_show, - fold_col=default_pars.fold_key, model_col=default_pars.id_key, - approach_col=default_pars.approach_key, width=default_pars.plotting_pars_width, - height=default_pars.plotting_pars_height): + width=default_pars.plotting_pars_width, height=default_pars.plotting_pars_height): data_plot = results.copy() - cols_hover = [model_col, approach_col] + cols_hover = [id_key, approach_key] if added_cols_hover is not None: cols_hover += added_cols_hover - fig1 = px.line(data_plot, x=fold_col, y=main_metric, width=width, height=height, hover_data=data_plot[cols_hover], - color=model_col) + fig1 = px.line(data_plot, x=fold_key, y=metric, width=width, height=height, hover_data=data_plot[cols_hover], + color=id_key) fig1.layout.coloraxis.showscale = False - fig1.layout.xaxis = dict(tickvals=data_plot[fold_col].unique()) + fig1.layout.xaxis = dict(tickvals=data_plot[fold_key].unique()) fig1.update_traces(mode='lines+markers') - fig1.update_layout(title=title, xaxis_title="Fold", yaxis_title=main_metric.title(), legend_title="Model ID") + fig1.update_layout(title=title, xaxis_title="Fold", yaxis_title=metric.title(), legend_title="Model ID") if plot_file is not None: plotly.offline.plot(fig1, filename=plot_file, auto_open=False) if show: diff --git a/modev/selection.py b/modev/selection.py index c80165b..c5b7fd4 100644 --- a/modev/selection.py +++ b/modev/selection.py @@ -3,14 +3,21 @@ """ import numpy as np +from modev import common from modev import default_pars +approach_key = default_pars.approach_key +id_key = default_pars.id_key +pars_key = default_pars.pars_key + + +def combine_fold_results(results, aggregation_method=default_pars.selection_pars_aggregation_method): + # Get metric names from results' columns. + metrics = common.get_metrics_from_results(results) -def combine_fold_results(results, metrics, aggregation_method=default_pars.selection_pars_aggregation_method, - approach_key=default_pars.approach_key, pars_key=default_pars.pars_key, - id_key=default_pars.id_key): # Combine results for all folds using a certain aggregation method (e.g. mean). metrics_agg = {col: aggregation_method for col in metrics} + # For columns that do not need to be combined, simply take first (since they are identical for all folds). other_columns = [approach_key, pars_key] other_columns_agg = {col: 'first' for col in other_columns} @@ -32,7 +39,7 @@ def apply_condition_to_dataframe(df, condition=default_pars.selection_pars_condi return df_selected -def model_selection(results, metrics, main_metric, aggregation_method=default_pars.selection_pars_aggregation_method, +def model_selection(results, main_metric, aggregation_method=default_pars.selection_pars_aggregation_method, results_condition=default_pars.selection_pars_results_condition, combined_results_condition=default_pars.selection_pars_combined_results_condition): """Model selection. @@ -43,8 +50,6 @@ def model_selection(results, metrics, main_metric, aggregation_method=default_pa ---------- results : pd.DataFrame Evaluations of the performance of approaches on different data folds. - metrics : list - Name of columns corresponding to metrics in 'results' dataframe. main_metric : str Name of the main metric (the one that has to be maximized). aggregation_method : str @@ -64,7 +69,7 @@ def model_selection(results, metrics, main_metric, aggregation_method=default_pa # Apply conditions to results of individual folds. results_selected = apply_condition_to_dataframe(results, results_condition) # Combine results of different folds. - combined_results = combine_fold_results(results_selected, metrics, aggregation_method=aggregation_method) + combined_results = combine_fold_results(results_selected, aggregation_method=aggregation_method) # Apply conditions to combined results. combined_results_selected = apply_condition_to_dataframe(combined_results, combined_results_condition) # Create ranking. diff --git a/modev/templates/__init__.py b/modev/templates/__init__.py index 52d306b..38296fc 100644 --- a/modev/templates/__init__.py +++ b/modev/templates/__init__.py @@ -1,2 +1,3 @@ from modev.templates import default from modev.templates import experiment_01 +from modev.templates import experiment_02 diff --git a/modev/templates/experiment_02.py b/modev/templates/experiment_02.py new file mode 100644 index 0000000..88495f2 --- /dev/null +++ b/modev/templates/experiment_02.py @@ -0,0 +1,17 @@ +"""Template experiment using temporal-fold cross validation instead of k-fold cross-validation. + +""" +from modev.validation import temporal_fold_playground_n_tests_split + +experiment = {'validation_inputs': {'function': temporal_fold_playground_n_tests_split, + 'min_n_train_examples': 20, + 'dev_n_sets': 4, + 'test_fraction': 0.5, + 'test_n_sets': 2, + }, + 'evaluation_inputs': {'metrics': ['precision', 'recall', 'f1'], + 'average': 'micro', + }, + 'selection_inputs': {'main_metric': 'f1', + }, + } diff --git a/modev/validation.py b/modev/validation.py index 61635a5..d4e8632 100644 --- a/modev/validation.py +++ b/modev/validation.py @@ -6,51 +6,159 @@ from modev import default_pars +dev_key = default_pars.dev_key +playground_key = default_pars.playground_key +test_key = default_pars.test_key +train_key = default_pars.train_key + def k_folds_split(raw_indexes, n_splits, labels=default_pars.validation_pars_labels, shuffle=default_pars.validation_pars_shuffle, random_state=default_pars.random_state, return_original_indexes=default_pars.validation_pars_return_original_indexes): - raw_indexes = np.array(raw_indexes) + """Splits a raw set of indexes into k train and k dev subsets using k-folding. + + There are k (given by 'n_splits') folds. Each of the folds uses the entire raw set of indexes (either for train or + for dev). The k dev sets do not overlap, and together they cover the entire raw set. For each fold, the train set is + made by all examples that are not in the dev set. Hence all train sets of different folds do overlap. + + Parameters + ---------- + raw_indexes : array_like + Indexes of data (e.g. data.index, assuming data is a pandas dataframe). + n_splits : int + Number of folds. + labels : list or None + If not None, the k-folding is stratified; if None, labels are ignored. + shuffle : bool + True to shuffle indexes before splitting; False to keep original order. + random_state : int or None + Random state for shuffling; Ignored if 'shuffle' is False (in which case, 'random_state' can be set to None). + return_original_indexes : bool + True to return original indexes (as given by 'raw_indexes'); False to return new integer indexes (that go from 0 + to the number of elements in raw_indexes). + + Returns + ------- + parts : list + K different parts (folds). Each part contains a tuple with: + (array of indexes in train set for this part, array of indexes in dev set for this part) + + """ + raw_indexes_array = np.array(raw_indexes) + # To avoid warnings, impose random_state None if there is no shuffling. + if not shuffle: + random_state = None # Split a data set into n parts without overlap, and optionally stratified. if labels is None: split_method = KFold else: split_method = StratifiedKFold parts = list(split_method(n_splits=n_splits, random_state=random_state, shuffle=shuffle). - split(raw_indexes, labels)) + split(raw_indexes_array, labels)) if return_original_indexes: - parts = [(raw_indexes[part[0]], raw_indexes[part[1]]) for part in parts] + parts = [(raw_indexes_array[part[0]], raw_indexes_array[part[1]]) for part in parts] + return parts + + +def temporal_folds_split(raw_indexes, min_n_train_examples, dev_n_sets): + """Splits a raw set of indexes into k train and k dev subsets using temporal-folding. + + We assume a simplistic temporal validation: separate the first 'min_n_train_examples' examples for the first train + set. Then split the remaining examples homogeneously in 'dev_n_sets' sets; they will be the dev sets. The + corresponding train set of each of these sets will be made of all previous examples. + Therefore, dev sets do not overlap. But each of the train sets fully contains the previous train set. + + Parameters + ---------- + raw_indexes : array_like + Indexes of data (e.g. data.index, assuming data is a pandas dataframe). + min_n_train_examples : int + Minimum number of examples in any train set; This will be the number of examples in the first train set. All + subsequent train sets will be larger than this. + dev_n_sets : int + Number of parts (folds). + + Returns + ------- + parts : list + K different parts (folds). Each part contains a tuple with: + (array of indexes in train set for this part, array of indexes in dev set for this part) + + """ + raw_indexes_array = np.array(raw_indexes) + first_train = raw_indexes_array[0: min_n_train_examples] + + dev_splits = np.array_split(raw_indexes_array[min_n_train_examples:], dev_n_sets) + + parts = [(first_train, dev_splits[0])] + for fold in range(1, dev_n_sets): + dev_fold = dev_splits[fold] + train_fold = raw_indexes_array[raw_indexes_array < dev_fold[0]] + parts.append((train_fold, dev_fold)) return parts -def train_n_tests_split(raw_indexes, test_fraction, test_n_sets=default_pars.validation_pars_test_n_sets, - labels=default_pars.validation_pars_labels, shuffle=default_pars.validation_pars_shuffle, - random_state=default_pars.random_state, - train_name=default_pars.train_key, - test_name=default_pars.test_key): +def one_set_n_sets_split(raw_indexes, test_fraction, test_n_sets, first_set_name, second_set_name, + labels=default_pars.validation_pars_labels, shuffle=default_pars.validation_pars_shuffle, + random_state=default_pars.random_state): + """Splits a raw set of indexes into one set (e.g. a playground) and n sets (e.g. test sets). + + The raw indexes are split so that a 'test_fraction' is for test sets (as many as 'test_n_sets'). The rest of the raw + indexes will be for the first set (e.g. the playground). + Therefore, there is no overlap between the first part (playground) and the second part (test sets), and there is no + overlap between the different test sets. + + Parameters + ---------- + raw_indexes : array_like + Indexes of data (e.g. data.index, assuming data is a pandas dataframe). + test_fraction : float + Fraction of data to use for test sets. + test_n_sets : int + Number of test sets. + first_set_name : str + Name to assign to first part (the one that is not a test set), e.g. 'playground'. + second_set_name : str + Name to assign to second part (the test sets), e.g. 'test'. + labels : list or None + If not None, splits are stratified; if None, labels are ignored. + shuffle : bool + True to shuffle original indexes; False to keep order of raw indexes. + random_state : int or None + Random state for shuffling; Ignored if 'shuffle' is False (in which case, 'random_state' can be set to None). + + Returns + ------- + indexes : dict + Indexes. It contains the first part (e.g. the 'playground') and some test sets (e.g. named 'test_0', ..., + 'test_n'). + """ # To begin with, the raw dataset is train, and there is only one test set (named 'test_0'), which is empty. - indexes = {train_name: np.array(raw_indexes), - f'{test_name}_0': np.array([], dtype=int)} + indexes = {first_set_name: np.array(raw_indexes), + f'{second_set_name}_0': np.array([], dtype=int)} + # To avoid warnings, impose random_state None if there is no shuffling. + if not shuffle: + random_state = None # If 'test_fraction' is not zero, take that fraction from train. # The new train will be a fraction 1 - 'test_fraction' of the raw dataset. # If labels are given, the splitting will be stratified (otherwise random). if test_fraction > 0: - primary_split = train_test_split(indexes[train_name], test_size=test_fraction, stratify=labels, + primary_split = train_test_split(indexes[first_set_name], test_size=test_fraction, stratify=labels, random_state=random_state, shuffle=shuffle) - indexes[train_name] = primary_split[0] - indexes[f'{test_name}_0'] = primary_split[1] + indexes[first_set_name] = primary_split[0] + indexes[f'{second_set_name}_0'] = primary_split[1] # If 'test_n_sets' > 1, we split the test set into 'test_n_sets' sets (named 'test_0', 'test_1', etc.) of # approximately equal size. # Again, if labels are given, the splitting will be stratified (otherwise random). # For convenience, use 'k_folds_split' for this task (and then ignore train parts). if test_n_sets > 1: - test_split = k_folds_split(indexes[f'{test_name}_0'], test_n_sets, labels=labels, shuffle=shuffle, + test_split = k_folds_split(indexes[f'{second_set_name}_0'], test_n_sets, labels=labels, shuffle=shuffle, random_state=random_state) # Disregard the zeroth part (which is meant for training), and keep the non-overlapping part. - indexes.update({f'{test_name}_{i}': fold[1] for i, fold in enumerate(test_split)}) + indexes.update({f'{second_set_name}_{i}': fold[1] for i, fold in enumerate(test_split)}) return indexes @@ -59,11 +167,7 @@ def k_fold_playground_n_tests_split(raw_indexes, playground_n_folds=default_pars test_n_sets=default_pars.validation_pars_test_n_sets, labels=default_pars.validation_pars_labels, shuffle=default_pars.validation_pars_shuffle, - random_state=default_pars.random_state, - train_name=default_pars.train_key, - dev_name=default_pars.dev_key, - playground_name=default_pars.playground_key, - test_name=default_pars.test_key): + random_state=default_pars.random_state): """Generate indexes that split data into a playground (with k folds) and n test sets. There is only one playground, which contains train and dev sets, and has no overlap with test sets. @@ -85,16 +189,8 @@ def k_fold_playground_n_tests_split(raw_indexes, playground_n_folds=default_pars Labels to stratify data according to their distribution; None to not stratify data. shuffle : bool True to shuffle data before splitting; False to keep them sorted as they are before splitting. - random_state : int - Random state to use on the splittings. - train_name : str - Name given to the train set (usually 'train'). - dev_name : str - Name given to the dev set (usually 'dev'). - playground_name : str - Name given to the playground (usually 'playground'). - test_name : str - Name given to the test set (usually 'test'). + random_state : int or None + Random state for shuffling; Ignored if 'shuffle' is False (in which case, 'random_state' can be set to None). Returns ------- @@ -104,56 +200,118 @@ def k_fold_playground_n_tests_split(raw_indexes, playground_n_folds=default_pars ..., 'train_k' and 'dev_0', ..., 'dev_k', respectively). """ + # To avoid warnings, impose random_state None if there is no shuffling. + if not shuffle: + random_state = None # Split data set into playground and test set(s). - indexes = train_n_tests_split(raw_indexes=raw_indexes, test_fraction=test_fraction, test_n_sets=test_n_sets, - labels=labels, shuffle=shuffle, random_state=random_state, train_name=playground_name, - test_name=test_name) + indexes = one_set_n_sets_split(raw_indexes=raw_indexes, test_fraction=test_fraction, test_n_sets=test_n_sets, + first_set_name=playground_key, second_set_name=test_key, + labels=labels, shuffle=shuffle, random_state=random_state) # Split playground into k train and k dev sets. - playground_split = k_folds_split(indexes[playground_name], playground_n_folds, labels=None, shuffle=True, + playground_split = k_folds_split(indexes[playground_key], playground_n_folds, labels=None, shuffle=True, random_state=random_state) - indexes.update({f'{train_name}_{i}': part[0] for i, part in enumerate(playground_split)}) - indexes.update({f'{dev_name}_{i}': part[1] for i, part in enumerate(playground_split)}) + indexes.update({f'{train_key}_{i}': part[0] for i, part in enumerate(playground_split)}) + indexes.update({f'{dev_key}_{i}': part[1] for i, part in enumerate(playground_split)}) + return indexes + + +def temporal_fold_playground_n_tests_split(raw_indexes, + min_n_train_examples=default_pars.validation_min_n_train_examples, + dev_n_sets=default_pars.validation_dev_n_sets, + test_fraction=default_pars.validation_pars_test_fraction, + test_n_sets=default_pars.validation_pars_test_n_sets): + """Generate indexes that split data into a playground (with temporal folds) and n test sets. + + There is only one playground, which contains train and dev sets, and has no overlap with test sets. + Playground is split using temporal validation: + The first 'min_n_train_examples' examples are the first train set. + The remaining examples in the playground are split homogeneously in 'dev_n_sets' sets (the dev sets). + The corresponding train set of each of these sets will be made of all previous examples. + There are hence 'dev_n_sets' non-overlapping dev sets, and the same number of (overlapping) train sets. + Train sets have different lengths (the first one is the shortest, with 'min_n_train_examples'), and subsequent train + sets have more and more examples. + + Parameters + ---------- + raw_indexes : array_like + All indexes of data. This could simply be the output of 'data.index' (assuming data is a pandas dataframe). + min_n_train_examples : int + Minimum number of examples in a train set. It will be the exact number of examples of the first train set. + All subsequent train sets will be larger than the first one. + dev_n_sets : int + Number of temporal folds. + test_fraction : float + Fraction of data to use for test sets. + test_n_sets : int + Number of test sets. + + Returns + ------- + indexes : dict + Indexes to use for validation. It contains one playground (named 'playground') and test sets (named 'test_0', + ..., 'test_n'). The indexes of train and dev sets (contained in the playground) are also given (named 'train_0', + ..., 'train_k' and 'dev_0', ..., 'dev_k', respectively). + + """ + # Split data set into playground and test set(s) without shuffling or stratifying (so they keep their order). + indexes = one_set_n_sets_split(raw_indexes=raw_indexes, test_fraction=test_fraction, test_n_sets=test_n_sets, + first_set_name=playground_key, second_set_name=test_key, + labels=None, shuffle=False, random_state=None) + # Split playground into k train and k dev temporal folds. + playground_split = temporal_folds_split(indexes[playground_key], min_n_train_examples=min_n_train_examples, + dev_n_sets=dev_n_sets) + indexes.update({f'{train_key}_{i}': part[0] for i, part in enumerate(playground_split)}) + indexes.update({f'{dev_key}_{i}': part[1] for i, part in enumerate(playground_split)}) return indexes -def validate_indexes(indexes, train_name=default_pars.train_key, - dev_name=default_pars.dev_key, - playground_name=default_pars.playground_key, - test_name=default_pars.test_key): +def validate_indexes(indexes): + """Check that indexes fulfil some criteria (e.g. that playground and test set do not overlap). + + Parameters + ---------- + indexes : dict + Indexes. + + Returns + ------- + checks : bool + True if all checks are fulfilled; False otherwise. + + """ # For convenience, collect all indexes in lists. train_indexes = [] dev_indexes = [] test_indexes = [] for key in indexes: - if key.startswith(f'{train_name}_'): + if key.startswith(f'{train_key}_'): train_indexes.extend(indexes[key]) - elif key.startswith(f'{dev_name}_'): + elif key.startswith(f'{dev_key}_'): dev_indexes.extend(indexes[key]) - elif key.startswith(f'{test_name}_'): + elif key.startswith(f'{test_key}_'): test_indexes.extend(indexes[key]) # Since there can be repetition among indexes in train sets, take unique. train_indexes = np.unique(train_indexes) # Validations that need to be passed: - validations = [] + checks = True - # Playground and train indexes coincide. - validations.append(set(indexes[playground_name]) == set(train_indexes)) + # The set of playground examples coincides with the union of all train and dev sets. + checks &= set(indexes[playground_key]) == (set(train_indexes) | set(dev_indexes)) # Train indexes and test indexes do not overlap. - validations.append((set(train_indexes) & set(test_indexes)) == set()) + checks &= (set(train_indexes) & set(test_indexes)) == set() # For each of the folds in playground, train and dev do not overlap. - folds = [int(key.split('_')[-1]) for key in indexes if key.startswith(f'{dev_name}_')] + folds = [int(key.split('_')[-1]) for key in indexes if key.startswith(f'{dev_key}_')] for fold in folds: - validations.append(((set(indexes[f'{train_name}_{fold}']) & set(indexes[f'{dev_name}_{fold}'])) == set())) + checks &= ((set(indexes[f'{train_key}_{fold}']) & set(indexes[f'{dev_key}_{fold}'])) == set()) # There is no overlap among dev sets. - validations.append(len(dev_indexes) == len(np.unique(dev_indexes))) + checks &= len(dev_indexes) == len(np.unique(dev_indexes)) # There is no overlap among test sets. - validations.append(len(test_indexes) == len(np.unique(test_indexes))) + checks &= len(test_indexes) == len(np.unique(test_indexes)) - check_validations = np.sum(validations) == len(validations) - return check_validations + return checks diff --git a/setup.py b/setup.py index b2d16a2..b488738 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="modev", - version="0.0.1", + version="0.1.1", author="Pablo Rosado", author_email="pabloarosado@gmail.com", description="Model Development for Data Science Projects.",