Skip to content

Commit

Permalink
Merge pull request #28 from pabloarosado/dev
Browse files Browse the repository at this point in the history
Release v0.1.1
  • Loading branch information
pabloarosado authored Aug 25, 2020
2 parents c016769 + 3cf4b45 commit 1521974
Show file tree
Hide file tree
Showing 16 changed files with 350 additions and 126 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,19 @@ The pipeline will load two dummy approaches (which can be accessed on ```pipe.ap
parameters (which can be accessed on ```pipe.approaches_pars```).
For each fold, these approaches will be fitted to the train set and predict the 'color' of the examples on the dev sets.

The metrics used to evaluate the performance of the approaches are listed in ```pipe.metrics```.
The metrics used to evaluate the performance of the approaches are listed in ```pipe.evaluation_pars['metrics']```.

An exhaustive grid search is performed, to get all possible combinations of the parameters of each of the approaches.
The performance of each of these combinations on each fold can be accessed on:
```
pipe.get_results()
```

To plot these results per fold:
To plot these results per fold for each of the metrics:
```
pipe.plot_results()
```
To plot only a certain list of metrics, this list can be given as an argument of this function.

To get the final ranking of best approaches (after combining the results of different folds):
```
Expand Down
1 change: 1 addition & 0 deletions modev/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import modev.approaches
import modev.common
import modev.default_pars
import modev.etl
import modev.evaluation
Expand Down
6 changes: 1 addition & 5 deletions modev/approaches.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,6 @@ def __init__(self, random_state=default_pars.random_state):
random_state : int
Random state to use when picking elements from the train set.
Attributes
----------
possible_choices
Possible values (taken from target column in the train set) to choose values from, when making predictions.
Methods
-------
fit
Expand All @@ -100,6 +95,7 @@ def fit(self, train_x, train_y):
None
"""
_ = train_x
self.possible_choices = train_y

def predict(self, test_x):
Expand Down
15 changes: 15 additions & 0 deletions modev/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""Common functions that are designed for modev.
"""
from modev import default_pars

approach_key = default_pars.approach_key
id_key = default_pars.id_key
fold_key = default_pars.fold_key
pars_key = default_pars.pars_key


def get_metrics_from_results(results):
non_metrics_columns = [pars_key, approach_key, id_key, fold_key]
metrics = [col for col in results.columns if col not in non_metrics_columns]
return metrics
8 changes: 6 additions & 2 deletions modev/default_pars.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,13 @@
selection_pars_combined_results_condition = None
selection_pars_results_condition = None

validation_pars_test_fraction = 0.5
validation_pars_test_n_sets = 6
# Default validation pars for k-fold cross-validation:
validation_pars_labels = None
validation_pars_playground_n_folds = 4
validation_pars_return_original_indexes = True
validation_pars_shuffle = True
validation_pars_test_fraction = 0.5
validation_pars_test_n_sets = 6
# Default validation pars for temporal-fold cross-validation:
validation_min_n_train_examples = 10
validation_dev_n_sets = 4
2 changes: 1 addition & 1 deletion modev/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def apply_selection_to_data(data, selection):

def count_rows(data_file):
with open(data_file) as f:
num_rows = sum(1 for line in f)
num_rows = sum(1 for _ in f)
return num_rows


Expand Down
6 changes: 5 additions & 1 deletion modev/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def evaluate_predictions(raw_true, raw_pred, metrics, **kwargs):
* 'threshold_at_*': threshold at k (e.g. 'threshold_at_10') or at k percent (e.g. 'threshold_at_5_pct').
Note: For the time being, all metrics have to return only one number; In the case of a multi-class
classification, a micro-average precision is returned.
# TODO: Allow saving metrics like precision and recall as lists (for different labels).
Returns
-------
Expand Down Expand Up @@ -80,6 +79,11 @@ def evaluate_predictions(raw_true, raw_pred, metrics, **kwargs):
type_of_metric = metric.split('_at_')[0]
# Extract only the type of metric at k needed.
results[metric] = metrics_at_k(true, pred, k)[type_of_metric]
# TODO: Allow saving metrics like precision and recall as lists (for different labels). Maybe the easiest is to
# create metrics *_per_label, that repeat that metric for each of the labels. But for that ensure that 'metrics'
# doesn't need to be redefined in pipeline.

# TODO: Allow saving file with individual predictions in test examples.
return results


Expand Down
59 changes: 36 additions & 23 deletions modev/execution.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
"""Functions related to the execution of the pipeline.
"""
import numpy as np
from tqdm.auto import tqdm

from modev import default_pars

approach_key = default_pars.approach_key
dev_key = default_pars.dev_key
fold_key = default_pars.fold_key
function_key = default_pars.function_key
pars_key = default_pars.pars_key
playground_key = default_pars.playground_key
test_key = default_pars.test_key
train_key = default_pars.train_key

def _get_train_and_test_sets(data, indexes, fold, test_mode=default_pars.execution_pars_test_mode,
train_name=default_pars.train_key,
dev_name=default_pars.dev_key,
test_name=default_pars.test_key,
playground_name=default_pars.playground_key):

def _get_train_and_test_sets(data, indexes, fold, test_mode=default_pars.execution_pars_test_mode):
if test_mode:
train_set = data.loc[indexes[playground_name]]
test_set = data.loc[indexes[f'{test_name}_{fold}']]
train_set = data.loc[indexes[playground_key]]
test_set = data.loc[indexes[f'{test_key}_{fold}']]
else:
train_set = data.loc[indexes[f'{train_name}_{fold}']]
test_set = data.loc[indexes[f'{dev_name}_{fold}']]
train_set = data.loc[indexes[f'{train_key}_{fold}']]
test_set = data.loc[indexes[f'{dev_key}_{fold}']]
return train_set, test_set


Expand All @@ -26,29 +32,37 @@ def separate_predictors_and_target(data_set, target_col):
return data_set_x, data_set_y


def _get_approaches_functions_from_grid(approaches_grid, function_key=default_pars.function_key):
def _get_approaches_functions_from_grid(approaches_grid):
approaches_functions = {app_name: approaches_grid[app_name][function_key] for app_name in approaches_grid}
return approaches_functions


def run_experiment(data, indexes, validation_pars, execution_function, execution_pars, evaluation_function,
evaluation_pars, exploration_function, approaches_function, approaches_pars,
fold_key=default_pars.fold_key,
pars_key=default_pars.pars_key,
approach_key=default_pars.approach_key):
def _add_metrics_to_pars_folds(i, pars_folds, results):
for metric in results:
if metric not in pars_folds.columns:
pars_folds[metric] = np.nan
pars_folds.loc[i, metric] = results[metric]


def _get_list_of_sets_from_indexes(indexes, set_name):
list_of_sets = [int(part.split('_')[-1]) for part in indexes if part.startswith(set_name)]
return list_of_sets


def run_experiment(data, indexes, execution_function, execution_pars, evaluation_function, evaluation_pars,
exploration_function, approaches_function, approaches_pars):
# Extract all necessary info from experiment.
metrics = evaluation_pars['metrics']
target = execution_pars['target']
test_mode = execution_pars['test_mode']

# Get list of folds to execute.
if test_mode:
folds = list(range(validation_pars['test_n_sets']))
folds = _get_list_of_sets_from_indexes(indexes, test_key)
else:
folds = list(range(validation_pars['playground_n_folds']))
folds = _get_list_of_sets_from_indexes(indexes, dev_key)

# Initialise parameter space explorer.
explorer = exploration_function(approaches_pars, folds, metrics)
explorer = exploration_function(approaches_pars, folds)
pars_folds = explorer.initialise_results()
n_iterations = explorer.select_executions_left()

Expand All @@ -71,13 +85,12 @@ def run_experiment(data, indexes, validation_pars, execution_function, execution
# Evaluate predictions.
results = evaluation_function(list(test_y), list(predictions), **evaluation_pars)

# Write results for these parameters and fold.
for metric in results:
pars_folds.loc[i, metric] = results[metric]
# Ensure metrics columns exist in pars_folds and write results for these parameters and fold.
_add_metrics_to_pars_folds(i, pars_folds, results)
return pars_folds


def execute_model(approach_function, approach_pars, train_x, train_y, test_x, **kwargs):
def execute_model(approach_function, approach_pars, train_x, train_y, test_x, **_kwargs):
"""Execution method (including training and prediction) for an approach.
This function takes an approach 'approach_function' with parameters 'approach_pars', a train set (with predictors
Expand Down
20 changes: 10 additions & 10 deletions modev/exploration.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from modev import default_pars

fixed_pars_key = default_pars.fixed_pars_key


def expand_parameter_grid(grid, fixed_pars=default_pars.exploration_pars_fixed_pars):
pars_names = list(grid)
Expand All @@ -27,7 +29,7 @@ def expand_parameter_grid(grid, fixed_pars=default_pars.exploration_pars_fixed_p
return pars


def _split_approaches_name_and_pars(approaches_pars, fixed_pars_key=default_pars.fixed_pars_key):
def expand_name_and_parameter_grids(approaches_pars):
app_names = []
app_pars = []
for name in approaches_pars:
Expand All @@ -43,7 +45,7 @@ def _split_approaches_name_and_pars(approaches_pars, fixed_pars_key=default_pars


class GridSearch:
def __init__(self, approaches_pars, folds, metrics):
def __init__(self, approaches_pars: dict, folds: list):
"""Grid search exploration of the parameter space.
Parameters
Expand All @@ -52,22 +54,19 @@ def __init__(self, approaches_pars, folds, metrics):
Dictionaries of approaches. Each key corresponds to one approach name, and the value is a dictionary.
This inner dictionary of an individual approach has one key per parameter, and the value is a list of
parameter values to explore.
folds : list of ints
folds : list
List of folds (e.g. [0, 1, 2, 3]).
metrics : list
List of metrics.
"""
self.approaches_pars = approaches_pars
self.folds = folds
self.metrics = metrics
self.pars_folds = None
self.selection_to_execute = None
self.next_point_generator = None

def initialise_results(self):
# TODO: Generalise this to save to/load from file.
app_names, app_pars = _split_approaches_name_and_pars(self.approaches_pars)
app_names, app_pars = expand_name_and_parameter_grids(self.approaches_pars)
app_ids = np.arange(len(app_pars))
# Repeat each pars combination for each fold.
pars_folds = pd.DataFrame(np.repeat(app_pars, len(self.folds)), columns=[default_pars.pars_key])
Expand All @@ -76,9 +75,6 @@ def initialise_results(self):
pars_folds[default_pars.fold_key] = np.tile(self.folds, len(app_pars))
# Add a column for each of the evaluation metrics.
self.pars_folds = pars_folds
# Add metrics to results dataframe.
for metric in self.metrics:
pars_folds[metric] = np.nan
return self.pars_folds

def select_executions_left(self):
Expand All @@ -99,3 +95,7 @@ def get_next_point(self):
# TODO: Create RandomSearch with similar structure.
# TODO: Create AutoSearch with similar structure.
# In this case, _next_point_finder can use self.pars_folds at any time to see explored points and decide next.
# The structure of approaches_inputs is the same for grid, random and auto searches.
# In the cases of random and auto, only first and last element will be taken, and the rest ignored.
# In these cases, the type of the first element will determine whether it is int or float.
# If more than one approach are given, split the iterations among them.
30 changes: 18 additions & 12 deletions modev/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,39 @@
"""
import logging

from modev import common
from modev import default_pars
from modev import execution
from modev import plotting
from modev import templates
from modev import validation
from modev.templates import default

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.WARNING)
app_name_key = default_pars.approach_name_key
function_key = default_pars.function_key

logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.WARNING)


def _check_requirements(previous_requirements, error_message):
if any([requirement is None for requirement in previous_requirements]):
logging.error(error_message)


def _split_function_and_pars(inputs, function_key=default_pars.function_key):
def _split_function_and_pars(inputs):
function = inputs[function_key]
pars = {par: inputs[par] for par in inputs if par != function_key}
return function, pars


def _split_approaches_function_and_pars(approaches, function_key=default_pars.function_key,
app_name_key=default_pars.approach_name_key):
def _split_approaches_function_and_pars(approaches):
function = {app[app_name_key]: app[function_key] for app in approaches}
pars = {app[app_name_key]: {par: app[par] for par in app if par not in [function_key, app_name_key]}
for app in approaches}
return function, pars


def _override_default_inputs(given_inputs, default_inputs, function_key=default_pars.function_key):
def _override_default_inputs(given_inputs, default_inputs):
# If function_key is not given in pars, default function will be used.
# Therefore, ensure all required parameters are taken from default, except the ones explicitly given in pars.
if given_inputs is None:
Expand Down Expand Up @@ -136,8 +140,6 @@ def __init__(self, *,
self.selection_function, self.selection_pars = _split_function_and_pars(selection_inputs)
self.approaches_function, self.approaches_pars = _split_approaches_function_and_pars(approaches_inputs)
# Initialise other attributes.
self.metrics = self.evaluation_pars['metrics']
self.main_metric = self.selection_pars['main_metric']
self.data = None
self.indexes = None
self.results = None
Expand Down Expand Up @@ -166,17 +168,17 @@ def get_indexes(self, reload=False):
def get_results(self, reload=False):
_check_requirements([self.data, self.indexes], self.requirements_error_message)
if self.results is None or reload:
self.results = execution.run_experiment(self.data, self.indexes, self.validation_pars,
self.results = execution.run_experiment(self.data, self.indexes,
self.execution_function, self.execution_pars,
self.evaluation_function, self.evaluation_pars,
self.exploration_function, self.approaches_function,
self.approaches_pars)
return self.results

def get_selected_models(self, reload=False):
_check_requirements([self.data, self.results], self.requirements_error_message)
_check_requirements([self.results], self.requirements_error_message)
if self.ranking is None or reload:
self.ranking = self.selection_function(self.results, self.metrics, **self.selection_pars)
self.ranking = self.selection_function(self.results, **self.selection_pars)
return self.ranking

def run(self, reload=False):
Expand All @@ -192,5 +194,9 @@ def run(self, reload=False):

return self.ranking

def plot_results(self):
plotting.metric_vs_folds(self.results, self.main_metric)
def plot_results(self, metrics=None):
_check_requirements([self.results], self.requirements_error_message)
if metrics is None:
metrics = common.get_metrics_from_results(self.results)
for metric in metrics:
plotting.metric_vs_folds(self.results, metric)
Loading

0 comments on commit 1521974

Please sign in to comment.