generated from JacksonBurns/blank-python-project
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add function to obtain values for a table (#122)
This PR adds a new function to retrieve (and optionally plot) the accuracy results for different samplers on a user-provided model.
- Loading branch information
Showing
4 changed files
with
309 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
import sklearn | ||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score | ||
from tabulate import tabulate | ||
|
||
from astartes import train_val_test_split | ||
from astartes.utils.exceptions import InvalidModelTypeError | ||
|
||
|
||
def generate_regression_results_dict( | ||
sklearn_model, | ||
X, | ||
y, | ||
samplers=["random"], | ||
random_state=0, | ||
samplers_hopts={}, | ||
train_size=0.8, | ||
val_size=0.1, | ||
test_size=0.1, | ||
print_results=False, | ||
): | ||
""" | ||
Helper function to train a sklearn model using the provided data | ||
and provided sampler types. | ||
Args: | ||
X (np.array, pd.DataFrame): Numpy array or pandas DataFrame of feature vectors. | ||
y (np.array, pd.Series): Targets corresponding to X, must be of same size. | ||
train_size (float, optional): Fraction of dataset to use in training set. Defaults to 0.8. | ||
val_size (float, optional): Fraction of dataset to use in validation set. Defaults to 0.1. | ||
test_size (float, optional): Fraction of dataset to use in test set. Defaults to 0.1. | ||
random_state (int, optional): The random seed used throughout astartes. | ||
samplers_hopts (dict, optional): Should be a dictionary of dictionaries with the keys specifying | ||
the sampler and the values being another dictionary with the | ||
corresponding hyperparameters. Defaults to {}. | ||
print_results (bool, optional): whether to print the resulting dictionary as a neat table | ||
Returns: | ||
dict: nested dictionary with the format of | ||
{ | ||
sampler: { | ||
'mae':{ | ||
'train': [], | ||
'val': [], | ||
'test': [], | ||
}, | ||
'rmse':{ | ||
'train': [], | ||
'val': [], | ||
'test': [], | ||
}, | ||
'R2':{ | ||
'train': [], | ||
'val': [], | ||
'test': [], | ||
}, | ||
}, | ||
} | ||
""" | ||
if not isinstance(sklearn_model, sklearn.base.BaseEstimator): | ||
raise InvalidModelTypeError("Model must be an sklearn model") | ||
|
||
final_dict = {} | ||
for sampler in samplers: | ||
error_dict = { | ||
"mae": { | ||
"train": [], | ||
"val": [], | ||
"test": [], | ||
}, | ||
"rmse": { | ||
"train": [], | ||
"val": [], | ||
"test": [], | ||
}, | ||
"R2": { | ||
"train": [], | ||
"val": [], | ||
"test": [], | ||
}, | ||
} | ||
|
||
# obtain indices | ||
_, _, _, train_indices, val_indices, test_indices = train_val_test_split( | ||
X, | ||
train_size=train_size, | ||
val_size=val_size, | ||
test_size=test_size, | ||
sampler=sampler, | ||
random_state=random_state, | ||
hopts=samplers_hopts.get(sampler, dict()), | ||
return_indices=True, | ||
) | ||
|
||
# create data splits | ||
X_train = X[train_indices] | ||
X_val = X[val_indices] | ||
X_test = X[test_indices] | ||
|
||
y_train = y[train_indices] | ||
y_val = y[val_indices] | ||
y_test = y[test_indices] | ||
|
||
# fit the model to the training data | ||
sklearn_model.fit(X_train, y_train) | ||
|
||
# get predictions | ||
y_pred_train = sklearn_model.predict(X_train) | ||
y_pred_val = sklearn_model.predict(X_val) | ||
y_pred_test = sklearn_model.predict(X_test) | ||
|
||
# store MAEs | ||
train_mae = mean_absolute_error(y_train, y_pred_train) | ||
error_dict["mae"]["train"].append(train_mae) | ||
|
||
val_mae = mean_absolute_error(y_val, y_pred_val) | ||
error_dict["mae"]["val"].append(val_mae) | ||
|
||
test_mae = mean_absolute_error(y_test, y_pred_test) | ||
error_dict["mae"]["test"].append(test_mae) | ||
|
||
# store RMSEs | ||
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False) | ||
error_dict["rmse"]["train"].append(train_rmse) | ||
|
||
val_rmse = mean_squared_error(y_val, y_pred_val, squared=False) | ||
error_dict["rmse"]["val"].append(val_rmse) | ||
|
||
test_rmse = mean_squared_error(y_test, y_pred_test, squared=False) | ||
error_dict["rmse"]["test"].append(test_rmse) | ||
|
||
# store R2 | ||
train_R2 = r2_score(y_train, y_pred_train) | ||
error_dict["R2"]["train"].append(train_R2) | ||
|
||
val_R2 = r2_score(y_val, y_pred_val) | ||
error_dict["R2"]["val"].append(val_R2) | ||
|
||
test_R2 = r2_score(y_test, y_pred_test) | ||
error_dict["R2"]["test"].append(test_R2) | ||
|
||
final_dict[sampler] = error_dict | ||
|
||
if print_results: | ||
print(f"\nDisplaying results for {sampler} sampler") | ||
display_results_as_table(error_dict) | ||
|
||
return final_dict | ||
|
||
|
||
def display_results_as_table(error_dict): | ||
"""Helper function to print a dictionary as a neat tabulate""" | ||
headers = ["Train", "Val", "Test"] | ||
table = [] | ||
for key, val in error_dict.items(): | ||
table_tmp = [key.upper()] | ||
table_tmp.extend([val[0] for val in val.values()]) | ||
table.append(table_tmp) | ||
print(tabulate(table, headers=headers)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import unittest | ||
|
||
import numpy as np | ||
from sklearn.svm import LinearSVR | ||
|
||
from astartes.samplers.interpolation import Random | ||
from astartes.utils.exceptions import InvalidModelTypeError | ||
from astartes.utils.utils import generate_regression_results_dict | ||
|
||
|
||
class Test_utils(unittest.TestCase): | ||
""" | ||
Test functions within utils.py. | ||
""" | ||
|
||
@classmethod | ||
def setUpClass(self): | ||
"""Save re-used arrays as class attributes.""" | ||
# X and y come from sklearn's make_regression function | ||
self.X = np.array( | ||
[ | ||
[-0.86, -0.98], | ||
[-0.42, -0.87], | ||
[1.33, 0.20], | ||
[-0.25, 2.43], | ||
[-0.59, -0.91], | ||
[-0.33, 0.19], | ||
[-0.10, -0.01], | ||
[1.86, 1.15], | ||
[0.64, -1.51], | ||
[-0.36, 0.06], | ||
[0.6, -0.36], | ||
[1.56, -0.09], | ||
[-0.70, -1.66], | ||
[-0.33, 0.44], | ||
[1.58, 0.11], | ||
[0.25, -0.05], | ||
[-0.63, 0.79], | ||
[-0.11, 0.00], | ||
[-0.20, -1.19], | ||
[0.71, 1.00], | ||
] | ||
) | ||
self.y = np.array( | ||
[ | ||
-10.27, | ||
-6.19, | ||
12.13, | ||
4.90, | ||
-7.77, | ||
-2.31, | ||
-0.89, | ||
19.42, | ||
1.18, | ||
-2.97, | ||
4.18, | ||
13.26, | ||
-10.90, | ||
-1.58, | ||
14.01, | ||
2.00, | ||
-3.16, | ||
-0.91, | ||
-5.25, | ||
9.07, | ||
] | ||
) | ||
|
||
def test_generate_regression_results_dict(self): | ||
"""Generate results dictionary for simple regression task.""" | ||
|
||
# test that error is raised if not using sklearn model | ||
with self.assertRaises(InvalidModelTypeError) as e: | ||
generate_regression_results_dict( | ||
Random, | ||
self.X, | ||
self.y, | ||
train_size=0.6, | ||
val_size=0.2, | ||
test_size=0.2, | ||
) | ||
|
||
# use default hyperparameters | ||
sklearn_model = LinearSVR() | ||
|
||
# test function call and also that a table can be printed without error | ||
results_dict = generate_regression_results_dict( | ||
sklearn_model, | ||
self.X, | ||
self.y, | ||
train_size=0.6, | ||
val_size=0.2, | ||
test_size=0.2, | ||
print_results=True, | ||
) | ||
|
||
# test that only results for the default random sampler are included | ||
self.assertEqual( | ||
len(results_dict), | ||
1, | ||
msg=f"results_dict contained {results_dict.keys()}. Expected just random sampler.", | ||
) | ||
# test that results for mae, rmse, and r2 are included | ||
self.assertTrue( | ||
"mae" in results_dict["random"].keys(), | ||
msg=f"results_dict did not contain MAE results.", | ||
) | ||
self.assertTrue( | ||
"rmse" in results_dict["random"].keys(), | ||
msg=f"results_dict did not contain RMSE results.", | ||
) | ||
self.assertTrue( | ||
"R2" in results_dict["random"].keys(), | ||
msg=f"results_dict did not contain R2 results.", | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |