Skip to content

Commit

Permalink
Add function to obtain values for a table (#122)
Browse files Browse the repository at this point in the history
This PR adds a new function to retrieve (and optionally plot) the accuracy results for different samplers on a user-provided model.
  • Loading branch information
JacksonBurns authored Jun 28, 2023
2 parents 170fbfc + f54892c commit 13f5ba5
Show file tree
Hide file tree
Showing 4 changed files with 309 additions and 0 deletions.
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,30 @@ Running `astartes` with the default settings will always produce the exact same
We have verified this behavior on Debian Ubuntu, Windows, and Intel Macs from Python versions 3.7 through 3.11 (with appropriate dependencies for each version).
We are limited in our ability to test on M1 Macs, but from our limited manual testing we achieve perfect reproducbility in all cases _except occasionally_ with `KMeans` on Apple silicon. It has produced _slightly_ different results between platforms regardless of `random_state`, with up to two clusters being assigned differently resulting in data splits which are >99% identical. `astartes` is still consistent between runs on the same platform in all cases.

## Evaluate the impact of splitting algorithms
The `generate_regression_results_dict` function allows users to quickly evaluate the impact of different splitting techniques on any model supported by `sklearn`. All results are stored in a dictionary format and can be displayed in a neatly formatted table using the optional `print_results` argument.

```
from sklearn.svm import LinearSVR
from astartes.utils.utils import generate_regression_results_dict
sklearn_model = LinearSVR()
results_dict = generate_regression_results_dict(
sklearn_model,
X,
y,
print_results=True,
)
Train Val Test
---- -------- -------- --------
MAE 1.41522 3.13435 2.17091
RMSE 2.03062 3.73721 2.40041
R2 0.90745 0.80787 0.78412
```

## Online Documentation
[The online documentation](https://JacksonBurns.github.io/astartes/) contains everything you see in this README with an additional tutorial for [moving from `train_test_split` in `sklearn` to `astartes`](https://jacksonburns.github.io/astartes/sklearn_to_astartes.html).

Expand Down
8 changes: 8 additions & 0 deletions astartes/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@ def __init__(self, message=None):
super().__init__(message)


class InvalidModelTypeError(RuntimeError):
"""Used when user-provided model is invalid."""

def __init__(self, message=None):
self.message = message
super().__init__(message)


class InvalidConfigurationError(RuntimeError):
"""Used when user-requested split/data would not work."""

Expand Down
158 changes: 158 additions & 0 deletions astartes/utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tabulate import tabulate

from astartes import train_val_test_split
from astartes.utils.exceptions import InvalidModelTypeError


def generate_regression_results_dict(
sklearn_model,
X,
y,
samplers=["random"],
random_state=0,
samplers_hopts={},
train_size=0.8,
val_size=0.1,
test_size=0.1,
print_results=False,
):
"""
Helper function to train a sklearn model using the provided data
and provided sampler types.
Args:
X (np.array, pd.DataFrame): Numpy array or pandas DataFrame of feature vectors.
y (np.array, pd.Series): Targets corresponding to X, must be of same size.
train_size (float, optional): Fraction of dataset to use in training set. Defaults to 0.8.
val_size (float, optional): Fraction of dataset to use in validation set. Defaults to 0.1.
test_size (float, optional): Fraction of dataset to use in test set. Defaults to 0.1.
random_state (int, optional): The random seed used throughout astartes.
samplers_hopts (dict, optional): Should be a dictionary of dictionaries with the keys specifying
the sampler and the values being another dictionary with the
corresponding hyperparameters. Defaults to {}.
print_results (bool, optional): whether to print the resulting dictionary as a neat table
Returns:
dict: nested dictionary with the format of
{
sampler: {
'mae':{
'train': [],
'val': [],
'test': [],
},
'rmse':{
'train': [],
'val': [],
'test': [],
},
'R2':{
'train': [],
'val': [],
'test': [],
},
},
}
"""
if not isinstance(sklearn_model, sklearn.base.BaseEstimator):
raise InvalidModelTypeError("Model must be an sklearn model")

final_dict = {}
for sampler in samplers:
error_dict = {
"mae": {
"train": [],
"val": [],
"test": [],
},
"rmse": {
"train": [],
"val": [],
"test": [],
},
"R2": {
"train": [],
"val": [],
"test": [],
},
}

# obtain indices
_, _, _, train_indices, val_indices, test_indices = train_val_test_split(
X,
train_size=train_size,
val_size=val_size,
test_size=test_size,
sampler=sampler,
random_state=random_state,
hopts=samplers_hopts.get(sampler, dict()),
return_indices=True,
)

# create data splits
X_train = X[train_indices]
X_val = X[val_indices]
X_test = X[test_indices]

y_train = y[train_indices]
y_val = y[val_indices]
y_test = y[test_indices]

# fit the model to the training data
sklearn_model.fit(X_train, y_train)

# get predictions
y_pred_train = sklearn_model.predict(X_train)
y_pred_val = sklearn_model.predict(X_val)
y_pred_test = sklearn_model.predict(X_test)

# store MAEs
train_mae = mean_absolute_error(y_train, y_pred_train)
error_dict["mae"]["train"].append(train_mae)

val_mae = mean_absolute_error(y_val, y_pred_val)
error_dict["mae"]["val"].append(val_mae)

test_mae = mean_absolute_error(y_test, y_pred_test)
error_dict["mae"]["test"].append(test_mae)

# store RMSEs
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
error_dict["rmse"]["train"].append(train_rmse)

val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)
error_dict["rmse"]["val"].append(val_rmse)

test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
error_dict["rmse"]["test"].append(test_rmse)

# store R2
train_R2 = r2_score(y_train, y_pred_train)
error_dict["R2"]["train"].append(train_R2)

val_R2 = r2_score(y_val, y_pred_val)
error_dict["R2"]["val"].append(val_R2)

test_R2 = r2_score(y_test, y_pred_test)
error_dict["R2"]["test"].append(test_R2)

final_dict[sampler] = error_dict

if print_results:
print(f"\nDisplaying results for {sampler} sampler")
display_results_as_table(error_dict)

return final_dict


def display_results_as_table(error_dict):
"""Helper function to print a dictionary as a neat tabulate"""
headers = ["Train", "Val", "Test"]
table = []
for key, val in error_dict.items():
table_tmp = [key.upper()]
table_tmp.extend([val[0] for val in val.values()])
table.append(table_tmp)
print(tabulate(table, headers=headers))
119 changes: 119 additions & 0 deletions test/unit/utils/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import unittest

import numpy as np
from sklearn.svm import LinearSVR

from astartes.samplers.interpolation import Random
from astartes.utils.exceptions import InvalidModelTypeError
from astartes.utils.utils import generate_regression_results_dict


class Test_utils(unittest.TestCase):
"""
Test functions within utils.py.
"""

@classmethod
def setUpClass(self):
"""Save re-used arrays as class attributes."""
# X and y come from sklearn's make_regression function
self.X = np.array(
[
[-0.86, -0.98],
[-0.42, -0.87],
[1.33, 0.20],
[-0.25, 2.43],
[-0.59, -0.91],
[-0.33, 0.19],
[-0.10, -0.01],
[1.86, 1.15],
[0.64, -1.51],
[-0.36, 0.06],
[0.6, -0.36],
[1.56, -0.09],
[-0.70, -1.66],
[-0.33, 0.44],
[1.58, 0.11],
[0.25, -0.05],
[-0.63, 0.79],
[-0.11, 0.00],
[-0.20, -1.19],
[0.71, 1.00],
]
)
self.y = np.array(
[
-10.27,
-6.19,
12.13,
4.90,
-7.77,
-2.31,
-0.89,
19.42,
1.18,
-2.97,
4.18,
13.26,
-10.90,
-1.58,
14.01,
2.00,
-3.16,
-0.91,
-5.25,
9.07,
]
)

def test_generate_regression_results_dict(self):
"""Generate results dictionary for simple regression task."""

# test that error is raised if not using sklearn model
with self.assertRaises(InvalidModelTypeError) as e:
generate_regression_results_dict(
Random,
self.X,
self.y,
train_size=0.6,
val_size=0.2,
test_size=0.2,
)

# use default hyperparameters
sklearn_model = LinearSVR()

# test function call and also that a table can be printed without error
results_dict = generate_regression_results_dict(
sklearn_model,
self.X,
self.y,
train_size=0.6,
val_size=0.2,
test_size=0.2,
print_results=True,
)

# test that only results for the default random sampler are included
self.assertEqual(
len(results_dict),
1,
msg=f"results_dict contained {results_dict.keys()}. Expected just random sampler.",
)
# test that results for mae, rmse, and r2 are included
self.assertTrue(
"mae" in results_dict["random"].keys(),
msg=f"results_dict did not contain MAE results.",
)
self.assertTrue(
"rmse" in results_dict["random"].keys(),
msg=f"results_dict did not contain RMSE results.",
)
self.assertTrue(
"R2" in results_dict["random"].keys(),
msg=f"results_dict did not contain R2 results.",
)


if __name__ == "__main__":
unittest.main()

0 comments on commit 13f5ba5

Please sign in to comment.