Add function to obtain values for a table (#122)

This PR adds a new function to retrieve (and optionally plot) the accuracy results for different samplers on a user-provided model.
JacksonBurns · Jun 28, 2023 · 13f5ba5 · 13f5ba5
2 parents 170fbfc + f54892c
commit 13f5ba5
Show file tree

Hide file tree

Showing 4 changed files with 309 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -128,6 +128,30 @@ Running `astartes` with the default settings will always produce the exact same
 We have verified this behavior on Debian Ubuntu, Windows, and Intel Macs from Python versions 3.7 through 3.11 (with appropriate dependencies for each version).
 We are limited in our ability to test on M1 Macs, but from our limited manual testing we achieve perfect reproducbility in all cases _except occasionally_ with `KMeans` on Apple silicon. It has produced _slightly_ different results between platforms regardless of `random_state`, with up to two clusters being assigned differently resulting in data splits which are >99% identical. `astartes` is still consistent between runs on the same platform in all cases.
 
+## Evaluate the impact of splitting algorithms
+The `generate_regression_results_dict` function allows users to quickly evaluate the impact of different splitting techniques on any model supported by `sklearn`. All results are stored in a dictionary format and can be displayed in a neatly formatted table using the optional `print_results` argument.
+
+```
+from sklearn.svm import LinearSVR
+
+from astartes.utils.utils import generate_regression_results_dict
+
+sklearn_model = LinearSVR()
+results_dict = generate_regression_results_dict(
+                    sklearn_model,
+                    X,
+                    y,
+                    print_results=True,
+               )
+
+         Train       Val      Test
+----  --------  --------  --------
+MAE   1.41522   3.13435   2.17091
+RMSE  2.03062   3.73721   2.40041
+R2    0.90745   0.80787   0.78412
+
+```
+
 ## Online Documentation
 [The online documentation](https://JacksonBurns.github.io/astartes/) contains everything you see in this README with an additional tutorial for [moving from `train_test_split` in `sklearn` to `astartes`](https://jacksonburns.github.io/astartes/sklearn_to_astartes.html).
 

diff --git a/astartes/utils/exceptions.py b/astartes/utils/exceptions.py
@@ -9,6 +9,14 @@ def __init__(self, message=None):
         super().__init__(message)
 
 
+class InvalidModelTypeError(RuntimeError):
+    """Used when user-provided model is invalid."""
+
+    def __init__(self, message=None):
+        self.message = message
+        super().__init__(message)
+
+
 class InvalidConfigurationError(RuntimeError):
     """Used when user-requested split/data would not work."""
 

diff --git a/astartes/utils/utils.py b/astartes/utils/utils.py
@@ -0,0 +1,158 @@
+import sklearn
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from tabulate import tabulate
+
+from astartes import train_val_test_split
+from astartes.utils.exceptions import InvalidModelTypeError
+
+
+def generate_regression_results_dict(
+    sklearn_model,
+    X,
+    y,
+    samplers=["random"],
+    random_state=0,
+    samplers_hopts={},
+    train_size=0.8,
+    val_size=0.1,
+    test_size=0.1,
+    print_results=False,
+):
+    """
+    Helper function to train a sklearn model using the provided data
+    and provided sampler types.
+
+    Args:
+        X (np.array, pd.DataFrame): Numpy array or pandas DataFrame of feature vectors.
+        y (np.array, pd.Series): Targets corresponding to X, must be of same size.
+        train_size (float, optional): Fraction of dataset to use in training set. Defaults to 0.8.
+        val_size (float, optional): Fraction of dataset to use in validation set. Defaults to 0.1.
+        test_size (float, optional): Fraction of dataset to use in test set. Defaults to 0.1.
+        random_state (int, optional): The random seed used throughout astartes.
+        samplers_hopts (dict, optional): Should be a dictionary of dictionaries with the keys specifying
+                                         the sampler and the values being another dictionary with the
+                                         corresponding hyperparameters. Defaults to {}.
+        print_results (bool, optional): whether to print the resulting dictionary as a neat table
+
+    Returns:
+        dict: nested dictionary with the format of
+            {
+                sampler: {
+                    'mae':{
+                        'train': [],
+                        'val': [],
+                        'test': [],
+                    },
+                    'rmse':{
+                        'train': [],
+                        'val': [],
+                        'test': [],
+                    },
+                    'R2':{
+                        'train': [],
+                        'val': [],
+                        'test': [],
+                    },
+                },
+            }
+    """
+    if not isinstance(sklearn_model, sklearn.base.BaseEstimator):
+        raise InvalidModelTypeError("Model must be an sklearn model")
+
+    final_dict = {}
+    for sampler in samplers:
+        error_dict = {
+            "mae": {
+                "train": [],
+                "val": [],
+                "test": [],
+            },
+            "rmse": {
+                "train": [],
+                "val": [],
+                "test": [],
+            },
+            "R2": {
+                "train": [],
+                "val": [],
+                "test": [],
+            },
+        }
+
+        # obtain indices
+        _, _, _, train_indices, val_indices, test_indices = train_val_test_split(
+            X,
+            train_size=train_size,
+            val_size=val_size,
+            test_size=test_size,
+            sampler=sampler,
+            random_state=random_state,
+            hopts=samplers_hopts.get(sampler, dict()),
+            return_indices=True,
+        )
+
+        # create data splits
+        X_train = X[train_indices]
+        X_val = X[val_indices]
+        X_test = X[test_indices]
+
+        y_train = y[train_indices]
+        y_val = y[val_indices]
+        y_test = y[test_indices]
+
+        # fit the model to the training data
+        sklearn_model.fit(X_train, y_train)
+
+        # get predictions
+        y_pred_train = sklearn_model.predict(X_train)
+        y_pred_val = sklearn_model.predict(X_val)
+        y_pred_test = sklearn_model.predict(X_test)
+
+        # store MAEs
+        train_mae = mean_absolute_error(y_train, y_pred_train)
+        error_dict["mae"]["train"].append(train_mae)
+
+        val_mae = mean_absolute_error(y_val, y_pred_val)
+        error_dict["mae"]["val"].append(val_mae)
+
+        test_mae = mean_absolute_error(y_test, y_pred_test)
+        error_dict["mae"]["test"].append(test_mae)
+
+        # store RMSEs
+        train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
+        error_dict["rmse"]["train"].append(train_rmse)
+
+        val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)
+        error_dict["rmse"]["val"].append(val_rmse)
+
+        test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
+        error_dict["rmse"]["test"].append(test_rmse)
+
+        # store R2
+        train_R2 = r2_score(y_train, y_pred_train)
+        error_dict["R2"]["train"].append(train_R2)
+
+        val_R2 = r2_score(y_val, y_pred_val)
+        error_dict["R2"]["val"].append(val_R2)
+
+        test_R2 = r2_score(y_test, y_pred_test)
+        error_dict["R2"]["test"].append(test_R2)
+
+        final_dict[sampler] = error_dict
+
+        if print_results:
+            print(f"\nDisplaying results for {sampler} sampler")
+            display_results_as_table(error_dict)
+
+    return final_dict
+
+
+def display_results_as_table(error_dict):
+    """Helper function to print a dictionary as a neat tabulate"""
+    headers = ["Train", "Val", "Test"]
+    table = []
+    for key, val in error_dict.items():
+        table_tmp = [key.upper()]
+        table_tmp.extend([val[0] for val in val.values()])
+        table.append(table_tmp)
+    print(tabulate(table, headers=headers))
diff --git a/test/unit/utils/test_utils.py b/test/unit/utils/test_utils.py
@@ -0,0 +1,119 @@
+import unittest
+
+import numpy as np
+from sklearn.svm import LinearSVR
+
+from astartes.samplers.interpolation import Random
+from astartes.utils.exceptions import InvalidModelTypeError
+from astartes.utils.utils import generate_regression_results_dict
+
+
+class Test_utils(unittest.TestCase):
+    """
+    Test functions within utils.py.
+    """
+
+    @classmethod
+    def setUpClass(self):
+        """Save re-used arrays as class attributes."""
+        # X and y come from sklearn's make_regression function
+        self.X = np.array(
+            [
+                [-0.86, -0.98],
+                [-0.42, -0.87],
+                [1.33, 0.20],
+                [-0.25, 2.43],
+                [-0.59, -0.91],
+                [-0.33, 0.19],
+                [-0.10, -0.01],
+                [1.86, 1.15],
+                [0.64, -1.51],
+                [-0.36, 0.06],
+                [0.6, -0.36],
+                [1.56, -0.09],
+                [-0.70, -1.66],
+                [-0.33, 0.44],
+                [1.58, 0.11],
+                [0.25, -0.05],
+                [-0.63, 0.79],
+                [-0.11, 0.00],
+                [-0.20, -1.19],
+                [0.71, 1.00],
+            ]
+        )
+        self.y = np.array(
+            [
+                -10.27,
+                -6.19,
+                12.13,
+                4.90,
+                -7.77,
+                -2.31,
+                -0.89,
+                19.42,
+                1.18,
+                -2.97,
+                4.18,
+                13.26,
+                -10.90,
+                -1.58,
+                14.01,
+                2.00,
+                -3.16,
+                -0.91,
+                -5.25,
+                9.07,
+            ]
+        )
+
+    def test_generate_regression_results_dict(self):
+        """Generate results dictionary for simple regression task."""
+
+        # test that error is raised if not using sklearn model
+        with self.assertRaises(InvalidModelTypeError) as e:
+            generate_regression_results_dict(
+                Random,
+                self.X,
+                self.y,
+                train_size=0.6,
+                val_size=0.2,
+                test_size=0.2,
+            )
+
+        # use default hyperparameters
+        sklearn_model = LinearSVR()
+
+        # test function call and also that a table can be printed without error
+        results_dict = generate_regression_results_dict(
+            sklearn_model,
+            self.X,
+            self.y,
+            train_size=0.6,
+            val_size=0.2,
+            test_size=0.2,
+            print_results=True,
+        )
+
+        # test that only results for the default random sampler are included
+        self.assertEqual(
+            len(results_dict),
+            1,
+            msg=f"results_dict contained {results_dict.keys()}. Expected just random sampler.",
+        )
+        # test that results for mae, rmse, and r2 are included
+        self.assertTrue(
+            "mae" in results_dict["random"].keys(),
+            msg=f"results_dict did not contain MAE results.",
+        )
+        self.assertTrue(
+            "rmse" in results_dict["random"].keys(),
+            msg=f"results_dict did not contain RMSE results.",
+        )
+        self.assertTrue(
+            "R2" in results_dict["random"].keys(),
+            msg=f"results_dict did not contain R2 results.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()