From 2263551ddc648eaa02805940b261c88e90ee2ada Mon Sep 17 00:00:00 2001
From: Kevin Spiekermann <kspieker@mit.edu>
Date: Thu, 1 Jun 2023 16:56:43 -0400
Subject: [PATCH 01/10] Draft a function to obtain values for a table

---
 examples/table_script/table_script.py | 157 ++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 examples/table_script/table_script.py

diff --git a/examples/table_script/table_script.py b/examples/table_script/table_script.py
new file mode 100644
index 0000000..c9311ab
--- /dev/null
+++ b/examples/table_script/table_script.py
@@ -0,0 +1,157 @@
+"""
+Draft script to train a simple sklearn model 
+and output the results that could be used to create 
+a table similar to what is from the paper.
+
+"""
+
+import numpy as np
+import pandas as pd
+from pprint import pprint
+
+from rdkit import Chem, DataStructs
+from rdkit.Chem import AllChem
+
+from sklearn.svm import LinearSVR
+from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
+
+from astartes import train_val_test_split
+
+
+# read in the data
+CSV_PATH = '../barrier_prediction_with_RDB7/ccsdtf12_dz.csv'
+df = pd.read_csv(CSV_PATH)
+df
+
+# helper function to featurize the data with 2048 morgan fingerprint
+# https://github.com/chemprop/chemprop/blob/master/chemprop/features/features_generators.py
+MORGAN_RADIUS = 2
+MORGAN_NUM_BITS = 2048
+def morgan_counts_features_generator(mol,
+                                     radius= MORGAN_RADIUS,
+                                     num_bits= MORGAN_NUM_BITS):
+    """
+    Generates a counts-based Morgan fingerprint for a molecule.
+    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
+    :param radius: Morgan fingerprint radius.
+    :param num_bits: Number of bits in Morgan fingerprint.
+    :return: A 1D numpy array containing the counts-based Morgan fingerprint.
+    """
+    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
+    features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
+    features = np.zeros((1,))
+    DataStructs.ConvertToNumpyArray(features_vec, features)
+
+    return features
+
+# create X and y
+params = Chem.SmilesParserParams()
+params.removeHs = False
+
+X = np.zeros((len(df), 2048*2))
+for i, row in df.iterrows():
+    rsmi, psmi = row.rsmi, row.psmi
+    
+    rmol = Chem.MolFromSmiles(rsmi, params)
+    r_morgan = morgan_counts_features_generator(rmol)
+    
+    pmol = Chem.MolFromSmiles(psmi, params)
+    p_morgan = morgan_counts_features_generator(pmol)
+    
+    X[i, :] = np.concatenate((r_morgan,
+                              p_morgan - r_morgan),
+                             axis=0)
+
+y = df.dE0.values
+
+def produce_table(sklearn_model, X, y, samplers = ["random"], seed=0, hopts={},):
+    final_dict = {}
+    for sampler in samplers:
+        error_dict = {'mae': {'train': [],
+                              'val': [],
+                              'test': [],
+                             },
+                      'rmse': {'train': [],
+                               'val': [],
+                               'test': [],
+                              },
+                      'R2': {'train': [],
+                               'val': [],
+                               'test': [],
+                            },
+                     }
+        
+        # obtain indices
+        _,_,_, train_indices, val_indices, test_indices = train_val_test_split(X,
+                                                                        train_size=0.85,
+                                                                        val_size=0.05,
+                                                                        test_size=0.1,
+                                                                        sampler=sampler,
+                                                                        random_state=seed,
+                                                                        hopts=hopts,
+                                                                        return_indices=True,
+                                                                        ) 
+        
+        
+        # create data splits
+        X_train = X[train_indices]
+        X_val = X[val_indices]
+        X_test = X[test_indices]
+        
+        y_train = y[train_indices]
+        y_val = y[val_indices]
+        y_test = y[test_indices]
+        
+        
+        # fit the model to the training data
+        sklearn_model.fit(X_train, y_train)
+        
+        # get predictions
+        y_pred_train = sklearn_model.predict(X_train)
+        y_pred_val = sklearn_model.predict(X_val)
+        y_pred_test = sklearn_model.predict(X_test)
+        
+        
+        # store MAEs
+        train_mae = mean_absolute_error(y_train, y_pred_train)
+        error_dict['mae']['train'].append(train_mae)
+
+        val_mae = mean_absolute_error(y_val, y_pred_val)
+        error_dict['mae']['val'].append(val_mae)
+
+        test_mae = mean_absolute_error(y_test, y_pred_test)
+        error_dict['mae']['test'].append(test_mae)
+        
+        
+        # store RMSEs
+        train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
+        error_dict['rmse']['train'].append(train_rmse)
+
+        val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)
+        error_dict['rmse']['val'].append(val_rmse)
+
+        test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
+        error_dict['rmse']['test'].append(test_rmse)
+
+        
+        # store R2
+        train_R2 = r2_score(y_train, y_pred_train)
+        error_dict['R2']['train'].append(train_R2)
+
+        val_R2 = r2_score(y_val, y_pred_val)
+        error_dict['R2']['val'].append(val_R2)
+
+        test_R2 = r2_score(y_test, y_pred_test)
+        error_dict['R2']['test'].append(test_R2)
+        
+        final_dict[sampler] = error_dict
+        
+    return final_dict
+
+
+# use default hyperparameters
+sklearn_model = LinearSVR()
+
+final_dict = sample_function(sklearn_model, X, y)
+pprint(final_dict)
+

From 6398242f7e3cad8271316cf8e4adb6f1ce416a60 Mon Sep 17 00:00:00 2001
From: Kevin Spiekermann <kspieker@mit.edu>
Date: Mon, 12 Jun 2023 14:35:17 -0400
Subject: [PATCH 02/10] Remove whitespace

---
 examples/table_script/table_script.py | 36 ++++++++++++---------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/examples/table_script/table_script.py b/examples/table_script/table_script.py
index c9311ab..91458e7 100644
--- a/examples/table_script/table_script.py
+++ b/examples/table_script/table_script.py
@@ -21,7 +21,7 @@
 # read in the data
 CSV_PATH = '../barrier_prediction_with_RDB7/ccsdtf12_dz.csv'
 df = pd.read_csv(CSV_PATH)
-df
+print(df.shape)
 
 # helper function to featurize the data with 2048 morgan fingerprint
 # https://github.com/chemprop/chemprop/blob/master/chemprop/features/features_generators.py
@@ -51,13 +51,13 @@ def morgan_counts_features_generator(mol,
 X = np.zeros((len(df), 2048*2))
 for i, row in df.iterrows():
     rsmi, psmi = row.rsmi, row.psmi
-    
+
     rmol = Chem.MolFromSmiles(rsmi, params)
     r_morgan = morgan_counts_features_generator(rmol)
-    
+
     pmol = Chem.MolFromSmiles(psmi, params)
     p_morgan = morgan_counts_features_generator(pmol)
-    
+
     X[i, :] = np.concatenate((r_morgan,
                               p_morgan - r_morgan),
                              axis=0)
@@ -80,7 +80,7 @@ def produce_table(sklearn_model, X, y, samplers = ["random"], seed=0, hopts={},)
                                'test': [],
                             },
                      }
-        
+
         # obtain indices
         _,_,_, train_indices, val_indices, test_indices = train_val_test_split(X,
                                                                         train_size=0.85,
@@ -90,28 +90,27 @@ def produce_table(sklearn_model, X, y, samplers = ["random"], seed=0, hopts={},)
                                                                         random_state=seed,
                                                                         hopts=hopts,
                                                                         return_indices=True,
-                                                                        ) 
-        
-        
+                                                                        )
+
         # create data splits
         X_train = X[train_indices]
         X_val = X[val_indices]
         X_test = X[test_indices]
-        
+
         y_train = y[train_indices]
         y_val = y[val_indices]
         y_test = y[test_indices]
-        
-        
+
+
         # fit the model to the training data
         sklearn_model.fit(X_train, y_train)
-        
+
         # get predictions
         y_pred_train = sklearn_model.predict(X_train)
         y_pred_val = sklearn_model.predict(X_val)
         y_pred_test = sklearn_model.predict(X_test)
-        
-        
+
+
         # store MAEs
         train_mae = mean_absolute_error(y_train, y_pred_train)
         error_dict['mae']['train'].append(train_mae)
@@ -121,8 +120,7 @@ def produce_table(sklearn_model, X, y, samplers = ["random"], seed=0, hopts={},)
 
         test_mae = mean_absolute_error(y_test, y_pred_test)
         error_dict['mae']['test'].append(test_mae)
-        
-        
+
         # store RMSEs
         train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
         error_dict['rmse']['train'].append(train_rmse)
@@ -133,7 +131,6 @@ def produce_table(sklearn_model, X, y, samplers = ["random"], seed=0, hopts={},)
         test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
         error_dict['rmse']['test'].append(test_rmse)
 
-        
         # store R2
         train_R2 = r2_score(y_train, y_pred_train)
         error_dict['R2']['train'].append(train_R2)
@@ -143,9 +140,9 @@ def produce_table(sklearn_model, X, y, samplers = ["random"], seed=0, hopts={},)
 
         test_R2 = r2_score(y_test, y_pred_test)
         error_dict['R2']['test'].append(test_R2)
-        
+
         final_dict[sampler] = error_dict
-        
+
     return final_dict
 
 
@@ -154,4 +151,3 @@ def produce_table(sklearn_model, X, y, samplers = ["random"], seed=0, hopts={},)
 
 final_dict = sample_function(sklearn_model, X, y)
 pprint(final_dict)
-

From 47c56f9f0fd0e1c3460f8b91885a2eec1753dc38 Mon Sep 17 00:00:00 2001
From: Kevin Spiekermann <kspieker@mit.edu>
Date: Mon, 12 Jun 2023 14:52:40 -0400
Subject: [PATCH 03/10] Update script

---
 examples/table_script/table_script.py | 63 ++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 11 deletions(-)

diff --git a/examples/table_script/table_script.py b/examples/table_script/table_script.py
index 91458e7..a000a2a 100644
--- a/examples/table_script/table_script.py
+++ b/examples/table_script/table_script.py
@@ -5,19 +5,17 @@
 
 """
 
-import numpy as np
-import pandas as pd
 from pprint import pprint
 
+import numpy as np
+import pandas as pd
 from rdkit import Chem, DataStructs
 from rdkit.Chem import AllChem
-
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
 from sklearn.svm import LinearSVR
-from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
 
 from astartes import train_val_test_split
 
-
 # read in the data
 CSV_PATH = '../barrier_prediction_with_RDB7/ccsdtf12_dz.csv'
 df = pd.read_csv(CSV_PATH)
@@ -64,7 +62,50 @@ def morgan_counts_features_generator(mol,
 
 y = df.dE0.values
 
-def produce_table(sklearn_model, X, y, samplers = ["random"], seed=0, hopts={},):
+def produce_table(sklearn_model,
+                  X,
+                  y,
+                  samplers=["random"],
+                  seed=0,
+                  sampler_hopts={},
+                  train_size=0.8,
+                  val_size=0.1,
+                  test_size=0.1,
+                  ):
+    """
+    Helper function to train a sklearn model using the provided data
+    and provided sampler types.
+
+    Args:
+        X (np.array, pd.DataFrame): Numpy array or pandas DataFrame of feature vectors.
+        y (np.array, pd.Series): Targets corresponding to X, must be of same size.
+        train_size (float, optional): Fraction of dataset to use in training set. Defaults to 0.8.
+        val_size (float, optional): Fraction of dataset to use in validation set. Defaults to 0.1.
+        test_size (float, optional): Fraction of dataset to use in test set. Defaults to 0.1.
+        random_state (int, optional): The random seed used throughout astartes.
+    
+    Returns:
+        dict: nested dictionary with the format of 
+            {
+                sampler: {
+                    'mae':{
+                        'train': [],
+                        'val': [],
+                        'test': [],
+                    },
+                    'rmse':{
+                        'train': [],
+                        'val': [],
+                        'test': [],
+                    },
+                    'R2':{
+                        'train': [],
+                        'val': [],
+                        'test': [],
+                    },
+                },
+            }
+    """
     final_dict = {}
     for sampler in samplers:
         error_dict = {'mae': {'train': [],
@@ -83,12 +124,12 @@ def produce_table(sklearn_model, X, y, samplers = ["random"], seed=0, hopts={},)
 
         # obtain indices
         _,_,_, train_indices, val_indices, test_indices = train_val_test_split(X,
-                                                                        train_size=0.85,
-                                                                        val_size=0.05,
-                                                                        test_size=0.1,
+                                                                        train_size=train_size,
+                                                                        val_size=val_size,
+                                                                        test_size=test_size,
                                                                         sampler=sampler,
                                                                         random_state=seed,
-                                                                        hopts=hopts,
+                                                                        hopts=sampler_hopts.get(sampler, dict()),
                                                                         return_indices=True,
                                                                         )
 
@@ -149,5 +190,5 @@ def produce_table(sklearn_model, X, y, samplers = ["random"], seed=0, hopts={},)
 # use default hyperparameters
 sklearn_model = LinearSVR()
 
-final_dict = sample_function(sklearn_model, X, y)
+final_dict = produce_table(sklearn_model, X, y)
 pprint(final_dict)

From 14a3e9809b8c51b8a26f8bad4e73be214b71bbdf Mon Sep 17 00:00:00 2001
From: Kevin Spiekermann <kspieker@mit.edu>
Date: Mon, 12 Jun 2023 15:35:08 -0400
Subject: [PATCH 04/10] Update docstring

---
 examples/table_script/table_script.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/table_script/table_script.py b/examples/table_script/table_script.py
index a000a2a..5119600 100644
--- a/examples/table_script/table_script.py
+++ b/examples/table_script/table_script.py
@@ -66,7 +66,7 @@ def produce_table(sklearn_model,
                   X,
                   y,
                   samplers=["random"],
-                  seed=0,
+                  random_state=0,
                   sampler_hopts={},
                   train_size=0.8,
                   val_size=0.1,
@@ -83,7 +83,11 @@ def produce_table(sklearn_model,
         val_size (float, optional): Fraction of dataset to use in validation set. Defaults to 0.1.
         test_size (float, optional): Fraction of dataset to use in test set. Defaults to 0.1.
         random_state (int, optional): The random seed used throughout astartes.
-    
+        sampler_hopts (dict, optional): Hyperparameters for the sampler used above.
+                                        Should be a dictionary of dictionaries with the keys specifying
+                                        the sampler and the values being another dictionary with the 
+                                        hyperparameters. Defaults to {}.
+
     Returns:
         dict: nested dictionary with the format of 
             {
@@ -128,7 +132,7 @@ def produce_table(sklearn_model,
                                                                         val_size=val_size,
                                                                         test_size=test_size,
                                                                         sampler=sampler,
-                                                                        random_state=seed,
+                                                                        random_state=random_state,
                                                                         hopts=sampler_hopts.get(sampler, dict()),
                                                                         return_indices=True,
                                                                         )

From 00b161e72b620bffb6149ea6cb1ccfcd4d5613ee Mon Sep 17 00:00:00 2001
From: Kevin Spiekermann <kspieker@mit.edu>
Date: Tue, 20 Jun 2023 09:27:53 -0400
Subject: [PATCH 05/10] Move function to utils folder

---
 .../utils/utils.py                            | 111 ++++--------------
 1 file changed, 22 insertions(+), 89 deletions(-)
 rename examples/table_script/table_script.py => astartes/utils/utils.py (58%)

diff --git a/examples/table_script/table_script.py b/astartes/utils/utils.py
similarity index 58%
rename from examples/table_script/table_script.py
rename to astartes/utils/utils.py
index 5119600..d2fd528 100644
--- a/examples/table_script/table_script.py
+++ b/astartes/utils/utils.py
@@ -1,77 +1,18 @@
-"""
-Draft script to train a simple sklearn model 
-and output the results that could be used to create 
-a table similar to what is from the paper.
-
-"""
-
-from pprint import pprint
-
-import numpy as np
-import pandas as pd
-from rdkit import Chem, DataStructs
-from rdkit.Chem import AllChem
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
-from sklearn.svm import LinearSVR
 
 from astartes import train_val_test_split
 
-# read in the data
-CSV_PATH = '../barrier_prediction_with_RDB7/ccsdtf12_dz.csv'
-df = pd.read_csv(CSV_PATH)
-print(df.shape)
-
-# helper function to featurize the data with 2048 morgan fingerprint
-# https://github.com/chemprop/chemprop/blob/master/chemprop/features/features_generators.py
-MORGAN_RADIUS = 2
-MORGAN_NUM_BITS = 2048
-def morgan_counts_features_generator(mol,
-                                     radius= MORGAN_RADIUS,
-                                     num_bits= MORGAN_NUM_BITS):
-    """
-    Generates a counts-based Morgan fingerprint for a molecule.
-    :param mol: A molecule (i.e., either a SMILES or an RDKit molecule).
-    :param radius: Morgan fingerprint radius.
-    :param num_bits: Number of bits in Morgan fingerprint.
-    :return: A 1D numpy array containing the counts-based Morgan fingerprint.
-    """
-    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
-    features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
-    features = np.zeros((1,))
-    DataStructs.ConvertToNumpyArray(features_vec, features)
-
-    return features
-
-# create X and y
-params = Chem.SmilesParserParams()
-params.removeHs = False
-
-X = np.zeros((len(df), 2048*2))
-for i, row in df.iterrows():
-    rsmi, psmi = row.rsmi, row.psmi
-
-    rmol = Chem.MolFromSmiles(rsmi, params)
-    r_morgan = morgan_counts_features_generator(rmol)
-
-    pmol = Chem.MolFromSmiles(psmi, params)
-    p_morgan = morgan_counts_features_generator(pmol)
-
-    X[i, :] = np.concatenate((r_morgan,
-                              p_morgan - r_morgan),
-                             axis=0)
-
-y = df.dE0.values
-
-def produce_table(sklearn_model,
-                  X,
-                  y,
-                  samplers=["random"],
-                  random_state=0,
-                  sampler_hopts={},
-                  train_size=0.8,
-                  val_size=0.1,
-                  test_size=0.1,
-                  ):
+
+def generate_regression_results_dict(sklearn_model,
+                                     X,
+                                     y,
+                                     samplers=["random"],
+                                     random_state=0,
+                                     samplers_hopts={},
+                                     train_size=0.8,
+                                     val_size=0.1,
+                                     test_size=0.1,
+                                     ):
     """
     Helper function to train a sklearn model using the provided data
     and provided sampler types.
@@ -83,10 +24,9 @@ def produce_table(sklearn_model,
         val_size (float, optional): Fraction of dataset to use in validation set. Defaults to 0.1.
         test_size (float, optional): Fraction of dataset to use in test set. Defaults to 0.1.
         random_state (int, optional): The random seed used throughout astartes.
-        sampler_hopts (dict, optional): Hyperparameters for the sampler used above.
-                                        Should be a dictionary of dictionaries with the keys specifying
-                                        the sampler and the values being another dictionary with the 
-                                        hyperparameters. Defaults to {}.
+        samplers_hopts (dict, optional): Should be a dictionary of dictionaries with the keys specifying
+                                         the sampler and the values being another dictionary with the 
+                                         corresponding hyperparameters. Defaults to {}.
 
     Returns:
         dict: nested dictionary with the format of 
@@ -128,14 +68,14 @@ def produce_table(sklearn_model,
 
         # obtain indices
         _,_,_, train_indices, val_indices, test_indices = train_val_test_split(X,
-                                                                        train_size=train_size,
-                                                                        val_size=val_size,
-                                                                        test_size=test_size,
-                                                                        sampler=sampler,
-                                                                        random_state=random_state,
-                                                                        hopts=sampler_hopts.get(sampler, dict()),
-                                                                        return_indices=True,
-                                                                        )
+                                                                               train_size=train_size,
+                                                                               val_size=val_size,
+                                                                               test_size=test_size,
+                                                                               sampler=sampler,
+                                                                               random_state=random_state,
+                                                                               hopts=samplers_hopts.get(sampler, dict()),
+                                                                               return_indices=True,
+                                                                               )
 
         # create data splits
         X_train = X[train_indices]
@@ -189,10 +129,3 @@ def produce_table(sklearn_model,
         final_dict[sampler] = error_dict
 
     return final_dict
-
-
-# use default hyperparameters
-sklearn_model = LinearSVR()
-
-final_dict = produce_table(sklearn_model, X, y)
-pprint(final_dict)

From 3d5bdac32d2f219a58cd621f786a9f3c328cc22e Mon Sep 17 00:00:00 2001
From: Kevin Spiekermann <kspieker@mit.edu>
Date: Tue, 20 Jun 2023 09:28:08 -0400
Subject: [PATCH 06/10] Add unittests

---
 test/unit/utils/test_utils.py | 75 +++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 test/unit/utils/test_utils.py

diff --git a/test/unit/utils/test_utils.py b/test/unit/utils/test_utils.py
new file mode 100644
index 0000000..ea85e5b
--- /dev/null
+++ b/test/unit/utils/test_utils.py
@@ -0,0 +1,75 @@
+import unittest
+
+import numpy as np
+from sklearn.svm import LinearSVR
+
+from astartes.utils.utils import generate_regression_results_dict
+
+
+class Test_utils(unittest.TestCase):
+    """
+    Test functions within utils.py.
+    """
+
+    @classmethod
+    def setUpClass(self):
+        """Save re-used arrays as class attributes."""
+        # X and y come from sklearn's make_regression function
+        self.X = np.array(
+            [
+                [-0.86, -0.98],
+                [-0.42, -0.87],
+                [ 1.33,  0.20],
+                [-0.25,  2.43],
+                [-0.59, -0.91],
+                [-0.33,  0.19],
+                [-0.10, -0.01],
+                [ 1.86,  1.15],
+                [ 0.64, -1.51],
+                [-0.36,  0.06],
+                [ 0.6 , -0.36],
+                [ 1.56, -0.09],
+                [-0.70, -1.66],
+                [-0.33,  0.44],
+                [ 1.58,  0.11],
+                [ 0.25, -0.05],
+                [-0.63,  0.79],
+                [-0.11,  0.00],
+                [-0.20, -1.19],
+                [ 0.71,  1.00],
+            ]
+        )
+        self.y = np.array([-10.27, -6.19, 12.13, 4.90 , -7.77, -2.31, -0.89,
+                           19.42, 1.18, -2.97, 4.18, 13.26, -10.90 ,  -1.58,
+                           14.01, 2.00, -3.16, -0.91, -5.25, 9.07])
+
+    def test_generate_regression_results_dict(self):
+        """Generate results dictionary for simple regression task."""
+
+        # use default hyperparameters
+        sklearn_model = LinearSVR()
+
+        results_dict = generate_regression_results_dict(sklearn_model, self.X, self.y,
+                                                        train_size=0.6,
+                                                        val_size=0.2,
+                                                        test_size=0.2,
+                                                        )
+        
+        # test that only results for the default random sampler are included
+        self.assertEqual(len(results_dict), 1,
+                         msg=f"results_dict contained {results_dict.keys()}. Expected just random sampler.",
+                         )
+        # test that results for mae, rmse, and r2 are included
+        self.assertTrue('mae' in results_dict['random'].keys(),
+                        msg=f"results_dict did not contain MAE results.",
+        )
+        self.assertTrue('rmse' in results_dict['random'].keys(),
+                        msg=f"results_dict did not contain RMSE results.",
+        )
+        self.assertTrue('R2' in results_dict['random'].keys(),
+                        msg=f"results_dict did not contain R2 results.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 38420c615d289a8608b1d7e8f91055efb0c5adc1 Mon Sep 17 00:00:00 2001
From: Kevin Spiekermann <kspieker@mit.edu>
Date: Tue, 20 Jun 2023 09:36:56 -0400
Subject: [PATCH 07/10] Apply black formatting

---
 astartes/utils/utils.py       | 94 ++++++++++++++++++-----------------
 test/unit/utils/test_utils.py | 93 ++++++++++++++++++++++------------
 2 files changed, 110 insertions(+), 77 deletions(-)

diff --git a/astartes/utils/utils.py b/astartes/utils/utils.py
index d2fd528..bd088df 100644
--- a/astartes/utils/utils.py
+++ b/astartes/utils/utils.py
@@ -3,16 +3,17 @@
 from astartes import train_val_test_split
 
 
-def generate_regression_results_dict(sklearn_model,
-                                     X,
-                                     y,
-                                     samplers=["random"],
-                                     random_state=0,
-                                     samplers_hopts={},
-                                     train_size=0.8,
-                                     val_size=0.1,
-                                     test_size=0.1,
-                                     ):
+def generate_regression_results_dict(
+    sklearn_model,
+    X,
+    y,
+    samplers=["random"],
+    random_state=0,
+    samplers_hopts={},
+    train_size=0.8,
+    val_size=0.1,
+    test_size=0.1,
+):
     """
     Helper function to train a sklearn model using the provided data
     and provided sampler types.
@@ -25,11 +26,11 @@ def generate_regression_results_dict(sklearn_model,
         test_size (float, optional): Fraction of dataset to use in test set. Defaults to 0.1.
         random_state (int, optional): The random seed used throughout astartes.
         samplers_hopts (dict, optional): Should be a dictionary of dictionaries with the keys specifying
-                                         the sampler and the values being another dictionary with the 
+                                         the sampler and the values being another dictionary with the
                                          corresponding hyperparameters. Defaults to {}.
 
     Returns:
-        dict: nested dictionary with the format of 
+        dict: nested dictionary with the format of
             {
                 sampler: {
                     'mae':{
@@ -52,30 +53,35 @@ def generate_regression_results_dict(sklearn_model,
     """
     final_dict = {}
     for sampler in samplers:
-        error_dict = {'mae': {'train': [],
-                              'val': [],
-                              'test': [],
-                             },
-                      'rmse': {'train': [],
-                               'val': [],
-                               'test': [],
-                              },
-                      'R2': {'train': [],
-                               'val': [],
-                               'test': [],
-                            },
-                     }
+        error_dict = {
+            "mae": {
+                "train": [],
+                "val": [],
+                "test": [],
+            },
+            "rmse": {
+                "train": [],
+                "val": [],
+                "test": [],
+            },
+            "R2": {
+                "train": [],
+                "val": [],
+                "test": [],
+            },
+        }
 
         # obtain indices
-        _,_,_, train_indices, val_indices, test_indices = train_val_test_split(X,
-                                                                               train_size=train_size,
-                                                                               val_size=val_size,
-                                                                               test_size=test_size,
-                                                                               sampler=sampler,
-                                                                               random_state=random_state,
-                                                                               hopts=samplers_hopts.get(sampler, dict()),
-                                                                               return_indices=True,
-                                                                               )
+        _, _, _, train_indices, val_indices, test_indices = train_val_test_split(
+            X,
+            train_size=train_size,
+            val_size=val_size,
+            test_size=test_size,
+            sampler=sampler,
+            random_state=random_state,
+            hopts=samplers_hopts.get(sampler, dict()),
+            return_indices=True,
+        )
 
         # create data splits
         X_train = X[train_indices]
@@ -86,7 +92,6 @@ def generate_regression_results_dict(sklearn_model,
         y_val = y[val_indices]
         y_test = y[test_indices]
 
-
         # fit the model to the training data
         sklearn_model.fit(X_train, y_train)
 
@@ -95,36 +100,35 @@ def generate_regression_results_dict(sklearn_model,
         y_pred_val = sklearn_model.predict(X_val)
         y_pred_test = sklearn_model.predict(X_test)
 
-
         # store MAEs
         train_mae = mean_absolute_error(y_train, y_pred_train)
-        error_dict['mae']['train'].append(train_mae)
+        error_dict["mae"]["train"].append(train_mae)
 
         val_mae = mean_absolute_error(y_val, y_pred_val)
-        error_dict['mae']['val'].append(val_mae)
+        error_dict["mae"]["val"].append(val_mae)
 
         test_mae = mean_absolute_error(y_test, y_pred_test)
-        error_dict['mae']['test'].append(test_mae)
+        error_dict["mae"]["test"].append(test_mae)
 
         # store RMSEs
         train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
-        error_dict['rmse']['train'].append(train_rmse)
+        error_dict["rmse"]["train"].append(train_rmse)
 
         val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)
-        error_dict['rmse']['val'].append(val_rmse)
+        error_dict["rmse"]["val"].append(val_rmse)
 
         test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)
-        error_dict['rmse']['test'].append(test_rmse)
+        error_dict["rmse"]["test"].append(test_rmse)
 
         # store R2
         train_R2 = r2_score(y_train, y_pred_train)
-        error_dict['R2']['train'].append(train_R2)
+        error_dict["R2"]["train"].append(train_R2)
 
         val_R2 = r2_score(y_val, y_pred_val)
-        error_dict['R2']['val'].append(val_R2)
+        error_dict["R2"]["val"].append(val_R2)
 
         test_R2 = r2_score(y_test, y_pred_test)
-        error_dict['R2']['test'].append(test_R2)
+        error_dict["R2"]["test"].append(test_R2)
 
         final_dict[sampler] = error_dict
 
diff --git a/test/unit/utils/test_utils.py b/test/unit/utils/test_utils.py
index ea85e5b..fc1f3fb 100644
--- a/test/unit/utils/test_utils.py
+++ b/test/unit/utils/test_utils.py
@@ -19,29 +19,50 @@ def setUpClass(self):
             [
                 [-0.86, -0.98],
                 [-0.42, -0.87],
-                [ 1.33,  0.20],
-                [-0.25,  2.43],
+                [1.33, 0.20],
+                [-0.25, 2.43],
                 [-0.59, -0.91],
-                [-0.33,  0.19],
+                [-0.33, 0.19],
                 [-0.10, -0.01],
-                [ 1.86,  1.15],
-                [ 0.64, -1.51],
-                [-0.36,  0.06],
-                [ 0.6 , -0.36],
-                [ 1.56, -0.09],
+                [1.86, 1.15],
+                [0.64, -1.51],
+                [-0.36, 0.06],
+                [0.6, -0.36],
+                [1.56, -0.09],
                 [-0.70, -1.66],
-                [-0.33,  0.44],
-                [ 1.58,  0.11],
-                [ 0.25, -0.05],
-                [-0.63,  0.79],
-                [-0.11,  0.00],
+                [-0.33, 0.44],
+                [1.58, 0.11],
+                [0.25, -0.05],
+                [-0.63, 0.79],
+                [-0.11, 0.00],
                 [-0.20, -1.19],
-                [ 0.71,  1.00],
+                [0.71, 1.00],
+            ]
+        )
+        self.y = np.array(
+            [
+                -10.27,
+                -6.19,
+                12.13,
+                4.90,
+                -7.77,
+                -2.31,
+                -0.89,
+                19.42,
+                1.18,
+                -2.97,
+                4.18,
+                13.26,
+                -10.90,
+                -1.58,
+                14.01,
+                2.00,
+                -3.16,
+                -0.91,
+                -5.25,
+                9.07,
             ]
         )
-        self.y = np.array([-10.27, -6.19, 12.13, 4.90 , -7.77, -2.31, -0.89,
-                           19.42, 1.18, -2.97, 4.18, 13.26, -10.90 ,  -1.58,
-                           14.01, 2.00, -3.16, -0.91, -5.25, 9.07])
 
     def test_generate_regression_results_dict(self):
         """Generate results dictionary for simple regression task."""
@@ -49,25 +70,33 @@ def test_generate_regression_results_dict(self):
         # use default hyperparameters
         sklearn_model = LinearSVR()
 
-        results_dict = generate_regression_results_dict(sklearn_model, self.X, self.y,
-                                                        train_size=0.6,
-                                                        val_size=0.2,
-                                                        test_size=0.2,
-                                                        )
-        
+        results_dict = generate_regression_results_dict(
+            sklearn_model,
+            self.X,
+            self.y,
+            train_size=0.6,
+            val_size=0.2,
+            test_size=0.2,
+        )
+
         # test that only results for the default random sampler are included
-        self.assertEqual(len(results_dict), 1,
-                         msg=f"results_dict contained {results_dict.keys()}. Expected just random sampler.",
-                         )
+        self.assertEqual(
+            len(results_dict),
+            1,
+            msg=f"results_dict contained {results_dict.keys()}. Expected just random sampler.",
+        )
         # test that results for mae, rmse, and r2 are included
-        self.assertTrue('mae' in results_dict['random'].keys(),
-                        msg=f"results_dict did not contain MAE results.",
+        self.assertTrue(
+            "mae" in results_dict["random"].keys(),
+            msg=f"results_dict did not contain MAE results.",
         )
-        self.assertTrue('rmse' in results_dict['random'].keys(),
-                        msg=f"results_dict did not contain RMSE results.",
+        self.assertTrue(
+            "rmse" in results_dict["random"].keys(),
+            msg=f"results_dict did not contain RMSE results.",
         )
-        self.assertTrue('R2' in results_dict['random'].keys(),
-                        msg=f"results_dict did not contain R2 results.",
+        self.assertTrue(
+            "R2" in results_dict["random"].keys(),
+            msg=f"results_dict did not contain R2 results.",
         )
 
 

From e6ca6d3cb03c0cf9aae0f0d090292d0c05c7f088 Mon Sep 17 00:00:00 2001
From: Kevin Spiekermann <kspieker@mit.edu>
Date: Wed, 28 Jun 2023 10:41:00 -0400
Subject: [PATCH 08/10] Fail gracefully if the passed model is not an sklearn
 model

---
 astartes/utils/exceptions.py  |  8 ++++++++
 astartes/utils/utils.py       |  5 +++++
 test/unit/utils/test_utils.py | 13 +++++++++++++
 3 files changed, 26 insertions(+)

diff --git a/astartes/utils/exceptions.py b/astartes/utils/exceptions.py
index 1fbc9a9..dd6e4b7 100644
--- a/astartes/utils/exceptions.py
+++ b/astartes/utils/exceptions.py
@@ -9,6 +9,14 @@ def __init__(self, message=None):
         super().__init__(message)
 
 
+class InvalidModelTypeError(RuntimeError):
+    """Used when user-provided model is invalid."""
+
+    def __init__(self, message=None):
+        self.message = message
+        super().__init__(message)
+
+
 class InvalidConfigurationError(RuntimeError):
     """Used when user-requested split/data would not work."""
 
diff --git a/astartes/utils/utils.py b/astartes/utils/utils.py
index bd088df..83a2ff2 100644
--- a/astartes/utils/utils.py
+++ b/astartes/utils/utils.py
@@ -1,6 +1,8 @@
+import sklearn
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
 
 from astartes import train_val_test_split
+from astartes.utils.exceptions import InvalidModelTypeError
 
 
 def generate_regression_results_dict(
@@ -51,6 +53,9 @@ def generate_regression_results_dict(
                 },
             }
     """
+    if not isinstance(sklearn_model, sklearn.base.BaseEstimator):
+        raise InvalidModelTypeError("Model must be an sklearn model")
+
     final_dict = {}
     for sampler in samplers:
         error_dict = {
diff --git a/test/unit/utils/test_utils.py b/test/unit/utils/test_utils.py
index fc1f3fb..074807d 100644
--- a/test/unit/utils/test_utils.py
+++ b/test/unit/utils/test_utils.py
@@ -3,6 +3,8 @@
 import numpy as np
 from sklearn.svm import LinearSVR
 
+from astartes.samplers.interpolation import Random
+from astartes.utils.exceptions import InvalidModelTypeError
 from astartes.utils.utils import generate_regression_results_dict
 
 
@@ -67,6 +69,17 @@ def setUpClass(self):
     def test_generate_regression_results_dict(self):
         """Generate results dictionary for simple regression task."""
 
+        # test that error is raised if not using sklearn model
+        with self.assertRaises(InvalidModelTypeError) as e:
+            generate_regression_results_dict(
+                Random,
+                self.X,
+                self.y,
+                train_size=0.6,
+                val_size=0.2,
+                test_size=0.2,
+            )
+
         # use default hyperparameters
         sklearn_model = LinearSVR()
 

From 14297a67865d8afb7113c66cc26e8b4829f49545 Mon Sep 17 00:00:00 2001
From: Kevin Spiekermann <kspieker@mit.edu>
Date: Wed, 28 Jun 2023 10:41:43 -0400
Subject: [PATCH 09/10] Optionally print a nicely formatted table of results
 from each sampler

---
 astartes/utils/utils.py       | 18 ++++++++++++++++++
 test/unit/utils/test_utils.py |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/astartes/utils/utils.py b/astartes/utils/utils.py
index 83a2ff2..8e65f7a 100644
--- a/astartes/utils/utils.py
+++ b/astartes/utils/utils.py
@@ -1,5 +1,6 @@
 import sklearn
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from tabulate import tabulate
 
 from astartes import train_val_test_split
 from astartes.utils.exceptions import InvalidModelTypeError
@@ -15,6 +16,7 @@ def generate_regression_results_dict(
     train_size=0.8,
     val_size=0.1,
     test_size=0.1,
+    print_results=False,
 ):
     """
     Helper function to train a sklearn model using the provided data
@@ -30,6 +32,7 @@ def generate_regression_results_dict(
         samplers_hopts (dict, optional): Should be a dictionary of dictionaries with the keys specifying
                                          the sampler and the values being another dictionary with the
                                          corresponding hyperparameters. Defaults to {}.
+        print_results (bool, optional): whether to print the resulting dictionary as a neat table
 
     Returns:
         dict: nested dictionary with the format of
@@ -137,4 +140,19 @@ def generate_regression_results_dict(
 
         final_dict[sampler] = error_dict
 
+        if print_results:
+            print(f"\nDisplaying results for {sampler} sampler")
+            display_results_as_table(error_dict)
+
     return final_dict
+
+
+def display_results_as_table(error_dict):
+    """Helper function to print a dictionary as a neat tabulate"""
+    headers = ["Train", "Val", "Test"]
+    table = []
+    for key, val in error_dict.items():
+        table_tmp = [key.upper()]
+        table_tmp.extend([val[0] for val in val.values()])
+        table.append(table_tmp)
+    print(tabulate(table, headers=headers))
diff --git a/test/unit/utils/test_utils.py b/test/unit/utils/test_utils.py
index 074807d..bad81d4 100644
--- a/test/unit/utils/test_utils.py
+++ b/test/unit/utils/test_utils.py
@@ -83,6 +83,7 @@ def test_generate_regression_results_dict(self):
         # use default hyperparameters
         sklearn_model = LinearSVR()
 
+        # test function call and also that a table can be printed without error
         results_dict = generate_regression_results_dict(
             sklearn_model,
             self.X,
@@ -90,6 +91,7 @@ def test_generate_regression_results_dict(self):
             train_size=0.6,
             val_size=0.2,
             test_size=0.2,
+            print_results=True,
         )
 
         # test that only results for the default random sampler are included

From f54892cc7ae4fa08889ca1bfc85de251e0745894 Mon Sep 17 00:00:00 2001
From: Kevin Spiekermann <kspieker@mit.edu>
Date: Wed, 28 Jun 2023 10:43:32 -0400
Subject: [PATCH 10/10] Update documentation

---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/README.md b/README.md
index 437215a..5636312 100644
--- a/README.md
+++ b/README.md
@@ -128,6 +128,30 @@ Running `astartes` with the default settings will always produce the exact same
 We have verified this behavior on Debian Ubuntu, Windows, and Intel Macs from Python versions 3.7 through 3.11 (with appropriate dependencies for each version).
 We are limited in our ability to test on M1 Macs, but from our limited manual testing we achieve perfect reproducbility in all cases _except occasionally_ with `KMeans` on Apple silicon. It has produced _slightly_ different results between platforms regardless of `random_state`, with up to two clusters being assigned differently resulting in data splits which are >99% identical. `astartes` is still consistent between runs on the same platform in all cases.
 
+## Evaluate the impact of splitting algorithms
+The `generate_regression_results_dict` function allows users to quickly evaluate the impact of different splitting techniques on any model supported by `sklearn`. All results are stored in a dictionary format and can be displayed in a neatly formatted table using the optional `print_results` argument.
+
+```
+from sklearn.svm import LinearSVR
+
+from astartes.utils.utils import generate_regression_results_dict
+
+sklearn_model = LinearSVR()
+results_dict = generate_regression_results_dict(
+                    sklearn_model,
+                    X,
+                    y,
+                    print_results=True,
+               )
+
+         Train       Val      Test
+----  --------  --------  --------
+MAE   1.41522   3.13435   2.17091
+RMSE  2.03062   3.73721   2.40041
+R2    0.90745   0.80787   0.78412
+
+```
+
 ## Online Documentation
 [The online documentation](https://JacksonBurns.github.io/astartes/) contains everything you see in this README with an additional tutorial for [moving from `train_test_split` in `sklearn` to `astartes`](https://jacksonburns.github.io/astartes/sklearn_to_astartes.html).