Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
robinruff committed Mar 14, 2023
1 parent 8e3af03 commit 88f467c
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 83 deletions.
64 changes: 7 additions & 57 deletions matbench/data_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,18 +108,13 @@ def score_array(true_array, pred_array, task_type):
for metric in metrics:
mfunc = METRIC_MAP[metric]

if metric == "rocauc":
# Both arrays must be in probability form
# if pred. array is given in probabilities
if isinstance(pred_array[0], float):
true_array = homogenize_clf_array(true_array, to_probs=True)

# Other clf metrics always be converted to labels
elif metric in CLF_METRICS:
if isinstance(pred_array[0], float):
pred_array = homogenize_clf_array(pred_array, to_labels=True)

computed[metric] = mfunc(true_array, pred_array)
if metric in CLF_METRICS and metric != "rocauc":
# Discretize predictions, if metric is for classification and not rocauc.
pred_array_bool = (np.asarray(pred_array) > CLF_THRESH).tolist()
computed[metric] = mfunc(true_array, pred_array_bool)
else:
computed[metric] = mfunc(true_array, pred_array)

return computed


Expand Down Expand Up @@ -154,51 +149,6 @@ def mean_absolute_percentage_error(y_true, y_pred, threshold=1e-5):
return np.mean(np.fabs((y_true - y_pred) / y_true))


def homogenize_clf_array(array, to_probs=False, to_labels=False, thresh=CLF_THRESH):
"""
Homogenize an array of either:
1. labels (True, False) to probabilities (1.0, 0.0)
2. probabilities (between 0 and 1) to labels (True, False)
based on a threshold float
Args:
array ([bool], [float]): A list of bools or a list of floats 0-1.
to_probs (bool): Convert the input array to all probabilities
to_labels (bool): Convert the input array to all labels based on
the threshold value thresh.
thresh (float): A number 0-1, which will decide the threshold
of probabilities if to_labels is True
Returns:
list
"""
if sum([to_probs, to_labels]) != 1:
raise ValueError(
"Set ONE of to_probs or to_labels to True to define "
"the conversion, NOT both."
)

if to_probs:
if all([isinstance(i, bool) for i in array]):
# The source array is bools
homogenized = [1.0 if i is True else 0.0 for i in array]
return homogenized
else:
raise TypeError(
"Cannot convert non-bool type in clf array to " "probabilities."
)
elif to_labels:
if all([isinstance(i, float) for i in array]):
# The source array is probabilities
homogenized = np.asarray(array) > thresh
return homogenized.tolist()
else:
raise TypeError(
"Cannot convert non-float types in clf array to" "labels."
)


METRIC_MAP = {
"mae": mean_absolute_error,
"rmse": lambda true, pred: math.sqrt(mean_squared_error(true, pred)),
Expand Down
45 changes: 19 additions & 26 deletions matbench/tests/test_data_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from matbench.constants import CLF_KEY, REG_KEY
from matbench.data_ops import (
homogenize_clf_array,
load,
mean_absolute_percentage_error,
score_array,
Expand Down Expand Up @@ -54,13 +53,23 @@ def test_downloads_mbv01(self):
)
self.assertIn(df[metadata["target"]].dtypes, test_types)

def assertDictAlmostEqual(self, first, second, **kwargs):
self.assertIsInstance(first, dict)
self.assertIsInstance(second, dict)
self.assertEqual(first.keys(), second.keys())
for key, value in first.items():
if isinstance(value, dict):
self.assertDictAlmostEqual(value, second[key], **kwargs)
else:
self.assertAlmostEqual(value, second[key], **kwargs)

def test_score_array(self):
# test for regression
true = [1, 2, 3, 4]
test = [1, 3, 3, 4]
ans = score_array(true, test, task_type=REG_KEY)
true_ans = {"mae": 0.25, "rmse": 0.5, "mape": 0.125, "max_error": 1}
self.assertDictEqual(ans, true_ans)
self.assertDictAlmostEqual(ans, true_ans)

# test for classification
true = [True, False]
Expand All @@ -72,13 +81,19 @@ def test_score_array(self):
"f1": 0.6666666666666666,
"rocauc": 0.5,
}
self.assertDictEqual(ans, true_ans)
self.assertDictAlmostEqual(ans, true_ans)

# test for probability clf
true = [True, False]
test = [0.7, 0.65]
ans = score_array(true, test, task_type=CLF_KEY)
self.assertDictEqual(ans, true_ans)
true_ans = {
"accuracy": 0.5,
"balanced_accuracy": 0.5,
"f1": 0.6666666666666666,
"rocauc": 1.0,
}
self.assertDictAlmostEqual(ans, true_ans)

def test_mean_absolute_percentage_error(self):

Expand All @@ -91,25 +106,3 @@ def test_mean_absolute_percentage_error(self):
self.assertAlmostEqual(mape, 0.09999999999999999)
self.assertAlmostEqual(mape, mape_masked)

def test_homogenize_clf_array(self):

bools = [True, False, True, True]
floats = [1.0, 0.3, 0.5001, 0.9]

probs = homogenize_clf_array(bools, to_probs=True)
self.assertAlmostEqual(probs[0], 1.0, places=5)
self.assertAlmostEqual(probs[1], 0.0, places=5)
self.assertAlmostEqual(probs[2], 1.0, places=5)
self.assertAlmostEqual(probs[3], 1.0, places=5)

labels = homogenize_clf_array(floats, to_labels=True, thresh=0.5)
self.assertTrue(labels[0])
self.assertFalse(labels[1])
self.assertTrue(labels[2])
self.assertTrue(labels[3])

labels2 = homogenize_clf_array(floats, to_labels=True, thresh=0.91)
self.assertTrue(labels2[0])
self.assertFalse(labels2[1])
self.assertFalse(labels2[2])
self.assertFalse(labels2[3])

0 comments on commit 88f467c

Please sign in to comment.