Add metric for loss value in active learning problems (#66)

asreview · Oct 31, 2024 · 5a15875 · 5a15875
1 parent 41c3298
commit 5a15875
Show file tree

Hide file tree

Showing 5 changed files with 143 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -133,7 +133,44 @@ pinpoint hard-to-find papers. The ATD, on the other hand, measures performance
 throughout the entire screening process, eliminating reliance on arbitrary
 cut-off values, and can be used to compare different models.
 
+### Loss
+The Loss metric evaluates the performance of an active learning model by
+quantifying how closely it approximates the ideal screening process. This
+quantification is then normalized between the ideal curve and the worst possible
+curve.
 
+While metrics like WSS, Recall, and ERF evaluate the performance at specific
+points on the recall curve, the Loss metric provides an overall measure of
+performance.
+
+To compute the loss, we start with three key concepts:
+
+1. **Optimal AUC**: This is the area under a "perfect recall curve," where
+   relevant records are identified as early as possible. Mathematically, it is
+   computed as $Nx \times Ny - \frac{Ny \times (Ny - 1)}{2}$, where $Nx$ is the
+   total number of records, and $Ny$ is the number of relevant records.
+
+2. **Worst AUC**: This represents the area under a worst-case recall curve,
+   where all relevant records appear at the end of the screening process. This
+   is calculated as $\frac{Ny \times (Ny + 1)}{2}$.
+
+3. **Actual AUC**: This is the area under the recall curve produced by the model
+   during the screening process. It can be obtained by summing up the cumulative
+   recall values for the labeled records.
+
+The normalized loss is calculated by taking the difference between the optimal
+AUC and the actual AUC, divided by the difference between the optimal AUC and
+the worst AUC.
+
+$$\text{Normalized Loss} = \frac{Ny \times \left(Nx - \frac{Ny - 1}{2}\right) -
+\sum \text{Cumulative Recall}}{Ny \times (Nx - Ny)}$$
+
+The lower the loss, the closer the model is to the perfect recall curve,
+indicating higher performance.
+
+![Recall plot illustrating loss metric](https://github.com/jteijema/asreview-insights/blob/loss-metric/figures/loss_metric_example.png?raw=true)
+
+In this figure, the green area between the recall curve and the perfect recall line is the lossed performance, which is then normalized for the total area (green and red combined).
 
 ## Basic usage
 
@@ -467,6 +504,11 @@ which results in
                     ]
                 ]
             },
+            {
+                "id": "loss",
+                "title": "Loss",
+                "value": 0.01707543880041846
+            },
             {
                 "id": "erf",
                 "title": "Extra Relevant record Found",

diff --git a/asreviewcontrib/insights/algorithms.py b/asreviewcontrib/insights/algorithms.py
@@ -19,6 +19,36 @@ def _recall_values(labels, x_absolute=False, y_absolute=False):
     return x.tolist(), y.tolist()
 
 
+def _loss_value(labels):
+    Ny = sum(labels)
+    Nx = len(labels)
+
+    if Ny == 0 or Nx == Ny:
+        raise ValueError("Need both 0 and 1 labels")
+
+    # The normalized loss is computed based on:
+    #
+    # 1. The "optimal" possible AUC, representing the area under an optimal recall
+    #    curve, is the total area, Nx * Ny, minus the area above the stepwise
+    #    curve, (Ny * (Ny - 1)) / 2. Combined to Ny * (Nx - (Ny - 1)) / 2.
+    #
+    # 2. The "actual" AUC is the cumulative recall sum, calculated with
+    #    np.cumsum(labels).sum().
+    #
+    # 3. The "worst" AUC, where all positive labels are clustered at the end, is
+    #    calculated as (Ny * (Ny + 1)) / 2. To normalize, we need the difference
+    #    between the optimal and worst AUCs. We simplify this difference:
+    #
+    #        (Nx * Ny - ((Ny * (Ny - 1)) / 2)) - ((Ny * (Ny + 1)) / 2)
+    #
+    #    This simplifies to the hyperbolic paraboloid Ny * (Nx - Ny), which is
+    #    the denominator in our normalized loss.
+    #
+    # Finally, we compute the normalized loss as: 
+    # (optimal - actual) / (optimal - worst).
+    return float((Ny * (Nx - (Ny - 1) / 2) - np.cumsum(labels).sum()) / (Ny * (Nx - Ny)))  # noqa: E501
+
+
 def _wss_values(labels, x_absolute=False, y_absolute=False):
     n_docs = len(labels)
     n_pos_docs = sum(labels)

diff --git a/asreviewcontrib/insights/metrics.py b/asreviewcontrib/insights/metrics.py
@@ -6,6 +6,7 @@
 from asreviewcontrib.insights.algorithms import _erf_values
 from asreviewcontrib.insights.algorithms import _fn_values
 from asreviewcontrib.insights.algorithms import _fp_values
+from asreviewcontrib.insights.algorithms import _loss_value
 from asreviewcontrib.insights.algorithms import _recall_values
 from asreviewcontrib.insights.algorithms import _tn_values
 from asreviewcontrib.insights.algorithms import _tp_values
@@ -169,6 +170,20 @@ def _tnr(labels, intercept, x_absolute=False):
 
     return _slice_metric(x, y, intercept)
 
+def loss(state_obj, priors=False):
+    """Compute the loss for active learning problem.
+
+    Computes the loss for active learning problem where all relevant records
+    have to be seen by a human.
+
+    See the inline documentation for detailed description of loss calculation.
+
+    Returns:
+        float: The loss value.
+    """
+    labels = _pad_simulation_labels(state_obj, priors=priors)
+
+    return _loss_value(labels)
 
 def get_metrics(
     state_obj,
@@ -225,6 +240,11 @@ def get_metrics(
                     "title": "Work Saved over Sampling",
                     "value": [(i, v) for i, v in zip(wss, wss_values)],
                 },
+                {
+                    "id": "loss",
+                    "title": "Loss",
+                    "value": _loss_value(labels),
+                },
                 {
                     "id": "erf",
                     "title": "Extra Relevant record Found",

diff --git a/figures/loss_metric_example.png b/figures/loss_metric_example.png
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -1,17 +1,24 @@
 from pathlib import Path
 
+import numpy as np
 from asreview import open_state
 from numpy import array_equal
 from numpy.testing import assert_almost_equal
+from numpy.testing import assert_raises
 
+from asreviewcontrib.insights.algorithms import _loss_value
+from asreviewcontrib.insights.metrics import _erf
 from asreviewcontrib.insights.metrics import _recall
 from asreviewcontrib.insights.metrics import _time_to_discovery
+from asreviewcontrib.insights.metrics import _wss
 from asreviewcontrib.insights.metrics import get_metrics
+from asreviewcontrib.insights.metrics import loss
 from asreviewcontrib.insights.metrics import recall
 
 TEST_ASREVIEW_FILES = Path(Path(__file__).parent, "asreview_files")
 
 
+
 def test_metric_recall_small_data():
     labels = [1, 1, 1, 0]
     r = _recall(labels, 0.5)
@@ -111,3 +118,47 @@ def test_label_padding():
         stop_if_full = get_metrics(s)
 
     assert stop_if_min == stop_if_full
+
+def test_loss():
+    with open_state(
+        Path(TEST_ASREVIEW_FILES, "sim_van_de_schoot_2017_stop_if_min.asreview")
+    ) as s:
+        loss_value = loss(s)
+        assert_almost_equal(loss_value, 0.011592855205548452)
+
+def test_loss_value_function(seed=None):
+    test_cases = [
+        ([1, 0], 0),
+        ([0, 1], 1),
+        ([1, 1, 0, 0, 0], 0),
+        ([0, 0, 0, 1, 1], 1),
+        ([1, 0, 1], 0.5)
+    ]
+
+    for labels, expected_value in test_cases:
+        loss_value = _loss_value(labels)
+        assert_almost_equal(loss_value, expected_value)
+
+    error_cases = [[0, 0, 0], [0], [1]]
+    for labels in error_cases:
+        with assert_raises(ValueError):
+            _loss_value(labels)
+
+    if seed is not None:
+        np.random.seed(seed)
+
+    for _ in range(100):
+        length = np.random.randint(2, 100)
+        labels = np.random.randint(0, 2, length)
+
+        # Ensure labels are not all 0 or all 1
+        if np.all(labels == 0) or np.all(labels == 1):
+            labels[np.random.randint(0, length)] = 1 - labels[0]
+
+        loss_value = _loss_value(labels)
+        assert 0 <= loss_value <= 1
+
+def test_single_value_formats():
+    assert isinstance(_wss([1,1,0,0], 0.5), float)
+    assert isinstance(_loss_value([1,1,0,0]), float)
+    assert isinstance(_erf([1,1,0,0], 0.5), float)