diff --git a/nucleus/metrics/base.py b/nucleus/metrics/base.py index 75d916e9..587dd5a5 100644 --- a/nucleus/metrics/base.py +++ b/nucleus/metrics/base.py @@ -1,7 +1,7 @@ import sys from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Iterable, List +from typing import Dict, Iterable, List from nucleus.annotation import AnnotationList from nucleus.prediction import PredictionList @@ -10,6 +10,16 @@ class MetricResult(ABC): """Base MetricResult class""" + @property + @abstractmethod + def results(self) -> Dict[str, float]: + """Interface for item results""" + + @property + def extra_info(self) -> Dict[str, str]: + """Overload this to pass extra info about the item to show in the UI""" + return {} + @dataclass class ScalarResult(MetricResult): @@ -27,6 +37,14 @@ class ScalarResult(MetricResult): value: float weight: float = 1.0 + @property + def results(self) -> Dict[str, float]: + return {"value": self.value} + + @property + def extra_info(self) -> Dict[str, str]: + return {"weight:": str(self.weight)} + @staticmethod def aggregate(results: Iterable["ScalarResult"]) -> "ScalarResult": """Aggregates results using a weighted average.""" @@ -37,6 +55,22 @@ def aggregate(results: Iterable["ScalarResult"]) -> "ScalarResult": return ScalarResult(value, total_weight) +@dataclass +class GroupedScalarResult(MetricResult): + group_to_scalar: Dict[str, ScalarResult] + + @property + def results(self) -> Dict[str, float]: + group_results = { + group: scalar.value + for group, scalar in self.group_to_scalar.items() + } + group_results["all_groups"] = ScalarResult.aggregate( + self.group_to_scalar.values() + ).value + return group_results + + class Metric(ABC): """Abstract class for defining a metric, which takes a list of annotations and predictions and returns a scalar. @@ -93,7 +127,9 @@ def __call__( """A metric must override this method and return a metric result, given annotations and predictions.""" @abstractmethod - def aggregate_score(self, results: List[MetricResult]) -> ScalarResult: + def aggregate_score( + self, results: List[MetricResult] + ) -> Dict[str, ScalarResult]: """A metric must define how to aggregate results from single items to a single ScalarResult. E.g. to calculate a R2 score with sklearn you could define a custom metric class :: diff --git a/nucleus/metrics/categorization_metrics.py b/nucleus/metrics/categorization_metrics.py index 416f831a..0d9f01cf 100644 --- a/nucleus/metrics/categorization_metrics.py +++ b/nucleus/metrics/categorization_metrics.py @@ -1,6 +1,6 @@ from abc import abstractmethod from dataclasses import dataclass -from typing import List, Set, Tuple, Union +from typing import Dict, List, Set, Tuple, Union from sklearn.metrics import f1_score @@ -33,16 +33,28 @@ class CategorizationResult(MetricResult): predictions: List[CategoryPrediction] @property - def value(self): + def results(self) -> Dict[str, float]: annotation_labels = to_taxonomy_labels(self.annotations) prediction_labels = to_taxonomy_labels(self.predictions) # TODO: Change task.py interface such that we can return label matching - # NOTE: Returning 1 if all taxonomy labels match else 0 - value = f1_score( - list(annotation_labels), list(prediction_labels), average="macro" - ) - return value + results = { + "f1_macro": f1_score( + list(annotation_labels), + list(prediction_labels), + average="macro", + ) + } + return results + + @property + def extra_info(self) -> Dict[str, str]: + annotation_labels = to_taxonomy_labels(self.annotations) + prediction_labels = to_taxonomy_labels(self.predictions) + return { + "annotations": ", ".join(annotation_labels), + "predictions": ", ".join(prediction_labels), + } class CategorizationMetric(Metric): @@ -80,7 +92,7 @@ def eval( pass @abstractmethod - def aggregate_score(self, results: List[CategorizationResult]) -> ScalarResult: # type: ignore[override] + def aggregate_score(self, results: List[CategorizationResult]) -> Dict[str, ScalarResult]: # type: ignore[override] pass def __call__( @@ -189,11 +201,18 @@ def eval( annotations=annotations, predictions=predictions ) - def aggregate_score(self, results: List[CategorizationResult]) -> ScalarResult: # type: ignore[override] + def aggregate_score(self, results: List[CategorizationResult]) -> Dict[str, ScalarResult]: # type: ignore[override] gt = [] predicted = [] for result in results: gt.extend(list(to_taxonomy_labels(result.annotations))) predicted.extend(list(to_taxonomy_labels(result.predictions))) - value = f1_score(gt, predicted, average=self.f1_method) - return ScalarResult(value) + aggregate_scores = {} + aggregate_scores["macro"] = f1_score(gt, predicted, average="macro") + aggregate_scores["weighted"] = f1_score( + gt, predicted, average="weighted" + ) + return { + result_label: ScalarResult(val) + for result_label, val in aggregate_scores.items() + } diff --git a/nucleus/metrics/label_grouper.py b/nucleus/metrics/label_grouper.py new file mode 100644 index 00000000..4a562637 --- /dev/null +++ b/nucleus/metrics/label_grouper.py @@ -0,0 +1,43 @@ +from typing import Any, List + +import numpy as np +import pandas as pd + + +class LabelsGrouper: + def __init__(self, annotations_or_predictions_list: List[Any]): + self.items = annotations_or_predictions_list + if len(self.items) > 0: + assert hasattr( + self.items[0], "label" + ), f"Expected items to have attribute 'label' found none on {repr(self.items[0])}" + self.codes, self.labels = pd.factorize( + [item.label for item in self.items] + ) + self.group_idx = 0 + + def __iter__(self): + self.group_idx = 0 + return self + + def __next__(self): + if self.group_idx >= len(self.labels): + raise StopIteration + label = self.labels[self.group_idx] + label_items = list( + np.take(self.items, np.where(self.codes == self.group_idx)[0]) + ) + self.group_idx += 1 + return label, label_items + + def label_group(self, label: str) -> List[Any]: + if len(self.items) == 0: + return [] + idx = np.where(self.labels == label)[0] + if idx >= 0: + label_items = list( + np.take(self.items, np.where(self.codes == idx)[0]) + ) + return label_items + else: + return [] diff --git a/nucleus/metrics/polygon_metrics.py b/nucleus/metrics/polygon_metrics.py index 7ebbf20d..eff482f0 100644 --- a/nucleus/metrics/polygon_metrics.py +++ b/nucleus/metrics/polygon_metrics.py @@ -1,14 +1,16 @@ import sys from abc import abstractmethod -from typing import List, Union +from collections import defaultdict +from typing import Dict, List, Union import numpy as np from nucleus.annotation import AnnotationList, BoxAnnotation, PolygonAnnotation from nucleus.prediction import BoxPrediction, PolygonPrediction, PredictionList -from .base import Metric, ScalarResult +from .base import GroupedScalarResult, Metric, ScalarResult from .filters import confidence_filter, polygon_label_filter +from .label_grouper import LabelsGrouper from .metric_utils import compute_average_precision from .polygon_utils import ( BoxOrPolygonAnnotation, @@ -80,19 +82,44 @@ def eval( def __init__( self, - enforce_label_match: bool = False, + enforce_label_match: bool = True, confidence_threshold: float = 0.0, ): """Initializes PolygonMetric abstract object. Args: - enforce_label_match: whether to enforce that annotation and prediction labels must match. Default False + enforce_label_match: whether to enforce that annotation and prediction labels must match. Default True confidence_threshold: minimum confidence threshold for predictions. Must be in [0, 1]. Default 0.0 """ self.enforce_label_match = enforce_label_match assert 0 <= confidence_threshold <= 1 self.confidence_threshold = confidence_threshold + def eval_grouped( + self, + annotations: List[Union[BoxAnnotation, PolygonAnnotation]], + predictions: List[Union[BoxPrediction, PolygonPrediction]], + ) -> GroupedScalarResult: + grouped_annotations = LabelsGrouper(annotations) + grouped_predictions = LabelsGrouper(predictions) + results = {} + for label, label_annotations in grouped_annotations: + # TODO(gunnar): Enforce label match -> Why is that a parameter? Should we generally allow IOU matches + # between different labels?!? + match_predictions = ( + grouped_predictions.label_group(label) + if self.enforce_label_match + else predictions + ) + eval_fn = label_match_wrapper(self.eval) + result = eval_fn( + label_annotations, + match_predictions, + enforce_label_match=self.enforce_label_match, + ) + results[label] = result + return GroupedScalarResult(group_to_scalar=results) + @abstractmethod def eval( self, @@ -102,12 +129,20 @@ def eval( # Main evaluation function that subclasses must override. pass - def aggregate_score(self, results: List[ScalarResult]) -> ScalarResult: # type: ignore[override] - return ScalarResult.aggregate(results) + def aggregate_score(self, results: List[GroupedScalarResult]) -> Dict[str, ScalarResult]: # type: ignore[override] + label_to_values = defaultdict(list) + for item_result in results: + for label, label_result in item_result.group_to_scalar.items(): + label_to_values[label].append(label_result) + scores = { + label: ScalarResult.aggregate(values) + for label, values in label_to_values.items() + } + return scores def __call__( self, annotations: AnnotationList, predictions: PredictionList - ) -> ScalarResult: + ) -> GroupedScalarResult: if self.confidence_threshold > 0: predictions = confidence_filter( predictions, self.confidence_threshold @@ -119,11 +154,9 @@ def __call__( polygon_predictions.extend(predictions.box_predictions) polygon_predictions.extend(predictions.polygon_predictions) - eval_fn = label_match_wrapper(self.eval) - result = eval_fn( + result = self.eval_grouped( polygon_annotations, polygon_predictions, - enforce_label_match=self.enforce_label_match, ) return result @@ -166,7 +199,7 @@ class PolygonIOU(PolygonMetric): # TODO: Remove defaults once these are surfaced more cleanly to users. def __init__( self, - enforce_label_match: bool = False, + enforce_label_match: bool = True, iou_threshold: float = 0.0, confidence_threshold: float = 0.0, ): @@ -234,7 +267,7 @@ class PolygonPrecision(PolygonMetric): # TODO: Remove defaults once these are surfaced more cleanly to users. def __init__( self, - enforce_label_match: bool = False, + enforce_label_match: bool = True, iou_threshold: float = 0.5, confidence_threshold: float = 0.0, ): @@ -303,7 +336,7 @@ class PolygonRecall(PolygonMetric): # TODO: Remove defaults once these are surfaced more cleanly to users. def __init__( self, - enforce_label_match: bool = False, + enforce_label_match: bool = True, iou_threshold: float = 0.5, confidence_threshold: float = 0.0, ): @@ -460,7 +493,7 @@ def __init__( 0 <= iou_threshold <= 1 ), "IoU threshold must be between 0 and 1." self.iou_threshold = iou_threshold - super().__init__(enforce_label_match=False, confidence_threshold=0) + super().__init__(enforce_label_match=True, confidence_threshold=0) def eval( self, diff --git a/nucleus/metrics/polygon_utils.py b/nucleus/metrics/polygon_utils.py index 8d746b51..d19bd8de 100644 --- a/nucleus/metrics/polygon_utils.py +++ b/nucleus/metrics/polygon_utils.py @@ -273,7 +273,7 @@ def wrapper( annotations: List[BoxOrPolygonAnnotation], predictions: List[BoxOrPolygonPrediction], *args, - enforce_label_match: bool = False, + enforce_label_match: bool = True, **kwargs, ) -> ScalarResult: # Simply return the metric if we are not enforcing label matches. diff --git a/pyproject.toml b/pyproject.toml index bb27ec32..e85193a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ click = ">=7.1.2,<9.0" # NOTE: COLAB has 7.1.2 and has problems updating rich = "^10.15.2" shellingham = "^1.4.0" scikit-learn = ">=0.24.0" +pandas = ">=1.0" [tool.poetry.dev-dependencies] poetry = "^1.1.5" diff --git a/tests/metrics/test_categorization_metrics.py b/tests/metrics/test_categorization_metrics.py index 98c5407a..0dc47ef5 100644 --- a/tests/metrics/test_categorization_metrics.py +++ b/tests/metrics/test_categorization_metrics.py @@ -29,9 +29,10 @@ def test_perfect_match_f1_score(): ) ) - assert results + assert [res.value for res in results] aggregate_result = metric.aggregate_score(results) - assert aggregate_result.value == 1 + for result_label, scalar in aggregate_result.items(): + assert scalar.value == 1 def test_no_match_f1_score(): diff --git a/tests/metrics/test_polygon_metrics.py b/tests/metrics/test_polygon_metrics.py index 6d7fc8fd..9fe57d75 100644 --- a/tests/metrics/test_polygon_metrics.py +++ b/tests/metrics/test_polygon_metrics.py @@ -30,36 +30,18 @@ PolygonIOU, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonIOU, - {"enforce_label_match": False}, - ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonPrecision, - {"enforce_label_match": False}, - ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonRecall, - {"enforce_label_match": False}, - ), (TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonMAP, {}), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, @@ -67,36 +49,18 @@ PolygonIOU, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonIOU, - {"enforce_label_match": False}, - ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonPrecision, - {"enforce_label_match": False}, - ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonRecall, - {"enforce_label_match": False}, - ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, @@ -111,7 +75,8 @@ def test_perfect_match_polygon_metrics( # Test metrics on where annotations = predictions perfectly metric = metric_fn(**kwargs) result = metric(test_annotations, test_predictions) - assert_metric_eq(result, ScalarResult(1, len(test_annotations))) + for label, result_val in result.items(): + assert_metric_eq(result_val, ScalarResult(1, 1)) @pytest.mark.parametrize( @@ -123,36 +88,18 @@ def test_perfect_match_polygon_metrics( PolygonIOU, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonIOU, - {"enforce_label_match": False}, - ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonPrecision, - {"enforce_label_match": False}, - ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonRecall, - {"enforce_label_match": False}, - ), (TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonMAP, {}), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, @@ -160,36 +107,18 @@ def test_perfect_match_polygon_metrics( PolygonIOU, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonIOU, - {"enforce_label_match": False}, - ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonPrecision, - {"enforce_label_match": False}, - ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonRecall, - {"enforce_label_match": False}, - ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, @@ -209,7 +138,8 @@ def test_perfect_unmatched_polygon_metrics( polygon.reference_id += "_bad" metric = metric_fn(**kwargs) result = metric(test_annotations, test_predictions_unmatch) - assert_metric_eq(result, ScalarResult(0, len(test_annotations))) + for label, result in result.items(): + assert_metric_eq(result, ScalarResult(0, 1)) @pytest.mark.parametrize( @@ -219,56 +149,35 @@ def test_perfect_unmatched_polygon_metrics( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, PolygonIOU, - ScalarResult(109.0 / 300, 3), + {"car": ScalarResult(109.0 / 300, 3)}, {"enforce_label_match": True}, ), - ( - TEST_ANNOTATION_LIST, - TEST_PREDICTION_LIST, - PolygonIOU, - ScalarResult(109.0 / 300, 3), - {"enforce_label_match": False}, - ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, PolygonPrecision, - ScalarResult(1.0 / 3, 3), + {"car": ScalarResult(1.0 / 3, 3)}, {"enforce_label_match": True}, ), - ( - TEST_ANNOTATION_LIST, - TEST_PREDICTION_LIST, - PolygonPrecision, - ScalarResult(1.0 / 3, 3), - {"enforce_label_match": False}, - ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, PolygonRecall, - ScalarResult(0.5, 2), + {"car": ScalarResult(0.5, 2)}, {"enforce_label_match": True}, ), - ( - TEST_ANNOTATION_LIST, - TEST_PREDICTION_LIST, - PolygonRecall, - ScalarResult(0.5, 2), - {"enforce_label_match": False}, - ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, PolygonAveragePrecision, - ScalarResult(1.0 / 6, 1), + {"car": ScalarResult(1.0 / 6, 1)}, {"label": "car"}, ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, PolygonMAP, - ScalarResult(1.0 / 6, 1), + {"car": ScalarResult(1.0 / 6, 1)}, {}, ), ], @@ -279,4 +188,6 @@ def test_simple_2_boxes( # Test metrics on where annotations = predictions perfectly metric = metric_fn(**kwargs) result = metric(test_annotations, test_predictions) - assert_metric_eq(result, expected) + for label, value in result.items(): + assert label in expected + assert_metric_eq(value, expected[label])