From 76147df54cf54b5c4a19b341cf9683ad5c15c7af Mon Sep 17 00:00:00 2001 From: Diego Ardila Date: Thu, 3 Mar 2022 17:17:53 -0800 Subject: [PATCH 1/5] Add Label Grouper --- nucleus/metrics/base.py | 4 +- nucleus/metrics/label_grouper.py | 43 +++++++++++++++ nucleus/metrics/polygon_metrics.py | 52 ++++++++++++++----- nucleus/metrics/polygon_utils.py | 2 +- pyproject.toml | 1 + tests/metrics/test_polygon_metrics.py | 75 ++++++++++++++------------- 6 files changed, 123 insertions(+), 54 deletions(-) create mode 100644 nucleus/metrics/label_grouper.py diff --git a/nucleus/metrics/base.py b/nucleus/metrics/base.py index 75d916e9..669aa151 100644 --- a/nucleus/metrics/base.py +++ b/nucleus/metrics/base.py @@ -1,7 +1,7 @@ import sys from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Iterable, List +from typing import Dict, Iterable, List from nucleus.annotation import AnnotationList from nucleus.prediction import PredictionList @@ -89,7 +89,7 @@ def __call__( @abstractmethod def __call__( self, annotations: AnnotationList, predictions: PredictionList - ) -> MetricResult: + ) -> Dict[str, MetricResult]: """A metric must override this method and return a metric result, given annotations and predictions.""" @abstractmethod diff --git a/nucleus/metrics/label_grouper.py b/nucleus/metrics/label_grouper.py new file mode 100644 index 00000000..1c8bd4b1 --- /dev/null +++ b/nucleus/metrics/label_grouper.py @@ -0,0 +1,43 @@ +from typing import Any, List + +import numpy as np +import pandas as pd + + +class LabelsGrouper: + def __init__(self, annotations_or_predictions_list: List[Any]): + self.items = annotations_or_predictions_list + if len(self.items) > 0: + assert hasattr( + self.items[0], "label" + ), f"Expected items to have attribute 'label' found none on {repr(self.items[0])}" + self.codes, self.labels = pd.factorize( + [item.label for item in self.items] + ) + self.it_idx = 0 + + def __iter__(self): + self.it_idx = 0 + return self + + def __next__(self): + self.it_idx += 1 + if self.it_idx >= len(self.labels): + raise StopIteration + label = self.labels[self.it_idx] + label_items = list( + np.take(self.items, np.where(self.codes == self.it_idx)[0]) + ) + return label, label_items + + def label_group(self, label: str) -> List[Any]: + if len(self.items) == 0: + return [] + idx = np.where(self.labels == label)[0] + if idx >= 0: + label_items = list( + np.take(self.items, np.where(self.codes == idx)[0]) + ) + return label_items + else: + return [] diff --git a/nucleus/metrics/polygon_metrics.py b/nucleus/metrics/polygon_metrics.py index 7ebbf20d..2928faa1 100644 --- a/nucleus/metrics/polygon_metrics.py +++ b/nucleus/metrics/polygon_metrics.py @@ -1,14 +1,15 @@ import sys from abc import abstractmethod -from typing import List, Union +from typing import Dict, List import numpy as np -from nucleus.annotation import AnnotationList, BoxAnnotation, PolygonAnnotation -from nucleus.prediction import BoxPrediction, PolygonPrediction, PredictionList +from nucleus.annotation import AnnotationList +from nucleus.prediction import PredictionList from .base import Metric, ScalarResult from .filters import confidence_filter, polygon_label_filter +from .label_grouper import LabelsGrouper from .metric_utils import compute_average_precision from .polygon_utils import ( BoxOrPolygonAnnotation, @@ -80,7 +81,7 @@ def eval( def __init__( self, - enforce_label_match: bool = False, + enforce_label_match: bool = True, confidence_threshold: float = 0.0, ): """Initializes PolygonMetric abstract object. @@ -93,6 +94,31 @@ def __init__( assert 0 <= confidence_threshold <= 1 self.confidence_threshold = confidence_threshold + def eval_grouped( + self, + annotations: List[BoxOrPolygonAnnotation], + predictions: List[BoxOrPolygonPrediction], + ) -> Dict[str, ScalarResult]: + grouped_annotations = LabelsGrouper(annotations) + grouped_predictions = LabelsGrouper(predictions) + results = {} + for label, label_annotations in grouped_annotations: + # TODO(gunnar): Enforce label match -> Why is that a parameter? Should we generally allow IOU matches + # between different labels?!? + match_predictions = ( + grouped_predictions.label_group(label) + if self.enforce_label_match + else predictions + ) + eval_fn = label_match_wrapper(self.eval) + result = eval_fn( + label_annotations, + match_predictions, + enforce_label_match=self.enforce_label_match, + ) + results[label] = result + return results + @abstractmethod def eval( self, @@ -107,23 +133,21 @@ def aggregate_score(self, results: List[ScalarResult]) -> ScalarResult: # type: def __call__( self, annotations: AnnotationList, predictions: PredictionList - ) -> ScalarResult: + ) -> Dict[str, ScalarResult]: if self.confidence_threshold > 0: predictions = confidence_filter( predictions, self.confidence_threshold ) - polygon_annotations: List[Union[BoxAnnotation, PolygonAnnotation]] = [] + polygon_annotations: List[BoxOrPolygonAnnotation] = [] polygon_annotations.extend(annotations.box_annotations) polygon_annotations.extend(annotations.polygon_annotations) - polygon_predictions: List[Union[BoxPrediction, PolygonPrediction]] = [] + polygon_predictions: List[BoxOrPolygonPrediction] = [] polygon_predictions.extend(predictions.box_predictions) polygon_predictions.extend(predictions.polygon_predictions) - eval_fn = label_match_wrapper(self.eval) - result = eval_fn( + result = self.eval_grouped( polygon_annotations, polygon_predictions, - enforce_label_match=self.enforce_label_match, ) return result @@ -166,7 +190,7 @@ class PolygonIOU(PolygonMetric): # TODO: Remove defaults once these are surfaced more cleanly to users. def __init__( self, - enforce_label_match: bool = False, + enforce_label_match: bool = True, iou_threshold: float = 0.0, confidence_threshold: float = 0.0, ): @@ -234,7 +258,7 @@ class PolygonPrecision(PolygonMetric): # TODO: Remove defaults once these are surfaced more cleanly to users. def __init__( self, - enforce_label_match: bool = False, + enforce_label_match: bool = True, iou_threshold: float = 0.5, confidence_threshold: float = 0.0, ): @@ -303,7 +327,7 @@ class PolygonRecall(PolygonMetric): # TODO: Remove defaults once these are surfaced more cleanly to users. def __init__( self, - enforce_label_match: bool = False, + enforce_label_match: bool = True, iou_threshold: float = 0.5, confidence_threshold: float = 0.0, ): @@ -460,7 +484,7 @@ def __init__( 0 <= iou_threshold <= 1 ), "IoU threshold must be between 0 and 1." self.iou_threshold = iou_threshold - super().__init__(enforce_label_match=False, confidence_threshold=0) + super().__init__(enforce_label_match=True, confidence_threshold=0) def eval( self, diff --git a/nucleus/metrics/polygon_utils.py b/nucleus/metrics/polygon_utils.py index 8d746b51..d19bd8de 100644 --- a/nucleus/metrics/polygon_utils.py +++ b/nucleus/metrics/polygon_utils.py @@ -273,7 +273,7 @@ def wrapper( annotations: List[BoxOrPolygonAnnotation], predictions: List[BoxOrPolygonPrediction], *args, - enforce_label_match: bool = False, + enforce_label_match: bool = True, **kwargs, ) -> ScalarResult: # Simply return the metric if we are not enforcing label matches. diff --git a/pyproject.toml b/pyproject.toml index b710abbb..f70b3399 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ click = ">=7.1.2,<9.0" # NOTE: COLAB has 7.1.2 and has problems updating rich = "^10.15.2" shellingham = "^1.4.0" scikit-learn = ">=0.24.0" +pandas = ">=1.0" [tool.poetry.dev-dependencies] poetry = "^1.1.5" diff --git a/tests/metrics/test_polygon_metrics.py b/tests/metrics/test_polygon_metrics.py index 6d7fc8fd..ae7a5c0a 100644 --- a/tests/metrics/test_polygon_metrics.py +++ b/tests/metrics/test_polygon_metrics.py @@ -30,36 +30,36 @@ PolygonIOU, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonIOU, - {"enforce_label_match": False}, - ), + # ( + # TEST_BOX_ANNOTATION_LIST, + # TEST_BOX_PREDICTION_LIST, + # PolygonIOU, + # {"enforce_label_match": False}, + # ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonPrecision, - {"enforce_label_match": False}, - ), + # ( + # TEST_BOX_ANNOTATION_LIST, + # TEST_BOX_PREDICTION_LIST, + # PolygonPrecision, + # {"enforce_label_match": False}, + # ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonRecall, - {"enforce_label_match": False}, - ), + # ( + # TEST_BOX_ANNOTATION_LIST, + # TEST_BOX_PREDICTION_LIST, + # PolygonRecall, + # {"enforce_label_match": False}, + # ), (TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonMAP, {}), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, @@ -67,36 +67,36 @@ PolygonIOU, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonIOU, - {"enforce_label_match": False}, - ), + # ( + # TEST_CONVEX_POLYGON_ANNOTATION_LIST, + # TEST_CONVEX_POLYGON_PREDICTION_LIST, + # PolygonIOU, + # {"enforce_label_match": False}, + # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonPrecision, - {"enforce_label_match": False}, - ), + # ( + # TEST_CONVEX_POLYGON_ANNOTATION_LIST, + # TEST_CONVEX_POLYGON_PREDICTION_LIST, + # PolygonPrecision, + # {"enforce_label_match": False}, + # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonRecall, - {"enforce_label_match": False}, - ), + # ( + # TEST_CONVEX_POLYGON_ANNOTATION_LIST, + # TEST_CONVEX_POLYGON_PREDICTION_LIST, + # PolygonRecall, + # {"enforce_label_match": False}, + # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, @@ -111,7 +111,8 @@ def test_perfect_match_polygon_metrics( # Test metrics on where annotations = predictions perfectly metric = metric_fn(**kwargs) result = metric(test_annotations, test_predictions) - assert_metric_eq(result, ScalarResult(1, len(test_annotations))) + for label, result_val in result.items(): + assert_metric_eq(result_val, ScalarResult(1, 1)) @pytest.mark.parametrize( From 6387817b92c472d1d61e05bbcab3c5f2cc418cab Mon Sep 17 00:00:00 2001 From: Gunnar Atli Thoroddsen Date: Fri, 4 Mar 2022 14:59:35 +0100 Subject: [PATCH 2/5] All tests except for non matching ones running --- nucleus/metrics/label_grouper.py | 12 +-- tests/metrics/test_polygon_metrics.py | 131 +++++++++++++------------- 2 files changed, 73 insertions(+), 70 deletions(-) diff --git a/nucleus/metrics/label_grouper.py b/nucleus/metrics/label_grouper.py index 1c8bd4b1..4a562637 100644 --- a/nucleus/metrics/label_grouper.py +++ b/nucleus/metrics/label_grouper.py @@ -14,20 +14,20 @@ def __init__(self, annotations_or_predictions_list: List[Any]): self.codes, self.labels = pd.factorize( [item.label for item in self.items] ) - self.it_idx = 0 + self.group_idx = 0 def __iter__(self): - self.it_idx = 0 + self.group_idx = 0 return self def __next__(self): - self.it_idx += 1 - if self.it_idx >= len(self.labels): + if self.group_idx >= len(self.labels): raise StopIteration - label = self.labels[self.it_idx] + label = self.labels[self.group_idx] label_items = list( - np.take(self.items, np.where(self.codes == self.it_idx)[0]) + np.take(self.items, np.where(self.codes == self.group_idx)[0]) ) + self.group_idx += 1 return label, label_items def label_group(self, label: str) -> List[Any]: diff --git a/tests/metrics/test_polygon_metrics.py b/tests/metrics/test_polygon_metrics.py index ae7a5c0a..662f149c 100644 --- a/tests/metrics/test_polygon_metrics.py +++ b/tests/metrics/test_polygon_metrics.py @@ -124,36 +124,36 @@ def test_perfect_match_polygon_metrics( PolygonIOU, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonIOU, - {"enforce_label_match": False}, - ), + # ( + # TEST_BOX_ANNOTATION_LIST, + # TEST_BOX_PREDICTION_LIST, + # PolygonIOU, + # {"enforce_label_match": False}, + # ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonPrecision, - {"enforce_label_match": False}, - ), + # ( + # TEST_BOX_ANNOTATION_LIST, + # TEST_BOX_PREDICTION_LIST, + # PolygonPrecision, + # {"enforce_label_match": False}, + # ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - ( - TEST_BOX_ANNOTATION_LIST, - TEST_BOX_PREDICTION_LIST, - PolygonRecall, - {"enforce_label_match": False}, - ), + # ( + # TEST_BOX_ANNOTATION_LIST, + # TEST_BOX_PREDICTION_LIST, + # PolygonRecall, + # {"enforce_label_match": False}, + # ), (TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonMAP, {}), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, @@ -161,36 +161,36 @@ def test_perfect_match_polygon_metrics( PolygonIOU, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonIOU, - {"enforce_label_match": False}, - ), + # ( + # TEST_CONVEX_POLYGON_ANNOTATION_LIST, + # TEST_CONVEX_POLYGON_PREDICTION_LIST, + # PolygonIOU, + # {"enforce_label_match": False}, + # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonPrecision, - {"enforce_label_match": False}, - ), + # ( + # TEST_CONVEX_POLYGON_ANNOTATION_LIST, + # TEST_CONVEX_POLYGON_PREDICTION_LIST, + # PolygonPrecision, + # {"enforce_label_match": False}, + # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - ( - TEST_CONVEX_POLYGON_ANNOTATION_LIST, - TEST_CONVEX_POLYGON_PREDICTION_LIST, - PolygonRecall, - {"enforce_label_match": False}, - ), + # ( + # TEST_CONVEX_POLYGON_ANNOTATION_LIST, + # TEST_CONVEX_POLYGON_PREDICTION_LIST, + # PolygonRecall, + # {"enforce_label_match": False}, + # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, @@ -210,7 +210,8 @@ def test_perfect_unmatched_polygon_metrics( polygon.reference_id += "_bad" metric = metric_fn(**kwargs) result = metric(test_annotations, test_predictions_unmatch) - assert_metric_eq(result, ScalarResult(0, len(test_annotations))) + for label, result in result.items(): + assert_metric_eq(result, ScalarResult(0, 1)) @pytest.mark.parametrize( @@ -220,56 +221,56 @@ def test_perfect_unmatched_polygon_metrics( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, PolygonIOU, - ScalarResult(109.0 / 300, 3), + {"car": ScalarResult(109.0 / 300, 3)}, {"enforce_label_match": True}, ), - ( - TEST_ANNOTATION_LIST, - TEST_PREDICTION_LIST, - PolygonIOU, - ScalarResult(109.0 / 300, 3), - {"enforce_label_match": False}, - ), + # ( + # TEST_ANNOTATION_LIST, + # TEST_PREDICTION_LIST, + # PolygonIOU, + # ScalarResult(109.0 / 300, 3), + # {"enforce_label_match": False}, + # ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, PolygonPrecision, - ScalarResult(1.0 / 3, 3), + {"car": ScalarResult(1.0 / 3, 3)}, {"enforce_label_match": True}, ), - ( - TEST_ANNOTATION_LIST, - TEST_PREDICTION_LIST, - PolygonPrecision, - ScalarResult(1.0 / 3, 3), - {"enforce_label_match": False}, - ), + # ( + # TEST_ANNOTATION_LIST, + # TEST_PREDICTION_LIST, + # PolygonPrecision, + # ScalarResult(1.0 / 3, 3), + # {"enforce_label_match": False}, + # ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, PolygonRecall, - ScalarResult(0.5, 2), + {"car": ScalarResult(0.5, 2)}, {"enforce_label_match": True}, ), - ( - TEST_ANNOTATION_LIST, - TEST_PREDICTION_LIST, - PolygonRecall, - ScalarResult(0.5, 2), - {"enforce_label_match": False}, - ), + # ( + # TEST_ANNOTATION_LIST, + # TEST_PREDICTION_LIST, + # PolygonRecall, + # ScalarResult(0.5, 2), + # {"enforce_label_match": False}, + # ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, PolygonAveragePrecision, - ScalarResult(1.0 / 6, 1), + {"car": ScalarResult(1.0 / 6, 1)}, {"label": "car"}, ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, PolygonMAP, - ScalarResult(1.0 / 6, 1), + {"car": ScalarResult(1.0 / 6, 1)}, {}, ), ], @@ -280,4 +281,6 @@ def test_simple_2_boxes( # Test metrics on where annotations = predictions perfectly metric = metric_fn(**kwargs) result = metric(test_annotations, test_predictions) - assert_metric_eq(result, expected) + for label, value in result.items(): + assert label in expected + assert_metric_eq(value, expected[label]) From 7e9f48ef353a56879051b7251ca43b081a5bc4f0 Mon Sep 17 00:00:00 2001 From: Gunnar Atli Thoroddsen Date: Mon, 7 Mar 2022 09:30:05 +0100 Subject: [PATCH 3/5] WIP --- nucleus/metrics/base.py | 16 ++++++++++++- nucleus/metrics/categorization_metrics.py | 25 ++++++++++++++------ nucleus/metrics/polygon_metrics.py | 15 +++++++++--- tests/metrics/test_categorization_metrics.py | 5 ++-- 4 files changed, 48 insertions(+), 13 deletions(-) diff --git a/nucleus/metrics/base.py b/nucleus/metrics/base.py index 669aa151..5dc016dc 100644 --- a/nucleus/metrics/base.py +++ b/nucleus/metrics/base.py @@ -37,6 +37,18 @@ def aggregate(results: Iterable["ScalarResult"]) -> "ScalarResult": return ScalarResult(value, total_weight) +@dataclass +class GroupedScalarResult(MetricResult): + group_to_scalar: Dict[str, ScalarResult] + + @property + def value(self): + return { + group: scalar.value + for group, scalar in self.group_to_scalar.items() + } + + class Metric(ABC): """Abstract class for defining a metric, which takes a list of annotations and predictions and returns a scalar. @@ -93,7 +105,9 @@ def __call__( """A metric must override this method and return a metric result, given annotations and predictions.""" @abstractmethod - def aggregate_score(self, results: List[MetricResult]) -> ScalarResult: + def aggregate_score( + self, results: List[MetricResult] + ) -> Dict[str, ScalarResult]: """A metric must define how to aggregate results from single items to a single ScalarResult. E.g. to calculate a R2 score with sklearn you could define a custom metric class :: diff --git a/nucleus/metrics/categorization_metrics.py b/nucleus/metrics/categorization_metrics.py index 416f831a..92e27ddb 100644 --- a/nucleus/metrics/categorization_metrics.py +++ b/nucleus/metrics/categorization_metrics.py @@ -1,6 +1,6 @@ from abc import abstractmethod from dataclasses import dataclass -from typing import List, Set, Tuple, Union +from typing import Dict, List, Set, Tuple, Union from sklearn.metrics import f1_score @@ -39,10 +39,16 @@ def value(self): # TODO: Change task.py interface such that we can return label matching # NOTE: Returning 1 if all taxonomy labels match else 0 - value = f1_score( + values = {} + values["f1_macro"] = f1_score( list(annotation_labels), list(prediction_labels), average="macro" ) - return value + values["f1_weighted"] = f1_score( + list(annotation_labels), + list(prediction_labels), + average="weighted", + ) + return values class CategorizationMetric(Metric): @@ -80,7 +86,7 @@ def eval( pass @abstractmethod - def aggregate_score(self, results: List[CategorizationResult]) -> ScalarResult: # type: ignore[override] + def aggregate_score(self, results: List[CategorizationResult]) -> Dict[str, ScalarResult]: # type: ignore[override] pass def __call__( @@ -189,11 +195,16 @@ def eval( annotations=annotations, predictions=predictions ) - def aggregate_score(self, results: List[CategorizationResult]) -> ScalarResult: # type: ignore[override] + def aggregate_score(self, results: List[CategorizationResult]) -> Dict[str, ScalarResult]: # type: ignore[override] gt = [] predicted = [] for result in results: gt.extend(list(to_taxonomy_labels(result.annotations))) predicted.extend(list(to_taxonomy_labels(result.predictions))) - value = f1_score(gt, predicted, average=self.f1_method) - return ScalarResult(value) + results = {} + results["macro"] = f1_score(gt, predicted, average="macro") + results["weighted"] = f1_score(gt, predicted, average="weighted") + return { + result_label: ScalarResult(val) + for result_label, val in results.items() + } diff --git a/nucleus/metrics/polygon_metrics.py b/nucleus/metrics/polygon_metrics.py index 2928faa1..076fc99e 100644 --- a/nucleus/metrics/polygon_metrics.py +++ b/nucleus/metrics/polygon_metrics.py @@ -1,5 +1,6 @@ import sys from abc import abstractmethod +from collections import defaultdict from typing import Dict, List import numpy as np @@ -7,7 +8,7 @@ from nucleus.annotation import AnnotationList from nucleus.prediction import PredictionList -from .base import Metric, ScalarResult +from .base import GroupedScalarResult, Metric, ScalarResult from .filters import confidence_filter, polygon_label_filter from .label_grouper import LabelsGrouper from .metric_utils import compute_average_precision @@ -128,8 +129,16 @@ def eval( # Main evaluation function that subclasses must override. pass - def aggregate_score(self, results: List[ScalarResult]) -> ScalarResult: # type: ignore[override] - return ScalarResult.aggregate(results) + def aggregate_score(self, results: List[GroupedScalarResult]) -> Dict[str, ScalarResult]: # type: ignore[override] + label_to_values = defaultdict(list) + for item_result in results: + for label, label_result in item_result.group_to_scalar.items(): + label_to_values[label].append(label_result) + scores = { + label: ScalarResult.aggregate(values) + for label, values in label_to_values.items() + } + return scores def __call__( self, annotations: AnnotationList, predictions: PredictionList diff --git a/tests/metrics/test_categorization_metrics.py b/tests/metrics/test_categorization_metrics.py index 98c5407a..0dc47ef5 100644 --- a/tests/metrics/test_categorization_metrics.py +++ b/tests/metrics/test_categorization_metrics.py @@ -29,9 +29,10 @@ def test_perfect_match_f1_score(): ) ) - assert results + assert [res.value for res in results] aggregate_result = metric.aggregate_score(results) - assert aggregate_result.value == 1 + for result_label, scalar in aggregate_result.items(): + assert scalar.value == 1 def test_no_match_f1_score(): From 12886e185bc532b3ed8271704121264301dccb62 Mon Sep 17 00:00:00 2001 From: Gunnar Atli Thoroddsen Date: Fri, 11 Mar 2022 11:26:38 +0100 Subject: [PATCH 4/5] Clean up MetricResult interfaces --- nucleus/metrics/base.py | 24 ++++++++++++++++-- nucleus/metrics/categorization_metrics.py | 30 ++++++++++++++--------- nucleus/metrics/polygon_metrics.py | 4 +-- 3 files changed, 42 insertions(+), 16 deletions(-) diff --git a/nucleus/metrics/base.py b/nucleus/metrics/base.py index 5dc016dc..a8eb9d61 100644 --- a/nucleus/metrics/base.py +++ b/nucleus/metrics/base.py @@ -1,7 +1,8 @@ +import dataclasses import sys from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Dict, Iterable, List +from typing import Any, Dict, Iterable, List from nucleus.annotation import AnnotationList from nucleus.prediction import PredictionList @@ -10,6 +11,17 @@ class MetricResult(ABC): """Base MetricResult class""" + @property + @abstractmethod + def results(self) -> Dict[str, float]: + """Interface for item results""" + return + + @property + def extra_info(self) -> Dict[str, str]: + """Overload this to pass extra info about the item to show in the UI""" + return {} + @dataclass class ScalarResult(MetricResult): @@ -27,6 +39,14 @@ class ScalarResult(MetricResult): value: float weight: float = 1.0 + @property + def results(self) -> Dict[str, float]: + return {"value": self.value} + + @property + def extra_info(self) -> Dict[str, str]: + return {"weight:": str(self.weight)} + @staticmethod def aggregate(results: Iterable["ScalarResult"]) -> "ScalarResult": """Aggregates results using a weighted average.""" @@ -42,7 +62,7 @@ class GroupedScalarResult(MetricResult): group_to_scalar: Dict[str, ScalarResult] @property - def value(self): + def results(self) -> Dict[str, Any]: return { group: scalar.value for group, scalar in self.group_to_scalar.items() diff --git a/nucleus/metrics/categorization_metrics.py b/nucleus/metrics/categorization_metrics.py index 92e27ddb..633e6e4d 100644 --- a/nucleus/metrics/categorization_metrics.py +++ b/nucleus/metrics/categorization_metrics.py @@ -33,22 +33,28 @@ class CategorizationResult(MetricResult): predictions: List[CategoryPrediction] @property - def value(self): + def results(self) -> Dict[str, float]: annotation_labels = to_taxonomy_labels(self.annotations) prediction_labels = to_taxonomy_labels(self.predictions) # TODO: Change task.py interface such that we can return label matching - # NOTE: Returning 1 if all taxonomy labels match else 0 - values = {} - values["f1_macro"] = f1_score( - list(annotation_labels), list(prediction_labels), average="macro" - ) - values["f1_weighted"] = f1_score( - list(annotation_labels), - list(prediction_labels), - average="weighted", - ) - return values + results = { + "f1_macro": f1_score( + list(annotation_labels), + list(prediction_labels), + average="macro", + ) + } + return results + + @property + def extra_info(self) -> Dict[str, str]: + annotation_labels = to_taxonomy_labels(self.annotations) + prediction_labels = to_taxonomy_labels(self.predictions) + return { + "annotations": ", ".join(annotation_labels), + "predictions": ", ".join(prediction_labels), + } class CategorizationMetric(Metric): diff --git a/nucleus/metrics/polygon_metrics.py b/nucleus/metrics/polygon_metrics.py index 076fc99e..10cdc22d 100644 --- a/nucleus/metrics/polygon_metrics.py +++ b/nucleus/metrics/polygon_metrics.py @@ -99,7 +99,7 @@ def eval_grouped( self, annotations: List[BoxOrPolygonAnnotation], predictions: List[BoxOrPolygonPrediction], - ) -> Dict[str, ScalarResult]: + ) -> GroupedScalarResult: grouped_annotations = LabelsGrouper(annotations) grouped_predictions = LabelsGrouper(predictions) results = {} @@ -118,7 +118,7 @@ def eval_grouped( enforce_label_match=self.enforce_label_match, ) results[label] = result - return results + return GroupedScalarResult(group_to_scalar=results) @abstractmethod def eval( From 860601d240ceba29abb6995cc7e731b6bab8fcbf Mon Sep 17 00:00:00 2001 From: Gunnar Atli Thoroddsen Date: Mon, 14 Mar 2022 16:53:24 +0100 Subject: [PATCH 5/5] Cleanup of mypy errors and addressing inconsistencies from PR --- nucleus/metrics/base.py | 14 ++-- nucleus/metrics/categorization_metrics.py | 10 ++- nucleus/metrics/polygon_metrics.py | 18 ++--- tests/metrics/test_polygon_metrics.py | 93 ----------------------- 4 files changed, 23 insertions(+), 112 deletions(-) diff --git a/nucleus/metrics/base.py b/nucleus/metrics/base.py index a8eb9d61..587dd5a5 100644 --- a/nucleus/metrics/base.py +++ b/nucleus/metrics/base.py @@ -1,8 +1,7 @@ -import dataclasses import sys from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Any, Dict, Iterable, List +from typing import Dict, Iterable, List from nucleus.annotation import AnnotationList from nucleus.prediction import PredictionList @@ -15,7 +14,6 @@ class MetricResult(ABC): @abstractmethod def results(self) -> Dict[str, float]: """Interface for item results""" - return @property def extra_info(self) -> Dict[str, str]: @@ -62,11 +60,15 @@ class GroupedScalarResult(MetricResult): group_to_scalar: Dict[str, ScalarResult] @property - def results(self) -> Dict[str, Any]: - return { + def results(self) -> Dict[str, float]: + group_results = { group: scalar.value for group, scalar in self.group_to_scalar.items() } + group_results["all_groups"] = ScalarResult.aggregate( + self.group_to_scalar.values() + ).value + return group_results class Metric(ABC): @@ -121,7 +123,7 @@ def __call__( @abstractmethod def __call__( self, annotations: AnnotationList, predictions: PredictionList - ) -> Dict[str, MetricResult]: + ) -> MetricResult: """A metric must override this method and return a metric result, given annotations and predictions.""" @abstractmethod diff --git a/nucleus/metrics/categorization_metrics.py b/nucleus/metrics/categorization_metrics.py index 633e6e4d..0d9f01cf 100644 --- a/nucleus/metrics/categorization_metrics.py +++ b/nucleus/metrics/categorization_metrics.py @@ -207,10 +207,12 @@ def aggregate_score(self, results: List[CategorizationResult]) -> Dict[str, Scal for result in results: gt.extend(list(to_taxonomy_labels(result.annotations))) predicted.extend(list(to_taxonomy_labels(result.predictions))) - results = {} - results["macro"] = f1_score(gt, predicted, average="macro") - results["weighted"] = f1_score(gt, predicted, average="weighted") + aggregate_scores = {} + aggregate_scores["macro"] = f1_score(gt, predicted, average="macro") + aggregate_scores["weighted"] = f1_score( + gt, predicted, average="weighted" + ) return { result_label: ScalarResult(val) - for result_label, val in results.items() + for result_label, val in aggregate_scores.items() } diff --git a/nucleus/metrics/polygon_metrics.py b/nucleus/metrics/polygon_metrics.py index 10cdc22d..eff482f0 100644 --- a/nucleus/metrics/polygon_metrics.py +++ b/nucleus/metrics/polygon_metrics.py @@ -1,12 +1,12 @@ import sys from abc import abstractmethod from collections import defaultdict -from typing import Dict, List +from typing import Dict, List, Union import numpy as np -from nucleus.annotation import AnnotationList -from nucleus.prediction import PredictionList +from nucleus.annotation import AnnotationList, BoxAnnotation, PolygonAnnotation +from nucleus.prediction import BoxPrediction, PolygonPrediction, PredictionList from .base import GroupedScalarResult, Metric, ScalarResult from .filters import confidence_filter, polygon_label_filter @@ -88,7 +88,7 @@ def __init__( """Initializes PolygonMetric abstract object. Args: - enforce_label_match: whether to enforce that annotation and prediction labels must match. Default False + enforce_label_match: whether to enforce that annotation and prediction labels must match. Default True confidence_threshold: minimum confidence threshold for predictions. Must be in [0, 1]. Default 0.0 """ self.enforce_label_match = enforce_label_match @@ -97,8 +97,8 @@ def __init__( def eval_grouped( self, - annotations: List[BoxOrPolygonAnnotation], - predictions: List[BoxOrPolygonPrediction], + annotations: List[Union[BoxAnnotation, PolygonAnnotation]], + predictions: List[Union[BoxPrediction, PolygonPrediction]], ) -> GroupedScalarResult: grouped_annotations = LabelsGrouper(annotations) grouped_predictions = LabelsGrouper(predictions) @@ -142,15 +142,15 @@ def aggregate_score(self, results: List[GroupedScalarResult]) -> Dict[str, Scala def __call__( self, annotations: AnnotationList, predictions: PredictionList - ) -> Dict[str, ScalarResult]: + ) -> GroupedScalarResult: if self.confidence_threshold > 0: predictions = confidence_filter( predictions, self.confidence_threshold ) - polygon_annotations: List[BoxOrPolygonAnnotation] = [] + polygon_annotations: List[Union[BoxAnnotation, PolygonAnnotation]] = [] polygon_annotations.extend(annotations.box_annotations) polygon_annotations.extend(annotations.polygon_annotations) - polygon_predictions: List[BoxOrPolygonPrediction] = [] + polygon_predictions: List[Union[BoxPrediction, PolygonPrediction]] = [] polygon_predictions.extend(predictions.box_predictions) polygon_predictions.extend(predictions.polygon_predictions) diff --git a/tests/metrics/test_polygon_metrics.py b/tests/metrics/test_polygon_metrics.py index 662f149c..9fe57d75 100644 --- a/tests/metrics/test_polygon_metrics.py +++ b/tests/metrics/test_polygon_metrics.py @@ -30,36 +30,18 @@ PolygonIOU, {"enforce_label_match": True}, ), - # ( - # TEST_BOX_ANNOTATION_LIST, - # TEST_BOX_PREDICTION_LIST, - # PolygonIOU, - # {"enforce_label_match": False}, - # ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - # ( - # TEST_BOX_ANNOTATION_LIST, - # TEST_BOX_PREDICTION_LIST, - # PolygonPrecision, - # {"enforce_label_match": False}, - # ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - # ( - # TEST_BOX_ANNOTATION_LIST, - # TEST_BOX_PREDICTION_LIST, - # PolygonRecall, - # {"enforce_label_match": False}, - # ), (TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonMAP, {}), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, @@ -67,36 +49,18 @@ PolygonIOU, {"enforce_label_match": True}, ), - # ( - # TEST_CONVEX_POLYGON_ANNOTATION_LIST, - # TEST_CONVEX_POLYGON_PREDICTION_LIST, - # PolygonIOU, - # {"enforce_label_match": False}, - # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - # ( - # TEST_CONVEX_POLYGON_ANNOTATION_LIST, - # TEST_CONVEX_POLYGON_PREDICTION_LIST, - # PolygonPrecision, - # {"enforce_label_match": False}, - # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - # ( - # TEST_CONVEX_POLYGON_ANNOTATION_LIST, - # TEST_CONVEX_POLYGON_PREDICTION_LIST, - # PolygonRecall, - # {"enforce_label_match": False}, - # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, @@ -124,36 +88,18 @@ def test_perfect_match_polygon_metrics( PolygonIOU, {"enforce_label_match": True}, ), - # ( - # TEST_BOX_ANNOTATION_LIST, - # TEST_BOX_PREDICTION_LIST, - # PolygonIOU, - # {"enforce_label_match": False}, - # ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - # ( - # TEST_BOX_ANNOTATION_LIST, - # TEST_BOX_PREDICTION_LIST, - # PolygonPrecision, - # {"enforce_label_match": False}, - # ), ( TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - # ( - # TEST_BOX_ANNOTATION_LIST, - # TEST_BOX_PREDICTION_LIST, - # PolygonRecall, - # {"enforce_label_match": False}, - # ), (TEST_BOX_ANNOTATION_LIST, TEST_BOX_PREDICTION_LIST, PolygonMAP, {}), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, @@ -161,36 +107,18 @@ def test_perfect_match_polygon_metrics( PolygonIOU, {"enforce_label_match": True}, ), - # ( - # TEST_CONVEX_POLYGON_ANNOTATION_LIST, - # TEST_CONVEX_POLYGON_PREDICTION_LIST, - # PolygonIOU, - # {"enforce_label_match": False}, - # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonPrecision, {"enforce_label_match": True}, ), - # ( - # TEST_CONVEX_POLYGON_ANNOTATION_LIST, - # TEST_CONVEX_POLYGON_PREDICTION_LIST, - # PolygonPrecision, - # {"enforce_label_match": False}, - # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, PolygonRecall, {"enforce_label_match": True}, ), - # ( - # TEST_CONVEX_POLYGON_ANNOTATION_LIST, - # TEST_CONVEX_POLYGON_PREDICTION_LIST, - # PolygonRecall, - # {"enforce_label_match": False}, - # ), ( TEST_CONVEX_POLYGON_ANNOTATION_LIST, TEST_CONVEX_POLYGON_PREDICTION_LIST, @@ -224,13 +152,6 @@ def test_perfect_unmatched_polygon_metrics( {"car": ScalarResult(109.0 / 300, 3)}, {"enforce_label_match": True}, ), - # ( - # TEST_ANNOTATION_LIST, - # TEST_PREDICTION_LIST, - # PolygonIOU, - # ScalarResult(109.0 / 300, 3), - # {"enforce_label_match": False}, - # ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, @@ -238,13 +159,6 @@ def test_perfect_unmatched_polygon_metrics( {"car": ScalarResult(1.0 / 3, 3)}, {"enforce_label_match": True}, ), - # ( - # TEST_ANNOTATION_LIST, - # TEST_PREDICTION_LIST, - # PolygonPrecision, - # ScalarResult(1.0 / 3, 3), - # {"enforce_label_match": False}, - # ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST, @@ -252,13 +166,6 @@ def test_perfect_unmatched_polygon_metrics( {"car": ScalarResult(0.5, 2)}, {"enforce_label_match": True}, ), - # ( - # TEST_ANNOTATION_LIST, - # TEST_PREDICTION_LIST, - # PolygonRecall, - # ScalarResult(0.5, 2), - # {"enforce_label_match": False}, - # ), ( TEST_ANNOTATION_LIST, TEST_PREDICTION_LIST,