diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index fe27c8f0f..bf5d75227 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -27,6 +27,7 @@ import time from dataclasses import asdict, is_dataclass from datetime import datetime +from enum import Enum from pathlib import Path from datasets import Dataset, load_dataset @@ -59,6 +60,8 @@ def default(self, o): return asdict(o) if callable(o): return o.__name__ + if isinstance(o, Enum): + return o.name return super().default(o) diff --git a/src/lighteval/metrics/utils.py b/src/lighteval/metrics/utils.py index e0db60897..52e8e0665 100644 --- a/src/lighteval/metrics/utils.py +++ b/src/lighteval/metrics/utils.py @@ -69,7 +69,7 @@ def compute(self, **kwargs) -> dict: # result: Union[list[ModelReturn], ModelRe return {} if isinstance(self, MetricGrouping): return self.sample_level_fn(**kwargs) # result, formatted_doc, - return {self.name: self.sample_level_fn(**kwargs)} # result, formatted_doc, + return {self.metric_name: self.sample_level_fn(**kwargs)} # result, formatted_doc, @dataclass