huggingface · clefourrier · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024
diff --git a/README.md b/README.md
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -13,6 +13,7 @@
 from lighteval.metrics.metrics_sample import (
     BLEU,
     BLEURT,
+    MRR,
     ROUGE,
     BertScore,
     ExactMatches,
@@ -23,7 +24,6 @@
     acc_golds_likelihood,
     extractiveness,
     faithfulness,
-    mrr,
 )
 from lighteval.metrics.normalizations import (
     bigbench_normalizer,
@@ -277,7 +277,7 @@ class Metrics(Enum):
     )
     mrr = SampleLevelMetric(
         metric="mrr",
-        sample_level_fn=mrr,
+        sample_level_fn=MRR().compute,
         category=MetricCategory.MULTICHOICE,
         use_case=MetricUseCase.ACCURACY,
         corpus_level_fn=np.mean,

diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
@@ -1,4 +1,4 @@
-"""This module manages all the score aggregations and computations occurring at the corpus level.
+"""This module manages all the metrics occurring at the corpus level.
 Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
 A number of these aggregations come from the EleutherAIHarness
 """
@@ -10,6 +10,7 @@
 
 from lighteval.metrics.sample_preparator import (
     GenerativeCorpusMetricInput,
+    LogprobCorpusMetricInput,
     PerplexityCorpusMetricInput,
 )
 from lighteval.utils import as_list
@@ -20,7 +21,7 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:
     """Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)).
 
     Args:
-        items (list[dict]): List of the correctly formatted dictionarinput
+        items (list[dict]): List of GenerativeCorpusMetricInput
 
     Returns:
         float: Score
@@ -32,13 +33,23 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:
 
 class CorpusLevelF1Score:
     def __init__(self, average: str, num_classes: int = 2):
-        # If num_classes > 2, we compute multi_f1_corpus_aggregation
-        self.average = average  # weighted, macro, micro
+        """Stores the relevant parameters for the task's corpus level f1 score.
+
+        Args:
+            average (str): Method to use to compute the f1 score. Can be weighted, macro, micro.
+            num_classes (int, optional): Num of possible choice classes. Defaults to 2. If this parameter is above 2, we'll compute multi f1 corpus score
+        """
+        if average not in ["weighted", "macro", "micro", None]:
+            raise ValueError(
+                f"A CorpusLevelF1Score must be initialized with weighted, macro, micro, or None as an average function. {average} was used."
+            )
+        self.average = average
         self.num_classes = num_classes
 
-    def compute(self, items):
-        golds = [i["golds"] for i in items]
-        preds = [i["preds"] for i in items]
+    def compute(self, items: list[LogprobCorpusMetricInput]):
+        """Computes the metric score over all the corpus generated items, by using the scikit learn implementation."""
+        golds = [i.golds for i in items]
+        preds = [i.preds for i in items]
         # Single f1
         if self.num_classes == 2:
             fscore = sklearn.metrics.f1_score(golds, preds, average=self.average)
@@ -48,11 +59,16 @@ def compute(self, items):
         f1s = []
         for i in range(self.num_classes):
             f1s.append(sklearn.metrics.f1_score(y_true=golds == i, y_pred=preds == i))
-        return np.mean(f1s)
+        return float(np.mean(f1s))
 
 
 class CorpusLevelTranslationMetric:
     def __init__(self, metric_type: str):
+        """Stores the relevant parameters for a corpus level translation metric.
+
+        Args:
+            metric_type (str): Can be any of bleu, chrf, or ter depending on the metric to use.
+        """
         if metric_type == "bleu":
             self.metric = sacrebleu.corpus_bleu
         elif metric_type == "chrf":
@@ -63,19 +79,32 @@ def __init__(self, metric_type: str):
             raise ValueError(f"Unknown corpus level translation metric type : {metric_type}")
 
     def compute(self, items: list[GenerativeCorpusMetricInput]) -> float:
+        """Computes the metric score over all the corpus generated items, by using the sacrebleu implementation."""
         golds = [i.golds for i in items]
         preds = [as_list(i.preds) for i in items]
-        return self.metric(hypotheses=preds, references=golds).score
+        return float(self.metric(hypotheses=preds, references=golds).score)
 
 
 class CorpusLevelPerplexityMetric:
     def __init__(self, metric_type: str):
+        """Stores the relevant parameter for a corpus level perplexity metric.
+        Perplexity metrics compute more or less the same thing, which is a variation on the
+        average of log-probabilities over a sequence, but the normalization and processing applied
+        is different depending on the metric type.
+        Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential
+        and the number of words as weights for the log-prob average, and bits per byte uses the number of bits
+        for normalization and divides the results by log(2).
+
+        Args:
+            metric_type (str): Can be any of `perplexity`, `weighted_perplexity` or `bits_per_byte`
+        """
         if metric_type not in ["perplexity", "weighted_perplexity", "bits_per_byte"]:
             raise ValueError(f"Unknown corpus level perplexity metric type : {metric_type}")
 
         self.metric_type = metric_type
 
     def compute(self, items: list[PerplexityCorpusMetricInput]):
+        """Computes the metric score over all the corpus generated items."""
         logprobs = [i.logprobs for i in items]
         weights = [i.weights for i in items]