Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Doc metrics + README #2

Merged
merged 14 commits into from
Jan 26, 2024
99 changes: 50 additions & 49 deletions README.md

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from lighteval.metrics.metrics_sample import (
BLEU,
BLEURT,
MRR,
ROUGE,
BertScore,
ExactMatches,
Expand All @@ -23,7 +24,6 @@
acc_golds_likelihood,
extractiveness,
faithfulness,
mrr,
)
from lighteval.metrics.normalizations import (
bigbench_normalizer,
Expand Down Expand Up @@ -277,7 +277,7 @@ class Metrics(Enum):
)
mrr = SampleLevelMetric(
metric="mrr",
sample_level_fn=mrr,
sample_level_fn=MRR().compute,
category=MetricCategory.MULTICHOICE,
use_case=MetricUseCase.ACCURACY,
corpus_level_fn=np.mean,
Expand Down
47 changes: 38 additions & 9 deletions src/lighteval/metrics/metrics_corpus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""This module manages all the score aggregations and computations occurring at the corpus level.
"""This module manages all the metrics occurring at the corpus level.
Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
A number of these aggregations come from the EleutherAIHarness
"""
Expand All @@ -10,6 +10,7 @@

from lighteval.metrics.sample_preparator import (
GenerativeCorpusMetricInput,
LogprobCorpusMetricInput,
PerplexityCorpusMetricInput,
)
from lighteval.utils import as_list
Expand All @@ -20,7 +21,7 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:
"""Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)).

Args:
items (list[dict]): List of the correctly formatted dictionarinput
items (list[dict]): List of GenerativeCorpusMetricInput

Returns:
float: Score
Expand All @@ -32,13 +33,23 @@ def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:

class CorpusLevelF1Score:
def __init__(self, average: str, num_classes: int = 2):
# If num_classes > 2, we compute multi_f1_corpus_aggregation
self.average = average # weighted, macro, micro
"""Stores the relevant parameters for the task's corpus level f1 score.

Args:
average (str): Method to use to compute the f1 score. Can be weighted, macro, micro.
num_classes (int, optional): Num of possible choice classes. Defaults to 2. If this parameter is above 2, we'll compute multi f1 corpus score
"""
if average not in ["weighted", "macro", "micro", None]:
raise ValueError(
f"A CorpusLevelF1Score must be initialized with weighted, macro, micro, or None as an average function. {average} was used."
)
self.average = average
self.num_classes = num_classes

def compute(self, items):
golds = [i["golds"] for i in items]
preds = [i["preds"] for i in items]
def compute(self, items: list[LogprobCorpusMetricInput]):
"""Computes the metric score over all the corpus generated items, by using the scikit learn implementation."""
golds = [i.golds for i in items]
preds = [i.preds for i in items]
# Single f1
if self.num_classes == 2:
fscore = sklearn.metrics.f1_score(golds, preds, average=self.average)
Expand All @@ -48,11 +59,16 @@ def compute(self, items):
f1s = []
for i in range(self.num_classes):
f1s.append(sklearn.metrics.f1_score(y_true=golds == i, y_pred=preds == i))
return np.mean(f1s)
return float(np.mean(f1s))


class CorpusLevelTranslationMetric:
def __init__(self, metric_type: str):
"""Stores the relevant parameters for a corpus level translation metric.

Args:
metric_type (str): Can be any of bleu, chrf, or ter depending on the metric to use.
"""
if metric_type == "bleu":
self.metric = sacrebleu.corpus_bleu
elif metric_type == "chrf":
Expand All @@ -63,19 +79,32 @@ def __init__(self, metric_type: str):
raise ValueError(f"Unknown corpus level translation metric type : {metric_type}")

def compute(self, items: list[GenerativeCorpusMetricInput]) -> float:
"""Computes the metric score over all the corpus generated items, by using the sacrebleu implementation."""
golds = [i.golds for i in items]
preds = [as_list(i.preds) for i in items]
return self.metric(hypotheses=preds, references=golds).score
return float(self.metric(hypotheses=preds, references=golds).score)


class CorpusLevelPerplexityMetric:
def __init__(self, metric_type: str):
"""Stores the relevant parameter for a corpus level perplexity metric.
Perplexity metrics compute more or less the same thing, which is a variation on the
average of log-probabilities over a sequence, but the normalization and processing applied
is different depending on the metric type.
Perplexity uses an exponential and no weights for the average, weighted perplexity uses an exponential
and the number of words as weights for the log-prob average, and bits per byte uses the number of bits
for normalization and divides the results by log(2).

Args:
metric_type (str): Can be any of `perplexity`, `weighted_perplexity` or `bits_per_byte`
"""
if metric_type not in ["perplexity", "weighted_perplexity", "bits_per_byte"]:
raise ValueError(f"Unknown corpus level perplexity metric type : {metric_type}")

self.metric_type = metric_type

def compute(self, items: list[PerplexityCorpusMetricInput]):
"""Computes the metric score over all the corpus generated items."""
logprobs = [i.logprobs for i in items]
weights = [i.weights for i in items]

Expand Down
Loading
Loading