From 930cbc572e2d235e176e2918d063f2c6efb244ff Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Thu, 2 Jan 2025 14:20:44 +0100 Subject: [PATCH] Disabled short metrics for evaluation of longer sequences. --- community_tasks/swiss_legal_evals.py | 32 +++++++++++++++++++++------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/community_tasks/swiss_legal_evals.py b/community_tasks/swiss_legal_evals.py index 7bcb77c7..90280875 100644 --- a/community_tasks/swiss_legal_evals.py +++ b/community_tasks/swiss_legal_evals.py @@ -954,15 +954,31 @@ def prompt_fn(line: dict, task_name: str = None): # However, these are only fine-tuned on English data and we need multilingual support. -def get_metrics(METRICS_TO_USE, target_lang: str): +def get_metrics(METRICS_TO_USE, target_lang: str, generation_size: int): metrics = [] for metric in METRICS_TO_USE: - if metric in METRICS: - if metric == "bert_score": - # Add only the BERTScore for the target language - metrics.append(METRICS["bert_score"][target_lang]) - else: - metrics.append(METRICS[metric]) + if metric not in METRICS: + logger.debug(f"Skipping {metric} because it is not available. Available metrics: {METRICS}") + continue + short_metrics = [ + "bleu_sentence", + "chrf_sentence", + "ter_sentence", + "bert_score", + "bleurt_tiny", + "bleurt_base", + "bleurt_large", + ] + if generation_size > 512 and metric in short_metrics: + logger.debug( + f"Skipping {metric} for generation size {generation_size} because the maximum supported sequence length is 512." + ) + continue + if metric == "bert_score": + # Add only the BERTScore for the target language + metrics.append(METRICS["bert_score"][target_lang]) + else: + metrics.append(METRICS[metric]) return metrics @@ -987,7 +1003,7 @@ def __init__( few_shots_split="validation", few_shots_select="sequential", generation_size=level_config.generation_size, - metric=get_metrics(METRICS_TO_USE, target_lang), + metric=get_metrics(METRICS_TO_USE, target_lang, level_config.generation_size), stop_sequence=level_config.stop_sequence, trust_dataset=True, # Remove the target language in the beginning if it exists: e.g., FR: {translation}