From cb163be1fb2e6eefc6753c35311f2927a6da672a Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomas@huggingface.co>
Date: Wed, 7 Feb 2024 11:30:20 +0000
Subject: [PATCH 1/3] moving custom tasks to code

---
 .pre-commit-config.yaml                       |   1 +
 README.md                                     |  10 +-
 src/lighteval/logging/__init__.py             |   0
 src/lighteval/logging/evaluation_tracker.py   |   4 +-
 src/lighteval/models/__init__.py              |   0
 src/lighteval/tasks/__init__.py               |   0
 src/lighteval/tasks/lighteval_task.py         |  37 ++++
 .../custom_tasks/custom_evaluation_tasks.py   |  55 +++---
 .../custom_tasks/custom_evaluation_utils.py   | 159 ------------------
 tasks_examples/open_llm_leaderboard_tasks.txt |   2 +-
 10 files changed, 71 insertions(+), 197 deletions(-)
 create mode 100644 src/lighteval/logging/__init__.py
 create mode 100644 src/lighteval/models/__init__.py
 create mode 100644 src/lighteval/tasks/__init__.py
 delete mode 100644 tasks_examples/custom_tasks/custom_evaluation_utils.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7ec4856f8..441ff70ad 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,4 +37,5 @@ repos:
     rev: 'v0.1.6'
     hooks:
       - id: ruff
+        args: ['--fix']
       - id: ruff-format
diff --git a/README.md b/README.md
index 2e2f35e6d..c04a66118 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ LightEval is an evaluation suite which gathers a selection of features from wide
 
 It is still an early, internal version - it should be nice to use but don't expect 100% stability!
 
-In case of problems or question, feel free to open an issue! 
+In case of problems or question, feel free to open an issue!
 
 ## How to install and use
 ### Requirements
@@ -50,11 +50,11 @@ Lastly, create a **line summary** of your evaluation, in `metadata_table.json`.
 - `suite` (list), the suite(s) to which your evaluation should belong. This field allows us to compare different tasks implementation, and is used a task selection to differentiate the versions to launch. At the moment, you'll find the keywords ["helm", "bigbench", "original", "lighteval"]; you can add also add new ones (for test, we recommend using "custom").
 - `prompt_function` (str), the name of the prompt function you defined in the step above
 - `hf_repo` (str), the path to your evaluation dataset on the hub
-- `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`) 
+- `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`)
 - `hf_avail_splits` (list), all the splits available for your dataset (train, valid or validation, test, other...)
 - `evaluation_splits` (list), the splits you want to use for evaluation
 - `few_shots_split` (str, can be `null`), the specific split from which you want to select samples for your few-shot examples. It should be different from the sets included in `evaluation_splits`
-- `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of: 
+- `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of:
     - `balanced` selects examples from the `few_shots_split` with balanced labels, to avoid skewing the few shot examples (hence the model generations) towards one specific label
     - `random` selects examples at random from the `few_shots_split`
     - `random_sampling` selects new examples at random from the `few_shots_split` for every new item, but if a sampled item is equal to the current one, it is removed from the available samples
@@ -102,7 +102,7 @@ These metrics need the model to generate an output. They are therefore slower.
     - `exact_match_indicator`: Exact match with some preceding context (before an indicator) removed
     - `f1_score_quasi` (HELM): Average F1 score in terms of word overlap between the model output and gold, with both being normalized first
     - `f1_score`:  Average F1 score in terms of word overlap between the model output and gold without normalisation
-    - `f1_score_macro`: Corpus level macro F1 score 
+    - `f1_score_macro`: Corpus level macro F1 score
     - `f1_score_macro`: Corpus level micro F1 score
 - Summarization:
     - `rouge` (Harness): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/)
@@ -141,7 +141,7 @@ These metrics need both the generation and its logprob. They are not working at
 - `prediction_perplexity` (HELM): Measure of the logprob of a given input.
 
 ## Adding a new metric
-If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `metrics_sample`. If not, add it to either of these files depending on the level at which it is applied. Then, follow the example in `src.lighteval.metrics.metrics` to register your metric. 
+If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `metrics_sample`. If not, add it to either of these files depending on the level at which it is applied. Then, follow the example in `src.lighteval.metrics.metrics` to register your metric.
 
 ## Examples of scripts to launch lighteval on the cluster
 ### Evaluate a whole suite on one node, 8 GPUs
diff --git a/src/lighteval/logging/__init__.py b/src/lighteval/logging/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 0515af461..e7628f845 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -556,7 +556,7 @@ def push_results_to_tensorboard(  # noqa: C901
 
         tb_context.close()  # flushes the unfinished write operations
         time.sleep(5)
-        files = os.listdir(output_dir_tb)
+        files = os.listdir(str(output_dir_tb))
         for file in files:
             os.rename(os.path.join(output_dir_tb, file), os.path.join(output_dir_tb, f"{global_step:07d}_{file}"))
 
@@ -566,5 +566,3 @@ def push_results_to_tensorboard(  # noqa: C901
             f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/"
             f" at {output_dir_tb} and global_step {global_step}"
         )
-        # except Exception as e:
-        #     logger.warning(f"Could not push to tensorboard\n{e}")
diff --git a/src/lighteval/models/__init__.py b/src/lighteval/models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/lighteval/tasks/__init__.py b/src/lighteval/tasks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index e16963a9c..76be14215 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -1,5 +1,6 @@
 import collections
 import random
+from dataclasses import dataclass
 from multiprocessing import Pool
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Tuple
@@ -39,6 +40,42 @@
     from lighteval.logging.evaluation_tracker import EvaluationTracker
 
 
+@dataclass
+class CustomEvaluationTaskConfig:
+    name: str
+    prompt_function: str
+    hf_repo: str
+    hf_subset: str
+    metric: Tuple[Metrics]
+    hf_avail_splits: Optional[Tuple[str]] = None
+    evaluation_splits: Optional[Tuple[str]] = None
+    few_shots_split: Optional[str] = None
+    few_shots_select: Optional[str] = None
+    generation_size: int = -1
+    stop_sequence: Optional[Tuple[str]] = None
+    output_regex: Optional[str] = None
+
+    frozen: bool = False
+    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
+
+    def __post_init__(self):
+        if self.suite is None:
+            self.suite = ["custom"]
+        if self.hf_avail_splits is None:
+            self.hf_avail_splits = ["train", "validation", "test"]
+        if self.evaluation_splits is None:
+            self.evaluation_splits = ["validation"]
+        if self.stop_sequence is None:
+            self.stop_sequence = ["\n"]
+
+        # Convert list to tuple for hashing
+        self.metric = tuple(self.metric)
+        self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
+        self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
+        self.suite = tuple(self.suite) if self.suite else None
+        self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
+
+
 class LightevalTask:
     def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None):
         """
diff --git a/tasks_examples/custom_tasks/custom_evaluation_tasks.py b/tasks_examples/custom_tasks/custom_evaluation_tasks.py
index b0dae200c..296db3720 100644
--- a/tasks_examples/custom_tasks/custom_evaluation_tasks.py
+++ b/tasks_examples/custom_tasks/custom_evaluation_tasks.py
@@ -6,44 +6,41 @@
 """
 import re
 from dataclasses import asdict
-from typing import Dict, List
+from typing import Dict, List, Tuple
 
+from lighteval.metrics import MetricCategory, Metrics
+from lighteval.tasks.lighteval_task import CustomEvaluationTaskConfig
 from lighteval.tasks.requests import Doc
+from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
-from .custom_evaluation_utils import *
 
-
-# fmt: off
-LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
-# fmt: on
-
-_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = []
-_TASKS: List[CustomEvaluationTask] = []
+_TASKS_STRINGS: List[Tuple[CustomEvaluationTaskConfig, str]] = []
+_TASKS: List[CustomEvaluationTaskConfig] = []
 
 ## COMMON_SENSE_REASONING_TASKS ##
 COMMON_SENSE_REASONING_TASKS = [
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="hellaswag",
         prompt_function="hellaswag_prompt",
         hf_repo="hellaswag",
         hf_subset="default",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="winogrande",
         prompt_function="winogrande",
         hf_repo="winogrande",
         hf_subset="winogrande_xl",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="piqa",
         prompt_function="piqa_harness",
         hf_repo="piqa",
         hf_subset="plain_text",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="siqa",
         prompt_function="siqa_prompt",
         hf_repo="lighteval/siqa",
@@ -51,14 +48,14 @@
         hf_avail_splits=["train", "validation"],
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="openbookqa",
         prompt_function="openbookqa",
         hf_repo="openbookqa",
         hf_subset="main",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="arc:easy",
         prompt_function="arc",
         hf_repo="ai2_arc",
@@ -67,7 +64,7 @@
         generation_size=1,
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="arc:challenge",
         prompt_function="arc",
         hf_repo="ai2_arc",
@@ -76,7 +73,7 @@
         generation_size=1,
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="commonsense_qa",
         prompt_function="commonsense_qa_prompt",
         hf_repo="commonsense_qa",
@@ -134,7 +131,7 @@ def preprocess(text):
 ## WORLD_KNOWLEDGE_TASKS ##
 
 WORLD_KNOWLEDGE_TASKS = [
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="trivia_qa",
         prompt_function="triviaqa",
         hf_repo="trivia_qa",
@@ -143,7 +140,7 @@ def preprocess(text):
         generation_size=20,
         stop_sequence=["\n", ".", ","],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="natural_questions",
         prompt_function="natural_questions_prompt",
         hf_repo="lighteval/natural_questions_clean",
@@ -173,14 +170,14 @@ def natural_questions_prompt(line, task_name: str = None):
 ## Reading comprehension ##
 
 READING_COMP_TASKS = [
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="super_glue:boolq",
         prompt_function="boolq_prompt",
         hf_repo="super_glue",
         hf_subset="boolq",
         metric=["target_perplexity"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="quac",
         prompt_function="quac",
         hf_repo="lighteval/quac_helm",
@@ -207,7 +204,7 @@ def boolq_prompt(line, task_name: str = None):
 
 
 ## MATH ##
-class CustomMathEvaluationTask(CustomEvaluationTask):
+class CustomMathEvaluationTask(CustomEvaluationTaskConfig):
     """Custom class for math tasks with all the defaults set"""
 
     def __init__(
@@ -254,7 +251,7 @@ def __init__(
     CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"),
     CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"),
 ]
-GSM8K = CustomEvaluationTask(
+GSM8K = CustomEvaluationTaskConfig(
     name="gsm8k",
     prompt_function="gsm8k",
     hf_repo="gsm8k",
@@ -275,7 +272,7 @@ def __init__(
 
 
 ## MMLU ##
-class CustomMMLUEvaluationTask(CustomEvaluationTask):
+class CustomMMLUEvaluationTask(CustomEvaluationTaskConfig):
     def __init__(
         self,
         name,
@@ -418,7 +415,7 @@ def mmlu_prompt(line, task_name: str = None):
 ## BBH ##
 
 
-class CustomBBHEvaluationTask(CustomEvaluationTask):
+class CustomBBHEvaluationTask(CustomEvaluationTaskConfig):
     def __init__(
         self,
         name,
@@ -509,7 +506,7 @@ def bbh_prompt(line, task_name: str = None):
 
 
 ## AGI eval ##
-class CustomAGIEvalEvaluationTask(CustomEvaluationTask):
+class CustomAGIEvalEvaluationTask(CustomEvaluationTaskConfig):
     def __init__(
         self,
         name,
@@ -620,7 +617,7 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):
 
 
 ## HUMAN EVAL ##
-# human_eval = CustomEvaluationTask(
+# human_eval = CustomEvaluationTaskConfig(
 #         name="human_eval",
 #         prompt_function="human_eval",
 #         hf_repo="lighteval/human_eval",
@@ -628,9 +625,9 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):
 #     ),
 
 
-def has_generative_metrics(task: CustomEvaluationTask) -> bool:
+def has_generative_metrics(task: CustomEvaluationTaskConfig) -> bool:
     for metric in task.metric:
-        if metric in NEEDS_GENERATION_ONLY:
+        if metric.category == MetricCategory.GENERATIVE:
             return True
     return False
 
diff --git a/tasks_examples/custom_tasks/custom_evaluation_utils.py b/tasks_examples/custom_tasks/custom_evaluation_utils.py
deleted file mode 100644
index d3f005db1..000000000
--- a/tasks_examples/custom_tasks/custom_evaluation_utils.py
+++ /dev/null
@@ -1,159 +0,0 @@
-"""
-Custom evaluation tasks for lighteval
-"""
-from dataclasses import dataclass
-from enum import Enum, auto
-from typing import Optional, Tuple, Union
-
-
-class Metrics(Enum):
-    any_target_loglikelihood_acc = auto()
-    bert_score = auto()
-    bias = auto()
-    bits_per_byte = auto()
-    bleu = auto()
-    bleu_1 = auto()
-    bleu_4 = auto()
-    byte_perplexity = auto()
-    chrf = auto()
-    code_eval_APPS = auto()
-    code_eval_HE = auto()
-    copyright = auto()
-    disinformation = auto()
-    exact_match = auto()
-    exact_set_match = auto()
-    extractiveness = auto()
-    f1_from_bags = auto()
-    f1_quasi = auto()
-    f1_sequence = auto()
-    f1_set_match = auto()
-    faithfulness = auto()
-    iou_set_match = auto()
-    log_prob = auto()
-    loglikelihood_acc = auto()
-    loglikelihood_acc_norm = auto()
-    loglikelihood_acc_norm_nospace = auto()
-    loglikelihood_acc_norm_single_token = auto()
-    loglikelihood_acc_single_token = auto()
-    loglikelihood_f1 = auto()
-    loglikelihood_f1_single_token = auto()
-    math_quasi_exact_match = auto()
-    mc_taco = auto()
-    mcc = auto()
-    mcc_single_token = auto()
-    mrr = auto()
-    mrr_single_token = auto()
-    multi_fi_numeric = auto()
-    one_choice_loglikelihood_acc = auto()
-    perfect_exact_match = auto()
-    prediction_perplexity = auto()
-    prefix_exact_match = auto()
-    prefix_quasi_exact_match = auto()
-    quasi_exact_match = auto()
-    quasi_exact_match2 = auto()
-    ranking = auto()
-    recall_at_1_single_token = auto()
-    recall_at_2_single_token = auto()
-    recall_at_1 = auto()
-    recall_at_2 = auto()
-    rouge = auto()
-    rouge_1 = auto()
-    rouge_2 = auto()
-    rouge_l = auto()
-    target_perplexity = auto()
-    ter = auto()
-    toxicity = auto()
-    truthfulqa_mc_metrics = auto()
-    word_perplexity = auto()
-
-    def __str__(self):
-        return self.name.replace("_at_", "@")
-
-
-NEEDS_GENERATION_ONLY = [
-    "perfect_exact_match",
-    "exact_match",
-    "quasi_exact_match",
-    "quasi_exact_match2",
-    "prefix_exact_match",
-    "prefix_quasi_exact_match",
-    "math_quasi_exact_match",
-    "iou_set_match",
-    "exact_set_match",
-    "f1_sequence",
-    "f1_quasi",
-    "f1_set_match",
-    "f1_from_bags",
-    "chrf",
-    "ter",
-    "rouge",
-    "rouge_1",
-    "rouge_2",
-    "rouge_l",
-    "faithfulness",
-    "extractiveness",
-    "bert_score",
-    "bleu",
-    "bleu_1",
-    "bleu_4",
-    "bias",
-    "toxicity",
-    "code_eval_HE",
-    "code_eval_APPS",
-    "copyright",
-]
-
-
-@dataclass(unsafe_hash=True)
-class CustomEvaluationTask:
-    name: str
-    prompt_function: str
-    hf_repo: str
-    hf_subset: str
-    metric: Tuple[Union[str, Metrics]]
-    hf_avail_splits: Optional[Tuple[str]] = None
-    evaluation_splits: Optional[Tuple[str]] = None
-    few_shots_split: Optional[str] = None
-    few_shots_select: Optional[str] = None
-    generation_size: int = -1
-    stop_sequence: Optional[Tuple[str]] = None
-    output_regex: Optional[str] = None
-
-    frozen: bool = False
-    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
-
-    def __post_init__(self):
-        self.metric = [str(m) for m in self.metric]
-        if self.suite is None:
-            self.suite = ["custom"]
-        if self.hf_avail_splits is None:
-            self.hf_avail_splits = ["train", "validation", "test"]
-        if self.evaluation_splits is None:
-            self.evaluation_splits = ["validation"]
-        if self.stop_sequence is None:
-            self.stop_sequence = ["\n"]
-
-        # Convert list to tuple for hashing
-        self.metric = tuple(self.metric)
-        self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
-        self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
-        self.suite = tuple(self.suite) if self.suite else None
-        self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
-
-
-@dataclass(unsafe_hash=True)
-class BigCodeEvaluationTask:
-    name: str
-    bigcode_task: str
-    bigcode_task_kwargs: Optional[dict] = None
-    n_samples: int = 1
-    prefix: Optional[str] = None
-
-    suite: Tuple[str] = None
-
-    def __post_init__(self):
-        if self.suite is None:
-            self.suite = ("bigcode",)
-
-        # Convert list to tuple for hashing
-        self.suite = tuple(self.suite)
diff --git a/tasks_examples/open_llm_leaderboard_tasks.txt b/tasks_examples/open_llm_leaderboard_tasks.txt
index 41c0ff35a..5736e9537 100644
--- a/tasks_examples/open_llm_leaderboard_tasks.txt
+++ b/tasks_examples/open_llm_leaderboard_tasks.txt
@@ -57,4 +57,4 @@ lighteval|mmlu:security_studies|5|0
 lighteval|mmlu:sociology|5|0
 lighteval|mmlu:us_foreign_policy|5|0
 lighteval|mmlu:virology|5|0
-lighteval|mmlu:world_religions|5|0
\ No newline at end of file
+lighteval|mmlu:world_religions|5|0

From 37db422f5fec9a05df8915923e7dcb9e87dbfbab Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomas@huggingface.co>
Date: Wed, 7 Feb 2024 11:31:41 +0000
Subject: [PATCH 2/3] Revert "moving custom tasks to code"

This reverts commit cb163be1fb2e6eefc6753c35311f2927a6da672a.
---
 .pre-commit-config.yaml                       |   1 -
 src/lighteval/logging/__init__.py             |   0
 src/lighteval/logging/evaluation_tracker.py   |   4 +-
 src/lighteval/models/__init__.py              |   0
 src/lighteval/tasks/__init__.py               |   0
 src/lighteval/tasks/lighteval_task.py         |  37 ----
 .../custom_tasks/custom_evaluation_tasks.py   |  55 +++---
 .../custom_tasks/custom_evaluation_utils.py   | 159 ++++++++++++++++++
 8 files changed, 191 insertions(+), 65 deletions(-)
 delete mode 100644 src/lighteval/logging/__init__.py
 delete mode 100644 src/lighteval/models/__init__.py
 delete mode 100644 src/lighteval/tasks/__init__.py
 create mode 100644 tasks_examples/custom_tasks/custom_evaluation_utils.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 441ff70ad..7ec4856f8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,5 +37,4 @@ repos:
     rev: 'v0.1.6'
     hooks:
       - id: ruff
-        args: ['--fix']
       - id: ruff-format
diff --git a/src/lighteval/logging/__init__.py b/src/lighteval/logging/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index e7628f845..0515af461 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -556,7 +556,7 @@ def push_results_to_tensorboard(  # noqa: C901
 
         tb_context.close()  # flushes the unfinished write operations
         time.sleep(5)
-        files = os.listdir(str(output_dir_tb))
+        files = os.listdir(output_dir_tb)
         for file in files:
             os.rename(os.path.join(output_dir_tb, file), os.path.join(output_dir_tb, f"{global_step:07d}_{file}"))
 
@@ -566,3 +566,5 @@ def push_results_to_tensorboard(  # noqa: C901
             f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/"
             f" at {output_dir_tb} and global_step {global_step}"
         )
+        # except Exception as e:
+        #     logger.warning(f"Could not push to tensorboard\n{e}")
diff --git a/src/lighteval/models/__init__.py b/src/lighteval/models/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/lighteval/tasks/__init__.py b/src/lighteval/tasks/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 76be14215..e16963a9c 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -1,6 +1,5 @@
 import collections
 import random
-from dataclasses import dataclass
 from multiprocessing import Pool
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Tuple
@@ -40,42 +39,6 @@
     from lighteval.logging.evaluation_tracker import EvaluationTracker
 
 
-@dataclass
-class CustomEvaluationTaskConfig:
-    name: str
-    prompt_function: str
-    hf_repo: str
-    hf_subset: str
-    metric: Tuple[Metrics]
-    hf_avail_splits: Optional[Tuple[str]] = None
-    evaluation_splits: Optional[Tuple[str]] = None
-    few_shots_split: Optional[str] = None
-    few_shots_select: Optional[str] = None
-    generation_size: int = -1
-    stop_sequence: Optional[Tuple[str]] = None
-    output_regex: Optional[str] = None
-
-    frozen: bool = False
-    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
-
-    def __post_init__(self):
-        if self.suite is None:
-            self.suite = ["custom"]
-        if self.hf_avail_splits is None:
-            self.hf_avail_splits = ["train", "validation", "test"]
-        if self.evaluation_splits is None:
-            self.evaluation_splits = ["validation"]
-        if self.stop_sequence is None:
-            self.stop_sequence = ["\n"]
-
-        # Convert list to tuple for hashing
-        self.metric = tuple(self.metric)
-        self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
-        self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
-        self.suite = tuple(self.suite) if self.suite else None
-        self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
-
-
 class LightevalTask:
     def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None):
         """
diff --git a/tasks_examples/custom_tasks/custom_evaluation_tasks.py b/tasks_examples/custom_tasks/custom_evaluation_tasks.py
index 296db3720..b0dae200c 100644
--- a/tasks_examples/custom_tasks/custom_evaluation_tasks.py
+++ b/tasks_examples/custom_tasks/custom_evaluation_tasks.py
@@ -6,41 +6,44 @@
 """
 import re
 from dataclasses import asdict
-from typing import Dict, List, Tuple
+from typing import Dict, List
 
-from lighteval.metrics import MetricCategory, Metrics
-from lighteval.tasks.lighteval_task import CustomEvaluationTaskConfig
 from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
+from .custom_evaluation_utils import *
 
-_TASKS_STRINGS: List[Tuple[CustomEvaluationTaskConfig, str]] = []
-_TASKS: List[CustomEvaluationTaskConfig] = []
+
+# fmt: off
+LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
+# fmt: on
+
+_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = []
+_TASKS: List[CustomEvaluationTask] = []
 
 ## COMMON_SENSE_REASONING_TASKS ##
 COMMON_SENSE_REASONING_TASKS = [
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="hellaswag",
         prompt_function="hellaswag_prompt",
         hf_repo="hellaswag",
         hf_subset="default",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="winogrande",
         prompt_function="winogrande",
         hf_repo="winogrande",
         hf_subset="winogrande_xl",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="piqa",
         prompt_function="piqa_harness",
         hf_repo="piqa",
         hf_subset="plain_text",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="siqa",
         prompt_function="siqa_prompt",
         hf_repo="lighteval/siqa",
@@ -48,14 +51,14 @@
         hf_avail_splits=["train", "validation"],
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="openbookqa",
         prompt_function="openbookqa",
         hf_repo="openbookqa",
         hf_subset="main",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="arc:easy",
         prompt_function="arc",
         hf_repo="ai2_arc",
@@ -64,7 +67,7 @@
         generation_size=1,
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="arc:challenge",
         prompt_function="arc",
         hf_repo="ai2_arc",
@@ -73,7 +76,7 @@
         generation_size=1,
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="commonsense_qa",
         prompt_function="commonsense_qa_prompt",
         hf_repo="commonsense_qa",
@@ -131,7 +134,7 @@ def preprocess(text):
 ## WORLD_KNOWLEDGE_TASKS ##
 
 WORLD_KNOWLEDGE_TASKS = [
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="trivia_qa",
         prompt_function="triviaqa",
         hf_repo="trivia_qa",
@@ -140,7 +143,7 @@ def preprocess(text):
         generation_size=20,
         stop_sequence=["\n", ".", ","],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="natural_questions",
         prompt_function="natural_questions_prompt",
         hf_repo="lighteval/natural_questions_clean",
@@ -170,14 +173,14 @@ def natural_questions_prompt(line, task_name: str = None):
 ## Reading comprehension ##
 
 READING_COMP_TASKS = [
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="super_glue:boolq",
         prompt_function="boolq_prompt",
         hf_repo="super_glue",
         hf_subset="boolq",
         metric=["target_perplexity"],
     ),
-    CustomEvaluationTaskConfig(
+    CustomEvaluationTask(
         name="quac",
         prompt_function="quac",
         hf_repo="lighteval/quac_helm",
@@ -204,7 +207,7 @@ def boolq_prompt(line, task_name: str = None):
 
 
 ## MATH ##
-class CustomMathEvaluationTask(CustomEvaluationTaskConfig):
+class CustomMathEvaluationTask(CustomEvaluationTask):
     """Custom class for math tasks with all the defaults set"""
 
     def __init__(
@@ -251,7 +254,7 @@ def __init__(
     CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"),
     CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"),
 ]
-GSM8K = CustomEvaluationTaskConfig(
+GSM8K = CustomEvaluationTask(
     name="gsm8k",
     prompt_function="gsm8k",
     hf_repo="gsm8k",
@@ -272,7 +275,7 @@ def __init__(
 
 
 ## MMLU ##
-class CustomMMLUEvaluationTask(CustomEvaluationTaskConfig):
+class CustomMMLUEvaluationTask(CustomEvaluationTask):
     def __init__(
         self,
         name,
@@ -415,7 +418,7 @@ def mmlu_prompt(line, task_name: str = None):
 ## BBH ##
 
 
-class CustomBBHEvaluationTask(CustomEvaluationTaskConfig):
+class CustomBBHEvaluationTask(CustomEvaluationTask):
     def __init__(
         self,
         name,
@@ -506,7 +509,7 @@ def bbh_prompt(line, task_name: str = None):
 
 
 ## AGI eval ##
-class CustomAGIEvalEvaluationTask(CustomEvaluationTaskConfig):
+class CustomAGIEvalEvaluationTask(CustomEvaluationTask):
     def __init__(
         self,
         name,
@@ -617,7 +620,7 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):
 
 
 ## HUMAN EVAL ##
-# human_eval = CustomEvaluationTaskConfig(
+# human_eval = CustomEvaluationTask(
 #         name="human_eval",
 #         prompt_function="human_eval",
 #         hf_repo="lighteval/human_eval",
@@ -625,9 +628,9 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):
 #     ),
 
 
-def has_generative_metrics(task: CustomEvaluationTaskConfig) -> bool:
+def has_generative_metrics(task: CustomEvaluationTask) -> bool:
     for metric in task.metric:
-        if metric.category == MetricCategory.GENERATIVE:
+        if metric in NEEDS_GENERATION_ONLY:
             return True
     return False
 
diff --git a/tasks_examples/custom_tasks/custom_evaluation_utils.py b/tasks_examples/custom_tasks/custom_evaluation_utils.py
new file mode 100644
index 000000000..d3f005db1
--- /dev/null
+++ b/tasks_examples/custom_tasks/custom_evaluation_utils.py
@@ -0,0 +1,159 @@
+"""
+Custom evaluation tasks for lighteval
+"""
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Optional, Tuple, Union
+
+
+class Metrics(Enum):
+    any_target_loglikelihood_acc = auto()
+    bert_score = auto()
+    bias = auto()
+    bits_per_byte = auto()
+    bleu = auto()
+    bleu_1 = auto()
+    bleu_4 = auto()
+    byte_perplexity = auto()
+    chrf = auto()
+    code_eval_APPS = auto()
+    code_eval_HE = auto()
+    copyright = auto()
+    disinformation = auto()
+    exact_match = auto()
+    exact_set_match = auto()
+    extractiveness = auto()
+    f1_from_bags = auto()
+    f1_quasi = auto()
+    f1_sequence = auto()
+    f1_set_match = auto()
+    faithfulness = auto()
+    iou_set_match = auto()
+    log_prob = auto()
+    loglikelihood_acc = auto()
+    loglikelihood_acc_norm = auto()
+    loglikelihood_acc_norm_nospace = auto()
+    loglikelihood_acc_norm_single_token = auto()
+    loglikelihood_acc_single_token = auto()
+    loglikelihood_f1 = auto()
+    loglikelihood_f1_single_token = auto()
+    math_quasi_exact_match = auto()
+    mc_taco = auto()
+    mcc = auto()
+    mcc_single_token = auto()
+    mrr = auto()
+    mrr_single_token = auto()
+    multi_fi_numeric = auto()
+    one_choice_loglikelihood_acc = auto()
+    perfect_exact_match = auto()
+    prediction_perplexity = auto()
+    prefix_exact_match = auto()
+    prefix_quasi_exact_match = auto()
+    quasi_exact_match = auto()
+    quasi_exact_match2 = auto()
+    ranking = auto()
+    recall_at_1_single_token = auto()
+    recall_at_2_single_token = auto()
+    recall_at_1 = auto()
+    recall_at_2 = auto()
+    rouge = auto()
+    rouge_1 = auto()
+    rouge_2 = auto()
+    rouge_l = auto()
+    target_perplexity = auto()
+    ter = auto()
+    toxicity = auto()
+    truthfulqa_mc_metrics = auto()
+    word_perplexity = auto()
+
+    def __str__(self):
+        return self.name.replace("_at_", "@")
+
+
+NEEDS_GENERATION_ONLY = [
+    "perfect_exact_match",
+    "exact_match",
+    "quasi_exact_match",
+    "quasi_exact_match2",
+    "prefix_exact_match",
+    "prefix_quasi_exact_match",
+    "math_quasi_exact_match",
+    "iou_set_match",
+    "exact_set_match",
+    "f1_sequence",
+    "f1_quasi",
+    "f1_set_match",
+    "f1_from_bags",
+    "chrf",
+    "ter",
+    "rouge",
+    "rouge_1",
+    "rouge_2",
+    "rouge_l",
+    "faithfulness",
+    "extractiveness",
+    "bert_score",
+    "bleu",
+    "bleu_1",
+    "bleu_4",
+    "bias",
+    "toxicity",
+    "code_eval_HE",
+    "code_eval_APPS",
+    "copyright",
+]
+
+
+@dataclass(unsafe_hash=True)
+class CustomEvaluationTask:
+    name: str
+    prompt_function: str
+    hf_repo: str
+    hf_subset: str
+    metric: Tuple[Union[str, Metrics]]
+    hf_avail_splits: Optional[Tuple[str]] = None
+    evaluation_splits: Optional[Tuple[str]] = None
+    few_shots_split: Optional[str] = None
+    few_shots_select: Optional[str] = None
+    generation_size: int = -1
+    stop_sequence: Optional[Tuple[str]] = None
+    output_regex: Optional[str] = None
+
+    frozen: bool = False
+    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
+
+    def __post_init__(self):
+        self.metric = [str(m) for m in self.metric]
+        if self.suite is None:
+            self.suite = ["custom"]
+        if self.hf_avail_splits is None:
+            self.hf_avail_splits = ["train", "validation", "test"]
+        if self.evaluation_splits is None:
+            self.evaluation_splits = ["validation"]
+        if self.stop_sequence is None:
+            self.stop_sequence = ["\n"]
+
+        # Convert list to tuple for hashing
+        self.metric = tuple(self.metric)
+        self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
+        self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
+        self.suite = tuple(self.suite) if self.suite else None
+        self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
+
+
+@dataclass(unsafe_hash=True)
+class BigCodeEvaluationTask:
+    name: str
+    bigcode_task: str
+    bigcode_task_kwargs: Optional[dict] = None
+    n_samples: int = 1
+    prefix: Optional[str] = None
+
+    suite: Tuple[str] = None
+
+    def __post_init__(self):
+        if self.suite is None:
+            self.suite = ("bigcode",)
+
+        # Convert list to tuple for hashing
+        self.suite = tuple(self.suite)

From 77eee8cb025c0df900fcc0872e5204b9b96b55cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Wed, 7 Feb 2024 14:53:41 +0100
Subject: [PATCH 3/3] Adding the target perplexity fix back (#15)

---------

Co-authored-by: Thomas Wolf <thomas@huggingface.co>
---
 src/lighteval/metrics/__init__.py           | 17 +++++++++++++----
 src/lighteval/metrics/metrics_sample.py     |  9 +++++----
 src/lighteval/metrics/sample_preparator.py  |  6 +++---
 tests/reference_scores/harness_metrics.json |  4 ++--
 4 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
index 3b17854e7..3a0984bfc 100644
--- a/src/lighteval/metrics/__init__.py
+++ b/src/lighteval/metrics/__init__.py
@@ -8,11 +8,18 @@
 
 def apply_target_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[str]):
     outputs = {}
-    current_results = [results.pop(0) for _ in range(len(formatted_doc.get_golds()))]
+    reference_text = formatted_doc.get_golds()[0]
+    current_result = results.pop(0)
+    target_logprob = current_result.result[0]
+    target_acc = current_result.result[1]
 
     for metric in metrics:
-        if Metrics[metric].value.category == MetricCategory.PERPLEXITY:
-            outputs.update(Metrics[metric].value.compute(results=current_results))
+        if Metrics[metric].value.category == MetricCategory.TARGET_PERPLEXITY:
+            outputs.update(
+                Metrics[metric].value.compute(
+                    logprobs=target_logprob, target_acc=target_acc, reference_text=reference_text
+                )
+            )
 
     return results, outputs
 
@@ -30,7 +37,9 @@ def apply_perplexity_metric(results: list[ModelReturn], formatted_doc: Doc, metr
 
     for metric in metrics:
         if Metrics[metric].value.category == MetricCategory.PERPLEXITY:
-            outputs.update(Metrics[metric].value.compute(results=current_result, reference_text=reference_text))
+            outputs.update(
+                Metrics[metric].value.compute(logprobs=current_result.result, reference_text=reference_text)
+            )
 
     return results, outputs
 
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index ec123741b..e87e3bb58 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -1,6 +1,8 @@
 """This module manages all the metrics occurring at the sample level. The results of said metrics are then aggregated
 using simple function (min, mean, max, ...) at the corpus level. Most metrics fall under this category.
 """
+from typing import Union
+
 import nltk
 import numpy as np
 from nltk.metrics.distance import edit_distance
@@ -275,17 +277,16 @@ def compute(self, choices_logprob: list[float], gold_ixs: list[float], formatted
         return 1.0 / (min(ranked_choices) + 1)
 
 
-def acc_golds_likelihood(results: list[tuple[float, int]], **kwargs) -> int:
+def acc_golds_likelihood(target_acc: Union[list[int], int], **kwargs) -> int:
     """Tests if at least one of predicted gold targets' log-likelihood is above 0.5.
 
     Args:
-        results (list[int]): List of tuples containing, for each gold, the predictions log-probabilities associated with whether they are above 0.5 aggregated.
-        formatted_doc (Doc): _description_
+        target_acc (list[int]): List of scores indicating whether the predictions log-probabilities are above 0.5 aggregated.
 
     Returns:
         int: 1 if at least one of the possible golds had a log-likelihood above 0.5.
     """
-    return max([int(acc_ppl) for _, acc_ppl in results])
+    return max([int(acc_ppl) for acc_ppl in as_list(target_acc)])
 
 
 class ROUGE:
diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py
index 659022920..c28ed2470 100644
--- a/src/lighteval/metrics/sample_preparator.py
+++ b/src/lighteval/metrics/sample_preparator.py
@@ -106,14 +106,14 @@ def count_units(self, text: str) -> int:
         if self.units_type == "bytes":
             return len(text.encode("utf-8"))
 
-    def prepare(self, results, reference_text, **kwargs):
+    def prepare(self, logprobs: list[float] | float, reference_text: str, **kwargs):
         """Prepares an individual perplexity example to the format expected by metrics computed at the corpus level (aggregated).
 
         Args:
-            results (list[float]): List of the logprobabilities computed for each item
+            logprobs (list[float]): List of the logprobabilities computed for each item of the sequence or single aggregated logprob over the sequence
             reference_text (str): Current reference text for which to compute the length in self.units_type
 
         Returns:
             PerplexityCorpusMetricInput: Stores the measured logprobs and associated text lengths, counted in the reference unit.
         """
-        return PerplexityCorpusMetricInput(logprobs=results.result, weights=self.count_units(reference_text))
+        return PerplexityCorpusMetricInput(logprobs=logprobs, weights=self.count_units(reference_text))
diff --git a/tests/reference_scores/harness_metrics.json b/tests/reference_scores/harness_metrics.json
index a6c506f34..1c8c5b91d 100644
--- a/tests/reference_scores/harness_metrics.json
+++ b/tests/reference_scores/harness_metrics.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a1965f0b9c66cfe1b1f3cc380a80949e32eab92ae8eac079c0339506ce827093
-size 48373142
+oid sha256:408956938a6b7a18b03658bb9772b471efcea4aa04afb0b35d76cecfca6a706e
+size 48376580