Merge branch 'main' into dev-gucci

huggingface · Jul 17, 2024 · 8c66761 · 8c66761
2 parents ffe3249 + 4550cb7
commit 8c66761
Show file tree

Hide file tree

Showing 26 changed files with 3,255 additions and 1,997 deletions.
diff --git a/README.md b/README.md
@@ -78,8 +78,8 @@ pre-commit install
 
 We provide two main entry points to evaluate models:
 
-* `run_evals_accelerate.py`: evaluate models on CPU or one or more GPUs using [🤗 Accelerate](https://github.com/huggingface/accelerate).
-* `run_evals_nanotron.py`: evaluate models in distributed settings using [⚡️ Nanotron](https://github.com/huggingface/nanotron).
+* `lighteval accelerate`: evaluate models on CPU or one or more GPUs using [🤗 Accelerate](https://github.com/huggingface/accelerate).
+* `lighteval nanotron`: evaluate models in distributed settings using [⚡️ Nanotron](https://github.com/huggingface/nanotron).
 
 For most users, we recommend using the 🤗 Accelerate backend - see below for specific commands.
 
@@ -94,7 +94,8 @@ accelerate config
 You can then evaluate a model using data parallelism as follows:
 
 ```shell
-accelerate launch --multi_gpu --num_processes=<num_gpus> run_evals_accelerate.py \
+accelerate launch --multi_gpu --num_processes=<num_gpus> -m \
+    lighteval accelerate \
     --model_args="pretrained=<path to model on the hub>" \
     --tasks <task parameters> \
     --output_dir output_dir
@@ -109,7 +110,8 @@ suite|task|num_few_shot|{0 or 1 to automatically reduce `num_few_shot` if prompt
 or a file path like [`examples/tasks/recommended_set.txt`](./examples/tasks/recommended_set.txt) which specifies multiple task configurations. For example, to evaluate GPT-2 on the Truthful QA benchmark run:
 
 ```shell
-accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
+accelerate launch --multi_gpu --num_processes=8 -m \
+    lighteval accelerate \
     --model_args "pretrained=gpt2" \
     --tasks "lighteval|truthfulqa:mc|0|0" \
     --override_batch_size 1 \
@@ -119,7 +121,8 @@ accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
 Here, `--override_batch_size` defines the _batch size per device_, so the effective batch size will be `override_batch_size x num_gpus`. To evaluate on multiple benchmarks, separate each task configuration with a comma, e.g.
 
 ```shell
-accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
+accelerate launch --multi_gpu --num_processes=8 -m \
+    lighteval accelerate \
     --model_args "pretrained=gpt2" \
     --tasks "leaderboard|truthfulqa:mc|0|0,leaderboard|gsm8k|0|0" \
     --override_batch_size 1 \
@@ -133,7 +136,8 @@ See the [`examples/tasks/recommended_set.txt`](./examples/tasks/recommended_set.
 If you want to evaluate a model by spinning up inference endpoints, use adapter/delta weights, or more complex configuration options, you can load models using a configuration file. This is done as follows:
 
 ```shell
-accelerate launch --multi_gpu --num_processes=<num_gpus> run_evals_accelerate.py \
+accelerate launch --multi_gpu --num_processes=<num_gpus> -m \
+    lighteval accelerate \
     --model_config_path="<path to your model configuration>" \
     --tasks <task parameters> \
     --output_dir output_dir
@@ -147,13 +151,15 @@ To evaluate models larger that ~40B parameters in 16-bit precision, you will nee
 
 ```shell
 # PP=2, DP=4 - good for models < 70B params
-accelerate launch --multi_gpu --num_processes=4 run_evals_accelerate.py \
+accelerate launch --multi_gpu --num_processes=4 -m \
+    lighteval accelerate \
     --model_args="pretrained=<path to model on the hub>,model_parallel=True" \
     --tasks <task parameters> \
     --output_dir output_dir
 
 # PP=4, DP=2 - good for huge models >= 70B params
-accelerate launch --multi_gpu --num_processes=2 run_evals_accelerate.py \
+accelerate launch --multi_gpu --num_processes=2 -m \
+    lighteval accelerate \
     --model_args="pretrained=<path to model on the hub>,model_parallel=True" \
     --tasks <task parameters> \
     --output_dir output_dir
@@ -164,7 +170,8 @@ accelerate launch --multi_gpu --num_processes=2 run_evals_accelerate.py \
 To evaluate a model on all the benchmarks of the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) using a single node of 8 GPUs, run:
 
 ```shell
-accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
+accelerate launch --multi_gpu --num_processes=8 -m \
+    lighteval accelerate \
     --model_args "pretrained=<model name>" \
     --tasks examples/tasks/open_llm_leaderboard_tasks.txt \
     --override_batch_size 1 \
@@ -176,7 +183,7 @@ accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
 You can also use `lighteval` to evaluate models on CPU, although note this will typically be very slow for large models. To do so, run:
 
 ```shell
-python run_evals_accelerate.py \
+lighteval accelerate \
     --model_args="pretrained=<path to model on the hub>"\
     --tasks <task parameters> \
     --output_dir output_dir
@@ -211,7 +218,7 @@ Independently of the default tasks provided in `lighteval` that you will find in
 
 For example, to run an extended task like `ifeval`, you can run:
 ```shell
-python run_evals_accelerate.py \
+lighteval accelerate \
     --model_args "pretrained=HuggingFaceH4/zephyr-7b-beta" \
     --use_chat_template \ # optional, if you want to run the evaluation with the chat template
     --tasks "extended|ifeval|0|0" \
@@ -221,7 +228,7 @@ python run_evals_accelerate.py \
 To run a community or custom task, you can use (note the custom_tasks flag):
 
 ```shell
-python run_evals_accelerate.py \
+lighteval accelerate \
     --model_args="pretrained=<path to model on the hub>"\
     --tasks <task parameters> \
     --custom_tasks <path to your custom or community task> \
@@ -231,7 +238,7 @@ python run_evals_accelerate.py \
 For example, to launch `lighteval` on `arabic_mmlu:abstract_algebra` for `HuggingFaceH4/zephyr-7b-beta`, run:
 
 ```shell
-python run_evals_accelerate.py \
+lighteval accelerate \
     --model_args "pretrained=HuggingFaceH4/zephyr-7b-beta" \
     --use_chat_template \ # optional, if you want to run the evaluation with the chat template
     --tasks "community|arabic_mmlu:abstract_algebra|5|1" \
@@ -464,7 +471,7 @@ source <path_to_your_venv>/activate #or conda activate yourenv
 cd <path_to_your_lighteval>/lighteval
 
 export CUDA_LAUNCH_BLOCKING=1
-srun accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py --model_args "pretrained=your model name" --tasks examples/tasks/open_llm_leaderboard_tasks.txt --override_batch_size 1 --save_details --output_dir=your output dir
+srun accelerate launch --multi_gpu --num_processes=8 -m lighteval accelerate --model_args "pretrained=your model name" --tasks examples/tasks/open_llm_leaderboard_tasks.txt --override_batch_size 1 --save_details --output_dir=your output dir
 ```
 
 ## Releases

diff --git a/community_tasks/_template.py b/community_tasks/_template.py
@@ -68,7 +68,7 @@ def prompt_fn(line, task_name: str = None):
     evaluation_splits=[],
     few_shots_split="",
     few_shots_select="",
-    metric=[""],
+    metric=[],  # select your metric in Metrics
 )
 
 # EVALS WITH SUBSET
@@ -91,7 +91,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=prompt_fn,  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
             hf_repo="",
-            metric=[""],
+            metric=[custom_metric],  # select your metric in Metrics or use your custom_metric
             hf_avail_splits=[],
             evaluation_splits=[],
             few_shots_split="",
@@ -111,16 +111,14 @@ def __init__(
 
 # CUSTOM METRIC IF NEEDED
 custom_metric = SampleLevelMetric(
-    metric="my_custom_metric_name",
+    metric_name="my_custom_metric_name",
     higher_is_better=True,
     category=MetricCategory.IGNORED,
     use_case=MetricUseCase.NONE,
     sample_level_fn=lambda x: x,  # how to compute score for one sample
     corpus_level_fn=np.mean,  # aggregation
 )
 
-extend_enum(Metrics, "my_custom_metric_name", custom_metric)
-
 # MODULE LOGIC
 # You should not need to touch this
 # Convert to dict for lighteval

diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py
@@ -25,6 +25,7 @@
 Task to evaluate LLMs on the training set of the Kaggle AIMO competition: https://www.kaggle.com/competitions/ai-mathematical-olympiad-prize
 """
 
+from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
@@ -48,7 +49,7 @@ def aimo_prompt(line, task_name: str = None):
     evaluation_splits=["train"],
     few_shots_split="train",
     few_shots_select="sequential",
-    metric=["quasi_exact_match_math"],
+    metric=[Metrics.quasi_exact_match_math],
     generation_size=2048,
     stop_sequence=None,
 )

diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py
@@ -29,6 +29,7 @@
 import random
 import re
 
+from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
@@ -86,7 +87,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=mmlu_arabic,
             hf_repo="OALL/Arabic_MMLU",
-            metric=["loglikelihood_acc_norm"],
+            metric=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "dev"],
             evaluation_splits=["test"],
             few_shots_split="dev",
@@ -143,7 +144,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=acva,
             hf_repo="OALL/ACVA",
-            metric=["loglikelihood_acc_norm"],
+            metric=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "validation"],
             evaluation_splits=["test"],
             few_shots_split="validation",
@@ -195,7 +196,7 @@ def arabic_exams(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -245,7 +246,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=alghafa_prompt,
             hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
-            metric=["loglikelihood_acc_norm"],
+            metric=[Metrics.loglikelihood_acc_norm],
             hf_avail_splits=["test", "validation"],
             evaluation_splits=["test"],
             few_shots_split="validation",
@@ -273,7 +274,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -290,7 +291,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -307,7 +308,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -324,7 +325,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -341,7 +342,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -358,7 +359,7 @@ def __init__(
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -400,7 +401,7 @@ def boolq_prompt_arabic(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -436,7 +437,7 @@ def copa_prompt_arabic(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -481,7 +482,7 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -519,7 +520,7 @@ def toxigen_prompt_arabic(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )
@@ -571,7 +572,7 @@ def sciq_prompt_arabic(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="validation",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc_norm"],
+    metric=[Metrics.loglikelihood_acc_norm],
     trust_dataset=True,
     version=0,
 )

diff --git a/community_tasks/german_rag_evals.py b/community_tasks/german_rag_evals.py
@@ -30,6 +30,7 @@
 See: https://huggingface.co/datasets/deutsche-telekom/Ger-RAG-eval
 """
 
+from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
 
@@ -161,7 +162,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="test",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc"],
+    metric=[Metrics.loglikelihood_acc],
     version=1,
 )
 
@@ -178,7 +179,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="test",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc"],
+    metric=[Metrics.loglikelihood_acc],
     version=1,
 )
 
@@ -196,7 +197,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="test",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc"],
+    metric=[Metrics.loglikelihood_acc],
     version=1,
 )
 
@@ -213,7 +214,7 @@ def prompt_fn_context_question_match(line, task_name: str = None):
     evaluation_splits=["test"],
     few_shots_split="test",
     few_shots_select="sequential",
-    metric=["loglikelihood_acc"],
+    metric=[Metrics.loglikelihood_acc],
     version=1,
 )
 

diff --git a/examples/model_configs/endpoint_model.yaml b/examples/model_configs/endpoint_model.yaml
@@ -16,7 +16,7 @@ model:
     endpoint_type: "protected"
     namespace: null # The namespace under which to launch the endopint. Defaults to the current user's namespace
     image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
-    env_vars: 
+    env_vars:
       null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
   generation:
     add_special_tokens: true
diff --git a/examples/model_configs/tgi_model.yaml b/examples/model_configs/tgi_model.yaml
@@ -3,4 +3,4 @@ model:
   instance:
     inference_server_address: ""
     inference_server_auth: null
-    model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory
+    model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory