From 51c06d660bb6273947bb9520999263c0666306bc Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Tue, 7 May 2024 22:41:15 +0330
Subject: [PATCH 01/11] Fix a tiny typo & a tiny bug

---
 src/lighteval/logging/evaluation_tracker.py | 2 +-
 src/lighteval/models/model_config.py        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 35a835bc1..f4bdf9566 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -511,7 +511,7 @@ def push_results_to_tensorboard(  # noqa: C901
         self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
     ):
         if not is_nanotron_available():
-            hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping")
+            hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping")
             return
         config: Config = self.general_config_logger.config
         lighteval_config = config.lighteval
diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index d62a85d3c..ee76f4524 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -85,9 +85,9 @@ class BaseModelConfig:
            If `None`, the default value will be set to `True` for seq2seq models (e.g. T5) and
             `False` for causal models.
         model_parallel (bool, optional, defaults to False):
-            True/False: force to uses or not the `accelerate` library to load a large
+            True/False: force to use or not the `accelerate` library to load a large
             model across multiple devices.
-            Default: None which correspond to comparing the number of process with
+            Default: None which corresponds to comparing the number of processes with
                 the number of GPUs. If it's smaller => model-parallelism, else not.
         dtype (Union[str, torch.dtype], optional, defaults to None):):
             Converts the model weights to `dtype`, if specified. Strings get
@@ -279,8 +279,8 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
 
     if config["type"] == "tgi":
         return TGIModelConfig(
-            inference_server_address=args["instance"]["inference_server_address"],
-            inference_server_auth=args["instance"]["inference_server_auth"],
+            inference_server_address=config["instance"]["inference_server_address"],
+            inference_server_auth=config["instance"]["inference_server_auth"],
         )
 
     if config["type"] == "endpoint":

From f91717e90f467f21cf22b5b1f341a61e28e55920 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sun, 12 May 2024 23:47:47 +0330
Subject: [PATCH 02/11] Fix some other typos

---
 run_evals_accelerate.py               | 4 ++--
 src/lighteval/evaluator.py            | 2 +-
 src/lighteval/models/base_model.py    | 6 +++---
 src/lighteval/tasks/lighteval_task.py | 4 ++--
 src/lighteval/tasks/requests.py       | 4 +---
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
index a743cb496..99c62a402 100644
--- a/run_evals_accelerate.py
+++ b/run_evals_accelerate.py
@@ -64,13 +64,13 @@ def get_parser():
         "--custom_tasks",
         type=str,
         default=None,
-        help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
+        help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formatting functions)",
     )
     group.add_argument(
         "--tasks",
         type=str,
         default=None,
-        help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5' or path to a texte file with a list of tasks",
+        help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5' or path to a text file with a list of tasks",
     )
     parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots")
     return parser
diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
index e837b9225..883e5ef70 100644
--- a/src/lighteval/evaluator.py
+++ b/src/lighteval/evaluator.py
@@ -67,7 +67,7 @@ def evaluate(  # noqa: C901
     # A request output tupe is a Tuple where the first element is the index of
     # the request for one document of one task i.e.
     # task: "arc_easy", doc: "0"# request: "0" -> request_index = 0,
-    # We can have multiple request per doc for multi choice tasks for example.
+    # We can have multiple requests per doc for multi choice tasks for example.
 
     # all responses for each (task, doc)
     RequestIndexModelResponseTuple = collections.namedtuple(
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 3913fd80b..8b8bcfdf1 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -819,7 +819,7 @@ def _loglikelihood_tokens(
                     )
                     res.append(answer)
 
-                # Clean up GPUS
+                # Clean up GPUs
                 del model_output
                 del logits
                 del batched_inputs
@@ -852,7 +852,7 @@ def prepare_batch_logprob(
             hlog_warn("max_context is None, using max_length")
             max_context = self.max_length
 
-        # Each sample is concatenated and cut to lenght or padded to max_length
+        # Each sample is concatenated and cut to length or padded to max_length
         for orig_tokens in inputs:
             truncated.append(max(len(orig_tokens) - max_context, 0))
 
@@ -1030,7 +1030,7 @@ def _loglikelihood_single_token(
                     )
                     res.append(answer)
 
-                # Clean up GPUS
+                # Clean up GPUs
                 del out
                 del batch_probs
                 del batched_inputs
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index f5c7a1f9e..1699a25ea 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -25,7 +25,7 @@
 from dataclasses import dataclass
 from multiprocessing import Pool
 from pathlib import Path
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 from datasets import load_dataset
 
@@ -440,7 +440,7 @@ def get_request_type(self) -> list[RequestType]:  # noqa C901
 
     def construct_requests(
         self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
-    ) -> List[Request]:
+    ) -> Dict[RequestType: List[Request]]:
         """
         Constructs a list of requests from the task based on the given parameters.
 
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index 283e6959f..9e312fe26 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -187,9 +187,7 @@ def get_golds(self, few_shot: bool = False):
             choices = self.choices
         golds = []
         for gold_ix in gold_indices:
-            local_golds = as_list(choices[gold_ix])
-            for local_gold in local_golds:
-                golds.append(local_gold)
+            golds.extend(as_list(choices[gold_ix]))
         return golds
 
     def __repr__(self):

From 0f9638e93a754263e65af503d857adc1a8a820a7 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Mon, 13 May 2024 00:00:27 +0330
Subject: [PATCH 03/11] Fix the last typo

---
 src/lighteval/tasks/registry.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index abaa17451..df5e4da6a 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -117,7 +117,7 @@ def get_task_dict(
 
         Args:
             task_name_list (List[str]): A list of task names.
-            custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
+            custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself
             extended_tasks (Optional[str]): The path to the extended tasks group of submodules
 
         Returns:
@@ -159,7 +159,7 @@ def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleTy
     """Creates a custom task module to load tasks defined by the user in their own file.
 
     Args:
-        custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
+        custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself
 
     Returns:
         ModuleType: The newly imported/created custom tasks modules
@@ -178,7 +178,7 @@ def get_custom_tasks(custom_tasks: Union[str, ModuleType]) -> Tuple[ModuleType,
     """Get all the custom tasks available from the given custom tasks file or module.
 
     Args:
-        custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
+        custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself
     """
     custom_tasks_module = create_custom_tasks_module(custom_tasks=custom_tasks)
     tasks_string = ""

From 60c671098c4d7c3e109424caed9eed24b36edb7d Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Wed, 15 May 2024 14:13:35 +0330
Subject: [PATCH 04/11] Fix my own typo :) and two other things

---
 pyproject.toml                        | 2 ++
 run_evals_accelerate.py               | 4 ++--
 src/lighteval/models/base_model.py    | 2 +-
 src/lighteval/tasks/lighteval_task.py | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a9fe4bc7a..15f6339f3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,6 +74,8 @@ dependencies = [
     "sentencepiece>=0.1.99",
     "protobuf==3.20.*", # pinned for sentencepiece compat
     "pycountry",
+    # LLM judges
+    "openai",
 ]
 
 [project.optional-dependencies]
diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
index 99c62a402..54f971bbe 100644
--- a/run_evals_accelerate.py
+++ b/run_evals_accelerate.py
@@ -50,7 +50,7 @@ def get_parser():
     parser.add_argument(
         "--public_run", default=False, action="store_true", help="Push results and details to a public repo"
     )
-    parser.add_argument("--cache_dir", type=str, default=CACHE_DIR)
+    parser.add_argument("--cache_dir", type=str, default=CACHE_DIR, help="Cache directory for downloaded datasets & model, defaults to `HF_HOME` environment variable")
     parser.add_argument(
         "--results_org",
         type=str,
@@ -70,7 +70,7 @@ def get_parser():
         "--tasks",
         type=str,
         default=None,
-        help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5' or path to a text file with a list of tasks",
+        help="Comma-separated ids of tasks, e.g. 'original|mmlu:abstract_algebra|5' or path to a text file with a list of tasks",
     )
     parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots")
     return parser
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 8b8bcfdf1..df7b3e92c 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -79,7 +79,7 @@ def __init__(
         self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False
         self._tokenizer = self._create_auto_tokenizer(config, env_config)
 
-        # If model_parallel is not set we compare the number of process with the number of GPUs
+        # If model_parallel is not set we compare the number of processes with the number of GPUs
         self.model = self._create_auto_model(config, env_config)
         self.model.eval()
         torch.set_grad_enabled(False)
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 1699a25ea..5943b67a4 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -440,7 +440,7 @@ def get_request_type(self) -> list[RequestType]:  # noqa C901
 
     def construct_requests(
         self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
-    ) -> Dict[RequestType: List[Request]]:
+    ) -> Dict[RequestType, List[Request]]:
         """
         Constructs a list of requests from the task based on the given parameters.
 

From adb26a9366101c1f52e99ea508c8b9b33fd74a19 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Wed, 15 May 2024 14:24:54 +0330
Subject: [PATCH 05/11] Revert change in pyproject.toml

---
 pyproject.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 15f6339f3..a9fe4bc7a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,8 +74,6 @@ dependencies = [
     "sentencepiece>=0.1.99",
     "protobuf==3.20.*", # pinned for sentencepiece compat
     "pycountry",
-    # LLM judges
-    "openai",
 ]
 
 [project.optional-dependencies]

From 5ac8f40363d8ce18ca87f01a1d026fd3e3f65e3a Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Wed, 15 May 2024 15:12:58 +0330
Subject: [PATCH 06/11] Apply precommit

---
 run_evals_accelerate.py                | 7 ++++++-
 src/lighteval/models/model_loader.py   | 4 ++--
 src/lighteval/models/nanotron_model.py | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
index 54f971bbe..f36603624 100644
--- a/run_evals_accelerate.py
+++ b/run_evals_accelerate.py
@@ -50,7 +50,12 @@ def get_parser():
     parser.add_argument(
         "--public_run", default=False, action="store_true", help="Push results and details to a public repo"
     )
-    parser.add_argument("--cache_dir", type=str, default=CACHE_DIR, help="Cache directory for downloaded datasets & model, defaults to `HF_HOME` environment variable")
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=CACHE_DIR,
+        help="Cache directory for downloaded datasets & model, defaults to `HF_HOME` environment variable",
+    )
     parser.add_argument(
         "--results_org",
         type=str,
diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py
index 3af5be263..ed372d5df 100644
--- a/src/lighteval/models/model_loader.py
+++ b/src/lighteval/models/model_loader.py
@@ -57,8 +57,8 @@ def load_model(  # noqa: C901
     config: Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig],
     env_config: EnvConfig,
 ) -> Tuple[Union[BaseModel, AdapterModel, DeltaModel, ModelClient], ModelInfo]:
-    """Will load either a model from an inference server or a model from a checkpoint. depending
-    on the arguments passed to the program.
+    """Will load either a model from an inference server or a model from a checkpoint, depending
+    on the config type.
 
     Args:
         args (Namespace): arguments passed to the program
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index 977b2b198..efe207091 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -846,7 +846,7 @@ def _loglikelihood_single_token(
 
                     tq.desc = f"loglikelihood_single_token Subset {s} Node {dist.get_rank(self.parallel_context.world_pg)} - {human_format(tokens_per_sec)} tokens/s"
 
-                    # Clean up GPUS
+                    # Clean up GPUs
                     del out
                     del batch_probs
                     del batched_inputs
@@ -1083,7 +1083,7 @@ def _loglikelihood_tokens(
                     tokens_per_sec = batched_inputs.numel() / (elapsed_time_per_iteration_ms / 1000)
                     tq.desc = f"loglikelihood Subset {s} Node {dist.get_rank(self.parallel_context.world_pg)} - {human_format(tokens_per_sec)} tokens/s"
 
-                    # Clean up GPUS
+                    # Clean up GPUs
                     del out
                     del logits
                     del batched_inputs

From ff30e2e764d1d0bdf7540bdcb4153f39f10291ab Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Thu, 16 May 2024 10:03:06 +0330
Subject: [PATCH 07/11] Remove 'args.model_config' from create_model_config

---
 src/lighteval/models/model_config.py | 155 +++++++++++++--------------
 1 file changed, 76 insertions(+), 79 deletions(-)

diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index 7819268be..0d6200496 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -271,85 +271,82 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
 
         return BaseModelConfig(**args_dict)
 
-    if args.model_config:
-        config = args.model_config["model"]
-    else:
-        with open(args.model_config_path, "r") as f:
-            config = yaml.safe_load(f)["model"]
-
-    if config["type"] == "tgi":
-        return TGIModelConfig(
-            inference_server_address=config["instance"]["inference_server_address"],
-            inference_server_auth=config["instance"]["inference_server_auth"],
-        )
+    with open(args.model_config_path, "r") as f:
+        config = yaml.safe_load(f)["model"]
 
-    if config["type"] == "endpoint":
-        reuse_existing_endpoint = config["base_params"]["reuse_existing"]
-        complete_config_endpoint = all(
-            val not in [None, ""]
-            for key, val in config["instance"].items()
-            if key not in InferenceEndpointModelConfig.nullable_keys()
-        )
-        if reuse_existing_endpoint or complete_config_endpoint:
-            return InferenceEndpointModelConfig(
-                name=config["base_params"]["endpoint_name"].replace(".", "-").lower(),
-                repository=config["base_params"]["model"],
-                model_dtype=config["base_params"]["dtype"],
-                revision=config["base_params"]["revision"] or "main",
-                should_reuse_existing=reuse_existing_endpoint,
-                accelerator=config["instance"]["accelerator"],
-                region=config["instance"]["region"],
-                vendor=config["instance"]["vendor"],
-                instance_size=config["instance"]["instance_size"],
-                instance_type=config["instance"]["instance_type"],
-                namespace=config["instance"]["namespace"],
-            )
-        return InferenceModelConfig(model=config["base_params"]["endpoint_name"])
-
-    if config["type"] == "base":
-        # Tests on the multichoice space parameters
-        multichoice_continuations_start_space = config["generation"]["multichoice_continuations_start_space"]
-        no_multichoice_continuations_start_space = config["generation"]["no_multichoice_continuations_start_space"]
-        if not multichoice_continuations_start_space and not no_multichoice_continuations_start_space:
-            multichoice_continuations_start_space = None
-        if multichoice_continuations_start_space and no_multichoice_continuations_start_space:
-            raise ValueError(
-                "You cannot force both the multichoice continuations to start with a space and not to start with a space"
+        if config["type"] == "tgi":
+            return TGIModelConfig(
+                inference_server_address=config["instance"]["inference_server_address"],
+                inference_server_auth=config["instance"]["inference_server_auth"],
             )
 
-        # Creating optional quantization configuration
-        if config["base_params"]["dtype"] == "4bit":
-            quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
-        elif config["base_params"]["dtype"] == "8bit":
-            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-        else:
-            quantization_config = None
-
-        # We extract the model args
-        args_dict = {k.split("=")[0]: k.split("=")[1] for k in config["base_params"]["model_args"].split(",")}
-
-        # We store the relevant other args
-        args_dict["base_model"] = config["merged_weights"]["base_model"]
-        args_dict["dtype"] = config["base_params"]["dtype"]
-        args_dict["accelerator"] = accelerator
-        args_dict["quantization_config"] = quantization_config
-        args_dict["batch_size"] = args.override_batch_size
-        args_dict["multichoice_continuations_start_space"] = multichoice_continuations_start_space
-        args_dict["use_chat_template"] = args.use_chat_template
-
-        # Keeping only non null params
-        args_dict = {k: v for k, v in args_dict.items() if v is not None}
-
-        if config["merged_weights"]["delta_weights"]:
-            if config["merged_weights"]["base_model"] is None:
-                raise ValueError("You need to specify a base model when using delta weights")
-            return DeltaModelConfig(**args_dict)
-        if config["merged_weights"]["adapter_weights"]:
-            if config["merged_weights"]["base_model"] is None:
-                raise ValueError("You need to specify a base model when using adapter weights")
-            return AdapterModelConfig(**args_dict)
-        if config["merged_weights"]["base_model"] not in ["", None]:
-            raise ValueError("You can't specify a base model if you are not using delta/adapter weights")
-        return BaseModelConfig(**args_dict)
-
-    raise ValueError(f"Unknown model type in your model config file: {config['type']}")
+        if config["type"] == "endpoint":
+            reuse_existing_endpoint = config["base_params"]["reuse_existing"]
+            complete_config_endpoint = all(
+                val not in [None, ""]
+                for key, val in config["instance"].items()
+                if key not in InferenceEndpointModelConfig.nullable_keys()
+            )
+            if reuse_existing_endpoint or complete_config_endpoint:
+                return InferenceEndpointModelConfig(
+                    name=config["base_params"]["endpoint_name"].replace(".", "-").lower(),
+                    repository=config["base_params"]["model"],
+                    model_dtype=config["base_params"]["dtype"],
+                    revision=config["base_params"]["revision"] or "main",
+                    should_reuse_existing=reuse_existing_endpoint,
+                    accelerator=config["instance"]["accelerator"],
+                    region=config["instance"]["region"],
+                    vendor=config["instance"]["vendor"],
+                    instance_size=config["instance"]["instance_size"],
+                    instance_type=config["instance"]["instance_type"],
+                    namespace=config["instance"]["namespace"],
+                )
+            return InferenceModelConfig(model=config["base_params"]["endpoint_name"])
+
+        if config["type"] == "base":
+            # Tests on the multichoice space parameters
+            multichoice_continuations_start_space = config["generation"]["multichoice_continuations_start_space"]
+            no_multichoice_continuations_start_space = config["generation"]["no_multichoice_continuations_start_space"]
+            if not multichoice_continuations_start_space and not no_multichoice_continuations_start_space:
+                multichoice_continuations_start_space = None
+            if multichoice_continuations_start_space and no_multichoice_continuations_start_space:
+                raise ValueError(
+                    "You cannot force both the multichoice continuations to start with a space and not to start with a space"
+                )
+
+            # Creating optional quantization configuration
+            if config["base_params"]["dtype"] == "4bit":
+                quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
+            elif config["base_params"]["dtype"] == "8bit":
+                quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+            else:
+                quantization_config = None
+
+            # We extract the model args
+            args_dict = {k.split("=")[0]: k.split("=")[1] for k in config["base_params"]["model_args"].split(",")}
+
+            # We store the relevant other args
+            args_dict["base_model"] = config["merged_weights"]["base_model"]
+            args_dict["dtype"] = config["base_params"]["dtype"]
+            args_dict["accelerator"] = accelerator
+            args_dict["quantization_config"] = quantization_config
+            args_dict["batch_size"] = args.override_batch_size
+            args_dict["multichoice_continuations_start_space"] = multichoice_continuations_start_space
+            args_dict["use_chat_template"] = args.use_chat_template
+
+            # Keeping only non null params
+            args_dict = {k: v for k, v in args_dict.items() if v is not None}
+
+            if config["merged_weights"]["delta_weights"]:
+                if config["merged_weights"]["base_model"] is None:
+                    raise ValueError("You need to specify a base model when using delta weights")
+                return DeltaModelConfig(**args_dict)
+            if config["merged_weights"]["adapter_weights"]:
+                if config["merged_weights"]["base_model"] is None:
+                    raise ValueError("You need to specify a base model when using adapter weights")
+                return AdapterModelConfig(**args_dict)
+            if config["merged_weights"]["base_model"] not in ["", None]:
+                raise ValueError("You can't specify a base model if you are not using delta/adapter weights")
+            return BaseModelConfig(**args_dict)
+
+        raise ValueError(f"Unknown model type in your model config file: {config['type']}")

From fbc2920e106820b6fa3f8d429fa0a5218ce6e313 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 18 May 2024 09:48:59 +0330
Subject: [PATCH 08/11] Revert a mistake

---
 src/lighteval/models/model_config.py | 148 +++++++++++++--------------
 1 file changed, 74 insertions(+), 74 deletions(-)

diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index 0d6200496..47ad03970 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -274,79 +274,79 @@ def create_model_config(args: Namespace, accelerator: Union["Accelerator", None]
     with open(args.model_config_path, "r") as f:
         config = yaml.safe_load(f)["model"]
 
-        if config["type"] == "tgi":
-            return TGIModelConfig(
-                inference_server_address=config["instance"]["inference_server_address"],
-                inference_server_auth=config["instance"]["inference_server_auth"],
-            )
+    if config["type"] == "tgi":
+        return TGIModelConfig(
+            inference_server_address=config["instance"]["inference_server_address"],
+            inference_server_auth=config["instance"]["inference_server_auth"],
+        )
 
-        if config["type"] == "endpoint":
-            reuse_existing_endpoint = config["base_params"]["reuse_existing"]
-            complete_config_endpoint = all(
-                val not in [None, ""]
-                for key, val in config["instance"].items()
-                if key not in InferenceEndpointModelConfig.nullable_keys()
+    if config["type"] == "endpoint":
+        reuse_existing_endpoint = config["base_params"]["reuse_existing"]
+        complete_config_endpoint = all(
+            val not in [None, ""]
+            for key, val in config["instance"].items()
+            if key not in InferenceEndpointModelConfig.nullable_keys()
+        )
+        if reuse_existing_endpoint or complete_config_endpoint:
+            return InferenceEndpointModelConfig(
+                name=config["base_params"]["endpoint_name"].replace(".", "-").lower(),
+                repository=config["base_params"]["model"],
+                model_dtype=config["base_params"]["dtype"],
+                revision=config["base_params"]["revision"] or "main",
+                should_reuse_existing=reuse_existing_endpoint,
+                accelerator=config["instance"]["accelerator"],
+                region=config["instance"]["region"],
+                vendor=config["instance"]["vendor"],
+                instance_size=config["instance"]["instance_size"],
+                instance_type=config["instance"]["instance_type"],
+                namespace=config["instance"]["namespace"],
             )
-            if reuse_existing_endpoint or complete_config_endpoint:
-                return InferenceEndpointModelConfig(
-                    name=config["base_params"]["endpoint_name"].replace(".", "-").lower(),
-                    repository=config["base_params"]["model"],
-                    model_dtype=config["base_params"]["dtype"],
-                    revision=config["base_params"]["revision"] or "main",
-                    should_reuse_existing=reuse_existing_endpoint,
-                    accelerator=config["instance"]["accelerator"],
-                    region=config["instance"]["region"],
-                    vendor=config["instance"]["vendor"],
-                    instance_size=config["instance"]["instance_size"],
-                    instance_type=config["instance"]["instance_type"],
-                    namespace=config["instance"]["namespace"],
-                )
-            return InferenceModelConfig(model=config["base_params"]["endpoint_name"])
-
-        if config["type"] == "base":
-            # Tests on the multichoice space parameters
-            multichoice_continuations_start_space = config["generation"]["multichoice_continuations_start_space"]
-            no_multichoice_continuations_start_space = config["generation"]["no_multichoice_continuations_start_space"]
-            if not multichoice_continuations_start_space and not no_multichoice_continuations_start_space:
-                multichoice_continuations_start_space = None
-            if multichoice_continuations_start_space and no_multichoice_continuations_start_space:
-                raise ValueError(
-                    "You cannot force both the multichoice continuations to start with a space and not to start with a space"
-                )
-
-            # Creating optional quantization configuration
-            if config["base_params"]["dtype"] == "4bit":
-                quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
-            elif config["base_params"]["dtype"] == "8bit":
-                quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-            else:
-                quantization_config = None
-
-            # We extract the model args
-            args_dict = {k.split("=")[0]: k.split("=")[1] for k in config["base_params"]["model_args"].split(",")}
-
-            # We store the relevant other args
-            args_dict["base_model"] = config["merged_weights"]["base_model"]
-            args_dict["dtype"] = config["base_params"]["dtype"]
-            args_dict["accelerator"] = accelerator
-            args_dict["quantization_config"] = quantization_config
-            args_dict["batch_size"] = args.override_batch_size
-            args_dict["multichoice_continuations_start_space"] = multichoice_continuations_start_space
-            args_dict["use_chat_template"] = args.use_chat_template
-
-            # Keeping only non null params
-            args_dict = {k: v for k, v in args_dict.items() if v is not None}
-
-            if config["merged_weights"]["delta_weights"]:
-                if config["merged_weights"]["base_model"] is None:
-                    raise ValueError("You need to specify a base model when using delta weights")
-                return DeltaModelConfig(**args_dict)
-            if config["merged_weights"]["adapter_weights"]:
-                if config["merged_weights"]["base_model"] is None:
-                    raise ValueError("You need to specify a base model when using adapter weights")
-                return AdapterModelConfig(**args_dict)
-            if config["merged_weights"]["base_model"] not in ["", None]:
-                raise ValueError("You can't specify a base model if you are not using delta/adapter weights")
-            return BaseModelConfig(**args_dict)
-
-        raise ValueError(f"Unknown model type in your model config file: {config['type']}")
+        return InferenceModelConfig(model=config["base_params"]["endpoint_name"])
+
+    if config["type"] == "base":
+        # Tests on the multichoice space parameters
+        multichoice_continuations_start_space = config["generation"]["multichoice_continuations_start_space"]
+        no_multichoice_continuations_start_space = config["generation"]["no_multichoice_continuations_start_space"]
+        if not multichoice_continuations_start_space and not no_multichoice_continuations_start_space:
+            multichoice_continuations_start_space = None
+        if multichoice_continuations_start_space and no_multichoice_continuations_start_space:
+            raise ValueError(
+                "You cannot force both the multichoice continuations to start with a space and not to start with a space"
+            )
+
+        # Creating optional quantization configuration
+        if config["base_params"]["dtype"] == "4bit":
+            quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
+        elif config["base_params"]["dtype"] == "8bit":
+            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+        else:
+            quantization_config = None
+
+        # We extract the model args
+        args_dict = {k.split("=")[0]: k.split("=")[1] for k in config["base_params"]["model_args"].split(",")}
+
+        # We store the relevant other args
+        args_dict["base_model"] = config["merged_weights"]["base_model"]
+        args_dict["dtype"] = config["base_params"]["dtype"]
+        args_dict["accelerator"] = accelerator
+        args_dict["quantization_config"] = quantization_config
+        args_dict["batch_size"] = args.override_batch_size
+        args_dict["multichoice_continuations_start_space"] = multichoice_continuations_start_space
+        args_dict["use_chat_template"] = args.use_chat_template
+
+        # Keeping only non null params
+        args_dict = {k: v for k, v in args_dict.items() if v is not None}
+
+        if config["merged_weights"]["delta_weights"]:
+            if config["merged_weights"]["base_model"] is None:
+                raise ValueError("You need to specify a base model when using delta weights")
+            return DeltaModelConfig(**args_dict)
+        if config["merged_weights"]["adapter_weights"]:
+            if config["merged_weights"]["base_model"] is None:
+                raise ValueError("You need to specify a base model when using adapter weights")
+            return AdapterModelConfig(**args_dict)
+        if config["merged_weights"]["base_model"] not in ["", None]:
+            raise ValueError("You can't specify a base model if you are not using delta/adapter weights")
+        return BaseModelConfig(**args_dict)
+
+    raise ValueError(f"Unknown model type in your model config file: {config['type']}")

From c61fa7272fd6794ca2bc72d544d13769bf6e0d02 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Wed, 22 May 2024 13:38:13 +0330
Subject: [PATCH 09/11] A few other impr.

---
 src/lighteval/metrics/imports/bert_scorer.py | 2 +-
 src/lighteval/metrics/metrics.py             | 5 +++--
 src/lighteval/metrics/metrics_sample.py      | 2 +-
 src/lighteval/tasks/requests.py              | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py
index dd8c0ee84..126282101 100644
--- a/src/lighteval/metrics/imports/bert_scorer.py
+++ b/src/lighteval/metrics/imports/bert_scorer.py
@@ -163,7 +163,7 @@ def greedy_cos_idf(
         - :param: `ref_masks` (torch.LongTensor): BxKxK, BERT attention mask for
                    reference sentences.
         - :param: `ref_idf` (torch.Tensor): BxK, idf score of each word
-                   piece in the reference setence
+                   piece in the reference sentence
         - :param: `hyp_embedding` (torch.Tensor):
                    embeddings of candidate sentences, BxKxd,
                    B: batch size, K: longest length, d: bert dimenison
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index f7eaedba1..9ce0eb653 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 from aenum import Enum
+from pkg_resources import resource_filename
 
 from lighteval.metrics.harness_compatibility.drop import drop_metrics
 from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics
@@ -232,7 +233,7 @@ class Metrics(Enum):
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLM(
             judge_model_name="gpt-3.5-turbo",
-            template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
+            template_path=resource_filename("lighteval", "tasks/extended/mt_bench/judge_prompts.jsonl"),
             multi_turn=True,
         ).compute,
         corpus_level_fn={
@@ -247,7 +248,7 @@ class Metrics(Enum):
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLM(
             judge_model_name="gpt-3.5-turbo",
-            template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
+            template_path=resource_filename("lighteval", "tasks/extended/mt_bench/judge_prompts.jsonl"),
             multi_turn=False,
         ).compute,
         corpus_level_fn={
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 6210f13ed..5e1b2c0ab 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -648,7 +648,7 @@ def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[
         """
         Compute the score of a generative task using a llm as a judge.
         The generative task can be multiturn with 2 turns max, in that case, we
-        return scores for turn 1 and 2. Also returns user_prompt and judgment
+        return scores for turn 1 and 2. Also returns user_prompt and judgement
         which are ignored later by the aggregator.
         """
 
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index 9e312fe26..6dd307868 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -143,7 +143,7 @@ class TaskExampleId(NamedTuple):
     Represents the identifier for an example in a task.
 
     Attributes:
-        task_name (str): The name of the task.
+        task_name (str): The name of the task in `name|num_fewshot` format.
         doc_id_seed (str): The document id with the seed used for few_shot appended at the end.
     """
 

From 2f5817e50c5a928e4195c834a01ed943dd4e829e Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Wed, 22 May 2024 16:59:20 +0330
Subject: [PATCH 10/11] Fix a typo in judge_prompts.jsonl

---
 src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl b/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl
index 4ec7524cb..a43ef34c1 100644
--- a/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl
+++ b/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl
@@ -4,5 +4,5 @@
 {"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
 {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
 {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
-{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
-{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
+{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Your evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
+{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Your evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}

From ee4b2550e0b2b265dc62b307eb3f872ad8fd42f7 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Thu, 4 Jul 2024 23:12:09 +0330
Subject: [PATCH 11/11] Remove pkg_resources

---
 src/lighteval/metrics/metrics.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index df66c70f0..262b20a09 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -24,7 +24,6 @@
 
 import numpy as np
 from aenum import Enum
-from pkg_resources import resource_filename
 
 from lighteval.metrics.harness_compatibility.drop import drop_metrics
 from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics