Merge branch 'main' into simplify_task_system

huggingface · Jul 9, 2024 · da016b7 · da016b7
2 parents 1a0efde + 70f7fc6
commit da016b7
Show file tree

Hide file tree

Showing 17 changed files with 223 additions and 83 deletions.
diff --git a/README.md b/README.md
@@ -239,6 +239,17 @@ python run_evals_accelerate.py \
     --output_dir "./evals"
 ```
 
+### Using the dummy model
+To debug or obtain random baseline scores for a given set of tasks, you can use the `dummy` model:
+```shell
+python run_evals_accelerate.py \
+    --model_args "dummy"\
+    --tasks <task parameters> \
+    --output_dir output_dir
+```
+This "model" randomly generates logprobs (for selection/accuracy tasks) and the string "random baseline" for generation tasks.
+You can also select a specific seed for the random logprob values generated by the dummy model: `--model_args "dummy,seed=123"`.
+
 ## Deep thanks
 `lighteval` was originally built on top of the great [Eleuther AI Harness](https://github.com/EleutherAI/lm-evaluation-harness) (we use the latter to power the [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)). We also took a lot of inspiration from the amazing [HELM](https://crfm.stanford.edu/helm/latest/), notably for metrics.
 

diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
@@ -51,7 +51,12 @@ def get_parser():
     parser.add_argument(
         "--public_run", default=False, action="store_true", help="Push results and details to a public repo"
     )
-    parser.add_argument("--cache_dir", type=str, default=CACHE_DIR)
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=CACHE_DIR,
+        help="Cache directory for downloaded datasets & model, defaults to `HF_HOME` environment variable",
+    )
     parser.add_argument(
         "--results_org",
         type=str,
@@ -65,13 +70,13 @@ def get_parser():
         "--custom_tasks",
         type=str,
         default=None,
-        help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
+        help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formatting functions)",
     )
     group.add_argument(
         "--tasks",
         type=str,
         default=None,
-        help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5|0' or path to a texte file with a list of tasks",
+        help="Comma-separated ids of tasks, e.g. 'original|mmlu:abstract_algebra|5' or path to a text file with a list of tasks",
     )
     parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots")
     return parser

diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
@@ -67,7 +67,7 @@ def evaluate(  # noqa: C901
     # A request output tupe is a Tuple where the first element is the index of
     # the request for one document of one task i.e.
     # task: "arc_easy", doc: "0"# request: "0" -> request_index = 0,
-    # We can have multiple request per doc for multi choice tasks for example.
+    # We can have multiple requests per doc for multi choice tasks for example.
 
     # all responses for each (task, doc)
     RequestIndexModelResponseTuple = collections.namedtuple(

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -511,7 +511,7 @@ def push_results_to_tensorboard(  # noqa: C901
         self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
     ):
         if not is_nanotron_available():
-            hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping")
+            hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping")
             return
         config: Config = self.general_config_logger.config
         lighteval_config = config.lighteval

diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py
@@ -163,7 +163,7 @@ def greedy_cos_idf(
         - :param: `ref_masks` (torch.LongTensor): BxKxK, BERT attention mask for
                    reference sentences.
         - :param: `ref_idf` (torch.Tensor): BxK, idf score of each word
-                   piece in the reference setence
+                   piece in the reference sentence
         - :param: `hyp_embedding` (torch.Tensor):
                    embeddings of candidate sentences, BxKxd,
                    B: batch size, K: longest length, d: bert dimenison

diff --git a/src/lighteval/metrics/judge_prompts.jsonl b/src/lighteval/metrics/judge_prompts.jsonl
@@ -4,5 +4,5 @@
 {"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
 {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
 {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
-{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
-{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
+{"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Your evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
+{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Your evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -249,7 +249,7 @@ class Metrics(Enum):
         use_case=MetricUseCase.SUMMARIZATION,
         sample_level_fn=JudgeLLM(
             judge_model_name="gpt-3.5-turbo",
-            template_path=os.path.join(os.path.dirname(__file__), "", "judge_prompts.jsonl"),
+            template_path=os.path.join(os.path.dirname(__file__), "judge_prompts.jsonl"),
             multi_turn=False,
         ).compute,
         corpus_level_fn={

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -622,7 +622,7 @@ def edit_similarity(self, s1, s2):
 
 
 class JudgeLLM:
-    available_models = ["gpt-3.5-turbo"]
+    available_models = ["gpt-3.5-turbo", "gpt-4o", "gpt-4-turbo", "gpt-4"]
 
     def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False):
         if judge_model_name not in self.available_models:
@@ -644,7 +644,7 @@ def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[
         """
         Compute the score of a generative task using a llm as a judge.
         The generative task can be multiturn with 2 turns max, in that case, we
-        return scores for turn 1 and 2. Also returns user_prompt and judgment
+        return scores for turn 1 and 2. Also returns user_prompt and judgement
         which are ignored later by the aggregator.
         """
 

diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
@@ -79,7 +79,7 @@ def __init__(
         self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False
         self._tokenizer = self._create_auto_tokenizer(config, env_config)
 
-        # If model_parallel is not set we compare the number of process with the number of GPUs
+        # If model_parallel is not set we compare the number of processes with the number of GPUs
         self.model = self._create_auto_model(config, env_config)
         self.model.eval()
         torch.set_grad_enabled(False)
@@ -530,27 +530,7 @@ def greedy_until(
                 returns_logits = batch[0].use_logits
                 num_samples = batch[0].num_samples
 
-                # The main question for this step is the following:
-                # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
-                # of loosing some meaning, or have some generations that are exceedingly short?
-                # The choice we go for here is to avoid truncating the prompt if we can, since it
-                # should have been managed by the prompt creator/few shot manager if requested by the user.
                 context = [c.context for c in batch]
-                smallest_context = min(len(c) for c in context)
-                biggest_context = max(len(c) for c in context)
-                if smallest_context > self.max_length:
-                    hlog_warn(
-                        f"The smallest context of your batch ({smallest_context}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in"
-                        + str({i.task_name for i in batch})
-                        + ". This is likely to lead to some errors."  # noqa C401
-                    )
-
-                if (
-                    biggest_context > self.max_length
-                ):  # There will be truncation of at least one sample, maximum generation size will be one
-                    max_new_tokens = 1
-                else:  # We can't allow generation of more than max_length
-                    max_new_tokens = min(self.max_length - biggest_context, max_new_tokens)
 
                 # See doc https://huggingface.co/docs/transformers/v4.38.2/en/pad_truncation#padding-and-truncation
                 # Will do left truncation and padding, as defined when creating the tokenizer
@@ -563,6 +543,23 @@ def greedy_until(
                     add_special_tokens=self.add_special_tokens,
                 ).to(self.device)
 
+                # The main question for this step is the following:
+                # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
+                # of losing some meaning, or have some generations that are exceedingly short?
+                # The choice we go for here is to avoid truncating the prompt if we can, since it
+                # should have been managed by the prompt creator/few shot manager if requested by the user.
+                context_size = tokenized["input_ids"].shape[1]
+                if context_size > self.max_length:
+                    hlog_warn(
+                        f"The context size of your batch ({context_size}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in"
+                        + str({i.task_name for i in batch})
+                        + ". This is likely to lead to some errors."  # noqa C401
+                    )
+                    # There will be truncation of at least one sample, maximum generation size will be one
+                    max_new_tokens = 1
+                else:  # We can't allow generation of more than max_length
+                    max_new_tokens = min(self.max_length - context_size, max_new_tokens)
+
                 prepared_batch = Batch(
                     input_ids=tokenized["input_ids"],
                     input_lengths=[len(item == 1) for item in tokenized["attention_mask"]],
@@ -819,7 +816,7 @@ def _loglikelihood_tokens(
                     )
                     res.append(answer)
 
-                # Clean up GPUS
+                # Clean up GPUs
                 del model_output
                 del logits
                 del batched_inputs
@@ -852,7 +849,7 @@ def prepare_batch_logprob(
             hlog_warn("max_context is None, using max_length")
             max_context = self.max_length
 
-        # Each sample is concatenated and cut to lenght or padded to max_length
+        # Each sample is concatenated and cut to length or padded to max_length
         for orig_tokens in inputs:
             truncated.append(max(len(orig_tokens) - max_context, 0))
 
@@ -1030,7 +1027,7 @@ def _loglikelihood_single_token(
                     )
                     res.append(answer)
 
-                # Clean up GPUS
+                # Clean up GPUs
                 del out
                 del batch_probs
                 del batched_inputs