huggingface · clefourrier · Jul 17, 2024 · Jul 5, 2024 · Jul 11, 2024 · Jul 16, 2024
diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py
@@ -64,7 +64,7 @@ def evaluate(  # noqa: C901
     :return
         Dictionary of results
     """
-    # A request output tupe is a Tuple where the first element is the index of
+    # A request output tuple is a Tuple where the first element is the index of
     # the request for one document of one task i.e.
     # task: "arc_easy", doc: "0"# request: "0" -> request_index = 0,
     # We can have multiple requests per doc for multi choice tasks for example.
@@ -75,8 +75,11 @@ def evaluate(  # noqa: C901
     )
     example_id_response_dict: dict[TaskExampleId, list[RequestIndexModelResponseTuple]] = collections.defaultdict(list)
 
-    for request_type, requests in requests_dict.items():
+    for request_type in RequestType:
+        if request_type not in requests_dict:
+            continue
         hlog(f"Running {request_type} requests")
+        requests = requests_dict[request_type]
         # These are all the request type from the request factory at the moment
         if request_type == RequestType.LOGLIKELIHOOD:
             full_resps = lm.loglikelihood(requests, override_bs=override_bs)
@@ -99,10 +102,6 @@ def evaluate(  # noqa: C901
 
     # ===== unpack results and sort back in order and return control to Task =====
     for task_example_id, prediction_list in example_id_response_dict.items():
-        # ===== Unpack the request =====
-        prediction_list.sort(
-            key=lambda x: x.request_index
-        )  # When we use Loglikelihood for several tokens we have all the options here
         model_responses = [x.model_response for x in prediction_list]
         cur_task_name = task_example_id.task_name.rsplit("|", 1)[0]
 

diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py
@@ -116,24 +116,22 @@ def apply_generative_metric(
 
 def apply_multichoice_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[Metric]):
     outputs = {}
-    if len(formatted_doc.choices) != len(results):
-        raise ValueError("Length of results is not equal to the length of the choices")
+    mc_results = results[: len(formatted_doc.choices)]
     if len(formatted_doc.choices) <= 1:
         raise ValueError(
             "You can't use a multi choice metric with only one choice. Use `acc_golds_likelihood` instead."
         )
 
     # Todo: make better system with return_bool_score instead of taking first element
-    choices_logprob = [results[i].result[0] for i in range(len(formatted_doc.choices))]  # sum(
+    choices_logprob = [mc_results[i].result[0] for i in range(len(formatted_doc.choices))]  # sum(
     gold_ixs = as_list(formatted_doc.gold_index)
 
     for metric in metrics:
         if metric.category == MetricCategory.MULTICHOICE:
             outputs.update(
                 metric.compute(choices_logprob=choices_logprob, gold_ixs=gold_ixs, formatted_doc=formatted_doc)
             )
-
-    return results, outputs
+    return results[len(formatted_doc.choices) :], outputs
 
 
 def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc: Doc, metrics: list[Metric]):

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -539,6 +539,16 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
             )
             outputs.update(cur_outputs)
+        if self.has_metric_category[MetricCategory.MULTICHOICE]:
+            results, cur_outputs = apply_multichoice_metric(
+                results=results, formatted_doc=formatted_doc, metrics=self.metrics
+            )
+            outputs.update(cur_outputs)
+        if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
+            results, cur_outputs = apply_multichoice_metric_one_token(
+                results=results, formatted_doc=formatted_doc, metrics=self.metrics
+            )
+            outputs.update(cur_outputs)
         if self.has_metric_category[MetricCategory.PERPLEXITY]:
             results, cur_outputs = apply_perplexity_metric(
                 results=results, formatted_doc=formatted_doc, metrics=self.metrics
@@ -557,16 +567,6 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
                 max_num_samples=max(self.num_samples),
             )
             outputs.update(cur_outputs)
-        if self.has_metric_category[MetricCategory.MULTICHOICE]:
-            results, cur_outputs = apply_multichoice_metric(
-                results=results, formatted_doc=formatted_doc, metrics=self.metrics
-            )
-            outputs.update(cur_outputs)
-        if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
-            results, cur_outputs = apply_multichoice_metric_one_token(
-                results=results, formatted_doc=formatted_doc, metrics=self.metrics
-            )
-            outputs.update(cur_outputs)
         if (
             self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]
             or self.has_metric_category[MetricCategory.LLM_AS_JUDGE]
@@ -643,7 +643,7 @@ def create_requests_from_tasks(  # noqa: C901
 ) -> Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]:
     """
     Takes a task dict and a fewshot dict and returns a dict of requests, a dict
-    of docs, and a dict of requests origins.  The construction of prompts and
+    of docs, and a dict of requests origins. The construction of prompts and
     thus the managing of few shots is done here.
 
     Args: