diff --git a/src/lighteval/evaluator.py b/src/lighteval/evaluator.py index 883e5ef70..331b5f386 100644 --- a/src/lighteval/evaluator.py +++ b/src/lighteval/evaluator.py @@ -64,7 +64,7 @@ def evaluate( # noqa: C901 :return Dictionary of results """ - # A request output tupe is a Tuple where the first element is the index of + # A request output tuple is a Tuple where the first element is the index of # the request for one document of one task i.e. # task: "arc_easy", doc: "0"# request: "0" -> request_index = 0, # We can have multiple requests per doc for multi choice tasks for example. @@ -75,8 +75,11 @@ def evaluate( # noqa: C901 ) example_id_response_dict: dict[TaskExampleId, list[RequestIndexModelResponseTuple]] = collections.defaultdict(list) - for request_type, requests in requests_dict.items(): + for request_type in RequestType: + if request_type not in requests_dict: + continue hlog(f"Running {request_type} requests") + requests = requests_dict[request_type] # These are all the request type from the request factory at the moment if request_type == RequestType.LOGLIKELIHOOD: full_resps = lm.loglikelihood(requests, override_bs=override_bs) @@ -99,10 +102,6 @@ def evaluate( # noqa: C901 # ===== unpack results and sort back in order and return control to Task ===== for task_example_id, prediction_list in example_id_response_dict.items(): - # ===== Unpack the request ===== - prediction_list.sort( - key=lambda x: x.request_index - ) # When we use Loglikelihood for several tokens we have all the options here model_responses = [x.model_response for x in prediction_list] cur_task_name = task_example_id.task_name.rsplit("|", 1)[0] diff --git a/src/lighteval/metrics/__init__.py b/src/lighteval/metrics/__init__.py index 5b1e8b7cd..1b105d74a 100644 --- a/src/lighteval/metrics/__init__.py +++ b/src/lighteval/metrics/__init__.py @@ -116,15 +116,14 @@ def apply_generative_metric( def apply_multichoice_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[Metric]): outputs = {} - if len(formatted_doc.choices) != len(results): - raise ValueError("Length of results is not equal to the length of the choices") + mc_results = results[: len(formatted_doc.choices)] if len(formatted_doc.choices) <= 1: raise ValueError( "You can't use a multi choice metric with only one choice. Use `acc_golds_likelihood` instead." ) # Todo: make better system with return_bool_score instead of taking first element - choices_logprob = [results[i].result[0] for i in range(len(formatted_doc.choices))] # sum( + choices_logprob = [mc_results[i].result[0] for i in range(len(formatted_doc.choices))] # sum( gold_ixs = as_list(formatted_doc.gold_index) for metric in metrics: @@ -132,8 +131,7 @@ def apply_multichoice_metric(results: list[ModelReturn], formatted_doc: Doc, met outputs.update( metric.compute(choices_logprob=choices_logprob, gold_ixs=gold_ixs, formatted_doc=formatted_doc) ) - - return results, outputs + return results[len(formatted_doc.choices) :], outputs def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc: Doc, metrics: list[Metric]): diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 6595571fe..07251d696 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -539,6 +539,16 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic results=results, formatted_doc=formatted_doc, metrics=self.metrics ) outputs.update(cur_outputs) + if self.has_metric_category[MetricCategory.MULTICHOICE]: + results, cur_outputs = apply_multichoice_metric( + results=results, formatted_doc=formatted_doc, metrics=self.metrics + ) + outputs.update(cur_outputs) + if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]: + results, cur_outputs = apply_multichoice_metric_one_token( + results=results, formatted_doc=formatted_doc, metrics=self.metrics + ) + outputs.update(cur_outputs) if self.has_metric_category[MetricCategory.PERPLEXITY]: results, cur_outputs = apply_perplexity_metric( results=results, formatted_doc=formatted_doc, metrics=self.metrics @@ -557,16 +567,6 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic max_num_samples=max(self.num_samples), ) outputs.update(cur_outputs) - if self.has_metric_category[MetricCategory.MULTICHOICE]: - results, cur_outputs = apply_multichoice_metric( - results=results, formatted_doc=formatted_doc, metrics=self.metrics - ) - outputs.update(cur_outputs) - if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]: - results, cur_outputs = apply_multichoice_metric_one_token( - results=results, formatted_doc=formatted_doc, metrics=self.metrics - ) - outputs.update(cur_outputs) if ( self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN] or self.has_metric_category[MetricCategory.LLM_AS_JUDGE] @@ -643,7 +643,7 @@ def create_requests_from_tasks( # noqa: C901 ) -> Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]: """ Takes a task dict and a fewshot dict and returns a dict of requests, a dict - of docs, and a dict of requests origins. The construction of prompts and + of docs, and a dict of requests origins. The construction of prompts and thus the managing of few shots is done here. Args: