Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make evaluator invariant of input request type order #215

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/lighteval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def evaluate( # noqa: C901
:return
Dictionary of results
"""
# A request output tupe is a Tuple where the first element is the index of
# A request output tuple is a Tuple where the first element is the index of
# the request for one document of one task i.e.
# task: "arc_easy", doc: "0"# request: "0" -> request_index = 0,
# We can have multiple requests per doc for multi choice tasks for example.
Expand All @@ -75,8 +75,11 @@ def evaluate( # noqa: C901
)
example_id_response_dict: dict[TaskExampleId, list[RequestIndexModelResponseTuple]] = collections.defaultdict(list)

for request_type, requests in requests_dict.items():
for request_type in RequestType:
if request_type not in requests_dict:
continue
hlog(f"Running {request_type} requests")
requests = requests_dict[request_type]
# These are all the request type from the request factory at the moment
if request_type == RequestType.LOGLIKELIHOOD:
full_resps = lm.loglikelihood(requests, override_bs=override_bs)
Expand All @@ -99,10 +102,6 @@ def evaluate( # noqa: C901

# ===== unpack results and sort back in order and return control to Task =====
for task_example_id, prediction_list in example_id_response_dict.items():
# ===== Unpack the request =====
prediction_list.sort(
key=lambda x: x.request_index
) # When we use Loglikelihood for several tokens we have all the options here
Copy link
Contributor Author

@sadra-barikbin sadra-barikbin Jul 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be unnecessary as the lm responses are reordered back to their original order in lm's methods. I removed it in this PR because for example when the responses for a document is like [LoglikelihoodReturn(index=0), LoglikelihoodReturn(index=1), GreedyUntilReturn(index=0)] (which occurs now because RequestType.LOGLIKELIHOOD resides before RequestType.GREEDY_UNTIL in RequestType), this statement wrongly changes it to [LoglikelihoodReturn(index=0), GreedyUntilReturn(index=0)], LoglikelihoodReturn(index=1). If you think we should keep it, we could add the request type to the sort key.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The list sorted here will only contain request of the same type. It is used for loglikelihood requests. For example, for one loglikelihood request with 4 choices we would have 4 different request. We sort them here so that we have the same ordering from the task doc.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But by looking at the upper for loop, we find that all responses of of all types associated with a single task and single example go to this list.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh yeah i forgot that we could have both greedy and multichoice returns for one task.
THis could be an issue when computing results. Did you check that removing the sorting was indeed fixing this issue ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I did. It could be verified by the example I put in #193

model_responses = [x.model_response for x in prediction_list]
cur_task_name = task_example_id.task_name.rsplit("|", 1)[0]

Expand Down
8 changes: 3 additions & 5 deletions src/lighteval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,24 +116,22 @@ def apply_generative_metric(

def apply_multichoice_metric(results: list[ModelReturn], formatted_doc: Doc, metrics: list[Metric]):
outputs = {}
if len(formatted_doc.choices) != len(results):
raise ValueError("Length of results is not equal to the length of the choices")
mc_results = results[: len(formatted_doc.choices)]
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
if len(formatted_doc.choices) <= 1:
raise ValueError(
"You can't use a multi choice metric with only one choice. Use `acc_golds_likelihood` instead."
)

# Todo: make better system with return_bool_score instead of taking first element
choices_logprob = [results[i].result[0] for i in range(len(formatted_doc.choices))] # sum(
choices_logprob = [mc_results[i].result[0] for i in range(len(formatted_doc.choices))] # sum(
gold_ixs = as_list(formatted_doc.gold_index)

for metric in metrics:
if metric.category == MetricCategory.MULTICHOICE:
outputs.update(
metric.compute(choices_logprob=choices_logprob, gold_ixs=gold_ixs, formatted_doc=formatted_doc)
)

return results, outputs
return results[len(formatted_doc.choices) :], outputs


def apply_multichoice_metric_one_token(results: list[ModelReturn], formatted_doc: Doc, metrics: list[Metric]):
Expand Down
22 changes: 11 additions & 11 deletions src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,16 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
results=results, formatted_doc=formatted_doc, metrics=self.metrics
)
outputs.update(cur_outputs)
if self.has_metric_category[MetricCategory.MULTICHOICE]:
results, cur_outputs = apply_multichoice_metric(
results=results, formatted_doc=formatted_doc, metrics=self.metrics
)
outputs.update(cur_outputs)
if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
results, cur_outputs = apply_multichoice_metric_one_token(
results=results, formatted_doc=formatted_doc, metrics=self.metrics
)
outputs.update(cur_outputs)
if self.has_metric_category[MetricCategory.PERPLEXITY]:
results, cur_outputs = apply_perplexity_metric(
results=results, formatted_doc=formatted_doc, metrics=self.metrics
Expand All @@ -557,16 +567,6 @@ def process_results(self, formatted_doc: Doc, results: list[ModelReturn]) -> dic
max_num_samples=max(self.num_samples),
)
outputs.update(cur_outputs)
if self.has_metric_category[MetricCategory.MULTICHOICE]:
results, cur_outputs = apply_multichoice_metric(
results=results, formatted_doc=formatted_doc, metrics=self.metrics
)
outputs.update(cur_outputs)
if self.has_metric_category[MetricCategory.MULTICHOICE_ONE_TOKEN]:
results, cur_outputs = apply_multichoice_metric_one_token(
results=results, formatted_doc=formatted_doc, metrics=self.metrics
)
outputs.update(cur_outputs)
if (
self.has_metric_category[MetricCategory.LLM_AS_JUDGE_MULTI_TURN]
or self.has_metric_category[MetricCategory.LLM_AS_JUDGE]
Expand Down Expand Up @@ -643,7 +643,7 @@ def create_requests_from_tasks( # noqa: C901
) -> Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]:
"""
Takes a task dict and a fewshot dict and returns a dict of requests, a dict
of docs, and a dict of requests origins. The construction of prompts and
of docs, and a dict of requests origins. The construction of prompts and
thus the managing of few shots is done here.

Args:
Expand Down
Loading