Skip to content

Commit

Permalink
Pr sadra (#393)
Browse files Browse the repository at this point in the history

---------

Co-authored-by: Sadra Barikbin <[email protected]>
Co-authored-by: Nathan Habib <[email protected]>
  • Loading branch information
3 people authored and Hynek Kydlicek committed Nov 26, 2024
1 parent dba45cc commit ad0ba3a
Show file tree
Hide file tree
Showing 10 changed files with 55 additions and 73 deletions.
9 changes: 0 additions & 9 deletions community_tasks/arabic_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def mmlu_arabic(line, task_name: str = None):
choices=LETTER_INDICES_AR[:4],
gold_index=gold_ix,
instruction=instruction,
target_for_fewshot_sorting=LETTER_INDICES_AR[gold_ix],
)


Expand Down Expand Up @@ -181,7 +180,6 @@ def arabic_exams(line, task_name: str = None):
choices=LETTER_INDICES_AR[:4],
gold_index=answer_index,
instruction=instruction,
target_for_fewshot_sorting=choices[answer_index],
)


Expand Down Expand Up @@ -231,7 +229,6 @@ def alghafa_prompt(line, task_name: str = None):
choices=choices,
gold_index=answer_index,
instruction=instruction,
target_for_fewshot_sorting=choices[answer_index],
)


Expand Down Expand Up @@ -371,7 +368,6 @@ def __init__(
def boolq_prompt_arabic(line, task_name: str = None):
question = line["question"]
passage = line["passage"]
answer = "نعم" if line["answer"] else "لا"
instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا"
query = f"""{instruction}
المقطع :
Expand All @@ -387,7 +383,6 @@ def boolq_prompt_arabic(line, task_name: str = None):
choices=["نعم", "لا"],
gold_index=0 if line["answer"] else 1,
instruction=instruction,
target_for_fewshot_sorting=answer,
)


Expand Down Expand Up @@ -423,7 +418,6 @@ def copa_prompt_arabic(line, task_name: str = None):
choices=choices,
gold_index=answer,
instruction="",
target_for_fewshot_sorting=choices[answer],
)


Expand Down Expand Up @@ -468,7 +462,6 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
choices=endings,
gold_index=answer_index,
instruction=instruction,
target_for_fewshot_sorting=endings[answer_index],
)


Expand Down Expand Up @@ -506,7 +499,6 @@ def toxigen_prompt_arabic(line, task_name: str = None):
choices=["لا", "نعم"],
gold_index=label,
instruction=instruction,
target_for_fewshot_sorting="نعم" if label == 1 else "لا",
)


Expand Down Expand Up @@ -558,7 +550,6 @@ def sciq_prompt_arabic(line, task_name: str = None):
choices=choices,
gold_index=answer_index,
instruction=instruction,
target_for_fewshot_sorting=choices[answer_index],
)


Expand Down
6 changes: 0 additions & 6 deletions community_tasks/serbian_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,6 @@ def serbian_eval_prompt(line: dict, task_name: Optional[str] = None) -> Doc:
- choices (list of str): The list of available answer choices.
- gold_index (int): The index of the correct answer.
- instruction (str): The instruction shown to the user in Serbian.
- target_for_fewshot_sorting (Union[str, list of str]): The correct answer, either as a
string (for regular tasks) or a list of strings (for MMLU tasks).
"""

question = line["query"]
Expand All @@ -226,16 +224,12 @@ def serbian_eval_prompt(line: dict, task_name: Optional[str] = None) -> Doc:

query += "\n\nKrajnji odgovor:"

# Finalize target_for_fewshot_sorting as we handle mmlu task group as string
target_for_fewshot_sorting = [choices[gold_index]] if task_name and "mmlu" in task_name else choices[gold_index]

return Doc(
task_name=task_name,
query=query,
choices=choices,
gold_index=gold_index,
instruction=instruction,
target_for_fewshot_sorting=target_for_fewshot_sorting,
)


Expand Down
22 changes: 22 additions & 0 deletions examples/model_configs/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
model:
type: "endpoint"
base_params:
endpoint_name: "smollm-360m-instruct-v0-2-q8-lvy" # needs to be lower case without special characters
model: HuggingFaceTB/SmolLM-360M-Instruct
revision: "main"
dtype: "default" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
reuse_existing: true # if true, ignore all params in instance, and don't delete the endpoint after evaluation
instance:
accelerator: "gpu"
region: "eu-west-1"
vendor: "aws"
instance_size: "medium"
instance_type: "g5.2xlarge"
framework: "pytorch"
endpoint_type: "protected"
namespace: null # The namespace under which to launch the endopint. Defaults to the current user's namespace
image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
env_vars:
null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
generation:
add_special_tokens: true
1 change: 0 additions & 1 deletion examples/nanotron/custom_evaluation_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,6 @@ def mmlu_harness(line, task_name: str = None):
task_name=task_name,
query=prompt,
choices=[" A", " B", " C", " D"],
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
)
Expand Down
4 changes: 2 additions & 2 deletions examples/nanotron/custom_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def mmlu_signs(line, topic):
return {
"query": prompt,
"choices": [" +", " *", " =", " #"] if is_few_shots else ["+", "*", "=", "#"],
"target_for_fewshot_sorting": [" +", " *", " =", " #"][gold_ix],
"fewshot_sorting_class": [" +", " *", " =", " #"][gold_ix],
"gold_index": gold_ix,
"instruction": f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
}
Expand All @@ -58,7 +58,7 @@ def mmlu_numbers(line, topic):
return {
"query": prompt,
"choices": [" 1", " 2", " 3", " 4"] if is_few_shots else ["1", "2", "3", "4"],
"target_for_fewshot_sorting": [" 1", " 2", " 3", " 4"][gold_ix],
"fewshot_sorting_class": [" 1", " 2", " 3", " 4"][gold_ix],
"gold_index": gold_ix,
"instruction": f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
}
Expand Down
20 changes: 3 additions & 17 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ def bbh_harness(line, task_name: str = None):
query=query,
choices=choices,
gold_index=correct_index,
target_for_fewshot_sorting=choices,
instruction=line.get("task_prefix", None),
)

Expand All @@ -196,7 +195,6 @@ def bbh_lighteval(line, task_name: str = None):
query=query,
choices=LETTER_INDICES[: len(line["choices"])],
gold_index=line["target_idx"],
target_for_fewshot_sorting=LETTER_INDICES[: len(line["choices"])],
instruction=line.get("task_prefix", None),
)

Expand All @@ -207,7 +205,6 @@ def bbh(line, instruction, choices, task_name: str = None):
query=f"{instruction}Q: {line['input']}\nA:",
choices=choices,
gold_index=choices.index(line["target"]),
target_for_fewshot_sorting=[f" {c}" for c in choices],
instruction=instruction,
)

Expand Down Expand Up @@ -799,7 +796,6 @@ def hellaswag_generative(line, task_name: str = None):
choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]],
gold_index=gold_ix, # -1 for test,
instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
target_for_fewshot_sorting=line["endings"][gold_ix] if gold_ix > -1 else "",
)


Expand Down Expand Up @@ -1352,7 +1348,6 @@ def mmlu(line, topic, task_name: str = None):
choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
)


Expand All @@ -1373,7 +1368,6 @@ def custom_mmlu_thom(line, task_name: str = None):
choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
)


Expand Down Expand Up @@ -1613,15 +1607,13 @@ def mmlu_harness(line, task_name: str = None):
query += "Answer:"

gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
"__few_shots" in line and line["__few_shots"] is True # We are adding few shots

return Doc(
task_name=task_name,
query=query,
choices=[" A", " B", " C", " D"],
gold_index=gold_ix,
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
)


Expand All @@ -1638,8 +1630,8 @@ def mmlu_helm(line, task_name: str = None):
query=query,
choices=[" A", " B", " C", " D"],
gold_index=gold_ix,
fewshot_sorting_class=line["choices"][gold_ix],
instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
target_for_fewshot_sorting=line["choices"][gold_ix], # specific to HELM evals
)


Expand Down Expand Up @@ -1816,7 +1808,6 @@ def openbookqa_helm(line, task_name: str = None):
choices=["A", "B", "C", "D", "E"],
gold_index=gold_ix,
instruction="The following are multiple choice questions (with answers) about common sense.\n",
target_for_fewshot_sorting=line["choices"]["text"][gold_ix], # specific to HELM evals
)


Expand All @@ -1837,14 +1828,13 @@ def piqa_helm(line, task_name: str = None):
query += "Answer: "

gold_ix = int(line["label"])

is_few_shots = line.get("__few_shots", False)
return Doc(
task_name=task_name,
query=query,
choices=["A", "B"],
choices=["A", "B"] if not is_few_shots else [line["sol1"], line["sol2"]],
gold_index=gold_ix,
instruction="The following are multiple choice questions (with answers) about common sense.\n",
target_for_fewshot_sorting=[line["sol1"], line["sol2"]][gold_ix],
)


Expand Down Expand Up @@ -1877,13 +1867,11 @@ def pubmed_qa_helm(line, task_name: str = None):
)
query += f"\n\nQuestion: {line['question']}\nAnswer: "
gold_ix = ["yes", "no", "maybe"].index(line["final_decision"])

return Doc(
task_name=task_name,
query=query,
choices=["A", "B", "C"],
gold_index=gold_ix,
target_for_fewshot_sorting=["yes", "no", "maybe"][gold_ix],
)


Expand Down Expand Up @@ -2263,13 +2251,11 @@ def truthful_qa_helm(line, task_name: str = None):
query = f"Question: {line['question']}\n"
query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
query += "Answer:"

return Doc(
task_name=task_name,
query=query,
choices=LETTER_INDICES[: len(line["choices"])],
gold_index=line["gold_index"],
target_for_fewshot_sorting=line["choices"][line["gold_index"]],
)


Expand Down
15 changes: 0 additions & 15 deletions src/lighteval/tasks/lighteval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,21 +340,6 @@ def eval_docs(self) -> list[Doc]:
self._docs = self.remove_duplicate_docs(self._docs)
return self._docs

def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
"""
Returns the target of the given document.
Args:
formatted_doc (Doc): Formatted document.
few_shot (bool, optional): Whether the document is used for few
shot examples. Defaults to False.
Returns:
str: Target of the document, which is the correct answer for a document.
"""
# likely we mostly need one example not all
return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]

def construct_requests(
self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
) -> Dict[RequestType, List[Request]]:
Expand Down
35 changes: 23 additions & 12 deletions src/lighteval/tasks/prompt_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,20 +65,33 @@ def doc_to_text(doc: Doc, return_instructions: bool = False) -> Union[str, Tuple
)

@staticmethod
def doc_to_target(formatted_doc: Doc, few_shot: bool = False) -> str:
def doc_to_target(formatted_doc: Doc) -> str:
"""
Returns the target of the given document.
Args:
formatted_doc (Doc): Formatted document.
few_shot (bool, optional): Whether the document is used for few
shot examples. Defaults to False.
Returns:
str: Target of the document, which is the correct answer for a document.
"""
# likely we mostly need one example not all
return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
return as_list(formatted_doc.get_golds())[0]

@staticmethod
def doc_to_fewshot_sorting_class(formatted_doc: Doc) -> str:
"""
In some cases, when selecting few-shot samples, we want to use specific document classes
which need to be specified separately from the target.
For example, a document where the gold is a json might want to use only one of the keys of
the json to define sorting classes in few shot samples. Else we take the gold.
Args:
formatted_doc (Doc): Formatted document.
Returns:
str: Class of the
"""
return formatted_doc.fewshot_sorting_class or PromptManager.doc_to_target(formatted_doc)

def add_context_to_doc(
self,
Expand Down Expand Up @@ -255,9 +268,7 @@ def get_examples(
class FewShotSelectionMethod:
sorting: str # sorting method for the overall few shot pool (balanced, random, sequential)
with_sampling: bool # samples item randomly from the few shot pool
fewshotpool_unique: (
bool
) # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
fewshotpool_unique: bool # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set


class FewShotSelection(Enum):
Expand Down Expand Up @@ -356,16 +367,16 @@ def _init_fewshot_sampling_balanced(
):
fewshotpool = self.task.fewshot_docs()

# rnd = random.Random(variance_seed)
random.seed(variance_seed)

# Build up balanced selection based on labels
# Sort by counts of labels
# Build up balanced selection based on fewshot_sorting_class
# (or the gold target, if the class is undefined)
label_to_instances = defaultdict(list)
for instance in fewshotpool:
target = PromptManager.doc_to_target(instance, few_shot=True)
target = PromptManager.doc_to_fewshot_sorting_class(instance)
label_to_instances[target].append(instance)

# Sort by counts of class labels
counts_to_labels = defaultdict(list)
for label, instances in sorted(label_to_instances.items()):
counts_to_labels[len(instances)].append(label)
Expand Down
12 changes: 3 additions & 9 deletions src/lighteval/tasks/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ class Doc:

# For few-shot
instruction: Optional[str] = ""
target_for_fewshot_sorting: Optional[str] = None # will probably have to be removed in the future
fewshot_sorting_class: Optional[str] = None # class to use to select balanced few-shot samples

# Filled when parsing and adding the few-shot context
ctx: Optional[str] = ""
Expand All @@ -194,18 +194,12 @@ def __post_init__(self):
if self.instruction is None:
self.instruction = ""

def get_golds(self, few_shot: bool = False):
def get_golds(self):
"""Return gold targets extracted from the target dict"""
gold_indices = as_list(self.gold_index)
if few_shot and self.target_for_fewshot_sorting is not None:
choices = self.target_for_fewshot_sorting
if isinstance(choices, str): # correct choice is already selected
return choices
else:
choices = self.choices
golds = []
for gold_ix in gold_indices:
golds.extend(as_list(choices[gold_ix]))
golds.extend(as_list(self.choices[gold_ix]))
return golds

def __repr__(self):
Expand Down
Loading

0 comments on commit ad0ba3a

Please sign in to comment.