From 3aaec227c9ae9924b423f88a6c46fdb81249d215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:42:07 +0200 Subject: [PATCH] Use only dataclasses for task init (#212) * replaced json tasks by python tasks --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- community_tasks/_template.py | 6 +- community_tasks/aimo_evals.py | 6 +- community_tasks/arabic_evals.py | 7 +- community_tasks/german_rag_evals.py | 6 +- examples/nanotron/custom_evaluation_tasks.py | 2 +- src/lighteval/tasks/default_tasks.py | 22665 ++++++++++++++++ src/lighteval/tasks/extended/ifeval/main.py | 4 +- src/lighteval/tasks/extended/mt_bench/main.py | 4 +- .../tasks/extended/tiny_benchmarks/main.py | 6 +- src/lighteval/tasks/lighteval_task.py | 22 +- src/lighteval/tasks/registry.py | 26 +- src/lighteval/tasks/tasks_table.jsonl | 1235 - src/lighteval/utils.py | 2 +- 13 files changed, 22692 insertions(+), 1299 deletions(-) create mode 100644 src/lighteval/tasks/default_tasks.py delete mode 100644 src/lighteval/tasks/tasks_table.jsonl diff --git a/community_tasks/_template.py b/community_tasks/_template.py index 6b52f9f4d..fe0d8e1d4 100644 --- a/community_tasks/_template.py +++ b/community_tasks/_template.py @@ -106,7 +106,7 @@ def prompt_fn(line, task_name: str = None): # STORE YOUR EVALS SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS] -_TASKS = SUBSET_TASKS + [task] +TASKS_TABLE = SUBSET_TASKS + [task] # CUSTOM METRIC IF NEEDED @@ -124,8 +124,6 @@ def prompt_fn(line, task_name: str = None): # MODULE LOGIC # You should not need to touch this # Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] - if __name__ == "__main__": - print(t["name"] for t in TASKS_TABLE) + print(t.name for t in TASKS_TABLE) print(len(TASKS_TABLE)) diff --git a/community_tasks/aimo_evals.py b/community_tasks/aimo_evals.py index 556ae6632..5262a013a 100644 --- a/community_tasks/aimo_evals.py +++ b/community_tasks/aimo_evals.py @@ -55,14 +55,12 @@ def aimo_prompt(line, task_name: str = None): # STORE YOUR EVALS -_TASKS = [task] +TASKS_TABLE = [task] # MODULE LOGIC # You should not need to touch this -# Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] if __name__ == "__main__": - print(t["name"] for t in TASKS_TABLE) + print(t.name for t in TASKS_TABLE) print(len(TASKS_TABLE)) diff --git a/community_tasks/arabic_evals.py b/community_tasks/arabic_evals.py index 9e65bade0..495c95d9e 100644 --- a/community_tasks/arabic_evals.py +++ b/community_tasks/arabic_evals.py @@ -577,7 +577,7 @@ def sciq_prompt_arabic(line, task_name: str = None): ) -_TASKS = ( +TASKS_TABLE = ( ARABIC_MMLU_TASKS + ACVA_TASKS + ALGHAFA_TASKS @@ -595,9 +595,6 @@ def sciq_prompt_arabic(line, task_name: str = None): + [sciq_ar_task] ) -# Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] - if __name__ == "__main__": - print(t["name"] for t in TASKS_TABLE) + print(t.name for t in TASKS_TABLE) print(len(TASKS_TABLE)) diff --git a/community_tasks/german_rag_evals.py b/community_tasks/german_rag_evals.py index fdda9d7ab..0d2c76c02 100644 --- a/community_tasks/german_rag_evals.py +++ b/community_tasks/german_rag_evals.py @@ -219,14 +219,12 @@ def prompt_fn_context_question_match(line, task_name: str = None): # STORE YOUR EVALS -_TASKS = [task1, task2, task3, task4] +TASKS_TABLE = [task1, task2, task3, task4] # MODULE LOGIC # You should not need to touch this -# Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] if __name__ == "__main__": - print(t["name"] for t in TASKS_TABLE) + print(t.name for t in TASKS_TABLE) print(len(TASKS_TABLE)) diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index cdca83854..62aa8dc40 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -679,7 +679,7 @@ def agi_eval_prompt_no_letters(line, task_name: str = None): EARLY_SIGNAL_TASKS = ",".join([t[1] for t in COMMON_SENSE_REASONING_STRING] + [t[1] for t in MMLU_STRING]) # Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] +TASKS_TABLE = _TASKS # You can have a few pre-organised groups of tasks TASKS_GROUPS = { "all": ",".join(t[1] for t in _TASKS_STRINGS), diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py new file mode 100644 index 000000000..dbfdfe09a --- /dev/null +++ b/src/lighteval/tasks/default_tasks.py @@ -0,0 +1,22665 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from lighteval.tasks.lighteval_task import LightevalTaskConfig + + +abstract_narrative_understanding_bigbench = LightevalTaskConfig( + name="abstract_narrative_understanding", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="abstract_narrative_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_aqua_rat_lighteval = LightevalTaskConfig( + name="agieval:aqua-rat", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-aqua-rat", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_biology_lighteval = LightevalTaskConfig( + name="agieval:gaokao-biology", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-biology", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_chemistry_lighteval = LightevalTaskConfig( + name="agieval:gaokao-chemistry", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-chemistry", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_chinese_lighteval = LightevalTaskConfig( + name="agieval:gaokao-chinese", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-chinese", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_english_lighteval = LightevalTaskConfig( + name="agieval:gaokao-english", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-english", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_geography_lighteval = LightevalTaskConfig( + name="agieval:gaokao-geography", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-geography", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_history_lighteval = LightevalTaskConfig( + name="agieval:gaokao-history", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-history", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_mathqa_lighteval = LightevalTaskConfig( + name="agieval:gaokao-mathqa", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-mathqa", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_gaokao_physics_lighteval = LightevalTaskConfig( + name="agieval:gaokao-physics", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-gaokao-physics", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_logiqa_en_lighteval = LightevalTaskConfig( + name="agieval:logiqa-en", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-logiqa-en", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_logiqa_zh_lighteval = LightevalTaskConfig( + name="agieval:logiqa-zh", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-logiqa-zh", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_lsat_ar_lighteval = LightevalTaskConfig( + name="agieval:lsat-ar", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-lsat-ar", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_lsat_lr_lighteval = LightevalTaskConfig( + name="agieval:lsat-lr", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-lsat-lr", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_lsat_rc_lighteval = LightevalTaskConfig( + name="agieval:lsat-rc", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-lsat-rc", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_sat_en_lighteval = LightevalTaskConfig( + name="agieval:sat-en", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-sat-en", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_sat_en_without_passage_lighteval = LightevalTaskConfig( + name="agieval:sat-en-without-passage", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-sat-en-without-passage", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +agieval_sat_math_lighteval = LightevalTaskConfig( + name="agieval:sat-math", + suite=["lighteval"], + prompt_function="agieval", + hf_repo="dmayhem93/agieval-sat-math", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=None, + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +anachronisms_bigbench = LightevalTaskConfig( + name="anachronisms", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="anachronisms", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +analogical_similarity_bigbench = LightevalTaskConfig( + name="analogical_similarity", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="analogical_similarity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +analytic_entailment_bigbench = LightevalTaskConfig( + name="analytic_entailment", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="analytic_entailment", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +anli_lighteval = LightevalTaskConfig( + name="anli", + suite=["lighteval", "anli"], + prompt_function="anli", + hf_repo="anli", + hf_subset="plain_text", + hf_avail_splits=[ + "train_r1", + "dev_r1", + "train_r2", + "dev_r2", + "train_r3", + "dev_r3", + "test_r1", + "test_r2", + "test_r3", + ], + evaluation_splits=["test_r1", "test_r2", "test_r3"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +anli_r1_lighteval = LightevalTaskConfig( + name="anli:r1", + suite=["lighteval", "anli"], + prompt_function="anli", + hf_repo="anli", + hf_subset="plain_text", + hf_avail_splits=["train_r1", "dev_r1", "test_r1"], + evaluation_splits=["test_r1"], + few_shots_split="train_r1", + few_shots_select="random_sampling_from_train", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +anli_r2_lighteval = LightevalTaskConfig( + name="anli:r2", + suite=["lighteval", "anli"], + prompt_function="anli", + hf_repo="anli", + hf_subset="plain_text", + hf_avail_splits=["train_r2", "dev_r2", "test_r2"], + evaluation_splits=["test_r2"], + few_shots_split="train_r2", + few_shots_select="random_sampling_from_train", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +anli_r3_lighteval = LightevalTaskConfig( + name="anli:r3", + suite=["lighteval", "anli"], + prompt_function="anli", + hf_repo="anli", + hf_subset="plain_text", + hf_avail_splits=["train_r3", "dev_r3", "test_r3"], + evaluation_splits=["test_r3"], + few_shots_split="train_r3", + few_shots_select="random_sampling_from_train", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arc_c_letters_original = LightevalTaskConfig( + name="arc:c:letters", + suite=["original", "arc"], + prompt_function="arc_with_options_letters_predict", + hf_repo="ai2_arc", + hf_subset="ARC-Challenge", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arc_c_options_original = LightevalTaskConfig( + name="arc:c:options", + suite=["original", "arc"], + prompt_function="arc_with_options", + hf_repo="ai2_arc", + hf_subset="ARC-Challenge", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arc_c_simple_original = LightevalTaskConfig( + name="arc:c:simple", + suite=["original", "arc"], + prompt_function="arc", + hf_repo="ai2_arc", + hf_subset="ARC-Challenge", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arc_challenge_leaderboard = LightevalTaskConfig( + name="arc:challenge", + suite=["leaderboard", "arc"], + prompt_function="arc", + hf_repo="ai2_arc", + hf_subset="ARC-Challenge", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arc_easy_lighteval = LightevalTaskConfig( + name="arc:easy", + suite=["lighteval", "arc"], + prompt_function="arc", + hf_repo="ai2_arc", + hf_subset="ARC-Easy", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_1dc_lighteval = LightevalTaskConfig( + name="arithmetic:1dc", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_1dc", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_2da_lighteval = LightevalTaskConfig( + name="arithmetic:2da", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_2da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_2dm_lighteval = LightevalTaskConfig( + name="arithmetic:2dm", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_2dm", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_2ds_lighteval = LightevalTaskConfig( + name="arithmetic:2ds", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_2ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_3da_lighteval = LightevalTaskConfig( + name="arithmetic:3da", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_3da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_3ds_lighteval = LightevalTaskConfig( + name="arithmetic:3ds", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_3ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_4da_lighteval = LightevalTaskConfig( + name="arithmetic:4da", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_4da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_4ds_lighteval = LightevalTaskConfig( + name="arithmetic:4ds", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_4ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_5da_lighteval = LightevalTaskConfig( + name="arithmetic:5da", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_5da", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_5ds_lighteval = LightevalTaskConfig( + name="arithmetic:5ds", + suite=["lighteval", "arithmetic"], + prompt_function="arithmetic", + hf_repo="EleutherAI/arithmetic", + hf_subset="arithmetic_5ds", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +arithmetic_bb_bigbench = LightevalTaskConfig( + name="arithmetic_bb", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="arithmetic", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ascii_word_recognition_bigbench = LightevalTaskConfig( + name="ascii_word_recognition", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="ascii_word_recognition", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +asdiv_lighteval = LightevalTaskConfig( + name="asdiv", + suite=["lighteval"], + prompt_function="asdiv", + hf_repo="EleutherAI/asdiv", + hf_subset="asdiv", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +authorship_verification_bigbench = LightevalTaskConfig( + name="authorship_verification", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="authorship_verification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +auto_categorization_bigbench = LightevalTaskConfig( + name="auto_categorization", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="auto_categorization", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +auto_debugging_bigbench_lite = LightevalTaskConfig( + name="auto_debugging", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_and_after_query", + hf_repo="bigbench", + hf_subset="auto_debugging", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["perfect_exact_match"], + stop_sequence=None, + output_regex="[^\\.\\?\\!\\;\\n]+", + trust_dataset=True, + version=0, +) +babi_qa_helm = LightevalTaskConfig( + name="babi_qa", + suite=["helm"], + prompt_function="babi_qa", + hf_repo="facebook/babi_qa", + hf_subset="en-valid-qa1", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_causal_judgment_lighteval = LightevalTaskConfig( + name="bigbench:causal_judgment", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="causal_judgement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_date_understanding_lighteval = LightevalTaskConfig( + name="bigbench:date_understanding", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="date_understanding", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_disambiguation_qa_lighteval = LightevalTaskConfig( + name="bigbench:disambiguation_qa", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="disambiguation_qa", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_geometric_shapes_lighteval = LightevalTaskConfig( + name="bigbench:geometric_shapes", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="geometric_shapes", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_five_objects_lighteval = LightevalTaskConfig( + name="bigbench:logical_deduction_five_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_five_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_seven_objects_lighteval = LightevalTaskConfig( + name="bigbench:logical_deduction_seven_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_seven_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_three_objects_lighteval = LightevalTaskConfig( + name="bigbench:logical_deduction_three_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_three_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_movie_recommendation_lighteval = LightevalTaskConfig( + name="bigbench:movie_recommendation", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="movie_recommendation", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_navigate_lighteval = LightevalTaskConfig( + name="bigbench:navigate", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="navigate", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_reasoning_about_colored_objects_lighteval = LightevalTaskConfig( + name="bigbench:reasoning_about_colored_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="reasoning_about_colored_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_ruin_names_lighteval = LightevalTaskConfig( + name="bigbench:ruin_names", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="ruin_names", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_salient_translation_error_detection_lighteval = LightevalTaskConfig( + name="bigbench:salient_translation_error_detection", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="salient_translation_error_detection", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_snarks_lighteval = LightevalTaskConfig( + name="bigbench:snarks", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="snarks", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_sports_understanding_lighteval = LightevalTaskConfig( + name="bigbench:sports_understanding", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="sports_understanding", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_temporal_sequences_lighteval = LightevalTaskConfig( + name="bigbench:temporal_sequences", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="temporal_sequences", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_five_objects_lighteval = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_five_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_five_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_seven_objects_lighteval = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_seven_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_seven_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_three_objects_lighteval = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_three_objects", + suite=["lighteval"], + prompt_function="bbh_lighteval", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_three_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_causal_judgment_harness = LightevalTaskConfig( + name="bigbench:causal_judgment", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="causal_judgement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_date_understanding_harness = LightevalTaskConfig( + name="bigbench:date_understanding", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="date_understanding", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_disambiguation_qa_harness = LightevalTaskConfig( + name="bigbench:disambiguation_qa", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="disambiguation_qa", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_geometric_shapes_harness = LightevalTaskConfig( + name="bigbench:geometric_shapes", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="geometric_shapes", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_five_objects_harness = LightevalTaskConfig( + name="bigbench:logical_deduction_five_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_five_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_seven_objects_harness = LightevalTaskConfig( + name="bigbench:logical_deduction_seven_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_seven_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_three_objects_harness = LightevalTaskConfig( + name="bigbench:logical_deduction_three_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="logical_deduction_three_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_movie_recommendation_harness = LightevalTaskConfig( + name="bigbench:movie_recommendation", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="movie_recommendation", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_navigate_harness = LightevalTaskConfig( + name="bigbench:navigate", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="navigate", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_reasoning_about_colored_objects_harness = LightevalTaskConfig( + name="bigbench:reasoning_about_colored_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="reasoning_about_colored_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_ruin_names_harness = LightevalTaskConfig( + name="bigbench:ruin_names", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="ruin_names", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_salient_translation_error_detection_harness = LightevalTaskConfig( + name="bigbench:salient_translation_error_detection", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="salient_translation_error_detection", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_snarks_harness = LightevalTaskConfig( + name="bigbench:snarks", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="snarks", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_sports_understanding_harness = LightevalTaskConfig( + name="bigbench:sports_understanding", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="sports_understanding", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_temporal_sequences_harness = LightevalTaskConfig( + name="bigbench:temporal_sequences", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="temporal_sequences", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_five_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_five_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_seven_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_seven_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bigbench_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig( + name="bigbench:tracking_shuffled_objects_three_objects", + suite=["harness"], + prompt_function="bbh_harness", + hf_repo="lighteval/bbh", + hf_subset="tracking_shuffled_objects_three_objects", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + must_remove_duplicate_docs=True, + trust_dataset=True, + version=0, +) +bbh_boolean_expressions_harness = LightevalTaskConfig( + name="bbh:boolean_expressions", + suite=["harness"], + prompt_function="bbh_boolean_expressions", + hf_repo="lukaemon/bbh", + hf_subset="boolean_expressions", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_causal_judgment_harness = LightevalTaskConfig( + name="bbh:causal_judgment", + suite=["harness"], + prompt_function="bbh_causal_judgment", + hf_repo="lukaemon/bbh", + hf_subset="causal_judgement", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_date_understanding_harness = LightevalTaskConfig( + name="bbh:date_understanding", + suite=["harness"], + prompt_function="bbh_date_understanding", + hf_repo="lukaemon/bbh", + hf_subset="date_understanding", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_disambiguation_qa_harness = LightevalTaskConfig( + name="bbh:disambiguation_qa", + suite=["harness"], + prompt_function="bbh_disambiguation_qa", + hf_repo="lukaemon/bbh", + hf_subset="disambiguation_qa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_dyck_languages_harness = LightevalTaskConfig( + name="bbh:dyck_languages", + suite=["harness"], + prompt_function="bbh_dyck_languages", + hf_repo="lukaemon/bbh", + hf_subset="dyck_languages", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_formal_fallacies_harness = LightevalTaskConfig( + name="bbh:formal_fallacies", + suite=["harness"], + prompt_function="bbh_formal_fallacies", + hf_repo="lukaemon/bbh", + hf_subset="formal_fallacies", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_geometric_shapes_harness = LightevalTaskConfig( + name="bbh:geometric_shapes", + suite=["harness"], + prompt_function="bbh_geometric_shapes", + hf_repo="lukaemon/bbh", + hf_subset="geometric_shapes", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_hyperbaton_harness = LightevalTaskConfig( + name="bbh:hyperbaton", + suite=["harness"], + prompt_function="bbh_hyperbaton", + hf_repo="lukaemon/bbh", + hf_subset="hyperbaton", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_logical_deduction_five_objects_harness = LightevalTaskConfig( + name="bbh:logical_deduction_five_objects", + suite=["harness"], + prompt_function="bbh_logical_deduction_five_objects", + hf_repo="lukaemon/bbh", + hf_subset="logical_deduction_five_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_logical_deduction_seven_objects_harness = LightevalTaskConfig( + name="bbh:logical_deduction_seven_objects", + suite=["harness"], + prompt_function="bbh_logical_deduction_seven_objects", + hf_repo="lukaemon/bbh", + hf_subset="logical_deduction_seven_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_logical_deduction_three_objects_harness = LightevalTaskConfig( + name="bbh:logical_deduction_three_objects", + suite=["harness"], + prompt_function="bbh_logical_deduction_three_objects", + hf_repo="lukaemon/bbh", + hf_subset="logical_deduction_three_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_movie_recommendation_harness = LightevalTaskConfig( + name="bbh:movie_recommendation", + suite=["harness"], + prompt_function="bbh_movie_recommendation", + hf_repo="lukaemon/bbh", + hf_subset="movie_recommendation", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_multistep_arithmetic_two_harness = LightevalTaskConfig( + name="bbh:multistep_arithmetic_two", + suite=["harness"], + prompt_function="bbh_multistep_arithmetic_two", + hf_repo="lukaemon/bbh", + hf_subset="multistep_arithmetic_two", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_navigate_harness = LightevalTaskConfig( + name="bbh:navigate", + suite=["harness"], + prompt_function="bbh_navigate", + hf_repo="lukaemon/bbh", + hf_subset="navigate", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_object_counting_harness = LightevalTaskConfig( + name="bbh:object_counting", + suite=["harness"], + prompt_function="bbh_object_counting", + hf_repo="lukaemon/bbh", + hf_subset="object_counting", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_penguins_in_a_table_harness = LightevalTaskConfig( + name="bbh:penguins_in_a_table", + suite=["harness"], + prompt_function="bbh_penguins_in_a_table", + hf_repo="lukaemon/bbh", + hf_subset="penguins_in_a_table", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_reasoning_about_colored_objects_harness = LightevalTaskConfig( + name="bbh:reasoning_about_colored_objects", + suite=["harness"], + prompt_function="bbh_reasoning_about_colored_objects", + hf_repo="lukaemon/bbh", + hf_subset="reasoning_about_colored_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_ruin_names_harness = LightevalTaskConfig( + name="bbh:ruin_names", + suite=["harness"], + prompt_function="bbh_ruin_names", + hf_repo="lukaemon/bbh", + hf_subset="ruin_names", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_salient_translation_error_detection_harness = LightevalTaskConfig( + name="bbh:salient_translation_error_detection", + suite=["harness"], + prompt_function="bbh_salient_translation_error_detection", + hf_repo="lukaemon/bbh", + hf_subset="salient_translation_error_detection", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_snarks_harness = LightevalTaskConfig( + name="bbh:snarks", + suite=["harness"], + prompt_function="bbh_snarks", + hf_repo="lukaemon/bbh", + hf_subset="snarks", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_sports_understanding_harness = LightevalTaskConfig( + name="bbh:sports_understanding", + suite=["harness"], + prompt_function="bbh_sports_understanding", + hf_repo="lukaemon/bbh", + hf_subset="sports_understanding", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_temporal_sequences_harness = LightevalTaskConfig( + name="bbh:temporal_sequences", + suite=["harness"], + prompt_function="bbh_temporal_sequences", + hf_repo="lukaemon/bbh", + hf_subset="temporal_sequences", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_tracking_shuffled_objects_five_objects_harness = LightevalTaskConfig( + name="bbh:tracking_shuffled_objects_five_objects", + suite=["harness"], + prompt_function="bbh_tracking_shuffled_objects_five_objects", + hf_repo="lukaemon/bbh", + hf_subset="tracking_shuffled_objects_five_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_tracking_shuffled_objects_seven_objects_harness = LightevalTaskConfig( + name="bbh:tracking_shuffled_objects_seven_objects", + suite=["harness"], + prompt_function="bbh_tracking_shuffled_objects_seven_objects", + hf_repo="lukaemon/bbh", + hf_subset="tracking_shuffled_objects_seven_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_tracking_shuffled_objects_three_objects_harness = LightevalTaskConfig( + name="bbh:tracking_shuffled_objects_three_objects", + suite=["harness"], + prompt_function="bbh_tracking_shuffled_objects_three_objects", + hf_repo="lukaemon/bbh", + hf_subset="tracking_shuffled_objects_three_objects", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_web_of_lies_harness = LightevalTaskConfig( + name="bbh:web_of_lies", + suite=["harness"], + prompt_function="bbh_web_of_lies", + hf_repo="lukaemon/bbh", + hf_subset="web_of_lies", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbh_word_sorting_harness = LightevalTaskConfig( + name="bbh:word_sorting", + suite=["harness"], + prompt_function="bbh_word_sorting", + hf_repo="lukaemon/bbh", + hf_subset="word_sorting", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["", "Q=", "\n\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_helm = LightevalTaskConfig( + name="bbq", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="all", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Age_helm = LightevalTaskConfig( + name="bbq:Age", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Age", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Disability_status_helm = LightevalTaskConfig( + name="bbq:Disability_status", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Disability_status", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Gender_identity_helm = LightevalTaskConfig( + name="bbq:Gender_identity", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Gender_identity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Nationality_helm = LightevalTaskConfig( + name="bbq=Nationality", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Nationality", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Physical_appearance_helm = LightevalTaskConfig( + name="bbq:Physical_appearance", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Physical_appearance", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Race_ethnicity_helm = LightevalTaskConfig( + name="bbq:Race_ethnicity", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Race_ethnicity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Race_x_SES_helm = LightevalTaskConfig( + name="bbq:Race_x_SES", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Race_x_SES", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Race_x_gender_helm = LightevalTaskConfig( + name="bbq:Race_x_gender", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Race_x_gender", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Religion_helm = LightevalTaskConfig( + name="bbq:Religion", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Religion", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_SES_helm = LightevalTaskConfig( + name="bbq:SES", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="SES", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_Sexual_orientation_helm = LightevalTaskConfig( + name="bbq:Sexual_orientation", + suite=["helm"], + prompt_function="bbq", + hf_repo="lighteval/bbq_helm", + hf_subset="Sexual_orientation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "perfect_exact_match", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bbq_lite_json_bigbench_lite = LightevalTaskConfig( + name="bbq_lite_json", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="bbq_lite_json", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_auto_debugging_helm = LightevalTaskConfig( + name="bigbench:auto_debugging", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="auto_debugging", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_age_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:age_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-age_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_age_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:age_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-age_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_disability_status_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:disability_status_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-disability_status_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_disability_status_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:disability_status_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-disability_status_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_gender_identity_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:gender_identity_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-gender_identity_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_gender_identity_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:gender_identity_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-gender_identity_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_nationality_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:nationality_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-nationality_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_nationality_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:nationality_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-nationality_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_physical_appearance_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:physical_appearance_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-physical_appearance_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_physical_appearance_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:physical_appearance_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-physical_appearance_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_race_ethnicity_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:race_ethnicity_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-race_ethnicity_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_race_ethnicity_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:race_ethnicity_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-race_ethnicity_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_religion_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:religion_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-religion_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_religion_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:religion_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-religion_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_ses_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:ses_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-ses_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_ses_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:ses_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-ses_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_sexual_orientation_ambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:sexual_orientation_ambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-sexual_orientation_ambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_bbq_lite_json_sexual_orientation_disambig_helm = LightevalTaskConfig( + name="bigbench:bbq_lite_json:sexual_orientation_disambig", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="bbq_lite_json-sexual_orientation_disambig", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_code_line_description_helm = LightevalTaskConfig( + name="bigbench:code_line_description", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="code_line_description", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conceptual_combinations_contradictions_helm = LightevalTaskConfig( + name="bigbench:conceptual_combinations:contradictions", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conceptual_combinations-contradictions", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conceptual_combinations_emergent_properties_helm = LightevalTaskConfig( + name="bigbench:conceptual_combinations:emergent_properties", + suite=["helm"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conceptual_combinations-emergent_properties", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conceptual_combinations_fanciful_fictional_combinations_helm = LightevalTaskConfig( + name="bigbench:conceptual_combinations:fanciful_fictional_combinations", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conceptual_combinations-fanciful_fictional_combinations", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conceptual_combinations_homonyms_helm = LightevalTaskConfig( + name="bigbench:conceptual_combinations:homonyms", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conceptual_combinations-homonyms", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conceptual_combinations_invented_words_helm = LightevalTaskConfig( + name="bigbench:conceptual_combinations:invented_words", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conceptual_combinations-invented_words", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_adna_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:adna_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-adna_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_adna_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:adna_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-adna_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_atikampe_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:atikampe_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-atikampe_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_atikampe_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:atikampe_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-atikampe_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_gornam_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:gornam_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-gornam_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_gornam_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:gornam_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-gornam_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_holuan_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:holuan_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-holuan_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_holuan_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:holuan_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-holuan_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_mkafala_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:mkafala_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-mkafala_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_mkafala_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:mkafala_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-mkafala_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_postpositive_english_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:postpositive_english_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-postpositive_english_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_postpositive_english_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:postpositive_english_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-postpositive_english_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_unapuri_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:unapuri_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-unapuri_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_unapuri_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:unapuri_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-unapuri_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_vaomi_from_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:vaomi_from", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-vaomi_from", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_conlang_translation_vaomi_to_helm = LightevalTaskConfig( + name="bigbench:conlang_translation:vaomi_to", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="conlang_translation-vaomi_to", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge1", "rouge2", "rougeL"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_emoji_movie_helm = LightevalTaskConfig( + name="bigbench:emoji_movie", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="emoji_movie", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_formal_fallacies_syllogisms_negation_helm = LightevalTaskConfig( + name="bigbench:formal_fallacies_syllogisms_negation", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="formal_fallacies_syllogisms_negation", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_hindu_knowledge_helm = LightevalTaskConfig( + name="bigbench:hindu_knowledge", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="hindu_knowledge", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_known_unknowns_helm = LightevalTaskConfig( + name="bigbench:known_unknowns", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="known_unknowns", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_language_identification_helm = LightevalTaskConfig( + name="bigbench:language_identification", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="language_identification", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_linguistics_puzzles_helm = LightevalTaskConfig( + name="bigbench:linguistics_puzzles", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="linguistics_puzzles", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logic_grid_puzzle_helm = LightevalTaskConfig( + name="bigbench:logic_grid_puzzle", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="logic_grid_puzzle", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_five_objects_helm = LightevalTaskConfig( + name="bigbench:logical_deduction-five_objects", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="logical_deduction-five_objects", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_seven_objects_helm = LightevalTaskConfig( + name="bigbench:logical_deduction-seven_objects", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="logical_deduction-seven_objects", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_logical_deduction_three_objects_helm = LightevalTaskConfig( + name="bigbench:logical_deduction-three_objects", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="logical_deduction-three_objects", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_misconceptions_russian_helm = LightevalTaskConfig( + name="bigbench:misconceptions_russian", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="misconceptions_russian", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_novel_concepts_helm = LightevalTaskConfig( + name="bigbench:novel_concepts", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="novel_concepts", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_operators_helm = LightevalTaskConfig( + name="bigbench:operators", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="operators", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_parsinlu_reading_comprehension_helm = LightevalTaskConfig( + name="bigbench:parsinlu_reading_comprehension", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="parsinlu_reading_comprehension", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_play_dialog_same_or_different_helm = LightevalTaskConfig( + name="bigbench:play_dialog_same_or_different", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="play_dialog_same_or_different", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_repeat_copy_logic_helm = LightevalTaskConfig( + name="bigbench:repeat_copy_logic", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="repeat_copy_logic", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_strange_stories_boolean_helm = LightevalTaskConfig( + name="bigbench:strange_stories-boolean", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="strange_stories-boolean", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_strange_stories_multiple_choice_helm = LightevalTaskConfig( + name="bigbench:strange_stories-multiple_choice", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="strange_stories-multiple_choice", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_strategyqa_helm = LightevalTaskConfig( + name="bigbench:strategyqa", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="strategyqa", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_symbol_interpretation_adversarial_helm = LightevalTaskConfig( + name="bigbench:symbol_interpretation-adversarial", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="symbol_interpretation-adversarial", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_symbol_interpretation_emoji_agnostic_helm = LightevalTaskConfig( + name="bigbench:symbol_interpretation-emoji_agnostic", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="symbol_interpretation-emoji_agnostic", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_symbol_interpretation_name_agnostic_helm = LightevalTaskConfig( + name="bigbench:symbol_interpretation-name_agnostic", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="symbol_interpretation-name_agnostic", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_symbol_interpretation_plain_helm = LightevalTaskConfig( + name="bigbench:symbol_interpretation-plain", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="symbol_interpretation-plain", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_symbol_interpretation_tricky_helm = LightevalTaskConfig( + name="bigbench:symbol_interpretation-tricky", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="symbol_interpretation-tricky", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_vitaminc_fact_verification_helm = LightevalTaskConfig( + name="bigbench:vitaminc_fact_verification", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="vitaminc_fact_verification", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bigbench_winowhy_helm = LightevalTaskConfig( + name="bigbench:winowhy", + suite=["helm", "bigbench_scenario"], + prompt_function="bigbench_helm", + hf_repo="lighteval/bigbench_helm", + hf_subset="winowhy", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_adjunct_island_lighteval = LightevalTaskConfig( + name="blimp:adjunct_island", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="adjunct_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_adjunct_island_helm = LightevalTaskConfig( + name="blimp:adjunct_island", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="adjunct_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_anaphor_gender_agreement_lighteval = LightevalTaskConfig( + name="blimp:anaphor_gender_agreement", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="anaphor_gender_agreement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_anaphor_gender_agreement_helm = LightevalTaskConfig( + name="blimp:anaphor_gender_agreement", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="anaphor_gender_agreement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_anaphor_number_agreement_lighteval = LightevalTaskConfig( + name="blimp:anaphor_number_agreement", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="anaphor_number_agreement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_anaphor_number_agreement_helm = LightevalTaskConfig( + name="blimp:anaphor_number_agreement", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="anaphor_number_agreement", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_animate_subject_passive_lighteval = LightevalTaskConfig( + name="blimp:animate_subject_passive", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="animate_subject_passive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_animate_subject_passive_helm = LightevalTaskConfig( + name="blimp:animate_subject_passive", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="animate_subject_passive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_animate_subject_trans_lighteval = LightevalTaskConfig( + name="blimp:animate_subject_trans", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="animate_subject_trans", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_animate_subject_trans_helm = LightevalTaskConfig( + name="blimp:animate_subject_trans", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="animate_subject_trans", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_causative_lighteval = LightevalTaskConfig( + name="blimp:causative", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="causative", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_causative_helm = LightevalTaskConfig( + name="blimp:causative", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="causative", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_complex_NP_island_lighteval = LightevalTaskConfig( + name="blimp:complex_NP_island", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="complex_NP_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_complex_NP_island_helm = LightevalTaskConfig( + name="blimp:complex_NP_island", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="complex_NP_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_coordinate_structure_constraint_complex_left_branch_lighteval = LightevalTaskConfig( + name="blimp:coordinate_structure_constraint_complex_left_branch", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="coordinate_structure_constraint_complex_left_branch", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_coordinate_structure_constraint_complex_left_branch_helm = LightevalTaskConfig( + name="blimp:coordinate_structure_constraint_complex_left_branch", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="coordinate_structure_constraint_complex_left_branch", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_coordinate_structure_constraint_object_extraction_lighteval = LightevalTaskConfig( + name="blimp:coordinate_structure_constraint_object_extraction", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="coordinate_structure_constraint_object_extraction", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_coordinate_structure_constraint_object_extraction_helm = LightevalTaskConfig( + name="blimp:coordinate_structure_constraint_object_extraction", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="coordinate_structure_constraint_object_extraction", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_1_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_1_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_2_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_2_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_irregular_1_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_irregular_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_irregular_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_irregular_1_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_irregular_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_irregular_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_irregular_2_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_irregular_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_irregular_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_irregular_2_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_irregular_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_irregular_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_2_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_2_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_irregular_1_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_irregular_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_irregular_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_irregular_1_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_irregular_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_irregular_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_irregular_2_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_irregular_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_irregular_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adj_irregular_2_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adj_irregular_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adj_irregular_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adjective_1_lighteval = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adjective_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adjective_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_determiner_noun_agreement_with_adjective_1_helm = LightevalTaskConfig( + name="blimp:determiner_noun_agreement_with_adjective_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="determiner_noun_agreement_with_adjective_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_distractor_agreement_relational_noun_lighteval = LightevalTaskConfig( + name="blimp:distractor_agreement_relational_noun", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="distractor_agreement_relational_noun", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_distractor_agreement_relational_noun_helm = LightevalTaskConfig( + name="blimp:distractor_agreement_relational_noun", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="distractor_agreement_relational_noun", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_distractor_agreement_relative_clause_lighteval = LightevalTaskConfig( + name="blimp:distractor_agreement_relative_clause", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="distractor_agreement_relative_clause", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_distractor_agreement_relative_clause_helm = LightevalTaskConfig( + name="blimp:distractor_agreement_relative_clause", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="distractor_agreement_relative_clause", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_drop_argument_lighteval = LightevalTaskConfig( + name="blimp:drop_argument", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="drop_argument", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_drop_argument_helm = LightevalTaskConfig( + name="blimp:drop_argument", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="drop_argument", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_ellipsis_n_bar_1_lighteval = LightevalTaskConfig( + name="blimp:ellipsis_n_bar_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="ellipsis_n_bar_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_ellipsis_n_bar_1_helm = LightevalTaskConfig( + name="blimp:ellipsis_n_bar_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="ellipsis_n_bar_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_ellipsis_n_bar_2_lighteval = LightevalTaskConfig( + name="blimp:ellipsis_n_bar_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="ellipsis_n_bar_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_ellipsis_n_bar_2_helm = LightevalTaskConfig( + name="blimp:ellipsis_n_bar_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="ellipsis_n_bar_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_object_raising_lighteval = LightevalTaskConfig( + name="blimp:existential_there_object_raising", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="existential_there_object_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_object_raising_helm = LightevalTaskConfig( + name="blimp:existential_there_object_raising", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="existential_there_object_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_quantifiers_1_lighteval = LightevalTaskConfig( + name="blimp:existential_there_quantifiers_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="existential_there_quantifiers_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_quantifiers_1_helm = LightevalTaskConfig( + name="blimp:existential_there_quantifiers_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="existential_there_quantifiers_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_quantifiers_2_lighteval = LightevalTaskConfig( + name="blimp:existential_there_quantifiers_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="existential_there_quantifiers_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_quantifiers_2_helm = LightevalTaskConfig( + name="blimp:existential_there_quantifiers_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="existential_there_quantifiers_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_subject_raising_lighteval = LightevalTaskConfig( + name="blimp:existential_there_subject_raising", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="existential_there_subject_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_existential_there_subject_raising_helm = LightevalTaskConfig( + name="blimp:existential_there_subject_raising", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="existential_there_subject_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_expletive_it_object_raising_lighteval = LightevalTaskConfig( + name="blimp:expletive_it_object_raising", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="expletive_it_object_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_expletive_it_object_raising_helm = LightevalTaskConfig( + name="blimp:expletive_it_object_raising", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="expletive_it_object_raising", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_inchoative_lighteval = LightevalTaskConfig( + name="blimp:inchoative", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="inchoative", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_inchoative_helm = LightevalTaskConfig( + name="blimp:inchoative", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="inchoative", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_intransitive_lighteval = LightevalTaskConfig( + name="blimp:intransitive", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="intransitive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_intransitive_helm = LightevalTaskConfig( + name="blimp:intransitive", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="intransitive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_past_participle_adjectives_lighteval = LightevalTaskConfig( + name="blimp:irregular_past_participle_adjectives", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="irregular_past_participle_adjectives", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_past_participle_adjectives_helm = LightevalTaskConfig( + name="blimp:irregular_past_participle_adjectives", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="irregular_past_participle_adjectives", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_past_participle_verbs_lighteval = LightevalTaskConfig( + name="blimp:irregular_past_participle_verbs", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="irregular_past_participle_verbs", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_past_participle_verbs_helm = LightevalTaskConfig( + name="blimp:irregular_past_participle_verbs", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="irregular_past_participle_verbs", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig( + name="blimp:irregular_plural_subject_verb_agreement_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="irregular_plural_subject_verb_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig( + name="blimp:irregular_plural_subject_verb_agreement_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="irregular_plural_subject_verb_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig( + name="blimp:irregular_plural_subject_verb_agreement_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="irregular_plural_subject_verb_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_irregular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig( + name="blimp:irregular_plural_subject_verb_agreement_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="irregular_plural_subject_verb_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_left_branch_island_echo_question_lighteval = LightevalTaskConfig( + name="blimp:left_branch_island_echo_question", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="left_branch_island_echo_question", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_left_branch_island_echo_question_helm = LightevalTaskConfig( + name="blimp:left_branch_island_echo_question", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="left_branch_island_echo_question", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_left_branch_island_simple_question_lighteval = LightevalTaskConfig( + name="blimp:left_branch_island_simple_question", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="left_branch_island_simple_question", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_left_branch_island_simple_question_helm = LightevalTaskConfig( + name="blimp:left_branch_island_simple_question", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="left_branch_island_simple_question", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_matrix_question_npi_licensor_present_lighteval = LightevalTaskConfig( + name="blimp:matrix_question_npi_licensor_present", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="matrix_question_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_matrix_question_npi_licensor_present_helm = LightevalTaskConfig( + name="blimp:matrix_question_npi_licensor_present", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="matrix_question_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_npi_present_1_lighteval = LightevalTaskConfig( + name="blimp:npi_present_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="npi_present_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_npi_present_1_helm = LightevalTaskConfig( + name="blimp:npi_present_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="npi_present_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_npi_present_2_lighteval = LightevalTaskConfig( + name="blimp:npi_present_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="npi_present_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_npi_present_2_helm = LightevalTaskConfig( + name="blimp:npi_present_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="npi_present_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_only_npi_licensor_present_lighteval = LightevalTaskConfig( + name="blimp:only_npi_licensor_present", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="only_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_only_npi_licensor_present_helm = LightevalTaskConfig( + name="blimp:only_npi_licensor_present", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="only_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_only_npi_scope_lighteval = LightevalTaskConfig( + name="blimp:only_npi_scope", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="only_npi_scope", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_only_npi_scope_helm = LightevalTaskConfig( + name="blimp:only_npi_scope", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="only_npi_scope", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_passive_1_lighteval = LightevalTaskConfig( + name="blimp:passive_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="passive_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_passive_1_helm = LightevalTaskConfig( + name="blimp:passive_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="passive_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_passive_2_lighteval = LightevalTaskConfig( + name="blimp:passive_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="passive_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_passive_2_helm = LightevalTaskConfig( + name="blimp:passive_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="passive_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_c_command_lighteval = LightevalTaskConfig( + name="blimp:principle_A_c_command", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_c_command", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_c_command_helm = LightevalTaskConfig( + name="blimp:principle_A_c_command", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_c_command", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_case_1_lighteval = LightevalTaskConfig( + name="blimp:principle_A_case_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_case_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_case_1_helm = LightevalTaskConfig( + name="blimp:principle_A_case_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_case_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_case_2_lighteval = LightevalTaskConfig( + name="blimp:principle_A_case_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_case_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_case_2_helm = LightevalTaskConfig( + name="blimp:principle_A_case_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_case_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_1_lighteval = LightevalTaskConfig( + name="blimp:principle_A_domain_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_domain_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_1_helm = LightevalTaskConfig( + name="blimp:principle_A_domain_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_domain_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_2_lighteval = LightevalTaskConfig( + name="blimp:principle_A_domain_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_domain_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_2_helm = LightevalTaskConfig( + name="blimp:principle_A_domain_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_domain_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_3_lighteval = LightevalTaskConfig( + name="blimp:principle_A_domain_3", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_domain_3", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_domain_3_helm = LightevalTaskConfig( + name="blimp:principle_A_domain_3", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_domain_3", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_reconstruction_lighteval = LightevalTaskConfig( + name="blimp:principle_A_reconstruction", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="principle_A_reconstruction", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_principle_A_reconstruction_helm = LightevalTaskConfig( + name="blimp:principle_A_reconstruction", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="principle_A_reconstruction", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_regular_plural_subject_verb_agreement_1_lighteval = LightevalTaskConfig( + name="blimp:regular_plural_subject_verb_agreement_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="regular_plural_subject_verb_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_regular_plural_subject_verb_agreement_1_helm = LightevalTaskConfig( + name="blimp:regular_plural_subject_verb_agreement_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="regular_plural_subject_verb_agreement_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_regular_plural_subject_verb_agreement_2_lighteval = LightevalTaskConfig( + name="blimp:regular_plural_subject_verb_agreement_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="regular_plural_subject_verb_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_regular_plural_subject_verb_agreement_2_helm = LightevalTaskConfig( + name="blimp:regular_plural_subject_verb_agreement_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="regular_plural_subject_verb_agreement_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_negation_npi_licensor_present_lighteval = LightevalTaskConfig( + name="blimp:sentential_negation_npi_licensor_present", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="sentential_negation_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_negation_npi_licensor_present_helm = LightevalTaskConfig( + name="blimp:sentential_negation_npi_licensor_present", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="sentential_negation_npi_licensor_present", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_negation_npi_scope_lighteval = LightevalTaskConfig( + name="blimp:sentential_negation_npi_scope", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="sentential_negation_npi_scope", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_negation_npi_scope_helm = LightevalTaskConfig( + name="blimp:sentential_negation_npi_scope", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="sentential_negation_npi_scope", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_subject_island_lighteval = LightevalTaskConfig( + name="blimp:sentential_subject_island", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="sentential_subject_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_sentential_subject_island_helm = LightevalTaskConfig( + name="blimp:sentential_subject_island", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="sentential_subject_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_superlative_quantifiers_1_lighteval = LightevalTaskConfig( + name="blimp:superlative_quantifiers_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="superlative_quantifiers_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_superlative_quantifiers_1_helm = LightevalTaskConfig( + name="blimp:superlative_quantifiers_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="superlative_quantifiers_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_superlative_quantifiers_2_lighteval = LightevalTaskConfig( + name="blimp:superlative_quantifiers_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="superlative_quantifiers_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_superlative_quantifiers_2_helm = LightevalTaskConfig( + name="blimp:superlative_quantifiers_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="superlative_quantifiers_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_tough_vs_raising_1_lighteval = LightevalTaskConfig( + name="blimp:tough_vs_raising_1", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="tough_vs_raising_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_tough_vs_raising_1_helm = LightevalTaskConfig( + name="blimp:tough_vs_raising_1", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="tough_vs_raising_1", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_tough_vs_raising_2_lighteval = LightevalTaskConfig( + name="blimp:tough_vs_raising_2", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="tough_vs_raising_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_tough_vs_raising_2_helm = LightevalTaskConfig( + name="blimp:tough_vs_raising_2", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="tough_vs_raising_2", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_transitive_lighteval = LightevalTaskConfig( + name="blimp:transitive", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="transitive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_transitive_helm = LightevalTaskConfig( + name="blimp:transitive", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="transitive", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_island_lighteval = LightevalTaskConfig( + name="blimp:wh_island", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_island_helm = LightevalTaskConfig( + name="blimp:wh_island", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_island", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_object_gap_lighteval = LightevalTaskConfig( + name="blimp:wh_questions_object_gap", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_questions_object_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_object_gap_helm = LightevalTaskConfig( + name="blimp:wh_questions_object_gap", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_questions_object_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_subject_gap_lighteval = LightevalTaskConfig( + name="blimp:wh_questions_subject_gap", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_questions_subject_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_subject_gap_helm = LightevalTaskConfig( + name="blimp:wh_questions_subject_gap", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_questions_subject_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_subject_gap_long_distance_lighteval = LightevalTaskConfig( + name="blimp:wh_questions_subject_gap_long_distance", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_questions_subject_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_questions_subject_gap_long_distance_helm = LightevalTaskConfig( + name="blimp:wh_questions_subject_gap_long_distance", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_questions_subject_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_no_gap_lighteval = LightevalTaskConfig( + name="blimp:wh_vs_that_no_gap", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_vs_that_no_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_no_gap_helm = LightevalTaskConfig( + name="blimp:wh_vs_that_no_gap", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_vs_that_no_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_no_gap_long_distance_lighteval = LightevalTaskConfig( + name="blimp:wh_vs_that_no_gap_long_distance", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_vs_that_no_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_no_gap_long_distance_helm = LightevalTaskConfig( + name="blimp:wh_vs_that_no_gap_long_distance", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_vs_that_no_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_with_gap_lighteval = LightevalTaskConfig( + name="blimp:wh_vs_that_with_gap", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_vs_that_with_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_with_gap_helm = LightevalTaskConfig( + name="blimp:wh_vs_that_with_gap", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_vs_that_with_gap", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_with_gap_long_distance_lighteval = LightevalTaskConfig( + name="blimp:wh_vs_that_with_gap_long_distance", + suite=["lighteval", "blimp"], + prompt_function="blimp", + hf_repo="blimp", + hf_subset="wh_vs_that_with_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +blimp_wh_vs_that_with_gap_long_distance_helm = LightevalTaskConfig( + name="blimp:wh_vs_that_with_gap_long_distance", + suite=["helm", "blimp"], + prompt_function="blimp_helm", + hf_repo="blimp", + hf_subset="wh_vs_that_with_gap_long_distance", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_helm = LightevalTaskConfig( + name="bold", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="all", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_gender_helm = LightevalTaskConfig( + name="bold:gender", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="gender", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_political_ideology_helm = LightevalTaskConfig( + name="bold:political_ideology", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="political_ideology", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_profession_helm = LightevalTaskConfig( + name="bold:profession", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="profession", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_race_helm = LightevalTaskConfig( + name="bold:race", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="race", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bold_religious_ideology_helm = LightevalTaskConfig( + name="bold:religious_ideology", + suite=["helm"], + prompt_function="bold", + hf_repo="lighteval/bold_helm", + hf_subset="religious_ideology", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +boolq_helm = LightevalTaskConfig( + name="boolq", + suite=["helm", "helm_general"], + prompt_function="boolq_helm", + hf_repo="lighteval/boolq_helm", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +boolq_contrastset_helm = LightevalTaskConfig( + name="boolq:contrastset", + suite=["helm"], + prompt_function="boolq_helm_contrastset", + hf_repo="lighteval/boolq_helm", + hf_subset="default", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +bridging_anaphora_resolution_barqa_bigbench = LightevalTaskConfig( + name="bridging_anaphora_resolution_barqa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="bridging_anaphora_resolution_barqa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +causal_judgment_bigbench = LightevalTaskConfig( + name="causal_judgment", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="causal_judgment", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +cause_and_effect_bigbench = LightevalTaskConfig( + name="cause_and_effect", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="cause_and_effect", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +checkmate_in_one_bigbench = LightevalTaskConfig( + name="checkmate_in_one", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="checkmate_in_one", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +chess_state_tracking_bigbench = LightevalTaskConfig( + name="chess_state_tracking", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="chess_state_tracking", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +chinese_remainder_theorem_bigbench = LightevalTaskConfig( + name="chinese_remainder_theorem", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="chinese_remainder_theorem", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +cifar10_classification_bigbench = LightevalTaskConfig( + name="cifar10_classification", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="cifar10_classification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_helm = LightevalTaskConfig( + name="civil_comments", + suite=["helm", "helm_general"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="all", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_LGBTQ_helm = LightevalTaskConfig( + name="civil_comments:LGBTQ", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="LGBTQ", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_black_helm = LightevalTaskConfig( + name="civil_comments:black", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="black", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_christian_helm = LightevalTaskConfig( + name="civil_comments:christian", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="christian", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_female_helm = LightevalTaskConfig( + name="civil_comments:female", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="female", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_male_helm = LightevalTaskConfig( + name="civil_comments:male", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="male", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_muslim_helm = LightevalTaskConfig( + name="civil_comments:muslim", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="muslim", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_other_religions_helm = LightevalTaskConfig( + name="civil_comments:other_religions", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="other_religions", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +civil_comments_white_helm = LightevalTaskConfig( + name="civil_comments:white", + suite=["helm"], + prompt_function="civil_comments", + hf_repo="lighteval/civil_comments_helm", + hf_subset="white", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +code_line_description_bigbench_lite = LightevalTaskConfig( + name="code_line_description", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_and_after_query", + hf_repo="bigbench", + hf_subset="code_line_description", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +codenames_bigbench = LightevalTaskConfig( + name="codenames", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="codenames", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "bleu", "bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +color_bigbench = LightevalTaskConfig( + name="color", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="color", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "bleu", "loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +common_morpheme_bigbench = LightevalTaskConfig( + name="common_morpheme", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="common_morpheme", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +commonsenseqa_helm = LightevalTaskConfig( + name="commonsenseqa", + suite=["helm", "commonsense_scenario"], + prompt_function="commonsense_qa", + hf_repo="commonsense_qa", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +conceptual_combinations_bigbench_lite = LightevalTaskConfig( + name="conceptual_combinations", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="conceptual_combinations", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +conlang_translation_bigbench_lite = LightevalTaskConfig( + name="conlang_translation", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="conlang_translation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["rouge_t5", "bleu", "perfect_exact_match"], + stop_sequence=[".", ";", "!", "?"], + output_regex="[^\\.\\?\\!\\;\\n]+", + trust_dataset=True, + version=0, +) +contextual_parametric_knowledge_conflicts_bigbench = LightevalTaskConfig( + name="contextual_parametric_knowledge_conflicts", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="contextual_parametric_knowledge_conflicts", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_1_prefix_length_125_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_1-prefix_length_125", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_1-prefix_length_125", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_1_prefix_length_25_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_1-prefix_length_25", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_1-prefix_length_25", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_1_prefix_length_5_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_1-prefix_length_5", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_1-prefix_length_5", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_3_prefix_length_125_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_3-prefix_length_125", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_3-prefix_length_125", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_3_prefix_length_25_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_3-prefix_length_25", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_3-prefix_length_25", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_n_books_1000_extractions_per_book_3_prefix_length_5_helm = LightevalTaskConfig( + name="copyright:n_books_1000-extractions_per_book_3-prefix_length_5", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="n_books_1000-extractions_per_book_3-prefix_length_5", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_oh_the_places_helm = LightevalTaskConfig( + name="copyright:oh_the_places", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="oh_the_places", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_pilot_helm = LightevalTaskConfig( + name="copyright:pilot", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="pilot", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_10_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_10", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_10", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_125_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_125", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_125", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_25_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_25", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_25", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_250_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_250", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_250", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_5_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_5", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_5", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_popular_books_prefix_length_50_helm = LightevalTaskConfig( + name="copyright:popular_books-prefix_length_50", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="popular_books-prefix_length_50", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_prompt_num_line_1_min_lines_20_helm = LightevalTaskConfig( + name="copyright:prompt_num_line_1-min_lines_20", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="prompt_num_line_1-min_lines_20", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_prompt_num_line_10_min_lines_20_helm = LightevalTaskConfig( + name="copyright:prompt_num_line_10-min_lines_20", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="prompt_num_line_10-min_lines_20", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +copyright_prompt_num_line_5_min_lines_20_helm = LightevalTaskConfig( + name="copyright:prompt_num_line_5-min_lines_20", + suite=["helm", "copyright_scenario"], + prompt_function="copyright", + hf_repo="lighteval/copyright_helm", + hf_subset="prompt_num_line_5-min_lines_20", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["copyright"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +coqa_lighteval = LightevalTaskConfig( + name="coqa", + suite=["lighteval"], + prompt_function="coqa", + hf_repo="coqa", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["perfect_exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +coqa_bb_lighteval = LightevalTaskConfig( + name="coqa_bb", + suite=["lighteval", "bigbench_programmatic", "bigbench"], + prompt_function="coqa", + hf_repo="coqa", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["perfect_exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +covid_dialogue_helm = LightevalTaskConfig( + name="covid_dialogue", + suite=["helm"], + prompt_function="covid_dialogue", + hf_repo="lighteval/covid_dialogue", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +crash_blossom_bigbench = LightevalTaskConfig( + name="crash_blossom", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="crash_blossom", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +crass_ai_bigbench = LightevalTaskConfig( + name="crass_ai", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="crass_ai", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +cryobiology_spanish_bigbench = LightevalTaskConfig( + name="cryobiology_spanish", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="cryobiology_spanish", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +cryptonite_bigbench = LightevalTaskConfig( + name="cryptonite", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="cryptonite", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +cs_algorithms_bigbench = LightevalTaskConfig( + name="cs_algorithms", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="cs_algorithms", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +dark_humor_detection_bigbench = LightevalTaskConfig( + name="dark_humor_detection", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="dark_humor_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +date_understanding_bigbench = LightevalTaskConfig( + name="date_understanding", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="date_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +disambiguation_qa_bigbench = LightevalTaskConfig( + name="disambiguation_qa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="disambiguation_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +discourse_marker_prediction_bigbench = LightevalTaskConfig( + name="discourse_marker_prediction", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="discourse_marker_prediction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +disfl_qa_bigbench = LightevalTaskConfig( + name="disfl_qa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="disfl_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +drop_lighteval = LightevalTaskConfig( + name="drop", + suite=["lighteval"], + prompt_function="drop", + hf_repo="lighteval/drop_harness", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split="train", + few_shots_select="random_sampling_from_train", + generation_size=None, + metric=["drop"], + stop_sequence=["."], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +dyck_language_2_helm = LightevalTaskConfig( + name="dyck_language:2", + suite=["helm"], + prompt_function="dyck_language", + hf_repo="lighteval/DyckLanguage", + hf_subset="2", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +dyck_language_3_helm = LightevalTaskConfig( + name="dyck_language:3", + suite=["helm"], + prompt_function="dyck_language", + hf_repo="lighteval/DyckLanguage", + hf_subset="3", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +dyck_language_4_helm = LightevalTaskConfig( + name="dyck_language:4", + suite=["helm"], + prompt_function="dyck_language", + hf_repo="lighteval/DyckLanguage", + hf_subset="4", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +dyck_languages_bigbench = LightevalTaskConfig( + name="dyck_languages", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="dyck_languages", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +elementary_math_qa_bigbench = LightevalTaskConfig( + name="elementary_math_qa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="elementary_math_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +emoji_movie_bigbench_lite = LightevalTaskConfig( + name="emoji_movie", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="emoji_movie", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "bleu", "loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +emojis_emotion_prediction_bigbench = LightevalTaskConfig( + name="emojis_emotion_prediction", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="emojis_emotion_prediction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +empirical_judgments_bigbench = LightevalTaskConfig( + name="empirical_judgments", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="empirical_judgments", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +english_proverbs_bigbench = LightevalTaskConfig( + name="english_proverbs", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="english_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +english_russian_proverbs_bigbench = LightevalTaskConfig( + name="english_russian_proverbs", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="english_russian_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entailed_polarity_bigbench = LightevalTaskConfig( + name="entailed_polarity", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="entailed_polarity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entailed_polarity_hindi_bigbench = LightevalTaskConfig( + name="entailed_polarity_hindi", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="entailed_polarity_hindi", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_data_imputation_Buy_helm = LightevalTaskConfig( + name="entity_data_imputation:Buy", + suite=["helm"], + prompt_function="entity_data_imputation", + hf_repo="lighteval/Buy", + hf_subset="default", + hf_avail_splits=["train", "test", "valid"], + evaluation_splits=["valid", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_data_imputation_Restaurant_helm = LightevalTaskConfig( + name="entity_data_imputation:Restaurant", + suite=["helm"], + prompt_function="entity_data_imputation", + hf_repo="lighteval/Restaurant", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Abt_Buy_helm = LightevalTaskConfig( + name="entity_matching:Abt_Buy", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Abt_Buy", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Amazon_Google_helm = LightevalTaskConfig( + name="entity_matching:Amazon_Google", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Amazon_Google", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Beer_helm = LightevalTaskConfig( + name="entity_matching:Beer", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Beer", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Company_helm = LightevalTaskConfig( + name="entity_matching:Company", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Company", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_DBLP_ACM_helm = LightevalTaskConfig( + name="entity_matching:DBLP_ACM", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="DBLP_ACM", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_DBLP_GoogleScholar_helm = LightevalTaskConfig( + name="entity_matching:DBLP_GoogleScholar", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="DBLP_GoogleScholar", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Dirty_DBLP_ACM_helm = LightevalTaskConfig( + name="entity_matching:Dirty_DBLP_ACM", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_DBLP_ACM", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Dirty_DBLP_GoogleScholar_helm = LightevalTaskConfig( + name="entity_matching:Dirty_DBLP_GoogleScholar", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_DBLP_GoogleScholar", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Dirty_Walmart_Amazon_helm = LightevalTaskConfig( + name="entity_matching:Dirty_Walmart_Amazon", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_Walmart_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Dirty_iTunes_Amazon_helm = LightevalTaskConfig( + name="entity_matching:Dirty_iTunes_Amazon", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Dirty_iTunes_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Fodors_Zagats_helm = LightevalTaskConfig( + name="entity_matching=Fodors_Zagats", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Fodors_Zagats", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_Walmart_Amazon_helm = LightevalTaskConfig( + name="entity_matching:Walmart_Amazon", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="Walmart_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +entity_matching_iTunes_Amazon_helm = LightevalTaskConfig( + name="entity_matching:iTunes_Amazon", + suite=["helm"], + prompt_function="entity_matching", + hf_repo="lighteval/EntityMatching", + hf_subset="iTunes_Amazon", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +epistemic_reasoning_bigbench = LightevalTaskConfig( + name="epistemic_reasoning", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="epistemic_reasoning", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ethics_commonsense_lighteval = LightevalTaskConfig( + name="ethics:commonsense", + suite=["lighteval", "ethics"], + prompt_function="ethics_commonsense", + hf_repo="lighteval/hendrycks_ethics", + hf_subset="commonsense", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ethics_deontology_lighteval = LightevalTaskConfig( + name="ethics:deontology", + suite=["lighteval", "ethics"], + prompt_function="ethics_deontology", + hf_repo="lighteval/hendrycks_ethics", + hf_subset="deontology", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ethics_justice_lighteval = LightevalTaskConfig( + name="ethics:justice", + suite=["lighteval", "ethics"], + prompt_function="ethics_justice", + hf_repo="lighteval/hendrycks_ethics", + hf_subset="justice", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ethics_utilitarianism_lighteval = LightevalTaskConfig( + name="ethics:utilitarianism", + suite=["lighteval", "ethics"], + prompt_function="ethics_utilitarianism", + hf_repo="lighteval/hendrycks_ethics", + hf_subset="utilitarianism", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ethics_virtue_lighteval = LightevalTaskConfig( + name="ethics:virtue", + suite=["lighteval", "ethics"], + prompt_function="ethics_virtue", + hf_repo="lighteval/hendrycks_ethics", + hf_subset="virtue", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +evaluating_information_essentiality_bigbench = LightevalTaskConfig( + name="evaluating_information_essentiality", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="evaluating_information_essentiality", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +fact_checker_bigbench = LightevalTaskConfig( + name="fact_checker", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="fact_checker", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +fantasy_reasoning_bigbench = LightevalTaskConfig( + name="fantasy_reasoning", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="fantasy_reasoning", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +few_shot_nlg_bigbench = LightevalTaskConfig( + name="few_shot_nlg", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="few_shot_nlg", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "bleurt"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +figure_of_speech_detection_bigbench = LightevalTaskConfig( + name="figure_of_speech_detection", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="figure_of_speech_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +formal_fallacies_syllogisms_negation_bigbench_lite = LightevalTaskConfig( + name="formal_fallacies_syllogisms_negation", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="formal_fallacies_syllogisms_negation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gem_bigbench = LightevalTaskConfig( + name="gem", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="gem", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gender_inclusive_sentences_german_bigbench = LightevalTaskConfig( + name="gender_inclusive_sentences_german", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="gender_inclusive_sentences_german", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +general_knowledge_bigbench = LightevalTaskConfig( + name="general_knowledge", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="general_knowledge", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +geometric_shapes_bigbench = LightevalTaskConfig( + name="geometric_shapes", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="geometric_shapes", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "bleu", "loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_cola_lighteval = LightevalTaskConfig( + name="glue:cola", + suite=["lighteval", "glue"], + prompt_function="cola", + hf_repo="glue", + hf_subset="cola", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token", "mcc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_mnli_lighteval = LightevalTaskConfig( + name="glue:mnli", + suite=["lighteval", "glue"], + prompt_function="mnli", + hf_repo="glue", + hf_subset="mnli_matched", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_mnli_mismatched_lighteval = LightevalTaskConfig( + name="glue:mnli_mismatched", + suite=["lighteval", "glue"], + prompt_function="mnli", + hf_repo="glue", + hf_subset="mnli_mismatched", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_mrpc_lighteval = LightevalTaskConfig( + name="glue:mrpc", + suite=["lighteval", "glue"], + prompt_function="mrpc", + hf_repo="glue", + hf_subset="mrpc", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_f1"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_qnli_lighteval = LightevalTaskConfig( + name="glue:qnli", + suite=["lighteval", "glue"], + prompt_function="qnli", + hf_repo="glue", + hf_subset="qnli", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_qqp_lighteval = LightevalTaskConfig( + name="glue:qqp", + suite=["lighteval", "glue"], + prompt_function="qqp", + hf_repo="glue", + hf_subset="qqp", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "loglikelihood_f1"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_rte_lighteval = LightevalTaskConfig( + name="glue:rte", + suite=["lighteval", "glue"], + prompt_function="rte", + hf_repo="glue", + hf_subset="rte", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_sst2_lighteval = LightevalTaskConfig( + name="glue:sst2", + suite=["lighteval", "glue"], + prompt_function="sst", + hf_repo="glue", + hf_subset="sst2", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_stsb_lighteval = LightevalTaskConfig( + name="glue:stsb", + suite=["lighteval", "glue"], + prompt_function="stsb", + hf_repo="glue", + hf_subset="stsb", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +glue_wnli_lighteval = LightevalTaskConfig( + name="glue:wnli", + suite=["lighteval", "glue"], + prompt_function="wnli", + hf_repo="glue", + hf_subset="wnli", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +goal_step_wikihow_bigbench = LightevalTaskConfig( + name="goal_step_wikihow", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="goal_step_wikihow", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gpqa_lighteval = LightevalTaskConfig( + name="gpqa", + suite=["lighteval"], + prompt_function="gpqa", + hf_repo="Idavidrein/gpqa", + hf_subset="gpqa_main", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gre_reading_comprehension_bigbench = LightevalTaskConfig( + name="gre_reading_comprehension", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="gre_reading_comprehension", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gsm8k_leaderboard = LightevalTaskConfig( + name="gsm8k", + suite=["leaderboard"], + prompt_function="gsm8k", + hf_repo="gsm8k", + hf_subset="main", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=256, + metric=["quasi_exact_match_gsm8k"], + stop_sequence=["Question=", "Question", "="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +gsm8k_lighteval = LightevalTaskConfig( + name="gsm8k", + suite=["lighteval"], + prompt_function="gsm8k", + hf_repo="gsm8k", + hf_subset="main", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=256, + metric=["quasi_exact_match_gsm8k", "maj_at_8_gsm8k"], + stop_sequence=["Question="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +headqa_en_lighteval = LightevalTaskConfig( + name="headqa:en", + suite=["lighteval", "headqa"], + prompt_function="headqa", + hf_repo="lighteval/headqa_harness", + hf_subset="en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +headqa_es_lighteval = LightevalTaskConfig( + name="headqa:es", + suite=["lighteval", "headqa"], + prompt_function="headqa", + hf_repo="lighteval/headqa_harness", + hf_subset="es", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hellaswag_leaderboard = LightevalTaskConfig( + name="hellaswag", + suite=["leaderboard"], + prompt_function="hellaswag_harness", + hf_repo="hellaswag", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select="random_sampling_from_train", + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hellaswag_helm = LightevalTaskConfig( + name="hellaswag", + suite=["helm", "helm_general"], + prompt_function="hellaswag_helm", + hf_repo="hellaswag", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hhh_alignment_bigbench = LightevalTaskConfig( + name="hhh_alignment", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="hhh_alignment", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hindi_question_answering_bigbench = LightevalTaskConfig( + name="hindi_question_answering", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="hindi_question_answering", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hindu_knowledge_bigbench_lite = LightevalTaskConfig( + name="hindu_knowledge", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="hindu_knowledge", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hinglish_toxicity_bigbench = LightevalTaskConfig( + name="hinglish_toxicity", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="hinglish_toxicity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +human_organs_senses_bigbench = LightevalTaskConfig( + name="human_organs_senses", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="human_organs_senses", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +humaneval_helm = LightevalTaskConfig( + name="humaneval", + suite=["helm", "code_scenario"], + prompt_function="humaneval", + hf_repo="openai_humaneval", + hf_subset="openai_humaneval", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=600, + metric=["code_humaneval"], + stop_sequence=["\nclass", "\ndef", "\nif", "\nprint"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +hyperbaton_bigbench = LightevalTaskConfig( + name="hyperbaton", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="hyperbaton", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +identify_math_theorems_bigbench = LightevalTaskConfig( + name="identify_math_theorems", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="identify_math_theorems", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +identify_odd_metaphor_bigbench = LightevalTaskConfig( + name="identify_odd_metaphor", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="identify_odd_metaphor", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +imdb_helm = LightevalTaskConfig( + name="imdb", + suite=["helm", "helm_general"], + prompt_function="imdb", + hf_repo="lighteval/IMDB_helm", + hf_subset="default", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +imdb_contrastset_helm = LightevalTaskConfig( + name="imdb:contrastset", + suite=["helm"], + prompt_function="imdb_contrastset", + hf_repo="lighteval/IMDB_helm", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +implicatures_bigbench = LightevalTaskConfig( + name="implicatures", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="implicatures", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +implicit_relations_bigbench = LightevalTaskConfig( + name="implicit_relations", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="implicit_relations", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +intent_recognition_bigbench = LightevalTaskConfig( + name="intent_recognition", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="intent_recognition", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_abstract_algebra_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:abstract_algebra", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_abstract_algebra", + hf_repo="lighteval/mmlu", + hf_subset="abstract_algebra", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_college_chemistry_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:college_chemistry", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_college_chemistry", + hf_repo="lighteval/mmlu", + hf_subset="college_chemistry", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_global_facts_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:global_facts", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_global_facts", + hf_repo="lighteval/mmlu", + hf_subset="global_facts", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_miscellaneous_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:miscellaneous", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_miscellaneous", + hf_repo="lighteval/mmlu", + hf_subset="miscellaneous", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_nutrition_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:nutrition", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_nutrition", + hf_repo="lighteval/mmlu", + hf_subset="nutrition", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +interactive_qa_mmlu_us_foreign_policy_helm = LightevalTaskConfig( + name="interactive_qa_mmlu:us_foreign_policy", + suite=["helm", "interactive_qa_mmlu_scenario"], + prompt_function="mmlu_qa_us_foreign_policy", + hf_repo="lighteval/mmlu", + hf_subset="us_foreign_policy", + hf_avail_splits=["dev", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +international_phonetic_alphabet_nli_bigbench = LightevalTaskConfig( + name="international_phonetic_alphabet_nli", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="international_phonetic_alphabet_nli", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +international_phonetic_alphabet_transliterate_bigbench = LightevalTaskConfig( + name="international_phonetic_alphabet_transliterate", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="international_phonetic_alphabet_transliterate", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +intersect_geometry_bigbench = LightevalTaskConfig( + name="intersect_geometry", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="intersect_geometry", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +irony_identification_bigbench = LightevalTaskConfig( + name="irony_identification", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="irony_identification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_ar_en_lighteval = LightevalTaskConfig( + name="iwslt17:ar-en", + suite=["lighteval", "harness_selection"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ar-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_de_en_lighteval = LightevalTaskConfig( + name="iwslt17:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_ar_lighteval = LightevalTaskConfig( + name="iwslt17:en-ar", + suite=["lighteval", "harness_selection"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ar-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_de_lighteval = LightevalTaskConfig( + name="iwslt17:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_fr_lighteval = LightevalTaskConfig( + name="iwslt17:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_ja_lighteval = LightevalTaskConfig( + name="iwslt17:en-ja", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-ja", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_ko_lighteval = LightevalTaskConfig( + name="iwslt17:en-ko", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-ko", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_en_zh_lighteval = LightevalTaskConfig( + name="iwslt17:en-zh", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_fr_en_lighteval = LightevalTaskConfig( + name="iwslt17:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_ja_en_lighteval = LightevalTaskConfig( + name="iwslt17:ja-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ja-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_ko_en_lighteval = LightevalTaskConfig( + name="iwslt17:ko-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_ko-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +iwslt17_zh_en_lighteval = LightevalTaskConfig( + name="iwslt17:zh-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="iwslt17_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +kanji_ascii_bigbench = LightevalTaskConfig( + name="kanji_ascii", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="kanji_ascii", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +kannada_bigbench = LightevalTaskConfig( + name="kannada", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="kannada", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +key_value_maps_bigbench = LightevalTaskConfig( + name="key_value_maps", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="key_value_maps", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +known_unknowns_bigbench_lite = LightevalTaskConfig( + name="known_unknowns", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="known_unknowns", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_standard_lighteval = LightevalTaskConfig( + name="lambada:standard", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="lambada", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_standard_cloze_lighteval = LightevalTaskConfig( + name="lambada:standard_cloze", + suite=["lighteval", "lambada"], + prompt_function="lambada_cloze", + hf_repo="lambada", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_lighteval = LightevalTaskConfig( + name="lambada:openai", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_de_lighteval = LightevalTaskConfig( + name="lambada:openai:de", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_en_lighteval = LightevalTaskConfig( + name="lambada:openai:en", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_es_lighteval = LightevalTaskConfig( + name="lambada:openai:es", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_fr_lighteval = LightevalTaskConfig( + name="lambada:openai:fr", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_it_lighteval = LightevalTaskConfig( + name="lambada:openai:it", + suite=["lighteval", "lambada"], + prompt_function="lambada", + hf_repo="EleutherAI/lambada_openai", + hf_subset="it", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lambada_openai_cloze_lighteval = LightevalTaskConfig( + name="lambada:openai_cloze", + suite=["lighteval", "lambada"], + prompt_function="lambada_cloze", + hf_repo="EleutherAI/lambada_openai", + hf_subset="en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["target_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +language_games_bigbench = LightevalTaskConfig( + name="language_games", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="language_games", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +language_identification_bigbench_lite = LightevalTaskConfig( + name="language_identification", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="language_identification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +legal_summarization_billsum_helm = LightevalTaskConfig( + name="legal_summarization:billsum", + suite=["helm"], + prompt_function="legal_summarization", + hf_repo="lighteval/legal_summarization", + hf_subset="BillSum", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1024, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +legal_summarization_eurlexsum_helm = LightevalTaskConfig( + name="legal_summarization:eurlexsum", + suite=["helm"], + prompt_function="legal_summarization", + hf_repo="lighteval/legal_summarization", + hf_subset="EurLexSum", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +legal_summarization_multilexsum_helm = LightevalTaskConfig( + name="legal_summarization:multilexsum", + suite=["helm"], + prompt_function="multilexsum", + hf_repo="lighteval/legal_summarization", + hf_subset="MultiLexSum", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=256, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +legalsupport_helm = LightevalTaskConfig( + name="legalsupport", + suite=["helm"], + prompt_function="legal_support", + hf_repo="lighteval/LegalSupport", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["loglikelihood_acc", "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_case_hold_helm = LightevalTaskConfig( + name="lexglue:case_hold", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_case_hold", + hf_repo="lighteval/lexglue", + hf_subset="case_hold", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_ecthr_a_helm = LightevalTaskConfig( + name="lexglue:ecthr_a", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_ecthr_a", + hf_repo="lighteval/lexglue", + hf_subset="ecthr_a", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_ecthr_b_helm = LightevalTaskConfig( + name="lexglue:ecthr_b", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_ecthr_b", + hf_repo="lighteval/lexglue", + hf_subset="ecthr_b", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_eurlex_helm = LightevalTaskConfig( + name="lexglue:eurlex", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_eurlex", + hf_repo="lighteval/lexglue", + hf_subset="eurlex", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_ledgar_helm = LightevalTaskConfig( + name="lexglue:ledgar", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_ledgar", + hf_repo="lighteval/lexglue", + hf_subset="ledgar", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_scotus_helm = LightevalTaskConfig( + name="lexglue:scotus", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_scotus", + hf_repo="lighteval/lexglue", + hf_subset="scotus", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lexglue_unfair_tos_helm = LightevalTaskConfig( + name="lexglue:unfair_tos", + suite=["helm", "lex_glue_scenario"], + prompt_function="lex_glue_unfair_tos", + hf_repo="lighteval/lexglue", + hf_subset="unfair_tos", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_brazilian_court_decisions_judgment_helm = LightevalTaskConfig( + name="lextreme:brazilian_court_decisions_judgment", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_brazilian_court_decisions_judgment", + hf_repo="lighteval/lextreme", + hf_subset="brazilian_court_decisions_judgment", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_brazilian_court_decisions_unanimity_helm = LightevalTaskConfig( + name="lextreme:brazilian_court_decisions_unanimity", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_brazilian_court_decisions_unanimity", + hf_repo="lighteval/lextreme", + hf_subset="brazilian_court_decisions_unanimity", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_covid19_emergency_event_helm = LightevalTaskConfig( + name="lextreme:covid19_emergency_event", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_covid19_emergency_event", + hf_repo="lighteval/lextreme", + hf_subset="covid19_emergency_event", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_german_argument_mining_helm = LightevalTaskConfig( + name="lextreme:german_argument_mining", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_german_argument_mining", + hf_repo="lighteval/lextreme", + hf_subset="german_argument_mining", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_greek_legal_code_chapter_helm = LightevalTaskConfig( + name="lextreme:greek_legal_code_chapter", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_greek_legal_code_chapter", + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_code_chapter", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_greek_legal_code_subject_helm = LightevalTaskConfig( + name="lextreme:greek_legal_code_subject", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_greek_legal_code_subject", + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_code_subject", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_greek_legal_code_volume_helm = LightevalTaskConfig( + name="lextreme:greek_legal_code_volume", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_greek_legal_code_volume", + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_code_volume", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_greek_legal_ner_helm = LightevalTaskConfig( + name="lextreme:greek_legal_ner", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_greek_legal_ner", + hf_repo="lighteval/lextreme", + hf_subset="greek_legal_ner", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=430, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_legalnero_helm = LightevalTaskConfig( + name="lextreme:legalnero", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_legalnero", + hf_repo="lighteval/lextreme", + hf_subset="legalnero", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=788, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_lener_br_helm = LightevalTaskConfig( + name="lextreme:lener_br", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_lener_br", + hf_repo="lighteval/lextreme", + hf_subset="lener_br", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=338, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_mapa_coarse_helm = LightevalTaskConfig( + name="lextreme:mapa_coarse", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_mapa_coarse", + hf_repo="lighteval/lextreme", + hf_subset="mapa_coarse", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=274, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_mapa_fine_helm = LightevalTaskConfig( + name="lextreme:mapa_fine", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_mapa_fine", + hf_repo="lighteval/lextreme", + hf_subset="mapa_fine", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=274, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_multi_eurlex_level_1_helm = LightevalTaskConfig( + name="lextreme:multi_eurlex_level_1", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_multi_eurlex_level_1", + hf_repo="lighteval/lextreme", + hf_subset="multi_eurlex_level_1", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_multi_eurlex_level_2_helm = LightevalTaskConfig( + name="lextreme:multi_eurlex_level_2", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_multi_eurlex_level_2", + hf_repo="lighteval/lextreme", + hf_subset="multi_eurlex_level_2", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_multi_eurlex_level_3_helm = LightevalTaskConfig( + name="lextreme:multi_eurlex_level_3", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_multi_eurlex_level_3", + hf_repo="lighteval/lextreme", + hf_subset="multi_eurlex_level_3", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_online_terms_of_service_clause_topics_helm = LightevalTaskConfig( + name="lextreme:online_terms_of_service_clause_topics", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_online_terms_of_service_clause_topics", + hf_repo="lighteval/lextreme", + hf_subset="online_terms_of_service_clause_topics", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_online_terms_of_service_unfairness_levels_helm = LightevalTaskConfig( + name="lextreme:online_terms_of_service_unfairness_levels", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_online_terms_of_service_unfairness_levels", + hf_repo="lighteval/lextreme", + hf_subset="online_terms_of_service_unfairness_levels", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=10, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lextreme_swiss_judgment_prediction_helm = LightevalTaskConfig( + name="lextreme:swiss_judgment_prediction", + suite=["helm", "lextreme_scenario"], + prompt_function="lextreme_swiss_judgment_prediction", + hf_repo="lighteval/lextreme", + hf_subset="swiss_judgment_prediction", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "f1_score", "f1_score_macro", "f1_score_micro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +linguistic_mappings_bigbench = LightevalTaskConfig( + name="linguistic_mappings", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="linguistic_mappings", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +linguistics_puzzles_bigbench_lite = LightevalTaskConfig( + name="linguistics_puzzles", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="linguistics_puzzles", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=None, + output_regex="[^\\.\\?\\!\\;\\n]+", + trust_dataset=True, + version=0, +) +logic_grid_puzzle_bigbench_lite = LightevalTaskConfig( + name="logic_grid_puzzle", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="logic_grid_puzzle", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +logical_args_bigbench = LightevalTaskConfig( + name="logical_args", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="logical_args", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +logical_deduction_bigbench_lite = LightevalTaskConfig( + name="logical_deduction", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="logical_deduction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +logical_fallacy_detection_bigbench = LightevalTaskConfig( + name="logical_fallacy_detection", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="logical_fallacy_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +logical_sequence_bigbench = LightevalTaskConfig( + name="logical_sequence", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="logical_sequence", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +logiqa_lighteval = LightevalTaskConfig( + name="logiqa", + suite=["lighteval"], + prompt_function="logiqa", + hf_repo="lighteval/logiqa_harness", + hf_subset="logiqa", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lsat_qa_helm = LightevalTaskConfig( + name="lsat_qa", + suite=["helm", "lsat_qa_scenario"], + prompt_function="lsat_qa", + hf_repo="lighteval/lsat_qa", + hf_subset="all", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lsat_qa_assignment_helm = LightevalTaskConfig( + name="lsat_qa:assignment", + suite=["helm", "lsat_qa_scenario"], + prompt_function="lsat_qa", + hf_repo="lighteval/lsat_qa", + hf_subset="assignment", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lsat_qa_grouping_helm = LightevalTaskConfig( + name="lsat_qa:grouping", + suite=["helm", "lsat_qa_scenario"], + prompt_function="lsat_qa", + hf_repo="lighteval/lsat_qa", + hf_subset="grouping", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lsat_qa_miscellaneous_helm = LightevalTaskConfig( + name="lsat_qa:miscellaneous", + suite=["helm", "lsat_qa_scenario"], + prompt_function="lsat_qa", + hf_repo="lighteval/lsat_qa", + hf_subset="miscellaneous", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +lsat_qa_ordering_helm = LightevalTaskConfig( + name="lsat_qa:ordering", + suite=["helm", "lsat_qa_scenario"], + prompt_function="lsat_qa", + hf_repo="lighteval/lsat_qa", + hf_subset="ordering", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_algebra_lighteval = LightevalTaskConfig( + name="math:algebra", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="algebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_counting_and_probability_lighteval = LightevalTaskConfig( + name="math:counting_and_probability", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="counting_and_probability", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_geometry_lighteval = LightevalTaskConfig( + name="math:geometry", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="geometry", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_intermediate_algebra_lighteval = LightevalTaskConfig( + name="math:intermediate_algebra", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="intermediate_algebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_number_theory_lighteval = LightevalTaskConfig( + name="math:number_theory", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="number_theory", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_prealgebra_lighteval = LightevalTaskConfig( + name="math:prealgebra", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="prealgebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_precalculus_lighteval = LightevalTaskConfig( + name="math:precalculus", + suite=["lighteval", "math"], + prompt_function="math", + hf_repo="lighteval/MATH", + hf_subset="precalculus", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=1, +) +math_cot_algebra_lighteval = LightevalTaskConfig( + name="math_cot:algebra", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="algebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_counting_and_probability_lighteval = LightevalTaskConfig( + name="math_cot:counting_and_probability", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="counting_and_probability", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_geometry_lighteval = LightevalTaskConfig( + name="math_cot:geometry", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="geometry", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_intermediate_algebra_lighteval = LightevalTaskConfig( + name="math_cot:intermediate_algebra", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="intermediate_algebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_number_theory_lighteval = LightevalTaskConfig( + name="math_cot:number_theory", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="number_theory", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_prealgebra_lighteval = LightevalTaskConfig( + name="math_cot:prealgebra", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="prealgebra", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +math_cot_precalculus_lighteval = LightevalTaskConfig( + name="math_cot:precalculus", + suite=["lighteval", "math"], + prompt_function="math_cot", + hf_repo="lighteval/MATH", + hf_subset="precalculus", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=2048, + metric=["quasi_exact_match_math", "maj_at_4_math"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mathematical_induction_bigbench = LightevalTaskConfig( + name="mathematical_induction", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="mathematical_induction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mathqa_lighteval = LightevalTaskConfig( + name="mathqa", + suite=["lighteval"], + prompt_function="mathqa", + hf_repo="math_qa", + hf_subset="default", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +matrixshapes_bigbench = LightevalTaskConfig( + name="matrixshapes", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="matrixshapes", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +me_q_sum_helm = LightevalTaskConfig( + name="me_q_sum", + suite=["helm"], + prompt_function="me_q_sum", + hf_repo="lighteval/me_q_sum", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +med_dialog_healthcaremagic_helm = LightevalTaskConfig( + name="med_dialog:healthcaremagic", + suite=["helm"], + prompt_function="med_dialog", + hf_repo="lighteval/med_dialog", + hf_subset="healthcaremagic", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +med_dialog_icliniq_helm = LightevalTaskConfig( + name="med_dialog:icliniq", + suite=["helm"], + prompt_function="med_dialog", + hf_repo="lighteval/med_dialog", + hf_subset="icliniq", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +med_mcqa_helm = LightevalTaskConfig( + name="med_mcqa", + suite=["helm"], + prompt_function="med_mcqa", + hf_repo="lighteval/med_mcqa", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc", "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +med_paragraph_simplification_helm = LightevalTaskConfig( + name="med_paragraph_simplification", + suite=["helm"], + prompt_function="med_paragraph_simplification", + hf_repo="lighteval/med_paragraph_simplification", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=512, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +med_qa_helm = LightevalTaskConfig( + name="med_qa", + suite=["helm"], + prompt_function="med_qa", + hf_repo="bigbio/med_qa", + hf_subset="med_qa_en_source", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc", "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +metaphor_boolean_bigbench = LightevalTaskConfig( + name="metaphor_boolean", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="metaphor_boolean", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +metaphor_understanding_bigbench = LightevalTaskConfig( + name="metaphor_understanding", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="metaphor_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_en_lighteval = LightevalTaskConfig( + name="mgsm:en", + suite=["lighteval"], + prompt_function="mgsm_en", + hf_repo="juletxara/mgsm", + hf_subset="en", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "Question="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_es_lighteval = LightevalTaskConfig( + name="mgsm:es", + suite=["lighteval"], + prompt_function="mgsm_es", + hf_repo="juletxara/mgsm", + hf_subset="es", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "Pregunta="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_fr_lighteval = LightevalTaskConfig( + name="mgsm:fr", + suite=["lighteval"], + prompt_function="mgsm_fr", + hf_repo="juletxara/mgsm", + hf_subset="fr", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "Question="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_de_lighteval = LightevalTaskConfig( + name="mgsm:de", + suite=["lighteval"], + prompt_function="mgsm_de", + hf_repo="juletxara/mgsm", + hf_subset="de", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "Frage="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_ru_lighteval = LightevalTaskConfig( + name="mgsm:ru", + suite=["lighteval"], + prompt_function="mgsm_ru", + hf_repo="juletxara/mgsm", + hf_subset="ru", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u0417\u0430\u0434\u0430\u0447\u0430="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_zh_lighteval = LightevalTaskConfig( + name="mgsm:zh", + suite=["lighteval"], + prompt_function="mgsm_zh", + hf_repo="juletxara/mgsm", + hf_subset="zh", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u95ee\u9898="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_ja_lighteval = LightevalTaskConfig( + name="mgsm:ja", + suite=["lighteval"], + prompt_function="mgsm_ja", + hf_repo="juletxara/mgsm", + hf_subset="ja", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u554f\u984c="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_th_lighteval = LightevalTaskConfig( + name="mgsm:th", + suite=["lighteval"], + prompt_function="mgsm_th", + hf_repo="juletxara/mgsm", + hf_subset="th", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u0e42\u0e08\u0e17\u0e22\u0e4c="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_sw_lighteval = LightevalTaskConfig( + name="mgsm:sw", + suite=["lighteval"], + prompt_function="mgsm_sw", + hf_repo="juletxara/mgsm", + hf_subset="sw", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "Swali="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_bn_lighteval = LightevalTaskConfig( + name="mgsm:bn", + suite=["lighteval"], + prompt_function="mgsm_bn", + hf_repo="juletxara/mgsm", + hf_subset="bn", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mgsm_te_lighteval = LightevalTaskConfig( + name="mgsm:te", + suite=["lighteval"], + prompt_function="mgsm_te", + hf_repo="juletxara/mgsm", + hf_subset="te", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n", "=", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28="], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +minute_mysteries_qa_bigbench = LightevalTaskConfig( + name="minute_mysteries_qa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="minute_mysteries_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "rouge_t5"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +misconceptions_bigbench = LightevalTaskConfig( + name="misconceptions", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="misconceptions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +misconceptions_russian_bigbench_lite = LightevalTaskConfig( + name="misconceptions_russian", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="misconceptions_russian", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_helm = LightevalTaskConfig( + name="mmlu", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="all", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_original = LightevalTaskConfig( + name="mmlu", + suite=["original"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="all", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=5, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_abstract_algebra_original = LightevalTaskConfig( + name="mmlu:abstract_algebra", + suite=["original", "mmlu"], + prompt_function="mmlu_abstract_algebra", + hf_repo="cais/mmlu", + hf_subset="abstract_algebra", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_abstract_algebra_leaderboard = LightevalTaskConfig( + name="mmlu:abstract_algebra", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="abstract_algebra", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_abstract_algebra_helm = LightevalTaskConfig( + name="mmlu:abstract_algebra", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="abstract_algebra", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_anatomy_original = LightevalTaskConfig( + name="mmlu:anatomy", + suite=["original", "mmlu"], + prompt_function="mmlu_anatomy", + hf_repo="cais/mmlu", + hf_subset="anatomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_anatomy_leaderboard = LightevalTaskConfig( + name="mmlu:anatomy", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="anatomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_anatomy_helm = LightevalTaskConfig( + name="mmlu:anatomy", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="anatomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_astronomy_original = LightevalTaskConfig( + name="mmlu:astronomy", + suite=["original", "mmlu"], + prompt_function="mmlu_astronomy", + hf_repo="cais/mmlu", + hf_subset="astronomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_astronomy_leaderboard = LightevalTaskConfig( + name="mmlu:astronomy", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="astronomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_astronomy_helm = LightevalTaskConfig( + name="mmlu:astronomy", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="astronomy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_business_ethics_original = LightevalTaskConfig( + name="mmlu:business_ethics", + suite=["original", "mmlu"], + prompt_function="mmlu_business_ethics", + hf_repo="cais/mmlu", + hf_subset="business_ethics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_business_ethics_leaderboard = LightevalTaskConfig( + name="mmlu:business_ethics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="business_ethics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_business_ethics_helm = LightevalTaskConfig( + name="mmlu:business_ethics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="business_ethics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_clinical_knowledge_original = LightevalTaskConfig( + name="mmlu:clinical_knowledge", + suite=["original", "mmlu"], + prompt_function="mmlu_clinical_knowledge", + hf_repo="cais/mmlu", + hf_subset="clinical_knowledge", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_clinical_knowledge_leaderboard = LightevalTaskConfig( + name="mmlu:clinical_knowledge", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="clinical_knowledge", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_clinical_knowledge_helm = LightevalTaskConfig( + name="mmlu:clinical_knowledge", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="clinical_knowledge", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_biology_original = LightevalTaskConfig( + name="mmlu:college_biology", + suite=["original", "mmlu"], + prompt_function="mmlu_college_biology", + hf_repo="cais/mmlu", + hf_subset="college_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_biology_leaderboard = LightevalTaskConfig( + name="mmlu:college_biology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_biology_helm = LightevalTaskConfig( + name="mmlu:college_biology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_chemistry_original = LightevalTaskConfig( + name="mmlu:college_chemistry", + suite=["original", "mmlu"], + prompt_function="mmlu_college_chemistry", + hf_repo="cais/mmlu", + hf_subset="college_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_chemistry_leaderboard = LightevalTaskConfig( + name="mmlu:college_chemistry", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_chemistry_helm = LightevalTaskConfig( + name="mmlu:college_chemistry", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_computer_science_original = LightevalTaskConfig( + name="mmlu:college_computer_science", + suite=["original", "mmlu"], + prompt_function="mmlu_college_computer_science", + hf_repo="cais/mmlu", + hf_subset="college_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_computer_science_leaderboard = LightevalTaskConfig( + name="mmlu:college_computer_science", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_computer_science_helm = LightevalTaskConfig( + name="mmlu:college_computer_science", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_mathematics_original = LightevalTaskConfig( + name="mmlu:college_mathematics", + suite=["original", "mmlu"], + prompt_function="mmlu_college_mathematics", + hf_repo="cais/mmlu", + hf_subset="college_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_mathematics_leaderboard = LightevalTaskConfig( + name="mmlu:college_mathematics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_mathematics_helm = LightevalTaskConfig( + name="mmlu:college_mathematics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_medicine_original = LightevalTaskConfig( + name="mmlu:college_medicine", + suite=["original", "mmlu"], + prompt_function="mmlu_college_medicine", + hf_repo="cais/mmlu", + hf_subset="college_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_medicine_leaderboard = LightevalTaskConfig( + name="mmlu:college_medicine", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_medicine_helm = LightevalTaskConfig( + name="mmlu:college_medicine", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_physics_original = LightevalTaskConfig( + name="mmlu:college_physics", + suite=["original", "mmlu"], + prompt_function="mmlu_college_physics", + hf_repo="cais/mmlu", + hf_subset="college_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_physics_leaderboard = LightevalTaskConfig( + name="mmlu:college_physics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="college_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_college_physics_helm = LightevalTaskConfig( + name="mmlu:college_physics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="college_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_computer_security_original = LightevalTaskConfig( + name="mmlu:computer_security", + suite=["original", "mmlu"], + prompt_function="mmlu_computer_security", + hf_repo="cais/mmlu", + hf_subset="computer_security", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_computer_security_leaderboard = LightevalTaskConfig( + name="mmlu:computer_security", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="computer_security", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_computer_security_helm = LightevalTaskConfig( + name="mmlu:computer_security", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="computer_security", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_conceptual_physics_original = LightevalTaskConfig( + name="mmlu:conceptual_physics", + suite=["original", "mmlu"], + prompt_function="mmlu_conceptual_physics", + hf_repo="cais/mmlu", + hf_subset="conceptual_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_conceptual_physics_leaderboard = LightevalTaskConfig( + name="mmlu:conceptual_physics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="conceptual_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_conceptual_physics_helm = LightevalTaskConfig( + name="mmlu:conceptual_physics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="conceptual_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_econometrics_original = LightevalTaskConfig( + name="mmlu:econometrics", + suite=["original", "mmlu"], + prompt_function="mmlu_econometrics", + hf_repo="cais/mmlu", + hf_subset="econometrics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_econometrics_leaderboard = LightevalTaskConfig( + name="mmlu:econometrics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="econometrics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_econometrics_helm = LightevalTaskConfig( + name="mmlu:econometrics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="econometrics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_electrical_engineering_original = LightevalTaskConfig( + name="mmlu:electrical_engineering", + suite=["original", "mmlu"], + prompt_function="mmlu_electrical_engineering", + hf_repo="cais/mmlu", + hf_subset="electrical_engineering", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_electrical_engineering_leaderboard = LightevalTaskConfig( + name="mmlu:electrical_engineering", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="electrical_engineering", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_electrical_engineering_helm = LightevalTaskConfig( + name="mmlu:electrical_engineering", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="electrical_engineering", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_elementary_mathematics_original = LightevalTaskConfig( + name="mmlu:elementary_mathematics", + suite=["original", "mmlu"], + prompt_function="mmlu_elementary_mathematics", + hf_repo="cais/mmlu", + hf_subset="elementary_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_elementary_mathematics_leaderboard = LightevalTaskConfig( + name="mmlu:elementary_mathematics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="elementary_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_elementary_mathematics_helm = LightevalTaskConfig( + name="mmlu:elementary_mathematics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="elementary_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_formal_logic_original = LightevalTaskConfig( + name="mmlu:formal_logic", + suite=["original", "mmlu"], + prompt_function="mmlu_formal_logic", + hf_repo="cais/mmlu", + hf_subset="formal_logic", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_formal_logic_leaderboard = LightevalTaskConfig( + name="mmlu:formal_logic", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="formal_logic", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_formal_logic_helm = LightevalTaskConfig( + name="mmlu:formal_logic", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="formal_logic", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_global_facts_original = LightevalTaskConfig( + name="mmlu:global_facts", + suite=["original", "mmlu"], + prompt_function="mmlu_global_facts", + hf_repo="cais/mmlu", + hf_subset="global_facts", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_global_facts_leaderboard = LightevalTaskConfig( + name="mmlu:global_facts", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="global_facts", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_global_facts_helm = LightevalTaskConfig( + name="mmlu:global_facts", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="global_facts", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_biology_original = LightevalTaskConfig( + name="mmlu:high_school_biology", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_biology", + hf_repo="cais/mmlu", + hf_subset="high_school_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_biology_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_biology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_biology_helm = LightevalTaskConfig( + name="mmlu:high_school_biology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_biology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_chemistry_original = LightevalTaskConfig( + name="mmlu:high_school_chemistry", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_chemistry", + hf_repo="cais/mmlu", + hf_subset="high_school_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_chemistry_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_chemistry", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_chemistry_helm = LightevalTaskConfig( + name="mmlu:high_school_chemistry", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_chemistry", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_computer_science_original = LightevalTaskConfig( + name="mmlu:high_school_computer_science", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_computer_science", + hf_repo="cais/mmlu", + hf_subset="high_school_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_computer_science_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_computer_science", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_computer_science_helm = LightevalTaskConfig( + name="mmlu:high_school_computer_science", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_computer_science", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_european_history_original = LightevalTaskConfig( + name="mmlu:high_school_european_history", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_european_history", + hf_repo="cais/mmlu", + hf_subset="high_school_european_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_european_history_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_european_history", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_european_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_european_history_helm = LightevalTaskConfig( + name="mmlu:high_school_european_history", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_european_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_geography_original = LightevalTaskConfig( + name="mmlu:high_school_geography", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_geography", + hf_repo="cais/mmlu", + hf_subset="high_school_geography", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_geography_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_geography", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_geography", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_geography_helm = LightevalTaskConfig( + name="mmlu:high_school_geography", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_geography", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_government_and_politics_original = LightevalTaskConfig( + name="mmlu:high_school_government_and_politics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_government_and_politics", + hf_repo="cais/mmlu", + hf_subset="high_school_government_and_politics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_government_and_politics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_government_and_politics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_government_and_politics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_government_and_politics_helm = LightevalTaskConfig( + name="mmlu:high_school_government_and_politics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_government_and_politics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_macroeconomics_original = LightevalTaskConfig( + name="mmlu:high_school_macroeconomics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_macroeconomics", + hf_repo="cais/mmlu", + hf_subset="high_school_macroeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_macroeconomics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_macroeconomics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_macroeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_macroeconomics_helm = LightevalTaskConfig( + name="mmlu:high_school_macroeconomics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_macroeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_mathematics_original = LightevalTaskConfig( + name="mmlu:high_school_mathematics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_mathematics", + hf_repo="cais/mmlu", + hf_subset="high_school_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_mathematics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_mathematics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_mathematics_helm = LightevalTaskConfig( + name="mmlu:high_school_mathematics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_mathematics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_microeconomics_original = LightevalTaskConfig( + name="mmlu:high_school_microeconomics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_microeconomics", + hf_repo="cais/mmlu", + hf_subset="high_school_microeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_microeconomics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_microeconomics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_microeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_microeconomics_helm = LightevalTaskConfig( + name="mmlu:high_school_microeconomics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_microeconomics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_physics_original = LightevalTaskConfig( + name="mmlu:high_school_physics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_physics", + hf_repo="cais/mmlu", + hf_subset="high_school_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_physics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_physics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_physics_helm = LightevalTaskConfig( + name="mmlu:high_school_physics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_physics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_psychology_original = LightevalTaskConfig( + name="mmlu:high_school_psychology", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_psychology", + hf_repo="cais/mmlu", + hf_subset="high_school_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_psychology_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_psychology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_psychology_helm = LightevalTaskConfig( + name="mmlu:high_school_psychology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_statistics_original = LightevalTaskConfig( + name="mmlu:high_school_statistics", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_statistics", + hf_repo="cais/mmlu", + hf_subset="high_school_statistics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_statistics_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_statistics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_statistics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_statistics_helm = LightevalTaskConfig( + name="mmlu:high_school_statistics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_statistics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_us_history_original = LightevalTaskConfig( + name="mmlu:high_school_us_history", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_us_history", + hf_repo="cais/mmlu", + hf_subset="high_school_us_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_us_history_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_us_history", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_us_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_us_history_helm = LightevalTaskConfig( + name="mmlu:high_school_us_history", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_us_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_world_history_original = LightevalTaskConfig( + name="mmlu:high_school_world_history", + suite=["original", "mmlu"], + prompt_function="mmlu_high_school_world_history", + hf_repo="cais/mmlu", + hf_subset="high_school_world_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_world_history_leaderboard = LightevalTaskConfig( + name="mmlu:high_school_world_history", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="high_school_world_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_high_school_world_history_helm = LightevalTaskConfig( + name="mmlu:high_school_world_history", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="high_school_world_history", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_aging_original = LightevalTaskConfig( + name="mmlu:human_aging", + suite=["original", "mmlu"], + prompt_function="mmlu_human_aging", + hf_repo="cais/mmlu", + hf_subset="human_aging", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_aging_leaderboard = LightevalTaskConfig( + name="mmlu:human_aging", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="human_aging", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_aging_helm = LightevalTaskConfig( + name="mmlu:human_aging", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="human_aging", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_sexuality_original = LightevalTaskConfig( + name="mmlu:human_sexuality", + suite=["original", "mmlu"], + prompt_function="mmlu_human_sexuality", + hf_repo="cais/mmlu", + hf_subset="human_sexuality", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_sexuality_leaderboard = LightevalTaskConfig( + name="mmlu:human_sexuality", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="human_sexuality", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_human_sexuality_helm = LightevalTaskConfig( + name="mmlu:human_sexuality", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="human_sexuality", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_international_law_original = LightevalTaskConfig( + name="mmlu:international_law", + suite=["original", "mmlu"], + prompt_function="mmlu_international_law", + hf_repo="cais/mmlu", + hf_subset="international_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_international_law_leaderboard = LightevalTaskConfig( + name="mmlu:international_law", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="international_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_international_law_helm = LightevalTaskConfig( + name="mmlu:international_law", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="international_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_jurisprudence_original = LightevalTaskConfig( + name="mmlu:jurisprudence", + suite=["original", "mmlu"], + prompt_function="mmlu_jurisprudence", + hf_repo="cais/mmlu", + hf_subset="jurisprudence", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_jurisprudence_leaderboard = LightevalTaskConfig( + name="mmlu:jurisprudence", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="jurisprudence", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_jurisprudence_helm = LightevalTaskConfig( + name="mmlu:jurisprudence", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="jurisprudence", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_logical_fallacies_original = LightevalTaskConfig( + name="mmlu:logical_fallacies", + suite=["original", "mmlu"], + prompt_function="mmlu_logical_fallacies", + hf_repo="cais/mmlu", + hf_subset="logical_fallacies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_logical_fallacies_leaderboard = LightevalTaskConfig( + name="mmlu:logical_fallacies", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="logical_fallacies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_logical_fallacies_helm = LightevalTaskConfig( + name="mmlu:logical_fallacies", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="logical_fallacies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_machine_learning_original = LightevalTaskConfig( + name="mmlu:machine_learning", + suite=["original", "mmlu"], + prompt_function="mmlu_machine_learning", + hf_repo="cais/mmlu", + hf_subset="machine_learning", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_machine_learning_leaderboard = LightevalTaskConfig( + name="mmlu:machine_learning", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="machine_learning", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_machine_learning_helm = LightevalTaskConfig( + name="mmlu:machine_learning", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="machine_learning", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_management_original = LightevalTaskConfig( + name="mmlu:management", + suite=["original", "mmlu"], + prompt_function="mmlu_management", + hf_repo="cais/mmlu", + hf_subset="management", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_management_leaderboard = LightevalTaskConfig( + name="mmlu:management", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="management", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_management_helm = LightevalTaskConfig( + name="mmlu:management", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="management", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_marketing_original = LightevalTaskConfig( + name="mmlu:marketing", + suite=["original", "mmlu"], + prompt_function="mmlu_marketing", + hf_repo="cais/mmlu", + hf_subset="marketing", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_marketing_leaderboard = LightevalTaskConfig( + name="mmlu:marketing", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="marketing", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_marketing_helm = LightevalTaskConfig( + name="mmlu:marketing", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="marketing", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_medical_genetics_original = LightevalTaskConfig( + name="mmlu:medical_genetics", + suite=["original", "mmlu"], + prompt_function="mmlu_medical_genetics", + hf_repo="cais/mmlu", + hf_subset="medical_genetics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_medical_genetics_leaderboard = LightevalTaskConfig( + name="mmlu:medical_genetics", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="medical_genetics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_medical_genetics_helm = LightevalTaskConfig( + name="mmlu:medical_genetics", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="medical_genetics", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_miscellaneous_original = LightevalTaskConfig( + name="mmlu:miscellaneous", + suite=["original", "mmlu"], + prompt_function="mmlu_miscellaneous", + hf_repo="cais/mmlu", + hf_subset="miscellaneous", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_miscellaneous_leaderboard = LightevalTaskConfig( + name="mmlu:miscellaneous", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="miscellaneous", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_miscellaneous_helm = LightevalTaskConfig( + name="mmlu:miscellaneous", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="miscellaneous", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_disputes_original = LightevalTaskConfig( + name="mmlu:moral_disputes", + suite=["original", "mmlu"], + prompt_function="mmlu_moral_disputes", + hf_repo="cais/mmlu", + hf_subset="moral_disputes", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_disputes_leaderboard = LightevalTaskConfig( + name="mmlu:moral_disputes", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="moral_disputes", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_disputes_helm = LightevalTaskConfig( + name="mmlu:moral_disputes", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="moral_disputes", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_scenarios_original = LightevalTaskConfig( + name="mmlu:moral_scenarios", + suite=["original", "mmlu"], + prompt_function="mmlu_moral_scenarios", + hf_repo="cais/mmlu", + hf_subset="moral_scenarios", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_scenarios_leaderboard = LightevalTaskConfig( + name="mmlu:moral_scenarios", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="moral_scenarios", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_moral_scenarios_helm = LightevalTaskConfig( + name="mmlu:moral_scenarios", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="moral_scenarios", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_nutrition_original = LightevalTaskConfig( + name="mmlu:nutrition", + suite=["original", "mmlu"], + prompt_function="mmlu_nutrition", + hf_repo="cais/mmlu", + hf_subset="nutrition", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_nutrition_leaderboard = LightevalTaskConfig( + name="mmlu:nutrition", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="nutrition", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_nutrition_helm = LightevalTaskConfig( + name="mmlu:nutrition", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="nutrition", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_philosophy_original = LightevalTaskConfig( + name="mmlu:philosophy", + suite=["original", "mmlu"], + prompt_function="mmlu_philosophy", + hf_repo="cais/mmlu", + hf_subset="philosophy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_philosophy_leaderboard = LightevalTaskConfig( + name="mmlu:philosophy", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="philosophy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_philosophy_helm = LightevalTaskConfig( + name="mmlu:philosophy", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="philosophy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_prehistory_original = LightevalTaskConfig( + name="mmlu:prehistory", + suite=["original", "mmlu"], + prompt_function="mmlu_prehistory", + hf_repo="cais/mmlu", + hf_subset="prehistory", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_prehistory_leaderboard = LightevalTaskConfig( + name="mmlu:prehistory", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="prehistory", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_prehistory_helm = LightevalTaskConfig( + name="mmlu:prehistory", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="prehistory", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_accounting_original = LightevalTaskConfig( + name="mmlu:professional_accounting", + suite=["original", "mmlu"], + prompt_function="mmlu_professional_accounting", + hf_repo="cais/mmlu", + hf_subset="professional_accounting", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_accounting_leaderboard = LightevalTaskConfig( + name="mmlu:professional_accounting", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="professional_accounting", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_accounting_helm = LightevalTaskConfig( + name="mmlu:professional_accounting", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="professional_accounting", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_law_original = LightevalTaskConfig( + name="mmlu:professional_law", + suite=["original", "mmlu"], + prompt_function="mmlu_professional_law", + hf_repo="cais/mmlu", + hf_subset="professional_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_law_leaderboard = LightevalTaskConfig( + name="mmlu:professional_law", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="professional_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_law_helm = LightevalTaskConfig( + name="mmlu:professional_law", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="professional_law", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_medicine_original = LightevalTaskConfig( + name="mmlu:professional_medicine", + suite=["original", "mmlu"], + prompt_function="mmlu_professional_medicine", + hf_repo="cais/mmlu", + hf_subset="professional_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_medicine_leaderboard = LightevalTaskConfig( + name="mmlu:professional_medicine", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="professional_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_medicine_helm = LightevalTaskConfig( + name="mmlu:professional_medicine", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="professional_medicine", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_psychology_original = LightevalTaskConfig( + name="mmlu:professional_psychology", + suite=["original", "mmlu"], + prompt_function="mmlu_professional_psychology", + hf_repo="cais/mmlu", + hf_subset="professional_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_psychology_leaderboard = LightevalTaskConfig( + name="mmlu:professional_psychology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="professional_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_professional_psychology_helm = LightevalTaskConfig( + name="mmlu:professional_psychology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="professional_psychology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_public_relations_original = LightevalTaskConfig( + name="mmlu:public_relations", + suite=["original", "mmlu"], + prompt_function="mmlu_public_relations", + hf_repo="cais/mmlu", + hf_subset="public_relations", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_public_relations_leaderboard = LightevalTaskConfig( + name="mmlu:public_relations", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="public_relations", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_public_relations_helm = LightevalTaskConfig( + name="mmlu:public_relations", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="public_relations", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_security_studies_original = LightevalTaskConfig( + name="mmlu:security_studies", + suite=["original", "mmlu"], + prompt_function="mmlu_security_studies", + hf_repo="cais/mmlu", + hf_subset="security_studies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_security_studies_leaderboard = LightevalTaskConfig( + name="mmlu:security_studies", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="security_studies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_security_studies_helm = LightevalTaskConfig( + name="mmlu:security_studies", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="security_studies", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_sociology_original = LightevalTaskConfig( + name="mmlu:sociology", + suite=["original", "mmlu"], + prompt_function="mmlu_sociology", + hf_repo="cais/mmlu", + hf_subset="sociology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_sociology_leaderboard = LightevalTaskConfig( + name="mmlu:sociology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="sociology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_sociology_helm = LightevalTaskConfig( + name="mmlu:sociology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="sociology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_us_foreign_policy_original = LightevalTaskConfig( + name="mmlu:us_foreign_policy", + suite=["original", "mmlu"], + prompt_function="mmlu_us_foreign_policy", + hf_repo="cais/mmlu", + hf_subset="us_foreign_policy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_us_foreign_policy_leaderboard = LightevalTaskConfig( + name="mmlu:us_foreign_policy", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="us_foreign_policy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_us_foreign_policy_helm = LightevalTaskConfig( + name="mmlu:us_foreign_policy", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="us_foreign_policy", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_virology_original = LightevalTaskConfig( + name="mmlu:virology", + suite=["original", "mmlu"], + prompt_function="mmlu_virology", + hf_repo="cais/mmlu", + hf_subset="virology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_virology_leaderboard = LightevalTaskConfig( + name="mmlu:virology", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="virology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_virology_helm = LightevalTaskConfig( + name="mmlu:virology", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="virology", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_world_religions_original = LightevalTaskConfig( + name="mmlu:world_religions", + suite=["original", "mmlu"], + prompt_function="mmlu_world_religions", + hf_repo="cais/mmlu", + hf_subset="world_religions", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_world_religions_leaderboard = LightevalTaskConfig( + name="mmlu:world_religions", + suite=["leaderboard", "mmlu"], + prompt_function="mmlu_harness", + hf_repo="lighteval/mmlu", + hf_subset="world_religions", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select="sequential", + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mmlu_world_religions_helm = LightevalTaskConfig( + name="mmlu:world_religions", + suite=["helm", "helm_general"], + prompt_function="mmlu_helm", + hf_repo="lighteval/mmlu", + hf_subset="world_religions", + hf_avail_splits=["auxiliary_train", "test", "validation", "dev"], + evaluation_splits=["test"], + few_shots_split="dev", + few_shots_select=None, + generation_size=5, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mnist_ascii_bigbench = LightevalTaskConfig( + name="mnist_ascii", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="mnist_ascii", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +modified_arithmetic_bigbench = LightevalTaskConfig( + name="modified_arithmetic", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="modified_arithmetic", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +moral_permissibility_bigbench = LightevalTaskConfig( + name="moral_permissibility", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="moral_permissibility", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +movie_dialog_same_or_different_bigbench = LightevalTaskConfig( + name="movie_dialog_same_or_different", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="movie_dialog_same_or_different", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +movie_recommendation_bigbench = LightevalTaskConfig( + name="movie_recommendation", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="movie_recommendation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mtnt2019_en_fr_lighteval = LightevalTaskConfig( + name="mtnt2019:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mtnt2019_en_ja_lighteval = LightevalTaskConfig( + name="mtnt2019:en-ja", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_en-ja", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mtnt2019_fr_en_lighteval = LightevalTaskConfig( + name="mtnt2019:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mtnt2019_ja_en_lighteval = LightevalTaskConfig( + name="mtnt2019:ja-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="mtnt2019_ja-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mult_data_wrangling_bigbench = LightevalTaskConfig( + name="mult_data_wrangling", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="mult_data_wrangling", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +multiemo_bigbench = LightevalTaskConfig( + name="multiemo", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="multiemo", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mutual_lighteval = LightevalTaskConfig( + name="mutual", + suite=["lighteval"], + prompt_function="mutual", + hf_repo="lighteval/mutual_harness", + hf_subset="mutual", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["recall_at_1", "recall_at_2", "mrr"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +mutual_plus_lighteval = LightevalTaskConfig( + name="mutual_plus", + suite=["lighteval"], + prompt_function="mutual", + hf_repo="lighteval/mutual_harness", + hf_subset="mutual_plus", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["recall_at_1", "recall_at_2", "mrr"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +narrativeqa_helm = LightevalTaskConfig( + name="narrativeqa", + suite=["helm", "helm_general"], + prompt_function="narrativeqa", + hf_repo="lighteval/narrative_qa_helm", + hf_subset="default", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "f1_score", "rougeL", "bleu_1", "bleu_4"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +natural_instructions_bigbench = LightevalTaskConfig( + name="natural_instructions", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="natural_instructions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +navigate_bigbench = LightevalTaskConfig( + name="navigate", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="navigate", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +nonsense_words_grammar_bigbench = LightevalTaskConfig( + name="nonsense_words_grammar", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="nonsense_words_grammar", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +novel_concepts_bigbench_lite = LightevalTaskConfig( + name="novel_concepts", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="novel_concepts", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_linear_example_helm = LightevalTaskConfig( + name="numeracy:linear_example", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="linear_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_linear_standard_helm = LightevalTaskConfig( + name="numeracy:linear_standard", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="linear_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_parabola_example_helm = LightevalTaskConfig( + name="numeracy:parabola_example", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="parabola_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_parabola_standard_helm = LightevalTaskConfig( + name="numeracy:parabola_standard", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="parabola_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_paraboloid_example_helm = LightevalTaskConfig( + name="numeracy:paraboloid_example", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="paraboloid_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_paraboloid_standard_helm = LightevalTaskConfig( + name="numeracy:paraboloid_standard", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="paraboloid_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_plane_example_helm = LightevalTaskConfig( + name="numeracy:plane_example", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="plane_example", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +numeracy_plane_standard_helm = LightevalTaskConfig( + name="numeracy:plane_standard", + suite=["helm"], + prompt_function="numeracy", + hf_repo="lighteval/numeracy", + hf_subset="plane_standard", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +object_counting_bigbench = LightevalTaskConfig( + name="object_counting", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="object_counting", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +odd_one_out_bigbench = LightevalTaskConfig( + name="odd_one_out", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="odd_one_out", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +openbookqa_helm = LightevalTaskConfig( + name="openbookqa", + suite=["helm", "commonsense_scenario", "helm_general"], + prompt_function="openbookqa_helm", + hf_repo="openbookqa", + hf_subset="main", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +openbookqa_lighteval = LightevalTaskConfig( + name="openbookqa", + suite=["lighteval"], + prompt_function="openbookqa", + hf_repo="openbookqa", + hf_subset="main", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +operators_bigbench_lite = LightevalTaskConfig( + name="operators", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="operators", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex="([-+]?\\d+)[.]0,1)$", + trust_dataset=True, + version=0, +) +paragraph_segmentation_bigbench = LightevalTaskConfig( + name="paragraph_segmentation", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="paragraph_segmentation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +parsinlu_qa_bigbench = LightevalTaskConfig( + name="parsinlu_qa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="parsinlu_qa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +parsinlu_reading_comprehension_bigbench_lite = LightevalTaskConfig( + name="parsinlu_reading_comprehension", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="parsinlu_reading_comprehension", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["perfect_exact_match"], + stop_sequence=None, + output_regex="[^\\.\\?\\!\\;\\n]+", + trust_dataset=True, + version=0, +) +penguins_in_a_table_bigbench = LightevalTaskConfig( + name="penguins_in_a_table", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="penguins_in_a_table", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +periodic_elements_bigbench = LightevalTaskConfig( + name="periodic_elements", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="periodic_elements", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +persian_idioms_bigbench = LightevalTaskConfig( + name="persian_idioms", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="persian_idioms", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +phrase_relatedness_bigbench = LightevalTaskConfig( + name="phrase_relatedness", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="phrase_relatedness", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +physical_intuition_bigbench = LightevalTaskConfig( + name="physical_intuition", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="physical_intuition", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +physics_bigbench = LightevalTaskConfig( + name="physics", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="physics", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +physics_questions_bigbench = LightevalTaskConfig( + name="physics_questions", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="physics_questions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +piqa_lighteval = LightevalTaskConfig( + name="piqa", + suite=["lighteval"], + prompt_function="piqa_harness", + hf_repo="piqa", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +piqa_helm = LightevalTaskConfig( + name="piqa", + suite=["helm", "commonsense_scenario"], + prompt_function="piqa_helm", + hf_repo="piqa", + hf_subset="plain_text", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +play_dialog_same_or_different_bigbench_lite = LightevalTaskConfig( + name="play_dialog_same_or_different", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="play_dialog_same_or_different", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +polish_sequence_labeling_bigbench = LightevalTaskConfig( + name="polish_sequence_labeling", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="polish_sequence_labeling", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +presuppositions_as_nli_bigbench = LightevalTaskConfig( + name="presuppositions_as_nli", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="presuppositions_as_nli", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +prost_lighteval = LightevalTaskConfig( + name="prost", + suite=["lighteval"], + prompt_function="prost", + hf_repo="corypaik/prost", + hf_subset="default", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +pubmedqa_lighteval = LightevalTaskConfig( + name="pubmedqa", + suite=["lighteval"], + prompt_function="pubmed_qa", + hf_repo="pubmed_qa", + hf_subset="pqa_labeled", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +pubmedqa_helm = LightevalTaskConfig( + name="pubmedqa", + suite=["helm"], + prompt_function="pubmed_qa_helm", + hf_repo="pubmed_qa", + hf_subset="pqa_labeled", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qa4mre_2011_lighteval = LightevalTaskConfig( + name="qa4mre:2011", + suite=["lighteval"], + prompt_function="qa4mre", + hf_repo="qa4mre", + hf_subset="2011.main.EN", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qa4mre_2012_lighteval = LightevalTaskConfig( + name="qa4mre:2012", + suite=["lighteval"], + prompt_function="qa4mre", + hf_repo="qa4mre", + hf_subset="2012.main.EN", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qa4mre_2013_lighteval = LightevalTaskConfig( + name="qa4mre:2013", + suite=["lighteval"], + prompt_function="qa4mre", + hf_repo="qa4mre", + hf_subset="2013.main.EN", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qa_wikidata_bigbench = LightevalTaskConfig( + name="qa_wikidata", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="qa_wikidata", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleurt", "bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qasper_lighteval = LightevalTaskConfig( + name="qasper", + suite=["lighteval"], + prompt_function="qasper", + hf_repo="qasper", + hf_subset="qasper", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["f1_score_quasi"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +qasper_ll_lighteval = LightevalTaskConfig( + name="qasper_ll", + suite=["lighteval"], + prompt_function="qasper_ll", + hf_repo="qasper", + hf_subset="qasper", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +quac_helm = LightevalTaskConfig( + name="quac", + suite=["helm"], + prompt_function="quac", + hf_repo="lighteval/quac_helm", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["exact_match", "quasi_exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +question_selection_bigbench = LightevalTaskConfig( + name="question_selection", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="question_selection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +race_high_lighteval = LightevalTaskConfig( + name="race:high", + suite=["lighteval", "race"], + prompt_function="race", + hf_repo="EleutherAI/race", + hf_subset="high", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_ade_corpus_v2_helm = LightevalTaskConfig( + name="raft:ade_corpus_v2", + suite=["helm", "helm_general"], + prompt_function="raft_ade_corpus_v2", + hf_repo="ought/raft", + hf_subset="ade_corpus_v2", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_banking_77_helm = LightevalTaskConfig( + name="raft:banking_77", + suite=["helm", "helm_general"], + prompt_function="raft_banking_77", + hf_repo="ought/raft", + hf_subset="banking_77", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_neurips_impact_statement_risks_helm = LightevalTaskConfig( + name="raft:neurips_impact_statement_risks", + suite=["helm", "helm_general"], + prompt_function="raft_neurips_impact_statement_risks", + hf_repo="ought/raft", + hf_subset="neurips_impact_statement_risks", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_one_stop_english_helm = LightevalTaskConfig( + name="raft:one_stop_english", + suite=["helm", "helm_general"], + prompt_function="raft_one_stop_english", + hf_repo="ought/raft", + hf_subset="one_stop_english", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_overruling_helm = LightevalTaskConfig( + name="raft:overruling", + suite=["helm", "helm_general"], + prompt_function="raft_overruling", + hf_repo="ought/raft", + hf_subset="overruling", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_semiconductor_org_types_helm = LightevalTaskConfig( + name="raft:semiconductor_org_types", + suite=["helm", "helm_general"], + prompt_function="raft_semiconductor_org_types", + hf_repo="ought/raft", + hf_subset="semiconductor_org_types", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_systematic_review_inclusion_helm = LightevalTaskConfig( + name="raft:systematic_review_inclusion", + suite=["helm", "helm_general"], + prompt_function="raft_systematic_review_inclusion", + hf_repo="ought/raft", + hf_subset="systematic_review_inclusion", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_tai_safety_research_helm = LightevalTaskConfig( + name="raft:tai_safety_research", + suite=["helm", "helm_general"], + prompt_function="raft_tai_safety_research", + hf_repo="ought/raft", + hf_subset="tai_safety_research", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_terms_of_service_helm = LightevalTaskConfig( + name="raft:terms_of_service", + suite=["helm", "helm_general"], + prompt_function="raft_terms_of_service", + hf_repo="ought/raft", + hf_subset="terms_of_service", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_tweet_eval_hate_helm = LightevalTaskConfig( + name="raft:tweet_eval_hate", + suite=["helm", "helm_general"], + prompt_function="raft_tweet_eval_hate", + hf_repo="ought/raft", + hf_subset="tweet_eval_hate", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +raft_twitter_complaints_helm = LightevalTaskConfig( + name="raft:twitter_complaints", + suite=["helm", "helm_general"], + prompt_function="raft_twitter_complaints", + hf_repo="ought/raft", + hf_subset="twitter_complaints", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=30, + metric=[ + "exact_match", + "quasi_exact_match", + "prefix_exact_match", + "prefix_quasi_exact_match", + "f1_score_macro", + "f1_score_micro", + ], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +real_or_fake_text_bigbench = LightevalTaskConfig( + name="real_or_fake_text", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="real_or_fake_text", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +real_toxicity_prompts_helm = LightevalTaskConfig( + name="real_toxicity_prompts", + suite=["helm"], + prompt_function="real_toxicity_prompts", + hf_repo="allenai/real-toxicity-prompts", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["prediction_perplexity"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +reasoning_about_colored_objects_bigbench = LightevalTaskConfig( + name="reasoning_about_colored_objects", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="reasoning_about_colored_objects", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +repeat_copy_logic_bigbench_lite = LightevalTaskConfig( + name="repeat_copy_logic", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="repeat_copy_logic", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +rephrase_bigbench = LightevalTaskConfig( + name="rephrase", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="rephrase", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["rouge_t5", "bleu", "loglikelihood_acc", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +rhyming_bigbench = LightevalTaskConfig( + name="rhyming", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="rhyming", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +riddle_sense_bigbench = LightevalTaskConfig( + name="riddle_sense", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="riddle_sense", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +ruin_names_bigbench = LightevalTaskConfig( + name="ruin_names", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="ruin_names", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +salient_translation_error_detection_bigbench = LightevalTaskConfig( + name="salient_translation_error_detection", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="salient_translation_error_detection", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +scientific_press_release_bigbench = LightevalTaskConfig( + name="scientific_press_release", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="scientific_press_release", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +sciq_lighteval = LightevalTaskConfig( + name="sciq", + suite=["lighteval"], + prompt_function="sciq", + hf_repo="sciq", + hf_subset="default", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +semantic_parsing_in_context_sparc_bigbench = LightevalTaskConfig( + name="semantic_parsing_in_context_sparc", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="semantic_parsing_in_context_sparc", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +semantic_parsing_spider_bigbench = LightevalTaskConfig( + name="semantic_parsing_spider", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="semantic_parsing_spider", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +sentence_ambiguity_bigbench = LightevalTaskConfig( + name="sentence_ambiguity", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="sentence_ambiguity", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +similarities_abstraction_bigbench = LightevalTaskConfig( + name="similarities_abstraction", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="similarities_abstraction", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simp_turing_concept_bigbench = LightevalTaskConfig( + name="simp_turing_concept", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simp_turing_concept", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_arithmetic_json_bigbench = LightevalTaskConfig( + name="simple_arithmetic_json", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_arithmetic_json", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_arithmetic_json_multiple_choice_bigbench = LightevalTaskConfig( + name="simple_arithmetic_json_multiple_choice", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_arithmetic_json_multiple_choice", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_arithmetic_json_subtasks_bigbench = LightevalTaskConfig( + name="simple_arithmetic_json_subtasks", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_arithmetic_json_subtasks", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_arithmetic_multiple_targets_json_bigbench = LightevalTaskConfig( + name="simple_arithmetic_multiple_targets_json", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_arithmetic_multiple_targets_json", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_ethical_questions_bigbench = LightevalTaskConfig( + name="simple_ethical_questions", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_ethical_questions", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +simple_text_editing_bigbench = LightevalTaskConfig( + name="simple_text_editing", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="simple_text_editing", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +siqa_helm = LightevalTaskConfig( + name="siqa", + suite=["helm", "commonsense_scenario"], + prompt_function="siqa", + hf_repo="social_i_qa", + hf_subset="default", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +snarks_bigbench = LightevalTaskConfig( + name="snarks", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="snarks", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +social_iqa_bigbench = LightevalTaskConfig( + name="social_iqa", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="social_iqa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +social_support_bigbench = LightevalTaskConfig( + name="social_support", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="social_support", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["f1_score_macro"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +sports_understanding_bigbench = LightevalTaskConfig( + name="sports_understanding", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="sports_understanding", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +storycloze_2016_lighteval = LightevalTaskConfig( + name="storycloze:2016", + suite=["lighteval", "storycloze"], + prompt_function="storycloze", + hf_repo="story_cloze", + hf_subset="2016", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +storycloze_2018_lighteval = LightevalTaskConfig( + name="storycloze:2018", + suite=["lighteval", "storycloze"], + prompt_function="storycloze", + hf_repo="story_cloze", + hf_subset="2018", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +strange_stories_bigbench_lite = LightevalTaskConfig( + name="strange_stories", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="strange_stories", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +strategyqa_bigbench_lite = LightevalTaskConfig( + name="strategyqa", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="strategyqa", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +sufficient_information_bigbench = LightevalTaskConfig( + name="sufficient_information", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="sufficient_information", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +suicide_risk_bigbench = LightevalTaskConfig( + name="suicide_risk", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="suicide_risk", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +summarization_cnn_dm_helm = LightevalTaskConfig( + name="summarization:cnn-dm", + suite=["helm", "helm_general"], + prompt_function="cnn_dm", + hf_repo="lighteval/summarization", + hf_subset="cnn-dm", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=128, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +summarization_xsum_helm = LightevalTaskConfig( + name="summarization:xsum", + suite=["helm", "helm_general"], + prompt_function="xsum", + hf_repo="lighteval/summarization", + hf_subset="xsum", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=64, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +summarization_xsum_sampled_helm = LightevalTaskConfig( + name="summarization:xsum-sampled", + suite=["helm"], + prompt_function="xsum", + hf_repo="lighteval/summarization", + hf_subset="xsum-sampled", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=64, + metric=["rouge1", "rouge2", "rougeL", "faithfulness", "extractiveness", "bert_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_boolq_lighteval = LightevalTaskConfig( + name="super_glue:boolq", + suite=["lighteval", "superglue"], + prompt_function="boolq_harness", + hf_repo="super_glue", + hf_subset="boolq", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_cb_lighteval = LightevalTaskConfig( + name="super_glue:cb", + suite=["lighteval", "superglue"], + prompt_function="cb", + hf_repo="super_glue", + hf_subset="cb", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc_single_token", "multi_f1_numeric"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_copa_lighteval = LightevalTaskConfig( + name="super_glue:copa", + suite=["lighteval", "superglue"], + prompt_function="copa", + hf_repo="super_glue", + hf_subset="copa", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_rte_lighteval = LightevalTaskConfig( + name="super_glue:rte", + suite=["lighteval", "superglue"], + prompt_function="rte", + hf_repo="super_glue", + hf_subset="rte", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_multirc_lighteval = LightevalTaskConfig( + name="super_glue:multirc", + suite=["lighteval", "superglue"], + prompt_function="multirc", + hf_repo="super_glue", + hf_subset="multirc", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_wic_lighteval = LightevalTaskConfig( + name="super_glue:wic", + suite=["lighteval", "superglue"], + prompt_function="wic", + hf_repo="super_glue", + hf_subset="wic", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +super_glue_wsc_lighteval = LightevalTaskConfig( + name="super_glue:wsc", + suite=["lighteval", "superglue"], + prompt_function="wsc", + hf_repo="super_glue", + hf_subset="wsc", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc_single_token"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +swahili_english_proverbs_bigbench = LightevalTaskConfig( + name="swahili_english_proverbs", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="swahili_english_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +swag_lighteval = LightevalTaskConfig( + name="swag", + suite=["lighteval"], + prompt_function="swag", + hf_repo="swag", + hf_subset="regular", + hf_avail_splits=["train", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +swedish_to_german_proverbs_bigbench = LightevalTaskConfig( + name="swedish_to_german_proverbs", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="swedish_to_german_proverbs", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +symbol_interpretation_bigbench_lite = LightevalTaskConfig( + name="symbol_interpretation", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_linefeed_before_whitespace_after_query", + hf_repo="bigbench", + hf_subset="symbol_interpretation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +synthetic_reasoning_induction_helm = LightevalTaskConfig( + name="synthetic_reasoning:induction", + suite=["helm"], + prompt_function="synthetic_reasoning", + hf_repo="lighteval/synthetic_reasoning", + hf_subset="induction", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +synthetic_reasoning_natural_easy_helm = LightevalTaskConfig( + name="synthetic_reasoning:natural_easy", + suite=["helm"], + prompt_function="synthetic_reasoning_natural", + hf_repo="lighteval/synthetic_reasoning_natural", + hf_subset="easy", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +synthetic_reasoning_natural_hard_helm = LightevalTaskConfig( + name="synthetic_reasoning:natural_hard", + suite=["helm"], + prompt_function="synthetic_reasoning_natural", + hf_repo="lighteval/synthetic_reasoning_natural", + hf_subset="hard", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["exact_match", "f1_score"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +synthetic_reasoning_pattern_match_helm = LightevalTaskConfig( + name="synthetic_reasoning:pattern_match", + suite=["helm"], + prompt_function="synthetic_reasoning", + hf_repo="lighteval/synthetic_reasoning", + hf_subset="pattern_match", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +synthetic_reasoning_variable_substitution_helm = LightevalTaskConfig( + name="synthetic_reasoning:variable_substitution", + suite=["helm"], + prompt_function="synthetic_reasoning", + hf_repo="lighteval/synthetic_reasoning", + hf_subset="variable_substitution", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=50, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +tellmewhy_bigbench = LightevalTaskConfig( + name="tellmewhy", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="tellmewhy", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +temporal_sequences_bigbench = LightevalTaskConfig( + name="temporal_sequences", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="temporal_sequences", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +tense_bigbench = LightevalTaskConfig( + name="tense", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="tense", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_arxiv_lighteval = LightevalTaskConfig( + name="the_pile:arxiv", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_arxiv", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_arxiv_helm = LightevalTaskConfig( + name="the_pile:arxiv", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="arxiv", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_bibliotik_helm = LightevalTaskConfig( + name="the_pile:bibliotik", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="bibliotik", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_bookcorpus2_lighteval = LightevalTaskConfig( + name="the_pile:bookcorpus2", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_bookcorpus2", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_books3_lighteval = LightevalTaskConfig( + name="the_pile:books3", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_books3", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_commoncrawl_helm = LightevalTaskConfig( + name="the_pile:commoncrawl", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="commoncrawl", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_dm_mathematics_lighteval = LightevalTaskConfig( + name="the_pile:dm-mathematics", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_dm-mathematics", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_dm_mathematics_helm = LightevalTaskConfig( + name="the_pile:dm-mathematics", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="dm-mathematics", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_enron_lighteval = LightevalTaskConfig( + name="the_pile:enron", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_enron", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_enron_helm = LightevalTaskConfig( + name="the_pile:enron", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="enron", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_europarl_lighteval = LightevalTaskConfig( + name="the_pile:europarl", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_europarl", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_europarl_helm = LightevalTaskConfig( + name="the_pile:europarl", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="europarl", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_freelaw_lighteval = LightevalTaskConfig( + name="the_pile:freelaw", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_freelaw", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_freelaw_helm = LightevalTaskConfig( + name="the_pile:freelaw", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="freelaw", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_github_lighteval = LightevalTaskConfig( + name="the_pile:github", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_github", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_github_helm = LightevalTaskConfig( + name="the_pile:github", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="github", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_gutenberg_lighteval = LightevalTaskConfig( + name="the_pile:gutenberg", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_gutenberg", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_gutenberg_helm = LightevalTaskConfig( + name="the_pile:gutenberg", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="gutenberg", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_hackernews_lighteval = LightevalTaskConfig( + name="the_pile:hackernews", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_hackernews", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_hackernews_helm = LightevalTaskConfig( + name="the_pile:hackernews", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="hackernews", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_nih_exporter_lighteval = LightevalTaskConfig( + name="the_pile:nih-exporter", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_nih-exporter", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_nih_exporter_helm = LightevalTaskConfig( + name="the_pile:nih-exporter", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="nih-exporter", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_opensubtitles_lighteval = LightevalTaskConfig( + name="the_pile:opensubtitles", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_opensubtitles", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_opensubtitles_helm = LightevalTaskConfig( + name="the_pile:opensubtitles", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="opensubtitles", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_openwebtext2_lighteval = LightevalTaskConfig( + name="the_pile:openwebtext2", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_openwebtext2", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_openwebtext2_helm = LightevalTaskConfig( + name="the_pile:openwebtext2", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="openwebtext2", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_philpapers_lighteval = LightevalTaskConfig( + name="the_pile:philpapers", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_philpapers", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_pile_cc_lighteval = LightevalTaskConfig( + name="the_pile:pile-cc", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_pile-cc", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_pubmed_abstracts_lighteval = LightevalTaskConfig( + name="the_pile:pubmed-abstracts", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_pubmed-abstracts", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_pubmed_abstracts_helm = LightevalTaskConfig( + name="the_pile:pubmed-abstracts", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="pubmed-abstracts", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_pubmed_central_lighteval = LightevalTaskConfig( + name="the_pile:pubmed-central", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_pubmed-central", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_pubmed_central_helm = LightevalTaskConfig( + name="the_pile:pubmed-central", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="pubmed-central", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_stackexchange_lighteval = LightevalTaskConfig( + name="the_pile:stackexchange", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_stackexchange", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_stackexchange_helm = LightevalTaskConfig( + name="the_pile:stackexchange", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="stackexchange", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_ubuntu_irc_lighteval = LightevalTaskConfig( + name="the_pile:ubuntu-irc", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_ubuntu-irc", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_uspto_lighteval = LightevalTaskConfig( + name="the_pile:uspto", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_upsto", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_upsto_helm = LightevalTaskConfig( + name="the_pile:upsto", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="uspto", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_wikipedia_lighteval = LightevalTaskConfig( + name="the_pile:wikipedia", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_wikipedia", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_wikipedia_helm = LightevalTaskConfig( + name="the_pile:wikipedia", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="wikipedia", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_youtubesubtitles_lighteval = LightevalTaskConfig( + name="the_pile:youtubesubtitles", + suite=["lighteval", "pile"], + prompt_function="the_pile", + hf_repo="lighteval/pile", + hf_subset="pile_youtubesubtitles", + hf_avail_splits=["validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +the_pile_youtubesubtitles_helm = LightevalTaskConfig( + name="the_pile:youtubesubtitles", + suite=["helm"], + prompt_function="the_pile", + hf_repo="lighteval/pile_helm", + hf_subset="youtubesubtitles", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +timedial_bigbench = LightevalTaskConfig( + name="timedial", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="timedial", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +toxigen_lighteval = LightevalTaskConfig( + name="toxigen", + suite=["lighteval"], + prompt_function="toxigen", + hf_repo="skg/toxigen-data", + hf_subset="annotated", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc", "loglikelihood_acc_norm"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +topical_chat_bigbench = LightevalTaskConfig( + name="topical_chat", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="topical_chat", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["bleu", "rouge_t5", "loglikelihood_acc", "bleurt"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +tracking_shuffled_objects_bigbench = LightevalTaskConfig( + name="tracking_shuffled_objects", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="tracking_shuffled_objects", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +triviaqa_lighteval = LightevalTaskConfig( + name="triviaqa", + suite=["lighteval"], + prompt_function="triviaqa", + hf_repo="trivia_qa", + hf_subset="rc.nocontext", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=20, + metric=["quasi_exact_match_triviaqa"], + stop_sequence=["\n", ".", ","], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +truthfulqa_gen_lighteval = LightevalTaskConfig( + name="truthfulqa:gen", + suite=["lighteval"], + prompt_function="truthful_qa_generative", + hf_repo="truthful_qa", + hf_subset="generation", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=200, + metric=["bleu", "rouge_t5"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +truthfulqa_mc_leaderboard = LightevalTaskConfig( + name="truthfulqa:mc", + suite=["leaderboard"], + prompt_function="truthful_qa_multiple_choice", + hf_repo="truthful_qa", + hf_subset="multiple_choice", + hf_avail_splits=["validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["truthfulqa_mc_metrics"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +truthfulqa_helm = LightevalTaskConfig( + name="truthfulqa", + suite=["helm", "helm_general"], + prompt_function="truthful_qa_helm", + hf_repo="lighteval/truthfulqa_helm", + hf_subset="default", + hf_avail_splits=["train", "valid"], + evaluation_splits=["valid"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["loglikelihood_acc", "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +twitterAAE_aa_helm = LightevalTaskConfig( + name="twitterAAE:aa", + suite=["helm"], + prompt_function="twitter_aae", + hf_repo="lighteval/twitterAAE", + hf_subset="aa", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +twitterAAE_white_helm = LightevalTaskConfig( + name="twitterAAE:white", + suite=["helm"], + prompt_function="twitter_aae", + hf_repo="lighteval/twitterAAE", + hf_subset="white", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +understanding_fables_bigbench = LightevalTaskConfig( + name="understanding_fables", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="understanding_fables", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +undo_permutation_bigbench = LightevalTaskConfig( + name="undo_permutation", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="undo_permutation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unit_conversion_bigbench = LightevalTaskConfig( + name="unit_conversion", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="unit_conversion", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unit_interpretation_bigbench = LightevalTaskConfig( + name="unit_interpretation", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="unit_interpretation", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unnatural_in_context_learning_bigbench = LightevalTaskConfig( + name="unnatural_in_context_learning", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="unnatural_in_context_learning", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unscramble_anagrams1_lighteval = LightevalTaskConfig( + name="unscramble:anagrams1", + suite=["lighteval", "unscramble"], + prompt_function="unscramble", + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["mid_word_1_anagrams"], + evaluation_splits=["mid_word_1_anagrams"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unscramble_anagrams2_lighteval = LightevalTaskConfig( + name="unscramble:anagrams2", + suite=["lighteval", "unscramble"], + prompt_function="unscramble", + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["mid_word_2_anagrams"], + evaluation_splits=["mid_word_2_anagrams"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unscramble_cycle_letters_lighteval = LightevalTaskConfig( + name="unscramble:cycle_letters", + suite=["lighteval", "unscramble"], + prompt_function="unscramble", + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["cycle_letters_in_word"], + evaluation_splits=["cycle_letters_in_word"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unscramble_random_insertion_lighteval = LightevalTaskConfig( + name="unscramble:random_insertion", + suite=["lighteval", "unscramble"], + prompt_function="unscramble", + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["random_insertion_in_word"], + evaluation_splits=["random_insertion_in_word"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +unscramble_reversed_words_lighteval = LightevalTaskConfig( + name="unscramble:reversed_words", + suite=["lighteval", "unscramble"], + prompt_function="unscramble", + hf_repo="lighteval/GPT3_unscramble", + hf_subset="default", + hf_avail_splits=["reversed_words"], + evaluation_splits=["reversed_words"], + few_shots_split=None, + few_shots_select=None, + generation_size=5, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +vitaminc_fact_verification_bigbench_lite = LightevalTaskConfig( + name="vitaminc_fact_verification", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="vitaminc_fact_verification", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +webqs_lighteval = LightevalTaskConfig( + name="webqs", + suite=["lighteval"], + prompt_function="webqs", + hf_repo="web_questions", + hf_subset="default", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["acc_golds_likelihood"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +what_is_the_tao_bigbench = LightevalTaskConfig( + name="what_is_the_tao", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="what_is_the_tao", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +which_wiki_edit_bigbench = LightevalTaskConfig( + name="which_wiki_edit", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="which_wiki_edit", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_applies_to_jurisdiction_helm = LightevalTaskConfig( + name="wikifact:applies_to_jurisdiction", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="applies_to_jurisdiction", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_atomic_number_helm = LightevalTaskConfig( + name="wikifact:atomic_number", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="atomic_number", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_author_helm = LightevalTaskConfig( + name="wikifact:author", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="author", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_award_received_helm = LightevalTaskConfig( + name="wikifact:award_received", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="award_received", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_basic_form_of_government_helm = LightevalTaskConfig( + name="wikifact:basic_form_of_government", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="basic_form_of_government", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_capital_helm = LightevalTaskConfig( + name="wikifact:capital", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="capital", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_capital_of_helm = LightevalTaskConfig( + name="wikifact:capital_of", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="capital_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_central_bank_helm = LightevalTaskConfig( + name="wikifact:central_bank", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="central_bank", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_composer_helm = LightevalTaskConfig( + name="wikifact:composer", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="composer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_continent_helm = LightevalTaskConfig( + name="wikifact:continent", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="continent", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_country_helm = LightevalTaskConfig( + name="wikifact:country", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="country", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_country_of_citizenship_helm = LightevalTaskConfig( + name="wikifact:country_of_citizenship", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="country_of_citizenship", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_country_of_origin_helm = LightevalTaskConfig( + name="wikifact:country_of_origin", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="country_of_origin", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_creator_helm = LightevalTaskConfig( + name="wikifact:creator", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="creator", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_currency_helm = LightevalTaskConfig( + name="wikifact:currency", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="currency", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_defendant_helm = LightevalTaskConfig( + name="wikifact:defendant", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="defendant", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_developer_helm = LightevalTaskConfig( + name="wikifact:developer", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="developer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_diplomatic_relation_helm = LightevalTaskConfig( + name="wikifact:diplomatic_relation", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="diplomatic_relation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_director_helm = LightevalTaskConfig( + name="wikifact:director", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="director", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_discoverer_or_inventor_helm = LightevalTaskConfig( + name="wikifact:discoverer_or_inventor", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="discoverer_or_inventor", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_drug_or_therapy_used_for_treatment_helm = LightevalTaskConfig( + name="wikifact:drug_or_therapy_used_for_treatment", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="drug_or_therapy_used_for_treatment", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_educated_at_helm = LightevalTaskConfig( + name="wikifact:educated_at", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="educated_at", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_electron_configuration_helm = LightevalTaskConfig( + name="wikifact:electron_configuration", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="electron_configuration", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_employer_helm = LightevalTaskConfig( + name="wikifact:employer", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="employer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_field_of_work_helm = LightevalTaskConfig( + name="wikifact:field_of_work", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="field_of_work", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_file_extension_helm = LightevalTaskConfig( + name="wikifact:file_extension", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="file_extension", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_genetic_association_helm = LightevalTaskConfig( + name="wikifact:genetic_association", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="genetic_association", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_genre_helm = LightevalTaskConfig( + name="wikifact:genre", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="genre", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_has_part_helm = LightevalTaskConfig( + name="wikifact:has_part", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="has_part", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_head_of_government_helm = LightevalTaskConfig( + name="wikifact:head_of_government", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="head_of_government", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_head_of_state_helm = LightevalTaskConfig( + name="wikifact:head_of_state", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="head_of_state", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_headquarters_location_helm = LightevalTaskConfig( + name="wikifact:headquarters_location", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="headquarters_location", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_industry_helm = LightevalTaskConfig( + name="wikifact:industry", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="industry", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_influenced_by_helm = LightevalTaskConfig( + name="wikifact:influenced_by", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="influenced_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_instance_of_helm = LightevalTaskConfig( + name="wikifact:instance_of", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="instance_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_instrument_helm = LightevalTaskConfig( + name="wikifact:instrument", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="instrument", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_language_of_work_or_name_helm = LightevalTaskConfig( + name="wikifact:language_of_work_or_name", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="language_of_work_or_name", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_languages_spoken_written_or_signed_helm = LightevalTaskConfig( + name="wikifact:languages_spoken_written_or_signed", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="languages_spoken_written_or_signed", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_laws_applied_helm = LightevalTaskConfig( + name="wikifact:laws_applied", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="laws_applied", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_located_in_the_administrative_territorial_entity_helm = LightevalTaskConfig( + name="wikifact:located_in_the_administrative_territorial_entity", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="located_in_the_administrative_territorial_entity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_location_helm = LightevalTaskConfig( + name="wikifact:location", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="location", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_location_of_discovery_helm = LightevalTaskConfig( + name="wikifact:location_of_discovery", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="location_of_discovery", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_location_of_formation_helm = LightevalTaskConfig( + name="wikifact:location_of_formation", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="location_of_formation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_majority_opinion_by_helm = LightevalTaskConfig( + name="wikifact:majority_opinion_by", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="majority_opinion_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_manufacturer_helm = LightevalTaskConfig( + name="wikifact:manufacturer", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="manufacturer", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_measured_physical_quantity_helm = LightevalTaskConfig( + name="wikifact:measured_physical_quantity", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="measured_physical_quantity", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_medical_condition_treated_helm = LightevalTaskConfig( + name="wikifact:medical_condition_treated", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="medical_condition_treated", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_member_of_helm = LightevalTaskConfig( + name="wikifact:member_of", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="member_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_member_of_political_party_helm = LightevalTaskConfig( + name="wikifact:member_of_political_party", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="member_of_political_party", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_member_of_sports_team_helm = LightevalTaskConfig( + name="wikifact:member_of_sports_team", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="member_of_sports_team", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_movement_helm = LightevalTaskConfig( + name="wikifact:movement", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="movement", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_named_after_helm = LightevalTaskConfig( + name="wikifact:named_after", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="named_after", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_native_language_helm = LightevalTaskConfig( + name="wikifact:native_language", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="native_language", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_number_of_processor_cores_helm = LightevalTaskConfig( + name="wikifact:number_of_processor_cores", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="number_of_processor_cores", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_occupation_helm = LightevalTaskConfig( + name="wikifact:occupation", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="occupation", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_office_held_by_head_of_government_helm = LightevalTaskConfig( + name="wikifact:office_held_by_head_of_government", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="office_held_by_head_of_government", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_office_held_by_head_of_state_helm = LightevalTaskConfig( + name="wikifact:office_held_by_head_of_state", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="office_held_by_head_of_state", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_official_language_helm = LightevalTaskConfig( + name="wikifact:official_language", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="official_language", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_operating_system_helm = LightevalTaskConfig( + name="wikifact:operating_system", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="operating_system", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_original_language_of_film_or_TV_show_helm = LightevalTaskConfig( + name="wikifact:original_language_of_film_or_TV_show", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="original_language_of_film_or_TV_show", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_original_network_helm = LightevalTaskConfig( + name="wikifact:original_network", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="original_network", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_overrules_helm = LightevalTaskConfig( + name="wikifact:overrules", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="overrules", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_owned_by_helm = LightevalTaskConfig( + name="wikifact:owned_by", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="owned_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_part_of_helm = LightevalTaskConfig( + name="wikifact:part_of", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="part_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_participating_team_helm = LightevalTaskConfig( + name="wikifact:participating_team", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="participating_team", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_place_of_birth_helm = LightevalTaskConfig( + name="wikifact:place_of_birth", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="place_of_birth", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_place_of_death_helm = LightevalTaskConfig( + name="wikifact:place_of_death", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="place_of_death", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_plaintiff_helm = LightevalTaskConfig( + name="wikifact:plaintiff", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="plaintiff", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_position_held_helm = LightevalTaskConfig( + name="wikifact:position_held", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="position_held", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_position_played_on_team_helm = LightevalTaskConfig( + name="wikifact:position_played_on_team", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="position_played_on_team", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_programming_language_helm = LightevalTaskConfig( + name="wikifact:programming_language", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="programming_language", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_recommended_unit_of_measurement_helm = LightevalTaskConfig( + name="wikifact:recommended_unit_of_measurement", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="recommended_unit_of_measurement", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_record_label_helm = LightevalTaskConfig( + name="wikifact:record_label", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="record_label", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_religion_helm = LightevalTaskConfig( + name="wikifact:religion", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="religion", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_repealed_by_helm = LightevalTaskConfig( + name="wikifact:repealed_by", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="repealed_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_shares_border_with_helm = LightevalTaskConfig( + name="wikifact:shares_border_with", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="shares_border_with", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_solved_by_helm = LightevalTaskConfig( + name="wikifact:solved_by", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="solved_by", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_statement_describes_helm = LightevalTaskConfig( + name="wikifact:statement_describes", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="statement_describes", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_stock_exchange_helm = LightevalTaskConfig( + name="wikifact:stock_exchange", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="stock_exchange", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_subclass_of_helm = LightevalTaskConfig( + name="wikifact:subclass_of", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="subclass_of", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_subsidiary_helm = LightevalTaskConfig( + name="wikifact:subsidiary", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="subsidiary", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_symptoms_and_signs_helm = LightevalTaskConfig( + name="wikifact:symptoms_and_signs", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="symptoms_and_signs", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_therapeutic_area_helm = LightevalTaskConfig( + name="wikifact:therapeutic_area", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="therapeutic_area", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_time_of_discovery_or_invention_helm = LightevalTaskConfig( + name="wikifact:time_of_discovery_or_invention", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="time_of_discovery_or_invention", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_twinned_administrative_body_helm = LightevalTaskConfig( + name="wikifact:twinned_administrative_body", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="twinned_administrative_body", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikifact_work_location_helm = LightevalTaskConfig( + name="wikifact:work_location", + suite=["helm"], + prompt_function="wikifact", + hf_repo="lighteval/wikifact", + hf_subset="work_location", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=8, + metric=["exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikitext_2_lighteval = LightevalTaskConfig( + name="wikitext:2", + suite=["lighteval"], + prompt_function="wikitext", + hf_repo="wikitext", + hf_subset="wikitext-2-raw-v1", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikitext_103_document_level_harness = LightevalTaskConfig( + name="wikitext:103:document_level", + suite=["harness"], + prompt_function="wikitext_harness", + hf_repo="EleutherAI/wikitext_document_level", + hf_subset="wikitext-103-raw-v1", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wikitext_103_document_level_helm = LightevalTaskConfig( + name="wikitext:103:document_level", + suite=["helm"], + prompt_function="wikitext_helm", + hf_repo="EleutherAI/wikitext_document_level", + hf_subset="wikitext-103-raw-v1", + hf_avail_splits=["train", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["word_perplexity", "byte_perplexity", "bits_per_byte"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wino_x_german_bigbench = LightevalTaskConfig( + name="wino_x_german", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="wino_x_german", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +winogrande_leaderboard = LightevalTaskConfig( + name="winogrande", + suite=["leaderboard"], + prompt_function="winogrande", + hf_repo="winogrande", + hf_subset="winogrande_xl", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +winowhy_bigbench_lite = LightevalTaskConfig( + name="winowhy", + suite=["bigbench_lite", "bigbench", "bigbench_json"], + prompt_function="bigbench_whitespace_after_query", + hf_repo="bigbench", + hf_subset="winowhy", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_cs_en_lighteval = LightevalTaskConfig( + name="wmt08:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_de_en_lighteval = LightevalTaskConfig( + name="wmt08:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_en_cs_lighteval = LightevalTaskConfig( + name="wmt08:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_en_de_lighteval = LightevalTaskConfig( + name="wmt08:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_en_es_lighteval = LightevalTaskConfig( + name="wmt08:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_en_fr_lighteval = LightevalTaskConfig( + name="wmt08:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_en_hu_lighteval = LightevalTaskConfig( + name="wmt08:en-hu", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_en-hu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_es_en_lighteval = LightevalTaskConfig( + name="wmt08:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_fr_en_lighteval = LightevalTaskConfig( + name="wmt08:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt08_hu_en_lighteval = LightevalTaskConfig( + name="wmt08:hu-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt08_hu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_cs_en_lighteval = LightevalTaskConfig( + name="wmt09:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_de_en_lighteval = LightevalTaskConfig( + name="wmt09:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_cs_lighteval = LightevalTaskConfig( + name="wmt09:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_de_lighteval = LightevalTaskConfig( + name="wmt09:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_es_lighteval = LightevalTaskConfig( + name="wmt09:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_fr_lighteval = LightevalTaskConfig( + name="wmt09:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_hu_lighteval = LightevalTaskConfig( + name="wmt09:en-hu", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-hu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_en_it_lighteval = LightevalTaskConfig( + name="wmt09:en-it", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_en-it", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_es_en_lighteval = LightevalTaskConfig( + name="wmt09:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_fr_en_lighteval = LightevalTaskConfig( + name="wmt09:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_hu_en_lighteval = LightevalTaskConfig( + name="wmt09:hu-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_hu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt09_it_en_lighteval = LightevalTaskConfig( + name="wmt09:it-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt09_it-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_cs_en_lighteval = LightevalTaskConfig( + name="wmt10:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_de_en_lighteval = LightevalTaskConfig( + name="wmt10:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_en_cs_lighteval = LightevalTaskConfig( + name="wmt10:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_en_de_lighteval = LightevalTaskConfig( + name="wmt10:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_en_es_lighteval = LightevalTaskConfig( + name="wmt10:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_en_fr_lighteval = LightevalTaskConfig( + name="wmt10:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_es_en_lighteval = LightevalTaskConfig( + name="wmt10:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt10_fr_en_lighteval = LightevalTaskConfig( + name="wmt10:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt10_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_cs_en_lighteval = LightevalTaskConfig( + name="wmt11:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_de_en_lighteval = LightevalTaskConfig( + name="wmt11:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_en_cs_lighteval = LightevalTaskConfig( + name="wmt11:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_en_de_lighteval = LightevalTaskConfig( + name="wmt11:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_en_es_lighteval = LightevalTaskConfig( + name="wmt11:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_en_fr_lighteval = LightevalTaskConfig( + name="wmt11:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_es_en_lighteval = LightevalTaskConfig( + name="wmt11:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt11_fr_en_lighteval = LightevalTaskConfig( + name="wmt11:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt11_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_cs_en_lighteval = LightevalTaskConfig( + name="wmt12:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_de_en_lighteval = LightevalTaskConfig( + name="wmt12:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_en_cs_lighteval = LightevalTaskConfig( + name="wmt12:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_en_de_lighteval = LightevalTaskConfig( + name="wmt12:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_en_es_lighteval = LightevalTaskConfig( + name="wmt12:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_en_fr_lighteval = LightevalTaskConfig( + name="wmt12:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_es_en_lighteval = LightevalTaskConfig( + name="wmt12:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt12_fr_en_lighteval = LightevalTaskConfig( + name="wmt12:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt12_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_cs_en_lighteval = LightevalTaskConfig( + name="wmt13:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_de_en_lighteval = LightevalTaskConfig( + name="wmt13:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_en_cs_lighteval = LightevalTaskConfig( + name="wmt13:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_en_de_lighteval = LightevalTaskConfig( + name="wmt13:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_en_es_lighteval = LightevalTaskConfig( + name="wmt13:en-es", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-es", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_en_fr_lighteval = LightevalTaskConfig( + name="wmt13:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_en_ru_lighteval = LightevalTaskConfig( + name="wmt13:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_es_en_lighteval = LightevalTaskConfig( + name="wmt13:es-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_es-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_fr_en_lighteval = LightevalTaskConfig( + name="wmt13:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt13_ru_en_lighteval = LightevalTaskConfig( + name="wmt13:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt13_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_cs_en_lighteval = LightevalTaskConfig( + name="wmt14:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_de_en_lighteval = LightevalTaskConfig( + name="wmt14:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_cs_lighteval = LightevalTaskConfig( + name="wmt14:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_de_lighteval = LightevalTaskConfig( + name="wmt14:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_fr_lighteval = LightevalTaskConfig( + name="wmt14:en-fr", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_alphabetical", + hf_repo="wmt14", + hf_subset="fr-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_fr_lighteval = LightevalTaskConfig( + name="wmt14:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_hi_lighteval = LightevalTaskConfig( + name="wmt14:en-hi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-hi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_en_ru_lighteval = LightevalTaskConfig( + name="wmt14:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_fr_en_lighteval = LightevalTaskConfig( + name="wmt14:fr-en", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="wmt14", + hf_subset="fr-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_fr_en_lighteval = LightevalTaskConfig( + name="wmt14:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_hi_en_lighteval = LightevalTaskConfig( + name="wmt14:hi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_hi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_ru_en_lighteval = LightevalTaskConfig( + name="wmt14:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt14_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_cs_en_helm = LightevalTaskConfig( + name="wmt14:cs-en", + suite=["helm"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/wmt14", + hf_subset="cs-en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_de_en_helm = LightevalTaskConfig( + name="wmt14:de-en", + suite=["helm"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/wmt14", + hf_subset="de-en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_fr_en_helm = LightevalTaskConfig( + name="wmt14:fr-en", + suite=["helm"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/wmt14", + hf_subset="fr-en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_hi_en_helm = LightevalTaskConfig( + name="wmt14:hi-en", + suite=["helm"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/wmt14", + hf_subset="hi-en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt14_ru_en_helm = LightevalTaskConfig( + name="wmt14:ru-en", + suite=["helm"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/wmt14", + hf_subset="ru-en", + hf_avail_splits=["train", "test", "validation"], + evaluation_splits=["validation", "test"], + few_shots_split=None, + few_shots_select=None, + generation_size=100, + metric=["bleu"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_cs_en_lighteval = LightevalTaskConfig( + name="wmt15:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_de_en_lighteval = LightevalTaskConfig( + name="wmt15:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_en_cs_lighteval = LightevalTaskConfig( + name="wmt15:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_en_de_lighteval = LightevalTaskConfig( + name="wmt15:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_en_fi_lighteval = LightevalTaskConfig( + name="wmt15:en-fi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_en_fr_lighteval = LightevalTaskConfig( + name="wmt15:en-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_en_ru_lighteval = LightevalTaskConfig( + name="wmt15:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_fi_en_lighteval = LightevalTaskConfig( + name="wmt15:fi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_fr_en_lighteval = LightevalTaskConfig( + name="wmt15:fr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_fr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt15_ru_en_lighteval = LightevalTaskConfig( + name="wmt15:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt15_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_cs_en_lighteval = LightevalTaskConfig( + name="wmt16:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_de_en_lighteval = LightevalTaskConfig( + name="wmt16:de-en", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_alphabetical", + hf_repo="wmt16", + hf_subset="de-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_de_en_lighteval = LightevalTaskConfig( + name="wmt16:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_cs_lighteval = LightevalTaskConfig( + name="wmt16:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_de_lighteval = LightevalTaskConfig( + name="wmt16:en-de", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="wmt16", + hf_subset="de-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_de_lighteval = LightevalTaskConfig( + name="wmt16:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_fi_lighteval = LightevalTaskConfig( + name="wmt16:en-fi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_ro_lighteval = LightevalTaskConfig( + name="wmt16:en-ro", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_alphabetical", + hf_repo="wmt16", + hf_subset="ro-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_ro_lighteval = LightevalTaskConfig( + name="wmt16:en-ro", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-ro", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_ru_lighteval = LightevalTaskConfig( + name="wmt16:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_en_tr_lighteval = LightevalTaskConfig( + name="wmt16:en-tr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_en-tr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_fi_en_lighteval = LightevalTaskConfig( + name="wmt16:fi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_ro_en_lighteval = LightevalTaskConfig( + name="wmt16:ro-en", + suite=["lighteval", "gpt3_benchmarks"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="wmt16", + hf_subset="ro-en", + hf_avail_splits=["train", "validation", "test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_ro_en_lighteval = LightevalTaskConfig( + name="wmt16:ro-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_ro-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_ru_en_lighteval = LightevalTaskConfig( + name="wmt16:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt16_tr_en_lighteval = LightevalTaskConfig( + name="wmt16:tr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt16_tr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_cs_en_lighteval = LightevalTaskConfig( + name="wmt17:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_de_en_lighteval = LightevalTaskConfig( + name="wmt17:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_cs_lighteval = LightevalTaskConfig( + name="wmt17:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_de_lighteval = LightevalTaskConfig( + name="wmt17:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_fi_lighteval = LightevalTaskConfig( + name="wmt17:en-fi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_lv_lighteval = LightevalTaskConfig( + name="wmt17:en-lv", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-lv", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_ru_lighteval = LightevalTaskConfig( + name="wmt17:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_tr_lighteval = LightevalTaskConfig( + name="wmt17:en-tr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-tr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_en_zh_lighteval = LightevalTaskConfig( + name="wmt17:en-zh", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_fi_en_lighteval = LightevalTaskConfig( + name="wmt17:fi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_lv_en_lighteval = LightevalTaskConfig( + name="wmt17:lv-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_lv-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_ru_en_lighteval = LightevalTaskConfig( + name="wmt17:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_tr_en_lighteval = LightevalTaskConfig( + name="wmt17:tr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_tr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt17_zh_en_lighteval = LightevalTaskConfig( + name="wmt17:zh-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt17_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_cs_en_lighteval = LightevalTaskConfig( + name="wmt18:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_de_en_lighteval = LightevalTaskConfig( + name="wmt18:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_cs_lighteval = LightevalTaskConfig( + name="wmt18:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_de_lighteval = LightevalTaskConfig( + name="wmt18:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_et_lighteval = LightevalTaskConfig( + name="wmt18:en-et", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-et", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_fi_lighteval = LightevalTaskConfig( + name="wmt18:en-fi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_ru_lighteval = LightevalTaskConfig( + name="wmt18:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_tr_lighteval = LightevalTaskConfig( + name="wmt18:en-tr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-tr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_en_zh_lighteval = LightevalTaskConfig( + name="wmt18:en-zh", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_et_en_lighteval = LightevalTaskConfig( + name="wmt18:et-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_et-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_fi_en_lighteval = LightevalTaskConfig( + name="wmt18:fi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_ru_en_lighteval = LightevalTaskConfig( + name="wmt18:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_tr_en_lighteval = LightevalTaskConfig( + name="wmt18:tr-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_tr-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt18_zh_en_lighteval = LightevalTaskConfig( + name="wmt18:zh-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt18_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_cs_de_lighteval = LightevalTaskConfig( + name="wmt19:cs-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_cs-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_de_cs_lighteval = LightevalTaskConfig( + name="wmt19:de-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_de-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_de_en_lighteval = LightevalTaskConfig( + name="wmt19:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_de_fr_lighteval = LightevalTaskConfig( + name="wmt19:de-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_de-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_cs_lighteval = LightevalTaskConfig( + name="wmt19:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_de_lighteval = LightevalTaskConfig( + name="wmt19:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_fi_lighteval = LightevalTaskConfig( + name="wmt19:en-fi", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-fi", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_gu_lighteval = LightevalTaskConfig( + name="wmt19:en-gu", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-gu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_kk_lighteval = LightevalTaskConfig( + name="wmt19:en-kk", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-kk", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_lt_lighteval = LightevalTaskConfig( + name="wmt19:en-lt", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-lt", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_ru_lighteval = LightevalTaskConfig( + name="wmt19:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_en_zh_lighteval = LightevalTaskConfig( + name="wmt19:en-zh", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_fi_en_lighteval = LightevalTaskConfig( + name="wmt19:fi-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_fi-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_fr_de_lighteval = LightevalTaskConfig( + name="wmt19:fr-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_fr-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_gu_en_lighteval = LightevalTaskConfig( + name="wmt19:gu-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_gu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_kk_en_lighteval = LightevalTaskConfig( + name="wmt19:kk-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_kk-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_lt_en_lighteval = LightevalTaskConfig( + name="wmt19:lt-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_lt-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_ru_en_lighteval = LightevalTaskConfig( + name="wmt19:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt19_zh_en_lighteval = LightevalTaskConfig( + name="wmt19:zh-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt19_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_cs_en_lighteval = LightevalTaskConfig( + name="wmt20:cs-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_cs-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_de_en_lighteval = LightevalTaskConfig( + name="wmt20:de-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_de-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_de_fr_lighteval = LightevalTaskConfig( + name="wmt20:de-fr", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_de-fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_cs_lighteval = LightevalTaskConfig( + name="wmt20:en-cs", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-cs", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_de_lighteval = LightevalTaskConfig( + name="wmt20:en-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_iu_lighteval = LightevalTaskConfig( + name="wmt20:en-iu", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-iu", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_ja_lighteval = LightevalTaskConfig( + name="wmt20:en-ja", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ja", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_km_lighteval = LightevalTaskConfig( + name="wmt20:en-km", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-km", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_pl_lighteval = LightevalTaskConfig( + name="wmt20:en-pl", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-pl", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_ps_lighteval = LightevalTaskConfig( + name="wmt20:en-ps", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ps", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_ru_lighteval = LightevalTaskConfig( + name="wmt20:en-ru", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_ta_lighteval = LightevalTaskConfig( + name="wmt20:en-ta", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-ta", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_en_zh_lighteval = LightevalTaskConfig( + name="wmt20:en-zh", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_en-zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_fr_de_lighteval = LightevalTaskConfig( + name="wmt20:fr-de", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_fr-de", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_iu_en_lighteval = LightevalTaskConfig( + name="wmt20:iu-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_iu-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_ja_en_lighteval = LightevalTaskConfig( + name="wmt20:ja-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ja-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_km_en_lighteval = LightevalTaskConfig( + name="wmt20:km-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_km-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_pl_en_lighteval = LightevalTaskConfig( + name="wmt20:pl-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_pl-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_ps_en_lighteval = LightevalTaskConfig( + name="wmt20:ps-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ps-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_ru_en_lighteval = LightevalTaskConfig( + name="wmt20:ru-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ru-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_ta_en_lighteval = LightevalTaskConfig( + name="wmt20:ta-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_ta-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wmt20_zh_en_lighteval = LightevalTaskConfig( + name="wmt20:zh-en", + suite=["lighteval", "sacrebleu"], + prompt_function="wmt_reverse_alphabetical", + hf_repo="lighteval/sacrebleu_manual", + hf_subset="wmt20_zh-en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=None, + metric=["bleu", "chrf", "ter"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +word_sorting_bigbench = LightevalTaskConfig( + name="word_sorting", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="word_sorting", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +word_unscrambling_bigbench = LightevalTaskConfig( + name="word_unscrambling", + suite=["bigbench", "bigbench_json"], + prompt_function="bigbench", + hf_repo="bigbench", + hf_subset="word_unscrambling", + hf_avail_splits=["default", "train", "validation"], + evaluation_splits=["default"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metric=["perfect_exact_match"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +wsc273_lighteval = LightevalTaskConfig( + name="wsc273", + suite=["lighteval"], + prompt_function="wsc273", + hf_repo="winograd_wsc", + hf_subset="wsc273", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_en_lighteval = LightevalTaskConfig( + name="xcopa:en", + suite=["lighteval"], + prompt_function="xcopa_en", + hf_repo="xcopa", + hf_subset="default", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_et_lighteval = LightevalTaskConfig( + name="xcopa:et", + suite=["lighteval"], + prompt_function="xcopa_et", + hf_repo="xcopa", + hf_subset="et", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_ht_lighteval = LightevalTaskConfig( + name="xcopa:ht", + suite=["lighteval"], + prompt_function="xcopa_ht", + hf_repo="xcopa", + hf_subset="ht", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_it_lighteval = LightevalTaskConfig( + name="xcopa:it", + suite=["lighteval"], + prompt_function="xcopa_it", + hf_repo="xcopa", + hf_subset="it", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_id_lighteval = LightevalTaskConfig( + name="xcopa:id", + suite=["lighteval"], + prompt_function="xcopa_id", + hf_repo="xcopa", + hf_subset="id", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_qu_lighteval = LightevalTaskConfig( + name="xcopa:qu", + suite=["lighteval"], + prompt_function="xcopa_qu", + hf_repo="xcopa", + hf_subset="qu", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_sw_lighteval = LightevalTaskConfig( + name="xcopa:sw", + suite=["lighteval"], + prompt_function="xcopa_sw", + hf_repo="xcopa", + hf_subset="sw", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_zh_lighteval = LightevalTaskConfig( + name="xcopa:zh", + suite=["lighteval"], + prompt_function="xcopa_zh", + hf_repo="xcopa", + hf_subset="zh", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_ta_lighteval = LightevalTaskConfig( + name="xcopa:ta", + suite=["lighteval"], + prompt_function="xcopa_ta", + hf_repo="xcopa", + hf_subset="ta", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_th_lighteval = LightevalTaskConfig( + name="xcopa:th", + suite=["lighteval"], + prompt_function="xcopa_th", + hf_repo="xcopa", + hf_subset="th", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_tr_lighteval = LightevalTaskConfig( + name="xcopa:tr", + suite=["lighteval"], + prompt_function="xcopa_tr", + hf_repo="xcopa", + hf_subset="tr", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xcopa_vi_lighteval = LightevalTaskConfig( + name="xcopa:vi", + suite=["lighteval"], + prompt_function="xcopa_vi", + hf_repo="xcopa", + hf_subset="vi", + hf_avail_splits=["test", "train", "validation"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_en_lighteval = LightevalTaskConfig( + name="xstory_cloze:en", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="en", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_ru_lighteval = LightevalTaskConfig( + name="xstory_cloze:ru", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="ru", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_zh_lighteval = LightevalTaskConfig( + name="xstory_cloze:zh", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="zh", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_es_lighteval = LightevalTaskConfig( + name="xstory_cloze:es", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="es", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_ar_lighteval = LightevalTaskConfig( + name="xstory_cloze:ar", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="ar", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_hi_lighteval = LightevalTaskConfig( + name="xstory_cloze:hi", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="hi", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_id_lighteval = LightevalTaskConfig( + name="xstory_cloze:id", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="id", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_te_lighteval = LightevalTaskConfig( + name="xstory_cloze:te", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="te", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_sw_lighteval = LightevalTaskConfig( + name="xstory_cloze:sw", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="sw", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_eu_lighteval = LightevalTaskConfig( + name="xstory_cloze:eu", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="eu", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xstory_cloze_my_lighteval = LightevalTaskConfig( + name="xstory_cloze:my", + suite=["lighteval"], + prompt_function="storycloze", + hf_repo="juletxara/xstory_cloze", + hf_subset="my", + hf_avail_splits=["training", "eval"], + evaluation_splits=["eval"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_en_lighteval = LightevalTaskConfig( + name="xwinograd:en", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="en", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_fr_lighteval = LightevalTaskConfig( + name="xwinograd:fr", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="fr", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_jp_lighteval = LightevalTaskConfig( + name="xwinograd:jp", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="jp", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_pt_lighteval = LightevalTaskConfig( + name="xwinograd:pt", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="pt", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_ru_lighteval = LightevalTaskConfig( + name="xwinograd:ru", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="ru", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) +xwinograd_zh_lighteval = LightevalTaskConfig( + name="xwinograd:zh", + suite=["lighteval"], + prompt_function="winogrande", + hf_repo="Muennighoff/xwinograd", + hf_subset="zh", + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=-1, + metric=["loglikelihood_acc"], + stop_sequence=["\n"], + output_regex=None, + frozen=False, + trust_dataset=True, + version=0, +) diff --git a/src/lighteval/tasks/extended/ifeval/main.py b/src/lighteval/tasks/extended/ifeval/main.py index 1faf55ee7..c7290c3f1 100644 --- a/src/lighteval/tasks/extended/ifeval/main.py +++ b/src/lighteval/tasks/extended/ifeval/main.py @@ -157,10 +157,8 @@ def agg_inst_level_acc(items): ) -_TASKS = [ifeval] +TASKS_TABLE = [ifeval] -# Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] extend_enum(Metrics, "ifeval_metric", ifeval_metrics) if __name__ == "__main__": diff --git a/src/lighteval/tasks/extended/mt_bench/main.py b/src/lighteval/tasks/extended/mt_bench/main.py index a0ce741ca..4dfdeb41e 100644 --- a/src/lighteval/tasks/extended/mt_bench/main.py +++ b/src/lighteval/tasks/extended/mt_bench/main.py @@ -71,9 +71,7 @@ def mt_bench_prompt(line, task_name: str = None): ) -_TASKS = [task] - -TASKS_TABLE = [task.as_dict() for task in _TASKS] +TASKS_TABLE = [task] if __name__ == "__main__": print(t["name"] for t in TASKS_TABLE) diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index a637f1a61..a8ce41a34 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -237,7 +237,7 @@ def aggregate(self, y_input): # }, ] -_TASKS = [] +TASKS_TABLE = [] for task in task_params: name = task["name"] generation_size = None @@ -259,7 +259,7 @@ def aggregate(self, y_input): generation_size=generation_size, stop_sequence=stop_sequence, ) - _TASKS.append(task) + TASKS_TABLE.append(task) # CUSTOM METRIC for task_param in task_params: @@ -288,8 +288,6 @@ def aggregate(self, y_input): # MODULE LOGIC # You should not need to touch this # Convert to dict for lighteval -TASKS_TABLE = [task.as_dict() for task in _TASKS] - if __name__ == "__main__": print(t["name"] for t in TASKS_TABLE) print(len(TASKS_TABLE)) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index fa70b61da..fa1b1d5af 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -30,6 +30,7 @@ from datasets import load_dataset +import lighteval.tasks.tasks_prompt_formatting as tasks_prompt_formatting from lighteval.few_shot_manager import FewShotSampler from lighteval.logging.hierarchical_logger import hlog, hlog_warn from lighteval.metrics import ( @@ -56,8 +57,6 @@ ) from lighteval.utils import NO_OPENAI_ERROR_MSG, as_list, is_openai_available -from . import tasks_prompt_formatting - if TYPE_CHECKING: from lighteval.logging.evaluation_tracker import EvaluationTracker @@ -115,25 +114,6 @@ class LightevalTaskConfig: version: int = 0 - def as_dict(self): - return { - "name": self.name, - "prompt_function": self.prompt_function, - "hf_repo": self.hf_repo, - "hf_subset": self.hf_subset, - "metric": tuple(str(m) for m in self.metric), - "hf_avail_splits": self.hf_avail_splits, - "evaluation_splits": self.evaluation_splits, - "few_shots_split": self.few_shots_split, - "few_shots_select": self.few_shots_select, - "generation_size": self.generation_size, - "stop_sequence": self.stop_sequence, - "output_regex": self.output_regex, - "frozen": self.frozen, - "suite": self.suite, - "version": self.version, - } - def __post_init__(self): if self.suite is None: self.suite = ["custom"] diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index df5e4da6a..ef575b7e1 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -28,9 +28,9 @@ from types import ModuleType from typing import Dict, List, Optional, Tuple, Union -from datasets import Dataset from datasets.load import dataset_module_factory +import lighteval.tasks.default_tasks as default_tasks from lighteval.logging.hierarchical_logger import hlog, hlog_warn from lighteval.tasks.extended import AVAILABLE_EXTENDED_TASKS_MODULES from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig @@ -57,8 +57,6 @@ TRUNCATE_FEW_SHOTS_DEFAULTS = True -TABLE_PATH = os.path.join(os.path.dirname(__file__), "tasks_table.jsonl") - class Registry: """ @@ -110,7 +108,7 @@ def get_task_class( ) def get_task_dict( - self, task_name_list: List[str], custom_tasks: Optional[Union[str, ModuleType]] = None + self, task_name_list: List[str], custom_tasks: Optional[Union[str, Path, ModuleType]] = None ) -> Dict[str, LightevalTask]: """ Get a dictionary of tasks based on the task name list. @@ -155,7 +153,7 @@ def get_task_dict( return tasks_dict -def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleType: +def create_custom_tasks_module(custom_tasks: Union[str, Path, ModuleType]) -> ModuleType: """Creates a custom task module to load tasks defined by the user in their own file. Args: @@ -234,15 +232,15 @@ def taskinfo_selector( def create_config_tasks( - meta_table: Optional[Dataset] = None, cache_dir: Optional[str] = None + meta_table: Optional[List[LightevalTaskConfig]] = None, cache_dir: Optional[str] = None ) -> Dict[str, LightevalTask]: """ Create configuration tasks based on the provided meta_table. Args: - meta_table (Optional[Dataset]): meta_table containing task + meta_table: meta_table containing tasks configurations. If not provided, it will be loaded from TABLE_PATH. - cache_dir (Optional[str]): Directory to store cached data. If not + cache_dir: Directory to store cached data. If not provided, the default cache directory will be used. Returns: @@ -257,18 +255,18 @@ def __init__(self, custom_tasks_module=None): return LightevalTaskFromConfig if meta_table is None: - meta_table = Dataset.from_json(TABLE_PATH) + meta_table = [config for config in vars(default_tasks).values() if isinstance(config, LightevalTaskConfig)] tasks_with_config = {} # Every task is renamed suite|task, if the suite is in DEFAULT_SUITE - for line in meta_table: - if not any(suite in line["suite"] for suite in DEFAULT_SUITES): + for config in meta_table: + if not any(suite in config.suite for suite in DEFAULT_SUITES): hlog_warn( - f"This evaluation is not in any known suite: {line['name']} is in {line['suite']}, not in {DEFAULT_SUITES}. Skipping." + f"This evaluation is not in any known suite: {config.name} is in {config.suite}, not in {DEFAULT_SUITES}. Skipping." ) continue - for suite in line["suite"]: + for suite in config.suite: if suite in DEFAULT_SUITES: - tasks_with_config[f"{suite}|{line['name']}"] = LightevalTaskConfig(**line) + tasks_with_config[f"{suite}|{config.name}"] = config return {task: create_task(task, cfg, cache_dir=cache_dir) for task, cfg in tasks_with_config.items()} diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl deleted file mode 100644 index 0047ad5db..000000000 --- a/src/lighteval/tasks/tasks_table.jsonl +++ /dev/null @@ -1,1235 +0,0 @@ -{"name":"abstract_narrative_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"abstract_narrative_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:aqua-rat","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-aqua-rat","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-biology","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-biology","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-chemistry","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chemistry","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-chinese","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-chinese","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-english","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-english","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-geography","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-geography","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-history","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-history","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-mathqa","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-mathqa","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:gaokao-physics","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-gaokao-physics","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:logiqa-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:logiqa-zh","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-logiqa-zh","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:lsat-ar","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-ar","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:lsat-lr","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-lr","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:lsat-rc","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-lsat-rc","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:sat-en","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:sat-en-without-passage","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-en-without-passage","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"agieval:sat-math","suite":["lighteval"],"prompt_function":"agieval","hf_repo":"dmayhem93/agieval-sat-math","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":null,"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"anachronisms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"anachronisms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"analogical_similarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analogical_similarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"analytic_entailment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"analytic_entailment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"anli","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r1","dev_r1","train_r2","dev_r2","train_r3","dev_r3","test_r1","test_r2","test_r3"],"evaluation_splits":["test_r1","test_r2","test_r3"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"anli:r1","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r1","dev_r1","test_r1"],"evaluation_splits":["test_r1"],"few_shots_split":"train_r1","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"anli:r2","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r2","dev_r2","test_r2"],"evaluation_splits":["test_r2"],"few_shots_split":"train_r2","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"anli:r3","suite":["lighteval","anli"],"prompt_function":"anli","hf_repo":"anli","hf_subset":"plain_text","hf_avail_splits":["train_r3","dev_r3","test_r3"],"evaluation_splits":["test_r3"],"few_shots_split":"train_r3","few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arc:c:letters","suite":["original","arc"],"prompt_function":"arc_with_options_letters_predict","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arc:c:options","suite":["original","arc"],"prompt_function":"arc_with_options","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arc:c:simple","suite":["original","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arc:challenge","suite":["leaderboard","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Challenge","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arc:easy","suite":["lighteval","arc"],"prompt_function":"arc","hf_repo":"ai2_arc","hf_subset":"ARC-Easy","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:1dc","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_1dc","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:2da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:2dm","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2dm","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:2ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_2ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:3da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_3da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:3ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_3ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:4da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_4da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:4ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_4ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:5da","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_5da","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic:5ds","suite":["lighteval","arithmetic"],"prompt_function":"arithmetic","hf_repo":"EleutherAI\/arithmetic","hf_subset":"arithmetic_5ds","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"arithmetic_bb","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"arithmetic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ascii_word_recognition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"ascii_word_recognition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"asdiv","suite":["lighteval"],"prompt_function":"asdiv","hf_repo":"EleutherAI\/asdiv","hf_subset":"asdiv","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"authorship_verification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"authorship_verification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"auto_categorization","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"auto_categorization","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"auto_debugging","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_and_after_query","hf_repo":"bigbench","hf_subset":"auto_debugging","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} -{"name":"babi_qa","suite":["helm"],"prompt_function":"babi_qa","hf_repo":"facebook\/babi_qa","hf_subset":"en-valid-qa1","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:causal_judgment","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:date_understanding","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:disambiguation_qa","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:geometric_shapes","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:movie_recommendation","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:navigate","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:reasoning_about_colored_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:ruin_names","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:salient_translation_error_detection","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:snarks","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:sports_understanding","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:temporal_sequences","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bigbench:causal_judgment","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:date_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:geometric_shapes","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:movie_recommendation","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:navigate","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:ruin_names","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:snarks","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:sports_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:temporal_sequences","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true,"version":0} -{"name":"bbh:boolean_expressions","suite":["harness"],"prompt_function":"bbh_boolean_expressions","hf_repo":"lukaemon/bbh","hf_subset":"boolean_expressions","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:causal_judgment","suite":["harness"],"prompt_function":"bbh_causal_judgment","hf_repo":"lukaemon/bbh","hf_subset":"causal_judgement","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:date_understanding","suite":["harness"],"prompt_function":"bbh_date_understanding","hf_repo":"lukaemon/bbh","hf_subset":"date_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_disambiguation_qa","hf_repo":"lukaemon/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:dyck_languages","suite":["harness"],"prompt_function":"bbh_dyck_languages","hf_repo":"lukaemon/bbh","hf_subset":"dyck_languages","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:formal_fallacies","suite":["harness"],"prompt_function":"bbh_formal_fallacies","hf_repo":"lukaemon/bbh","hf_subset":"formal_fallacies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:geometric_shapes","suite":["harness"],"prompt_function":"bbh_geometric_shapes","hf_repo":"lukaemon/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:hyperbaton","suite":["harness"],"prompt_function":"bbh_hyperbaton","hf_repo":"lukaemon/bbh","hf_subset":"hyperbaton","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:movie_recommendation","suite":["harness"],"prompt_function":"bbh_movie_recommendation","hf_repo":"lukaemon/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:multistep_arithmetic_two","suite":["harness"],"prompt_function":"bbh_multistep_arithmetic_two","hf_repo":"lukaemon/bbh","hf_subset":"multistep_arithmetic_two","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:navigate","suite":["harness"],"prompt_function":"bbh_navigate","hf_repo":"lukaemon/bbh","hf_subset":"navigate","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:object_counting","suite":["harness"],"prompt_function":"bbh_object_counting","hf_repo":"lukaemon/bbh","hf_subset":"object_counting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:penguins_in_a_table","suite":["harness"],"prompt_function":"bbh_penguins_in_a_table","hf_repo":"lukaemon/bbh","hf_subset":"penguins_in_a_table","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_reasoning_about_colored_objects","hf_repo":"lukaemon/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:ruin_names","suite":["harness"],"prompt_function":"bbh_ruin_names","hf_repo":"lukaemon/bbh","hf_subset":"ruin_names","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_salient_translation_error_detection","hf_repo":"lukaemon/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:snarks","suite":["harness"],"prompt_function":"bbh_snarks","hf_repo":"lukaemon/bbh","hf_subset":"snarks","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:sports_understanding","suite":["harness"],"prompt_function":"bbh_sports_understanding","hf_repo":"lukaemon/bbh","hf_subset":"sports_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:temporal_sequences","suite":["harness"],"prompt_function":"bbh_temporal_sequences","hf_repo":"lukaemon/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:web_of_lies","suite":["harness"],"prompt_function":"bbh_web_of_lies","hf_repo":"lukaemon/bbh","hf_subset":"web_of_lies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbh:word_sorting","suite":["harness"],"prompt_function":"bbh_word_sorting","hf_repo":"lukaemon/bbh","hf_subset":"word_sorting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true,"version":0} -{"name":"bbq","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Age","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Age","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Disability_status","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Disability_status","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Gender_identity","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Gender_identity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Nationality","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Nationality","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Physical_appearance","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Physical_appearance","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Race_ethnicity","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_ethnicity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Race_x_SES","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_x_SES","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Race_x_gender","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Race_x_gender","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Religion","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Religion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:SES","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"SES","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq:Sexual_orientation","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Sexual_orientation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bbq_lite_json","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"bbq_lite_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:auto_debugging","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"auto_debugging","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:age_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:age_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-age_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:disability_status_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-disability_status_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:disability_status_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-disability_status_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:gender_identity_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-gender_identity_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:gender_identity_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-gender_identity_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:nationality_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-nationality_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:nationality_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-nationality_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:physical_appearance_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-physical_appearance_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:physical_appearance_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-physical_appearance_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:race_ethnicity_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-race_ethnicity_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:race_ethnicity_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-race_ethnicity_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:religion_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-religion_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:religion_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-religion_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:ses_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-ses_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:ses_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-ses_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:sexual_orientation_ambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-sexual_orientation_ambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:bbq_lite_json:sexual_orientation_disambig","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"bbq_lite_json-sexual_orientation_disambig","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:code_line_description","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"code_line_description","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conceptual_combinations:contradictions","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-contradictions","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conceptual_combinations:emergent_properties","suite":["helm"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-emergent_properties","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conceptual_combinations:fanciful_fictional_combinations","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-fanciful_fictional_combinations","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conceptual_combinations:homonyms","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-homonyms","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conceptual_combinations:invented_words","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conceptual_combinations-invented_words","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:adna_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-adna_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:adna_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-adna_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:atikampe_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-atikampe_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:atikampe_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-atikampe_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:gornam_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-gornam_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:gornam_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-gornam_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:holuan_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-holuan_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:holuan_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-holuan_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:mkafala_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-mkafala_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:mkafala_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-mkafala_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:postpositive_english_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-postpositive_english_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:postpositive_english_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-postpositive_english_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:unapuri_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-unapuri_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:unapuri_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-unapuri_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:vaomi_from","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-vaomi_from","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:conlang_translation:vaomi_to","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"conlang_translation-vaomi_to","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge1","rouge2","rougeL"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:emoji_movie","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"emoji_movie","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:formal_fallacies_syllogisms_negation","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"formal_fallacies_syllogisms_negation","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:hindu_knowledge","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"hindu_knowledge","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:known_unknowns","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"known_unknowns","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:language_identification","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"language_identification","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:linguistics_puzzles","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"linguistics_puzzles","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:logic_grid_puzzle","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logic_grid_puzzle","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:logical_deduction-five_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-five_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:logical_deduction-seven_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-seven_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:logical_deduction-three_objects","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"logical_deduction-three_objects","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:misconceptions_russian","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"misconceptions_russian","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:novel_concepts","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"novel_concepts","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:operators","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"operators","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:parsinlu_reading_comprehension","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"parsinlu_reading_comprehension","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:play_dialog_same_or_different","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"play_dialog_same_or_different","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:repeat_copy_logic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"repeat_copy_logic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:strange_stories-boolean","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strange_stories-boolean","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:strange_stories-multiple_choice","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strange_stories-multiple_choice","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:strategyqa","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"strategyqa","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:symbol_interpretation-adversarial","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-adversarial","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:symbol_interpretation-emoji_agnostic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-emoji_agnostic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:symbol_interpretation-name_agnostic","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-name_agnostic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:symbol_interpretation-plain","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-plain","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:symbol_interpretation-tricky","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"symbol_interpretation-tricky","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:vitaminc_fact_verification","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"vitaminc_fact_verification","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bigbench:winowhy","suite":["helm","bigbench_scenario"],"prompt_function":"bigbench_helm","hf_repo":"lighteval\/bigbench_helm","hf_subset":"winowhy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:adjunct_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"adjunct_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:adjunct_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"adjunct_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:anaphor_gender_agreement","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"anaphor_gender_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:anaphor_gender_agreement","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"anaphor_gender_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:anaphor_number_agreement","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"anaphor_number_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:anaphor_number_agreement","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"anaphor_number_agreement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:animate_subject_passive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"animate_subject_passive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:animate_subject_passive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"animate_subject_passive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:animate_subject_trans","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"animate_subject_trans","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:animate_subject_trans","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"animate_subject_trans","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:causative","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"causative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:causative","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"causative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:complex_NP_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"complex_NP_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:complex_NP_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"complex_NP_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:coordinate_structure_constraint_complex_left_branch","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_complex_left_branch","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:coordinate_structure_constraint_complex_left_branch","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_complex_left_branch","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:coordinate_structure_constraint_object_extraction","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_object_extraction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:coordinate_structure_constraint_object_extraction","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"coordinate_structure_constraint_object_extraction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_irregular_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_irregular_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_irregular_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_irregular_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adj_irregular_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adj_irregular_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adjective_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adjective_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:determiner_noun_agreement_with_adjective_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"determiner_noun_agreement_with_adjective_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:distractor_agreement_relational_noun","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"distractor_agreement_relational_noun","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:distractor_agreement_relational_noun","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"distractor_agreement_relational_noun","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:distractor_agreement_relative_clause","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"distractor_agreement_relative_clause","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:distractor_agreement_relative_clause","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"distractor_agreement_relative_clause","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:drop_argument","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"drop_argument","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:drop_argument","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"drop_argument","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:ellipsis_n_bar_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:ellipsis_n_bar_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:ellipsis_n_bar_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:ellipsis_n_bar_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"ellipsis_n_bar_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_object_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_object_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_quantifiers_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_quantifiers_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_quantifiers_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_quantifiers_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_subject_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"existential_there_subject_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:existential_there_subject_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"existential_there_subject_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:expletive_it_object_raising","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"expletive_it_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:expletive_it_object_raising","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"expletive_it_object_raising","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:inchoative","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"inchoative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:inchoative","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"inchoative","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:intransitive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"intransitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:intransitive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"intransitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_past_participle_adjectives","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_past_participle_adjectives","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_past_participle_adjectives","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_past_participle_adjectives","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_past_participle_verbs","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_past_participle_verbs","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_past_participle_verbs","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_past_participle_verbs","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_plural_subject_verb_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_plural_subject_verb_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_plural_subject_verb_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:irregular_plural_subject_verb_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"irregular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:left_branch_island_echo_question","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"left_branch_island_echo_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:left_branch_island_echo_question","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"left_branch_island_echo_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:left_branch_island_simple_question","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"left_branch_island_simple_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:left_branch_island_simple_question","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"left_branch_island_simple_question","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:matrix_question_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"matrix_question_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:matrix_question_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"matrix_question_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:npi_present_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"npi_present_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:npi_present_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"npi_present_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:npi_present_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"npi_present_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:npi_present_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"npi_present_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:only_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"only_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:only_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"only_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:only_npi_scope","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"only_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:only_npi_scope","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"only_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:passive_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"passive_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:passive_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"passive_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:passive_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"passive_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:passive_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"passive_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_c_command","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_c_command","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_c_command","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_c_command","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_case_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_case_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_case_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_case_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_case_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_case_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_case_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_case_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_3","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_domain_3","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_domain_3","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_domain_3","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_reconstruction","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"principle_A_reconstruction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:principle_A_reconstruction","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"principle_A_reconstruction","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:regular_plural_subject_verb_agreement_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:regular_plural_subject_verb_agreement_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:regular_plural_subject_verb_agreement_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:regular_plural_subject_verb_agreement_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"regular_plural_subject_verb_agreement_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_negation_npi_licensor_present","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_negation_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_negation_npi_licensor_present","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_negation_npi_licensor_present","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_negation_npi_scope","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_negation_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_negation_npi_scope","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_negation_npi_scope","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_subject_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"sentential_subject_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:sentential_subject_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"sentential_subject_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:superlative_quantifiers_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"superlative_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:superlative_quantifiers_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"superlative_quantifiers_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:superlative_quantifiers_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"superlative_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:superlative_quantifiers_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"superlative_quantifiers_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:tough_vs_raising_1","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"tough_vs_raising_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:tough_vs_raising_1","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"tough_vs_raising_1","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:tough_vs_raising_2","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"tough_vs_raising_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:tough_vs_raising_2","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"tough_vs_raising_2","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:transitive","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"transitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:transitive","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"transitive","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_island","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_island","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_island","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_object_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_object_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_object_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_object_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_subject_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_subject_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_subject_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_questions_subject_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_questions_subject_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_no_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_no_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_no_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_no_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_no_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_with_gap","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_with_gap","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_with_gap_long_distance","suite":["lighteval","blimp"],"prompt_function":"blimp","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"blimp:wh_vs_that_with_gap_long_distance","suite":["helm","blimp"],"prompt_function":"blimp_helm","hf_repo":"blimp","hf_subset":"wh_vs_that_with_gap_long_distance","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold:gender","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"gender","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold:political_ideology","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"political_ideology","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold:profession","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"profession","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold:race","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"race","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bold:religious_ideology","suite":["helm"],"prompt_function":"bold","hf_repo":"lighteval\/bold_helm","hf_subset":"religious_ideology","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"boolq","suite":["helm","helm_general"],"prompt_function":"boolq_helm","hf_repo":"lighteval\/boolq_helm","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"boolq:contrastset","suite":["helm"],"prompt_function":"boolq_helm_contrastset","hf_repo":"lighteval\/boolq_helm","hf_subset":"default","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"bridging_anaphora_resolution_barqa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"bridging_anaphora_resolution_barqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"causal_judgment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"causal_judgment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"cause_and_effect","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cause_and_effect","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"checkmate_in_one","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"checkmate_in_one","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"chess_state_tracking","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"chess_state_tracking","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"chinese_remainder_theorem","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"chinese_remainder_theorem","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"cifar10_classification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cifar10_classification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments","suite":["helm","helm_general"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:LGBTQ","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"LGBTQ","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:black","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"black","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:christian","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"christian","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:female","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"female","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:male","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"male","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:muslim","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"muslim","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:other_religions","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"other_religions","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"civil_comments:white","suite":["helm"],"prompt_function":"civil_comments","hf_repo":"lighteval\/civil_comments_helm","hf_subset":"white","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"code_line_description","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_and_after_query","hf_repo":"bigbench","hf_subset":"code_line_description","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"codenames","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"codenames","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"color","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"color","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"common_morpheme","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"common_morpheme","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"commonsenseqa","suite":["helm","commonsense_scenario"],"prompt_function":"commonsense_qa","hf_repo":"commonsense_qa","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"conceptual_combinations","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"conceptual_combinations","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"conlang_translation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"conlang_translation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["rouge_t5","bleu","perfect_exact_match"],"stop_sequence":[".",";","!","?"],"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} -{"name":"contextual_parametric_knowledge_conflicts","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"contextual_parametric_knowledge_conflicts","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_1-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_1-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:n_books_1000-extractions_per_book_3-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"n_books_1000-extractions_per_book_3-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:oh_the_places","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"oh_the_places","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:pilot","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"pilot","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_10","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_10","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_125","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_125","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_25","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_25","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_250","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_250","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_5","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_5","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:popular_books-prefix_length_50","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"popular_books-prefix_length_50","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:prompt_num_line_1-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_1-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:prompt_num_line_10-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_10-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"copyright:prompt_num_line_5-min_lines_20","suite":["helm","copyright_scenario"],"prompt_function":"copyright","hf_repo":"lighteval\/copyright_helm","hf_subset":"prompt_num_line_5-min_lines_20","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["copyright"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"coqa","suite":["lighteval"],"prompt_function":"coqa","hf_repo":"coqa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["perfect_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"coqa_bb","suite":["lighteval","bigbench_programmatic","bigbench"],"prompt_function":"coqa","hf_repo":"coqa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["perfect_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"covid_dialogue","suite":["helm"],"prompt_function":"covid_dialogue","hf_repo":"lighteval\/covid_dialogue","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"crash_blossom","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"crash_blossom","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"crass_ai","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"crass_ai","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"cryobiology_spanish","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cryobiology_spanish","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"cryptonite","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cryptonite","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"cs_algorithms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"cs_algorithms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"dark_humor_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"dark_humor_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"date_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"date_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"disambiguation_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"disambiguation_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"discourse_marker_prediction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"discourse_marker_prediction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"disfl_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"disfl_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"drop","suite":["lighteval"],"prompt_function":"drop","hf_repo":"lighteval/drop_harness","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":"train","few_shots_select":"random_sampling_from_train","generation_size":null,"metric":["drop"],"stop_sequence":["."],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"dyck_language:2","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"2","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"dyck_language:3","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"3","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"dyck_language:4","suite":["helm"],"prompt_function":"dyck_language","hf_repo":"lighteval\/DyckLanguage","hf_subset":"4","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"dyck_languages","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"dyck_languages","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"elementary_math_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"elementary_math_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"emoji_movie","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"emoji_movie","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"emojis_emotion_prediction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"emojis_emotion_prediction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"empirical_judgments","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"empirical_judgments","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"english_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"english_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"english_russian_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"english_russian_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entailed_polarity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"entailed_polarity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entailed_polarity_hindi","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"entailed_polarity_hindi","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_data_imputation:Buy","suite":["helm"],"prompt_function":"entity_data_imputation","hf_repo":"lighteval\/Buy","hf_subset":"default","hf_avail_splits":["train","test","valid"],"evaluation_splits":["valid","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_data_imputation:Restaurant","suite":["helm"],"prompt_function":"entity_data_imputation","hf_repo":"lighteval\/Restaurant","hf_subset":"default","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Abt_Buy","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Abt_Buy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Amazon_Google","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Amazon_Google","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Beer","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Beer","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Company","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Company","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:DBLP_ACM","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"DBLP_ACM","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:DBLP_GoogleScholar","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"DBLP_GoogleScholar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Dirty_DBLP_ACM","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_DBLP_ACM","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Dirty_DBLP_GoogleScholar","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_DBLP_GoogleScholar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Dirty_Walmart_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_Walmart_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Dirty_iTunes_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Dirty_iTunes_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Fodors_Zagats","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Fodors_Zagats","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:Walmart_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"Walmart_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"entity_matching:iTunes_Amazon","suite":["helm"],"prompt_function":"entity_matching","hf_repo":"lighteval\/EntityMatching","hf_subset":"iTunes_Amazon","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"epistemic_reasoning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"epistemic_reasoning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ethics:commonsense","suite":["lighteval","ethics"],"prompt_function":"ethics_commonsense","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"commonsense","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ethics:deontology","suite":["lighteval","ethics"],"prompt_function":"ethics_deontology","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"deontology","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ethics:justice","suite":["lighteval","ethics"],"prompt_function":"ethics_justice","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"justice","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ethics:utilitarianism","suite":["lighteval","ethics"],"prompt_function":"ethics_utilitarianism","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"utilitarianism","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ethics:virtue","suite":["lighteval","ethics"],"prompt_function":"ethics_virtue","hf_repo":"lighteval\/hendrycks_ethics","hf_subset":"virtue","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"evaluating_information_essentiality","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"evaluating_information_essentiality","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"fact_checker","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"fact_checker","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"fantasy_reasoning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"fantasy_reasoning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"few_shot_nlg","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"few_shot_nlg","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","bleurt"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"figure_of_speech_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"figure_of_speech_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"formal_fallacies_syllogisms_negation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"formal_fallacies_syllogisms_negation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gem","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gem","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gender_inclusive_sentences_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gender_inclusive_sentences_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"general_knowledge","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"general_knowledge","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"geometric_shapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"geometric_shapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:cola","suite":["lighteval","glue"],"prompt_function":"cola","hf_repo":"glue","hf_subset":"cola","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token", "mcc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:mnli","suite":["lighteval","glue"],"prompt_function":"mnli","hf_repo":"glue","hf_subset":"mnli_matched","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:mnli_mismatched","suite":["lighteval","glue"],"prompt_function":"mnli","hf_repo":"glue","hf_subset":"mnli_mismatched","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:mrpc","suite":["lighteval","glue"],"prompt_function":"mrpc","hf_repo":"glue","hf_subset":"mrpc","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_f1"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:qnli","suite":["lighteval","glue"],"prompt_function":"qnli","hf_repo":"glue","hf_subset":"qnli","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:qqp","suite":["lighteval","glue"],"prompt_function":"qqp","hf_repo":"glue","hf_subset":"qqp","hf_avail_splits":["train","validation","test"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc", "loglikelihood_f1"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:rte","suite":["lighteval","glue"],"prompt_function":"rte","hf_repo":"glue","hf_subset":"rte","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:sst2","suite":["lighteval","glue"],"prompt_function":"sst","hf_repo":"glue","hf_subset":"sst2","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:stsb","suite":["lighteval","glue"],"prompt_function":"stsb","hf_repo":"glue","hf_subset":"stsb","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"glue:wnli","suite":["lighteval","glue"],"prompt_function":"wnli","hf_repo":"glue","hf_subset":"wnli","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"goal_step_wikihow","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"goal_step_wikihow","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gpqa","suite":["lighteval"],"prompt_function":"gpqa","hf_repo":"Idavidrein/gpqa","hf_subset":"gpqa_main","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gre_reading_comprehension","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"gre_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gsm8k","suite":["leaderboard"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k"],"stop_sequence":["Question:","Question",":"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"gsm8k","suite":["lighteval"],"prompt_function":"gsm8k","hf_repo":"gsm8k","hf_subset":"main","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":256,"metric":["quasi_exact_match_gsm8k","maj_at_8_gsm8k"],"stop_sequence":["Question:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"headqa:en","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"headqa:es","suite":["lighteval","headqa"],"prompt_function":"headqa","hf_repo":"lighteval/headqa_harness","hf_subset":"es","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hellaswag","suite":["leaderboard"],"prompt_function":"hellaswag_harness","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling_from_train","generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hellaswag","suite":["helm","helm_general"],"prompt_function":"hellaswag_helm","hf_repo":"hellaswag","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hhh_alignment","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hhh_alignment","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hindi_question_answering","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hindi_question_answering","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hindu_knowledge","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"hindu_knowledge","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hinglish_toxicity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hinglish_toxicity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"human_organs_senses","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"human_organs_senses","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"humaneval","suite":["helm","code_scenario"],"prompt_function":"humaneval","hf_repo":"openai_humaneval","hf_subset":"openai_humaneval","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":600,"metric":["code_humaneval"],"stop_sequence":["\nclass","\ndef","\nif","\nprint"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"hyperbaton","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"hyperbaton","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"identify_math_theorems","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"identify_math_theorems","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"identify_odd_metaphor","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"identify_odd_metaphor","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"imdb","suite":["helm","helm_general"],"prompt_function":"imdb","hf_repo":"lighteval\/IMDB_helm","hf_subset":"default","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"imdb:contrastset","suite":["helm"],"prompt_function":"imdb_contrastset","hf_repo":"lighteval\/IMDB_helm","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"implicatures","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"implicatures","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"implicit_relations","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"implicit_relations","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"intent_recognition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"intent_recognition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:abstract_algebra","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_abstract_algebra","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:college_chemistry","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_college_chemistry","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:global_facts","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_global_facts","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:miscellaneous","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_miscellaneous","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:nutrition","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_nutrition","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"interactive_qa_mmlu:us_foreign_policy","suite":["helm","interactive_qa_mmlu_scenario"],"prompt_function":"mmlu_qa_us_foreign_policy","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["dev","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"international_phonetic_alphabet_nli","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"international_phonetic_alphabet_nli","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"international_phonetic_alphabet_transliterate","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"international_phonetic_alphabet_transliterate","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"intersect_geometry","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"intersect_geometry","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"irony_identification","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"irony_identification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:ar-en","suite":["lighteval","harness_selection"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ar-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-ar","suite":["lighteval","harness_selection"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ar-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-ko","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-ko","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:ko-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_ko-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"iwslt17:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"iwslt17_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"kanji_ascii","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"kanji_ascii","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"kannada","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"kannada","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"key_value_maps","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"key_value_maps","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"known_unknowns","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"known_unknowns","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:standard","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"lambada","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:standard_cloze","suite":["lighteval","lambada"],"prompt_function":"lambada_cloze","hf_repo":"lambada","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai:de","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai:en","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai:es","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai:fr","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai:it","suite":["lighteval","lambada"],"prompt_function":"lambada","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"it","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lambada:openai_cloze","suite":["lighteval","lambada"],"prompt_function":"lambada_cloze","hf_repo":"EleutherAI\/lambada_openai","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["target_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"language_games","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"language_games","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"language_identification","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"language_identification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"legal_summarization:billsum","suite":["helm"],"prompt_function":"legal_summarization","hf_repo":"lighteval\/legal_summarization","hf_subset":"BillSum","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1024,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"legal_summarization:eurlexsum","suite":["helm"],"prompt_function":"legal_summarization","hf_repo":"lighteval\/legal_summarization","hf_subset":"EurLexSum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"legal_summarization:multilexsum","suite":["helm"],"prompt_function":"multilexsum","hf_repo":"lighteval\/legal_summarization","hf_subset":"MultiLexSum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":256,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"legalsupport","suite":["helm"],"prompt_function":"legal_support","hf_repo":"lighteval\/LegalSupport","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:case_hold","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_case_hold","hf_repo":"lighteval\/lexglue","hf_subset":"case_hold","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:ecthr_a","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ecthr_a","hf_repo":"lighteval\/lexglue","hf_subset":"ecthr_a","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:ecthr_b","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ecthr_b","hf_repo":"lighteval\/lexglue","hf_subset":"ecthr_b","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:eurlex","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_eurlex","hf_repo":"lighteval\/lexglue","hf_subset":"eurlex","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:ledgar","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_ledgar","hf_repo":"lighteval\/lexglue","hf_subset":"ledgar","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:scotus","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_scotus","hf_repo":"lighteval\/lexglue","hf_subset":"scotus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lexglue:unfair_tos","suite":["helm","lex_glue_scenario"],"prompt_function":"lex_glue_unfair_tos","hf_repo":"lighteval\/lexglue","hf_subset":"unfair_tos","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:brazilian_court_decisions_judgment","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_brazilian_court_decisions_judgment","hf_repo":"lighteval\/lextreme","hf_subset":"brazilian_court_decisions_judgment","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:brazilian_court_decisions_unanimity","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_brazilian_court_decisions_unanimity","hf_repo":"lighteval\/lextreme","hf_subset":"brazilian_court_decisions_unanimity","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:covid19_emergency_event","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_covid19_emergency_event","hf_repo":"lighteval\/lextreme","hf_subset":"covid19_emergency_event","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:german_argument_mining","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_german_argument_mining","hf_repo":"lighteval\/lextreme","hf_subset":"german_argument_mining","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:greek_legal_code_chapter","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_chapter","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_chapter","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:greek_legal_code_subject","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_subject","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_subject","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:greek_legal_code_volume","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_code_volume","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_code_volume","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:greek_legal_ner","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_greek_legal_ner","hf_repo":"lighteval\/lextreme","hf_subset":"greek_legal_ner","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":430,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:legalnero","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_legalnero","hf_repo":"lighteval\/lextreme","hf_subset":"legalnero","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":788,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:lener_br","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_lener_br","hf_repo":"lighteval\/lextreme","hf_subset":"lener_br","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":338,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:mapa_coarse","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_mapa_coarse","hf_repo":"lighteval\/lextreme","hf_subset":"mapa_coarse","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":274,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:mapa_fine","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_mapa_fine","hf_repo":"lighteval\/lextreme","hf_subset":"mapa_fine","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":274,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:multi_eurlex_level_1","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_1","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_1","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:multi_eurlex_level_2","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_2","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_2","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:multi_eurlex_level_3","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_multi_eurlex_level_3","hf_repo":"lighteval\/lextreme","hf_subset":"multi_eurlex_level_3","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:online_terms_of_service_clause_topics","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_online_terms_of_service_clause_topics","hf_repo":"lighteval\/lextreme","hf_subset":"online_terms_of_service_clause_topics","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:online_terms_of_service_unfairness_levels","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_online_terms_of_service_unfairness_levels","hf_repo":"lighteval\/lextreme","hf_subset":"online_terms_of_service_unfairness_levels","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":10,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lextreme:swiss_judgment_prediction","suite":["helm","lextreme_scenario"],"prompt_function":"lextreme_swiss_judgment_prediction","hf_repo":"lighteval\/lextreme","hf_subset":"swiss_judgment_prediction","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","f1_score","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"linguistic_mappings","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"linguistic_mappings","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"linguistics_puzzles","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"linguistics_puzzles","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} -{"name":"logic_grid_puzzle","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logic_grid_puzzle","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"logical_args","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_args","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"logical_deduction","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"logical_deduction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"logical_fallacy_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_fallacy_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"logical_sequence","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"logical_sequence","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"logiqa","suite":["lighteval"],"prompt_function":"logiqa","hf_repo":"lighteval/logiqa_harness","hf_subset":"logiqa","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lsat_qa","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"all","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lsat_qa:assignment","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"assignment","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lsat_qa:grouping","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"grouping","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lsat_qa:miscellaneous","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"miscellaneous","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"lsat_qa:ordering","suite":["helm","lsat_qa_scenario"],"prompt_function":"lsat_qa","hf_repo":"lighteval\/lsat_qa","hf_subset":"ordering","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math:algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:geometry","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:number_theory","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:prealgebra","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math:precalculus","suite":["lighteval","math"],"prompt_function":"math","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":1} -{"name":"math_cot:algebra","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:counting_and_probability","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"counting_and_probability","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:geometry","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"geometry","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:intermediate_algebra","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"intermediate_algebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:number_theory","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"number_theory","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:prealgebra","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"prealgebra","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"math_cot:precalculus","suite":["lighteval","math"],"prompt_function":"math_cot","hf_repo":"lighteval\/MATH","hf_subset":"precalculus","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":2048,"metric":["quasi_exact_match_math","maj_at_4_math"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mathematical_induction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mathematical_induction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mathqa","suite":["lighteval"],"prompt_function":"mathqa","hf_repo":"math_qa","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"matrixshapes","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"matrixshapes","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"me_q_sum","suite":["helm"],"prompt_function":"me_q_sum","hf_repo":"lighteval\/me_q_sum","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"med_dialog:healthcaremagic","suite":["helm"],"prompt_function":"med_dialog","hf_repo":"lighteval\/med_dialog","hf_subset":"healthcaremagic","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"med_dialog:icliniq","suite":["helm"],"prompt_function":"med_dialog","hf_repo":"lighteval\/med_dialog","hf_subset":"icliniq","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"med_mcqa","suite":["helm"],"prompt_function":"med_mcqa","hf_repo":"lighteval\/med_mcqa","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"med_paragraph_simplification","suite":["helm"],"prompt_function":"med_paragraph_simplification","hf_repo":"lighteval\/med_paragraph_simplification","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":512,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"med_qa","suite":["helm"],"prompt_function":"med_qa","hf_repo":"bigbio\/med_qa","hf_subset":"med_qa_en_source","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"metaphor_boolean","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"metaphor_boolean","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"metaphor_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"metaphor_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:en","suite":["lighteval"],"prompt_function":"mgsm_en","hf_repo":"juletxara/mgsm","hf_subset":"en","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Question:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:es","suite":["lighteval"],"prompt_function":"mgsm_es","hf_repo":"juletxara/mgsm","hf_subset":"es","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Pregunta:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:fr","suite":["lighteval"],"prompt_function":"mgsm_fr","hf_repo":"juletxara/mgsm","hf_subset":"fr","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Question:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:de","suite":["lighteval"],"prompt_function":"mgsm_de","hf_repo":"juletxara/mgsm","hf_subset":"de","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Frage:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:ru","suite":["lighteval"],"prompt_function":"mgsm_ru","hf_repo":"juletxara/mgsm","hf_subset":"ru","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0417\u0430\u0434\u0430\u0447\u0430:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:zh","suite":["lighteval"],"prompt_function":"mgsm_zh","hf_repo":"juletxara/mgsm","hf_subset":"zh","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u95ee\u9898:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:ja","suite":["lighteval"],"prompt_function":"mgsm_ja","hf_repo":"juletxara/mgsm","hf_subset":"ja","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u554f\u984c:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:th","suite":["lighteval"],"prompt_function":"mgsm_th","hf_repo":"juletxara/mgsm","hf_subset":"th","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0e42\u0e08\u0e17\u0e22\u0e4c:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:sw","suite":["lighteval"],"prompt_function":"mgsm_sw","hf_repo":"juletxara/mgsm","hf_subset":"sw","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "Swali:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:bn","suite":["lighteval"],"prompt_function":"mgsm_bn","hf_repo":"juletxara/mgsm","hf_subset":"bn","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mgsm:te","suite":["lighteval"],"prompt_function":"mgsm_te","hf_repo":"juletxara/mgsm","hf_subset":"te","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match", "quasi_exact_match"],"stop_sequence":["\n", ":", "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"minute_mysteries_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"minute_mysteries_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"misconceptions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"misconceptions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"misconceptions_russian","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"misconceptions_russian","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"all","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu","suite":["original"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"all","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":5,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:abstract_algebra","suite":["original","mmlu"],"prompt_function":"mmlu_abstract_algebra","hf_repo":"cais\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:abstract_algebra","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:abstract_algebra","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"abstract_algebra","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:anatomy","suite":["original","mmlu"],"prompt_function":"mmlu_anatomy","hf_repo":"cais\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:anatomy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:anatomy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"anatomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:astronomy","suite":["original","mmlu"],"prompt_function":"mmlu_astronomy","hf_repo":"cais\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:astronomy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:astronomy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"astronomy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:business_ethics","suite":["original","mmlu"],"prompt_function":"mmlu_business_ethics","hf_repo":"cais\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:business_ethics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:business_ethics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"business_ethics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:clinical_knowledge","suite":["original","mmlu"],"prompt_function":"mmlu_clinical_knowledge","hf_repo":"cais\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:clinical_knowledge","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:clinical_knowledge","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"clinical_knowledge","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_biology","suite":["original","mmlu"],"prompt_function":"mmlu_college_biology","hf_repo":"cais\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_biology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_biology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_chemistry","suite":["original","mmlu"],"prompt_function":"mmlu_college_chemistry","hf_repo":"cais\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_chemistry","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_chemistry","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_computer_science","suite":["original","mmlu"],"prompt_function":"mmlu_college_computer_science","hf_repo":"cais\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_computer_science","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_computer_science","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_college_mathematics","hf_repo":"cais\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_medicine","suite":["original","mmlu"],"prompt_function":"mmlu_college_medicine","hf_repo":"cais\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_medicine","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_medicine","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_physics","suite":["original","mmlu"],"prompt_function":"mmlu_college_physics","hf_repo":"cais\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:college_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"college_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:computer_security","suite":["original","mmlu"],"prompt_function":"mmlu_computer_security","hf_repo":"cais\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:computer_security","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:computer_security","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"computer_security","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:conceptual_physics","suite":["original","mmlu"],"prompt_function":"mmlu_conceptual_physics","hf_repo":"cais\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:conceptual_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:conceptual_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"conceptual_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:econometrics","suite":["original","mmlu"],"prompt_function":"mmlu_econometrics","hf_repo":"cais\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:econometrics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:econometrics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"econometrics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:electrical_engineering","suite":["original","mmlu"],"prompt_function":"mmlu_electrical_engineering","hf_repo":"cais\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:electrical_engineering","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:electrical_engineering","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"electrical_engineering","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:elementary_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_elementary_mathematics","hf_repo":"cais\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:elementary_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:elementary_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"elementary_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:formal_logic","suite":["original","mmlu"],"prompt_function":"mmlu_formal_logic","hf_repo":"cais\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:formal_logic","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:formal_logic","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"formal_logic","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:global_facts","suite":["original","mmlu"],"prompt_function":"mmlu_global_facts","hf_repo":"cais\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:global_facts","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:global_facts","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"global_facts","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_biology","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_biology","hf_repo":"cais\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_biology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_biology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_biology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_chemistry","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_chemistry","hf_repo":"cais\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_chemistry","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_chemistry","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_chemistry","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_computer_science","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_computer_science","hf_repo":"cais\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_computer_science","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_computer_science","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_computer_science","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_european_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_european_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_european_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_european_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_european_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_geography","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_geography","hf_repo":"cais\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_geography","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_geography","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_geography","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_government_and_politics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_government_and_politics","hf_repo":"cais\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_government_and_politics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_government_and_politics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_government_and_politics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_macroeconomics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_macroeconomics","hf_repo":"cais\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_macroeconomics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_macroeconomics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_macroeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_mathematics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_mathematics","hf_repo":"cais\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_mathematics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_mathematics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_mathematics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_microeconomics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_microeconomics","hf_repo":"cais\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_microeconomics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_microeconomics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_microeconomics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_physics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_physics","hf_repo":"cais\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_physics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_physics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_physics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_psychology","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_psychology","hf_repo":"cais\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_psychology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_psychology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_statistics","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_statistics","hf_repo":"cais\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_statistics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_statistics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_statistics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_us_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_us_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_us_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_us_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_us_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_world_history","suite":["original","mmlu"],"prompt_function":"mmlu_high_school_world_history","hf_repo":"cais\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_world_history","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:high_school_world_history","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"high_school_world_history","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_aging","suite":["original","mmlu"],"prompt_function":"mmlu_human_aging","hf_repo":"cais\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_aging","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_aging","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"human_aging","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_sexuality","suite":["original","mmlu"],"prompt_function":"mmlu_human_sexuality","hf_repo":"cais\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_sexuality","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:human_sexuality","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"human_sexuality","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:international_law","suite":["original","mmlu"],"prompt_function":"mmlu_international_law","hf_repo":"cais\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:international_law","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:international_law","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"international_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:jurisprudence","suite":["original","mmlu"],"prompt_function":"mmlu_jurisprudence","hf_repo":"cais\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:jurisprudence","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:jurisprudence","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"jurisprudence","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:logical_fallacies","suite":["original","mmlu"],"prompt_function":"mmlu_logical_fallacies","hf_repo":"cais\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:logical_fallacies","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:logical_fallacies","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"logical_fallacies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:machine_learning","suite":["original","mmlu"],"prompt_function":"mmlu_machine_learning","hf_repo":"cais\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:machine_learning","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:machine_learning","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"machine_learning","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:management","suite":["original","mmlu"],"prompt_function":"mmlu_management","hf_repo":"cais\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:management","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:management","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"management","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:marketing","suite":["original","mmlu"],"prompt_function":"mmlu_marketing","hf_repo":"cais\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:marketing","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:marketing","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"marketing","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:medical_genetics","suite":["original","mmlu"],"prompt_function":"mmlu_medical_genetics","hf_repo":"cais\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:medical_genetics","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:medical_genetics","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"medical_genetics","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:miscellaneous","suite":["original","mmlu"],"prompt_function":"mmlu_miscellaneous","hf_repo":"cais\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:miscellaneous","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:miscellaneous","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"miscellaneous","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_disputes","suite":["original","mmlu"],"prompt_function":"mmlu_moral_disputes","hf_repo":"cais\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_disputes","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_disputes","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"moral_disputes","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_scenarios","suite":["original","mmlu"],"prompt_function":"mmlu_moral_scenarios","hf_repo":"cais\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_scenarios","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:moral_scenarios","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"moral_scenarios","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:nutrition","suite":["original","mmlu"],"prompt_function":"mmlu_nutrition","hf_repo":"cais\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:nutrition","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:nutrition","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"nutrition","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:philosophy","suite":["original","mmlu"],"prompt_function":"mmlu_philosophy","hf_repo":"cais\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:philosophy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:philosophy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"philosophy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:prehistory","suite":["original","mmlu"],"prompt_function":"mmlu_prehistory","hf_repo":"cais\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:prehistory","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:prehistory","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"prehistory","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_accounting","suite":["original","mmlu"],"prompt_function":"mmlu_professional_accounting","hf_repo":"cais\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_accounting","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_accounting","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_accounting","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_law","suite":["original","mmlu"],"prompt_function":"mmlu_professional_law","hf_repo":"cais\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_law","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_law","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_law","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_medicine","suite":["original","mmlu"],"prompt_function":"mmlu_professional_medicine","hf_repo":"cais\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_medicine","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_medicine","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_medicine","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_psychology","suite":["original","mmlu"],"prompt_function":"mmlu_professional_psychology","hf_repo":"cais\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_psychology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:professional_psychology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"professional_psychology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:public_relations","suite":["original","mmlu"],"prompt_function":"mmlu_public_relations","hf_repo":"cais\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:public_relations","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:public_relations","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"public_relations","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:security_studies","suite":["original","mmlu"],"prompt_function":"mmlu_security_studies","hf_repo":"cais\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:security_studies","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:security_studies","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"security_studies","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:sociology","suite":["original","mmlu"],"prompt_function":"mmlu_sociology","hf_repo":"cais\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:sociology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:sociology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"sociology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:us_foreign_policy","suite":["original","mmlu"],"prompt_function":"mmlu_us_foreign_policy","hf_repo":"cais\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:us_foreign_policy","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:us_foreign_policy","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"us_foreign_policy","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:virology","suite":["original","mmlu"],"prompt_function":"mmlu_virology","hf_repo":"cais\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:virology","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:virology","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"virology","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:world_religions","suite":["original","mmlu"],"prompt_function":"mmlu_world_religions","hf_repo":"cais\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:world_religions","suite":["leaderboard","mmlu"],"prompt_function":"mmlu_harness","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":"sequential","generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mmlu:world_religions","suite":["helm","helm_general"],"prompt_function":"mmlu_helm","hf_repo":"lighteval\/mmlu","hf_subset":"world_religions","hf_avail_splits":["auxiliary_train","test","validation","dev"],"evaluation_splits":["test"],"few_shots_split":"dev","few_shots_select":null,"generation_size":5,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mnist_ascii","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mnist_ascii","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"modified_arithmetic","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"modified_arithmetic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"moral_permissibility","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"moral_permissibility","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"movie_dialog_same_or_different","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"movie_dialog_same_or_different","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"movie_recommendation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"movie_recommendation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mtnt2019:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mtnt2019:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mtnt2019:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mtnt2019:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"mtnt2019_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mult_data_wrangling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"mult_data_wrangling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"multiemo","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"multiemo","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mutual","suite":["lighteval"],"prompt_function":"mutual","hf_repo":"lighteval\/mutual_harness","hf_subset":"mutual","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["recall_at_1","recall_at_2","mrr"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"mutual_plus","suite":["lighteval"],"prompt_function":"mutual","hf_repo":"lighteval\/mutual_harness","hf_subset":"mutual_plus","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["recall_at_1","recall_at_2","mrr"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"narrativeqa","suite":["helm","helm_general"],"prompt_function":"narrativeqa","hf_repo":"lighteval/narrative_qa_helm","hf_subset":"default","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match","quasi_exact_match","f1_score","rougeL","bleu_1","bleu_4"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"natural_instructions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"natural_instructions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"navigate","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"navigate","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"nonsense_words_grammar","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"nonsense_words_grammar","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"novel_concepts","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"novel_concepts","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:linear_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"linear_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:linear_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"linear_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:parabola_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"parabola_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:parabola_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"parabola_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:paraboloid_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"paraboloid_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:paraboloid_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"paraboloid_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:plane_example","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"plane_example","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"numeracy:plane_standard","suite":["helm"],"prompt_function":"numeracy","hf_repo":"lighteval\/numeracy","hf_subset":"plane_standard","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"object_counting","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"object_counting","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"odd_one_out","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"odd_one_out","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"openbookqa","suite":["helm","commonsense_scenario","helm_general"],"prompt_function":"openbookqa_helm","hf_repo":"openbookqa","hf_subset":"main","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"openbookqa","suite":["lighteval"],"prompt_function":"openbookqa","hf_repo":"openbookqa","hf_subset":"main","hf_avail_splits":["train","test","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"operators","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"operators","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":"([-+]?\\d+)[.]{0,1}$", "trust_dataset": true,"version":0} -{"name":"paragraph_segmentation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"paragraph_segmentation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"parsinlu_qa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"parsinlu_qa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"parsinlu_reading_comprehension","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"parsinlu_reading_comprehension","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true,"version":0} -{"name":"penguins_in_a_table","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"penguins_in_a_table","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"periodic_elements","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"periodic_elements","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"persian_idioms","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"persian_idioms","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"phrase_relatedness","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"phrase_relatedness","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"physical_intuition","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physical_intuition","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"physics","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physics","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"physics_questions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"physics_questions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"piqa","suite":["lighteval"],"prompt_function":"piqa_harness","hf_repo":"piqa","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"piqa","suite":["helm","commonsense_scenario"],"prompt_function":"piqa_helm","hf_repo":"piqa","hf_subset":"plain_text","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"play_dialog_same_or_different","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"play_dialog_same_or_different","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"polish_sequence_labeling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"polish_sequence_labeling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"presuppositions_as_nli","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"presuppositions_as_nli","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"prost","suite":["lighteval"],"prompt_function":"prost","hf_repo":"corypaik\/prost","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"pubmedqa","suite":["lighteval"],"prompt_function":"pubmed_qa","hf_repo":"pubmed_qa","hf_subset":"pqa_labeled","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"pubmedqa","suite":["helm"],"prompt_function":"pubmed_qa_helm","hf_repo":"pubmed_qa","hf_subset":"pqa_labeled","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qa4mre:2011","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2011.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qa4mre:2012","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2012.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qa4mre:2013","suite":["lighteval"],"prompt_function":"qa4mre","hf_repo":"qa4mre","hf_subset":"2013.main.EN","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qa_wikidata","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"qa_wikidata","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleurt","bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qasper","suite":["lighteval"],"prompt_function":"qasper","hf_repo":"qasper","hf_subset":"qasper","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["f1_score_quasi"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"qasper_ll","suite":["lighteval"],"prompt_function":"qasper_ll","hf_repo":"qasper","hf_subset":"qasper","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"quac","suite":["helm"],"prompt_function":"quac","hf_repo":"lighteval/quac_helm","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["exact_match", "quasi_exact_match", "f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"question_selection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"question_selection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"race:high","suite":["lighteval","race"],"prompt_function":"race","hf_repo":"EleutherAI/race","hf_subset":"high","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:ade_corpus_v2","suite":["helm","helm_general"],"prompt_function":"raft_ade_corpus_v2","hf_repo":"ought\/raft","hf_subset":"ade_corpus_v2","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:banking_77","suite":["helm","helm_general"],"prompt_function":"raft_banking_77","hf_repo":"ought\/raft","hf_subset":"banking_77","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:neurips_impact_statement_risks","suite":["helm","helm_general"],"prompt_function":"raft_neurips_impact_statement_risks","hf_repo":"ought\/raft","hf_subset":"neurips_impact_statement_risks","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:one_stop_english","suite":["helm","helm_general"],"prompt_function":"raft_one_stop_english","hf_repo":"ought\/raft","hf_subset":"one_stop_english","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:overruling","suite":["helm","helm_general"],"prompt_function":"raft_overruling","hf_repo":"ought\/raft","hf_subset":"overruling","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:semiconductor_org_types","suite":["helm","helm_general"],"prompt_function":"raft_semiconductor_org_types","hf_repo":"ought\/raft","hf_subset":"semiconductor_org_types","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:systematic_review_inclusion","suite":["helm","helm_general"],"prompt_function":"raft_systematic_review_inclusion","hf_repo":"ought\/raft","hf_subset":"systematic_review_inclusion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:tai_safety_research","suite":["helm","helm_general"],"prompt_function":"raft_tai_safety_research","hf_repo":"ought\/raft","hf_subset":"tai_safety_research","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:terms_of_service","suite":["helm","helm_general"],"prompt_function":"raft_terms_of_service","hf_repo":"ought\/raft","hf_subset":"terms_of_service","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:tweet_eval_hate","suite":["helm","helm_general"],"prompt_function":"raft_tweet_eval_hate","hf_repo":"ought\/raft","hf_subset":"tweet_eval_hate","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"raft:twitter_complaints","suite":["helm","helm_general"],"prompt_function":"raft_twitter_complaints","hf_repo":"ought\/raft","hf_subset":"twitter_complaints","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":30,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","f1_score_macro","f1_score_micro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"real_or_fake_text","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"real_or_fake_text","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"real_toxicity_prompts","suite":["helm"],"prompt_function":"real_toxicity_prompts","hf_repo":"allenai\/real-toxicity-prompts","hf_subset":"default","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["prediction_perplexity"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"reasoning_about_colored_objects","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"repeat_copy_logic","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"repeat_copy_logic","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"rephrase","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"rephrase","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["rouge_t5","bleu","loglikelihood_acc","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"rhyming","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"rhyming","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"riddle_sense","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"riddle_sense","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"ruin_names","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"ruin_names","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"salient_translation_error_detection","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"salient_translation_error_detection","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"scientific_press_release","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"scientific_press_release","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"sciq","suite":["lighteval"],"prompt_function":"sciq","hf_repo":"sciq","hf_subset":"default","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"semantic_parsing_in_context_sparc","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"semantic_parsing_in_context_sparc","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"semantic_parsing_spider","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"semantic_parsing_spider","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"sentence_ambiguity","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sentence_ambiguity","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"similarities_abstraction","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"similarities_abstraction","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simp_turing_concept","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simp_turing_concept","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_arithmetic_json","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_arithmetic_json_multiple_choice","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json_multiple_choice","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_arithmetic_json_subtasks","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_json_subtasks","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_arithmetic_multiple_targets_json","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_arithmetic_multiple_targets_json","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_ethical_questions","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_ethical_questions","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"simple_text_editing","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"simple_text_editing","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"siqa","suite":["helm","commonsense_scenario"],"prompt_function":"siqa","hf_repo":"social_i_qa","hf_subset":"default","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"snarks","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"snarks","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"social_iqa","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"social_iqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"social_support","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"social_support","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["f1_score_macro"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"sports_understanding","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sports_understanding","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"storycloze:2016","suite":["lighteval","storycloze"],"prompt_function":"storycloze","hf_repo":"story_cloze","hf_subset":"2016","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"storycloze:2018","suite":["lighteval","storycloze"],"prompt_function":"storycloze","hf_repo":"story_cloze","hf_subset":"2018","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"strange_stories","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"strange_stories","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"strategyqa","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"strategyqa","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"sufficient_information","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"sufficient_information","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"suicide_risk","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"suicide_risk","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"summarization:cnn-dm","suite":["helm","helm_general"],"prompt_function":"cnn_dm","hf_repo":"lighteval\/summarization","hf_subset":"cnn-dm","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":128,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"summarization:xsum","suite":["helm","helm_general"],"prompt_function":"xsum","hf_repo":"lighteval\/summarization","hf_subset":"xsum","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":64,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"summarization:xsum-sampled","suite":["helm"],"prompt_function":"xsum","hf_repo":"lighteval\/summarization","hf_subset":"xsum-sampled","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":64,"metric":["rouge1","rouge2","rougeL","faithfulness","extractiveness","bert_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:boolq","suite":["lighteval","superglue"],"prompt_function":"boolq_harness","hf_repo":"super_glue","hf_subset":"boolq","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:cb","suite":["lighteval","superglue"],"prompt_function":"cb","hf_repo":"super_glue","hf_subset":"cb","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc_single_token", "multi_f1_numeric"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:copa","suite":["lighteval","superglue"],"prompt_function":"copa","hf_repo":"super_glue","hf_subset":"copa","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:rte","suite":["lighteval","superglue"],"prompt_function":"rte","hf_repo":"super_glue","hf_subset":"rte","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:multirc","suite":["lighteval","superglue"],"prompt_function":"multirc","hf_repo":"super_glue","hf_subset":"multirc","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:wic","suite":["lighteval","superglue"],"prompt_function":"wic","hf_repo":"super_glue","hf_subset":"wic","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"super_glue:wsc","suite":["lighteval","superglue"],"prompt_function":"wsc","hf_repo":"super_glue","hf_subset":"wsc","hf_avail_splits":["test","train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"swahili_english_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"swahili_english_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"swag","suite":["lighteval"],"prompt_function":"swag","hf_repo":"swag","hf_subset":"regular","hf_avail_splits":["train","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm_nospace"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"swedish_to_german_proverbs","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"swedish_to_german_proverbs","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"symbol_interpretation","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_whitespace_after_query","hf_repo":"bigbench","hf_subset":"symbol_interpretation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"synthetic_reasoning:induction","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"induction","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"synthetic_reasoning:natural_easy","suite":["helm"],"prompt_function":"synthetic_reasoning_natural","hf_repo":"lighteval\/synthetic_reasoning_natural","hf_subset":"easy","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"synthetic_reasoning:natural_hard","suite":["helm"],"prompt_function":"synthetic_reasoning_natural","hf_repo":"lighteval\/synthetic_reasoning_natural","hf_subset":"hard","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","f1_score"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"synthetic_reasoning:pattern_match","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"pattern_match","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"synthetic_reasoning:variable_substitution","suite":["helm"],"prompt_function":"synthetic_reasoning","hf_repo":"lighteval\/synthetic_reasoning","hf_subset":"variable_substitution","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":50,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"tellmewhy","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tellmewhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"temporal_sequences","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"temporal_sequences","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"tense","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tense","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:arxiv","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_arxiv","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:arxiv","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"arxiv","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:bibliotik","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"bibliotik","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:bookcorpus2","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_bookcorpus2","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:books3","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_books3","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:commoncrawl","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"commoncrawl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:dm-mathematics","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_dm-mathematics","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:dm-mathematics","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"dm-mathematics","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:enron","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_enron","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:enron","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"enron","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:europarl","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_europarl","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:europarl","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"europarl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:freelaw","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_freelaw","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:freelaw","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"freelaw","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:github","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_github","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:github","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"github","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:gutenberg","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_gutenberg","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:gutenberg","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"gutenberg","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:hackernews","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_hackernews","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:hackernews","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"hackernews","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:nih-exporter","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_nih-exporter","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:nih-exporter","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"nih-exporter","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:opensubtitles","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_opensubtitles","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:opensubtitles","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"opensubtitles","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:openwebtext2","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_openwebtext2","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:openwebtext2","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"openwebtext2","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:philpapers","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_philpapers","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:pile-cc","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pile-cc","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:pubmed-abstracts","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pubmed-abstracts","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:pubmed-abstracts","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"pubmed-abstracts","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:pubmed-central","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_pubmed-central","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:pubmed-central","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"pubmed-central","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:stackexchange","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_stackexchange","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:stackexchange","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"stackexchange","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:ubuntu-irc","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_ubuntu-irc","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:uspto","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_upsto","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:upsto","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"uspto","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:wikipedia","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_wikipedia","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:wikipedia","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"wikipedia","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:youtubesubtitles","suite":["lighteval","pile"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile","hf_subset":"pile_youtubesubtitles","hf_avail_splits":["validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"the_pile:youtubesubtitles","suite":["helm"],"prompt_function":"the_pile","hf_repo":"lighteval\/pile_helm","hf_subset":"youtubesubtitles","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"timedial","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"timedial","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"toxigen","suite":["lighteval"],"prompt_function":"toxigen","hf_repo":"skg/toxigen-data","hf_subset":"annotated","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"topical_chat","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"topical_chat","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu","rouge_t5","loglikelihood_acc","bleurt"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"tracking_shuffled_objects","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"tracking_shuffled_objects","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"triviaqa","suite":["lighteval"],"prompt_function":"triviaqa","hf_repo":"trivia_qa","hf_subset":"rc.nocontext","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["quasi_exact_match_triviaqa"],"stop_sequence":["\n", ".", ","],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"truthfulqa:gen","suite":["lighteval"],"prompt_function":"truthful_qa_generative","hf_repo":"truthful_qa","hf_subset":"generation","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":200,"metric":["bleu","rouge_t5"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"truthfulqa:mc","suite":["leaderboard"],"prompt_function":"truthful_qa_multiple_choice","hf_repo":"truthful_qa","hf_subset":"multiple_choice","hf_avail_splits":["validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["truthfulqa_mc_metrics"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"truthfulqa","suite":["helm","helm_general"],"prompt_function":"truthful_qa_helm","hf_repo":"lighteval\/truthfulqa_helm","hf_subset":"default","hf_avail_splits":["train","valid"],"evaluation_splits":["valid"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["loglikelihood_acc","exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"twitterAAE:aa","suite":["helm"],"prompt_function":"twitter_aae","hf_repo":"lighteval\/twitterAAE","hf_subset":"aa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"twitterAAE:white","suite":["helm"],"prompt_function":"twitter_aae","hf_repo":"lighteval\/twitterAAE","hf_subset":"white","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"understanding_fables","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"understanding_fables","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"undo_permutation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"undo_permutation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unit_conversion","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unit_conversion","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unit_interpretation","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unit_interpretation","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unnatural_in_context_learning","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"unnatural_in_context_learning","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unscramble:anagrams1","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["mid_word_1_anagrams"],"evaluation_splits":["mid_word_1_anagrams"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unscramble:anagrams2","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["mid_word_2_anagrams"],"evaluation_splits":["mid_word_2_anagrams"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unscramble:cycle_letters","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["cycle_letters_in_word"],"evaluation_splits":["cycle_letters_in_word"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unscramble:random_insertion","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["random_insertion_in_word"],"evaluation_splits":["random_insertion_in_word"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"unscramble:reversed_words","suite":["lighteval","unscramble"],"prompt_function":"unscramble","hf_repo":"lighteval\/GPT3_unscramble","hf_subset":"default","hf_avail_splits":["reversed_words"],"evaluation_splits":["reversed_words"],"few_shots_split":null,"few_shots_select":null,"generation_size":5,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"vitaminc_fact_verification","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"vitaminc_fact_verification","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"webqs","suite":["lighteval"],"prompt_function":"webqs","hf_repo":"web_questions","hf_subset":"default","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["acc_golds_likelihood"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"what_is_the_tao","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"what_is_the_tao","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"which_wiki_edit","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"which_wiki_edit","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:applies_to_jurisdiction","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"applies_to_jurisdiction","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:atomic_number","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"atomic_number","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:author","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"author","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:award_received","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"award_received","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:basic_form_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"basic_form_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:capital","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"capital","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:capital_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"capital_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:central_bank","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"central_bank","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:composer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"composer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:continent","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"continent","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:country","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:country_of_citizenship","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country_of_citizenship","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:country_of_origin","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"country_of_origin","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:creator","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"creator","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:currency","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"currency","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:defendant","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"defendant","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:developer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"developer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:diplomatic_relation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"diplomatic_relation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:director","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"director","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:discoverer_or_inventor","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"discoverer_or_inventor","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:drug_or_therapy_used_for_treatment","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"drug_or_therapy_used_for_treatment","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:educated_at","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"educated_at","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:electron_configuration","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"electron_configuration","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:employer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"employer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:field_of_work","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"field_of_work","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:file_extension","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"file_extension","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:genetic_association","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"genetic_association","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:genre","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"genre","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:has_part","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"has_part","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:head_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"head_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:head_of_state","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"head_of_state","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:headquarters_location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"headquarters_location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:industry","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"industry","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:influenced_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"influenced_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:instance_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"instance_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:instrument","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"instrument","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:language_of_work_or_name","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"language_of_work_or_name","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:languages_spoken_written_or_signed","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"languages_spoken_written_or_signed","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:laws_applied","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"laws_applied","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:located_in_the_administrative_territorial_entity","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"located_in_the_administrative_territorial_entity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:location_of_discovery","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location_of_discovery","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:location_of_formation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"location_of_formation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:majority_opinion_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"majority_opinion_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:manufacturer","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"manufacturer","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:measured_physical_quantity","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"measured_physical_quantity","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:medical_condition_treated","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"medical_condition_treated","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:member_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:member_of_political_party","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of_political_party","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:member_of_sports_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"member_of_sports_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:movement","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"movement","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:named_after","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"named_after","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:native_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"native_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:number_of_processor_cores","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"number_of_processor_cores","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:occupation","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"occupation","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:office_held_by_head_of_government","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"office_held_by_head_of_government","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:office_held_by_head_of_state","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"office_held_by_head_of_state","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:official_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"official_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:operating_system","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"operating_system","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:original_language_of_film_or_TV_show","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"original_language_of_film_or_TV_show","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:original_network","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"original_network","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:overrules","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"overrules","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:owned_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"owned_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:part_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"part_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:participating_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"participating_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:place_of_birth","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"place_of_birth","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:place_of_death","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"place_of_death","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:plaintiff","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"plaintiff","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:position_held","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"position_held","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:position_played_on_team","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"position_played_on_team","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:programming_language","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"programming_language","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:recommended_unit_of_measurement","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"recommended_unit_of_measurement","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:record_label","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"record_label","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:religion","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"religion","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:repealed_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"repealed_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:shares_border_with","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"shares_border_with","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:solved_by","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"solved_by","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:statement_describes","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"statement_describes","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:stock_exchange","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"stock_exchange","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:subclass_of","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"subclass_of","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:subsidiary","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"subsidiary","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:symptoms_and_signs","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"symptoms_and_signs","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:therapeutic_area","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"therapeutic_area","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:time_of_discovery_or_invention","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"time_of_discovery_or_invention","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:twinned_administrative_body","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"twinned_administrative_body","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikifact:work_location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"work_location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikitext:2","suite":["lighteval"],"prompt_function":"wikitext","hf_repo":"wikitext","hf_subset":"wikitext-2-raw-v1","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikitext:103:document_level","suite":["harness"],"prompt_function":"wikitext_harness","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wikitext:103:document_level","suite":["helm"],"prompt_function":"wikitext_helm","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wino_x_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"wino_x_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"winogrande","suite":["leaderboard"],"prompt_function":"winogrande","hf_repo":"winogrande","hf_subset":"winogrande_xl","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"winowhy","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"winowhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:en-hu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_en-hu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt08:hu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt08_hu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-hu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-hu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:en-it","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_en-it","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:hu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_hu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt09:it-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt09_it-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt10:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt10_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt11:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt11_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt12:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt12_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:en-es","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-es","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:es-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_es-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt13:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt13_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-fr","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt14","hf_subset":"fr-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-hi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-hi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:fr-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt14","hf_subset":"fr-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:hi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_hi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt14_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:cs-en","suite":["helm"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"cs-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:de-en","suite":["helm"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"de-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:fr-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"fr-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:hi-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"hi-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt14:ru-en","suite":["helm"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/wmt14","hf_subset":"ru-en","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:en-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:fr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_fr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt15:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt15_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:de-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt16","hf_subset":"de-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-de","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt16","hf_subset":"de-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-ro","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_alphabetical","hf_repo":"wmt16","hf_subset":"ro-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-ro","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-ro","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:ro-en","suite":["lighteval","gpt3_benchmarks"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"wmt16","hf_subset":"ro-en","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:ro-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_ro-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt16:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt16_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-lv","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-lv","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:lv-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_lv-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt17:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt17_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-et","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-et","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-tr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-tr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:et-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_et-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:tr-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_tr-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt18:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt18_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:cs-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_cs-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:de-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:de-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_de-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-fi","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-fi","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-gu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-gu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-kk","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-kk","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-lt","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-lt","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:fi-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_fi-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:fr-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_fr-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:gu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_gu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:kk-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_kk-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:lt-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_lt-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt19:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt19_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:cs-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_cs-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:de-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_de-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:de-fr","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_de-fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-cs","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-cs","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-iu","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-iu","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-ja","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ja","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-km","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-km","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-pl","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-pl","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-ps","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ps","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-ru","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-ta","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-ta","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:en-zh","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_en-zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:fr-de","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_fr-de","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:iu-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_iu-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:ja-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ja-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:km-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_km-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:pl-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_pl-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:ps-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ps-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:ru-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ru-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:ta-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_ta-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wmt20:zh-en","suite":["lighteval","sacrebleu"],"prompt_function":"wmt_reverse_alphabetical","hf_repo":"lighteval\/sacrebleu_manual","hf_subset":"wmt20_zh-en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["bleu","chrf","ter"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"word_sorting","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"word_sorting","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"word_unscrambling","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"word_unscrambling","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"wsc273","suite":["lighteval"],"prompt_function":"wsc273","hf_repo":"winograd_wsc","hf_subset":"wsc273","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:en","suite":["lighteval"],"prompt_function":"xcopa_en","hf_repo":"xcopa","hf_subset":"default","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:et","suite":["lighteval"],"prompt_function":"xcopa_et","hf_repo":"xcopa","hf_subset":"et","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:ht","suite":["lighteval"],"prompt_function":"xcopa_ht","hf_repo":"xcopa","hf_subset":"ht","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:it","suite":["lighteval"],"prompt_function":"xcopa_it","hf_repo":"xcopa","hf_subset":"it","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:id","suite":["lighteval"],"prompt_function":"xcopa_id","hf_repo":"xcopa","hf_subset":"id","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:qu","suite":["lighteval"],"prompt_function":"xcopa_qu","hf_repo":"xcopa","hf_subset":"qu","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:sw","suite":["lighteval"],"prompt_function":"xcopa_sw","hf_repo":"xcopa","hf_subset":"sw","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:zh","suite":["lighteval"],"prompt_function":"xcopa_zh","hf_repo":"xcopa","hf_subset":"zh","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:ta","suite":["lighteval"],"prompt_function":"xcopa_ta","hf_repo":"xcopa","hf_subset":"ta","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:th","suite":["lighteval"],"prompt_function":"xcopa_th","hf_repo":"xcopa","hf_subset":"th","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:tr","suite":["lighteval"],"prompt_function":"xcopa_tr","hf_repo":"xcopa","hf_subset":"tr","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xcopa:vi","suite":["lighteval"],"prompt_function":"xcopa_vi","hf_repo":"xcopa","hf_subset":"vi","hf_avail_splits":["test","train","validation"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:en","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"en","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:ru","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"ru","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:zh","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"zh","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:es","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"es","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:ar","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"ar","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:hi","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"hi","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:id","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"id","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:te","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"te","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:sw","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"sw","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:eu","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"eu","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xstory_cloze:my","suite":["lighteval"],"prompt_function":"storycloze","hf_repo":"juletxara/xstory_cloze","hf_subset":"my","hf_avail_splits":["training", "eval"],"evaluation_splits":["eval"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:en","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"en","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:fr","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"fr","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:jp","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"jp","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:pt","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"pt","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:ru","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"ru","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} -{"name":"xwinograd:zh","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"Muennighoff/xwinograd","hf_subset":"zh","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true,"version":0} diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index 162357858..3e032d1f4 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -209,7 +209,7 @@ def is_openai_available() -> bool: def can_load_extended_tasks() -> bool: imports = [] - for package in ["langdetect"]: + for package in ["langdetect", "openai"]: imports.append(importlib.util.find_spec(package)) return all(cur_import is not None for cur_import in imports)