diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 1989584a..ea918606 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -14,7 +14,7 @@ # original is the reimplementation of original evals # custom is to play around -DEFAULT_SUITES = ["helm", "bigbench", "lighteval", "original", "custom"] +DEFAULT_SUITES = ["helm", "harness", "bigbench", "lighteval", "original", "custom"] TRUNCATE_FEW_SHOTS_DEFAULTS = True diff --git a/src/lighteval/tasks/tasks_prompt_formatting.py b/src/lighteval/tasks/tasks_prompt_formatting.py index c3fd88b1..cf3677bc 100644 --- a/src/lighteval/tasks/tasks_prompt_formatting.py +++ b/src/lighteval/tasks/tasks_prompt_formatting.py @@ -6,6 +6,7 @@ import pycountry +from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.tasks.requests import Doc from lighteval.utils import as_list @@ -157,7 +158,7 @@ def bbh_formal_fallacies(line, task_name: str = None): def bbh_geometric_shapes(line, task_name: str = None): instruction = "Name geometric shapes from their SVG paths.\n\n" - choices = [f"({c})" for c in LETTER_INDICES[:9]] + choices = [f"({c})" for c in LETTER_INDICES[:11]] return bbh(line, instruction, choices, task_name) def bbh_hyperbaton(line, task_name: str = None): @@ -181,6 +182,9 @@ def bbh_logical_deduction_three_objects(line, task_name: str = None): return bbh(line, instruction, choices, task_name) def bbh_movie_recommendation(line, task_name: str = None): + if line["target"] == "Monsters, Inc": # this line is not correctly formatted + hlog_warn("One sample removed from task bbh:movie_recommentation because its line is incorrectly formatted.") + return [] instruction = "Recommend movies similar to the given list of movies.\n\n" choices = [f"({c})" for c in LETTER_INDICES[:6]] return bbh(line, instruction, choices, task_name) @@ -197,7 +201,7 @@ def bbh_navigate(line, task_name: str = None): def bbh_object_counting(line, task_name: str = None): instruction = "Questions that involve enumerating objects and asking the model to count them.\n\n" - choices = [i for i in range(18)] + choices = [str(i) for i in range(1, 19)] return bbh(line, instruction, choices, task_name) def bbh_penguins_in_a_table(line, task_name: str = None): @@ -211,6 +215,9 @@ def bbh_reasoning_about_colored_objects(line, task_name: str = None): return bbh(line, instruction, choices, task_name) def bbh_ruin_names(line, task_name: str = None): + if line["target"] in ["dearth, wind, & fire", "rita, sue and bob poo"]: # line not correctly formatted + hlog_warn("One sample removed from task bbh:ruin_names because its line is incorrectly formatted.") + return [] instruction = "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n" choices = [f"({c})" for c in LETTER_INDICES[:6]] return bbh(line, instruction, choices, task_name) diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl index b3c1bf79..260bf5ac 100644 --- a/src/lighteval/tasks/tasks_table.jsonl +++ b/src/lighteval/tasks/tasks_table.jsonl @@ -28,33 +28,33 @@ {"name":"auto_categorization","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"auto_categorization","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false} {"name":"auto_debugging","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_and_after_query","hf_repo":"bigbench","hf_subset":"auto_debugging","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+"} {"name":"babi_qa","suite":["helm"],"prompt_function":"babi_qa","hf_repo":"facebook\/babi_qa","hf_subset":"en-valid-qa1","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false} -{"name":"bbh:boolean_expressions","suite":["harness"],"prompt_function":"bbh_boolean_expressions","hf_repo":"lukaemon/bbh","hf_subset":"boolean_expressions","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:causal_judgement","suite":["harness"],"prompt_function":"bbh_causal_judgement","hf_repo":"lukaemon/bbh","hf_subset":"causal_judgement","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:date_understanding","suite":["harness"],"prompt_function":"bbh_date_understanding","hf_repo":"lukaemon/bbh","hf_subset":"date_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_disambiguation_qa","hf_repo":"lukaemon/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:dyck_languages","suite":["harness"],"prompt_function":"bbh_dyck_languages","hf_repo":"lukaemon/bbh","hf_subset":"dyck_languages","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:formal_fallacies","suite":["harness"],"prompt_function":"bbh_formal_fallacies","hf_repo":"lukaemon/bbh","hf_subset":"formal_fallacies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:geometric_shapes","suite":["harness"],"prompt_function":"bbh_geometric_shapes","hf_repo":"lukaemon/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:hyperbaton","suite":["harness"],"prompt_function":"bbh_hyperbaton","hf_repo":"lukaemon/bbh","hf_subset":"hyperbaton","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:movie_recommendation","suite":["harness"],"prompt_function":"bbh_movie_recommendation","hf_repo":"lukaemon/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:multistep_arithmetic_two","suite":["harness"],"prompt_function":"bbh_multistep_arithmetic_two","hf_repo":"lukaemon/bbh","hf_subset":"multistep_arithmetic_two","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:navigate","suite":["harness"],"prompt_function":"bbh_navigate","hf_repo":"lukaemon/bbh","hf_subset":"navigate","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:object_counting","suite":["harness"],"prompt_function":"bbh_object_counting","hf_repo":"lukaemon/bbh","hf_subset":"object_counting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:penguins_in_a_table","suite":["harness"],"prompt_function":"bbh_penguins_in_a_table","hf_repo":"lukaemon/bbh","hf_subset":"penguins_in_a_table","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_reasoning_about_colored_objects","hf_repo":"lukaemon/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:ruin_names","suite":["harness"],"prompt_function":"bbh_ruin_names","hf_repo":"lukaemon/bbh","hf_subset":"ruin_names","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_salient_translation_error_detection","hf_repo":"lukaemon/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:snarks","suite":["harness"],"prompt_function":"bbh_snarks","hf_repo":"lukaemon/bbh","hf_subset":"snarks","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:sports_understanding","suite":["harness"],"prompt_function":"bbh_sports_understanding","hf_repo":"lukaemon/bbh","hf_subset":"sports_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:temporal_sequences","suite":["harness"],"prompt_function":"bbh_temporal_sequences","hf_repo":"lukaemon/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:web_of_lies","suite":["harness"],"prompt_function":"bbh_web_of_lies","hf_repo":"lukaemon/bbh","hf_subset":"web_of_lies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} -{"name":"bbh:word_sorting","suite":["harness"],"prompt_function":"bbh_word_sorting","hf_repo":"lukaemon/bbh","hf_subset":"word_sorting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":null,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:boolean_expressions","suite":["harness"],"prompt_function":"bbh_boolean_expressions","hf_repo":"lukaemon/bbh","hf_subset":"boolean_expressions","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:causal_judgement","suite":["harness"],"prompt_function":"bbh_causal_judgement","hf_repo":"lukaemon/bbh","hf_subset":"causal_judgement","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:date_understanding","suite":["harness"],"prompt_function":"bbh_date_understanding","hf_repo":"lukaemon/bbh","hf_subset":"date_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_disambiguation_qa","hf_repo":"lukaemon/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:dyck_languages","suite":["harness"],"prompt_function":"bbh_dyck_languages","hf_repo":"lukaemon/bbh","hf_subset":"dyck_languages","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:formal_fallacies","suite":["harness"],"prompt_function":"bbh_formal_fallacies","hf_repo":"lukaemon/bbh","hf_subset":"formal_fallacies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:geometric_shapes","suite":["harness"],"prompt_function":"bbh_geometric_shapes","hf_repo":"lukaemon/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:hyperbaton","suite":["harness"],"prompt_function":"bbh_hyperbaton","hf_repo":"lukaemon/bbh","hf_subset":"hyperbaton","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_logical_deduction_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:movie_recommendation","suite":["harness"],"prompt_function":"bbh_movie_recommendation","hf_repo":"lukaemon/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:multistep_arithmetic_two","suite":["harness"],"prompt_function":"bbh_multistep_arithmetic_two","hf_repo":"lukaemon/bbh","hf_subset":"multistep_arithmetic_two","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:navigate","suite":["harness"],"prompt_function":"bbh_navigate","hf_repo":"lukaemon/bbh","hf_subset":"navigate","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:object_counting","suite":["harness"],"prompt_function":"bbh_object_counting","hf_repo":"lukaemon/bbh","hf_subset":"object_counting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:penguins_in_a_table","suite":["harness"],"prompt_function":"bbh_penguins_in_a_table","hf_repo":"lukaemon/bbh","hf_subset":"penguins_in_a_table","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_reasoning_about_colored_objects","hf_repo":"lukaemon/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:ruin_names","suite":["harness"],"prompt_function":"bbh_ruin_names","hf_repo":"lukaemon/bbh","hf_subset":"ruin_names","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_salient_translation_error_detection","hf_repo":"lukaemon/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:snarks","suite":["harness"],"prompt_function":"bbh_snarks","hf_repo":"lukaemon/bbh","hf_subset":"snarks","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:sports_understanding","suite":["harness"],"prompt_function":"bbh_sports_understanding","hf_repo":"lukaemon/bbh","hf_subset":"sports_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:temporal_sequences","suite":["harness"],"prompt_function":"bbh_temporal_sequences","hf_repo":"lukaemon/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_five_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_seven_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_tracking_shuffled_objects_three_objects","hf_repo":"lukaemon/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:web_of_lies","suite":["harness"],"prompt_function":"bbh_web_of_lies","hf_repo":"lukaemon/bbh","hf_subset":"web_of_lies","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} +{"name":"bbh:word_sorting","suite":["harness"],"prompt_function":"bbh_word_sorting","hf_repo":"lukaemon/bbh","hf_subset":"word_sorting","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["", "Q:", "\n\n"],"output_regex":null,"frozen":false} {"name":"bbq","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"all","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false} {"name":"bbq:Age","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Age","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false} {"name":"bbq:Disability_status","suite":["helm"],"prompt_function":"bbq","hf_repo":"lighteval\/bbq_helm","hf_subset":"Disability_status","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}