diff --git a/scripts/eval/yamls/eval_gauntlet_v0.2.1.yaml b/scripts/eval/yamls/eval_gauntlet_v0.2.1.yaml new file mode 100644 index 0000000000..5959d2fde0 --- /dev/null +++ b/scripts/eval/yamls/eval_gauntlet_v0.2.1.yaml @@ -0,0 +1,109 @@ +eval_gauntlet: + weighting: EQUAL + subtract_random_baseline: true + rescale_accuracy: true + averages: + core_average: + - world_knowledge + - commonsense_reasoning + - language_understanding + - symbolic_problem_solving + - reading_comprehension + categories: + - name: world_knowledge + benchmarks: + - name: jeopardy + num_fewshot: 3 + random_baseline: 0 + - name: bigbench_qa_wikidata + num_fewshot: 3 + random_baseline: 0 + - name: arc_easy + num_fewshot: 3 + random_baseline: 0.25 + - name: arc_challenge + num_fewshot: 3 + random_baseline: 0.25 + - name: mmlu + num_fewshot: 5 + random_baseline: 0.25 + - name: triviaqa_sm_sub + num_fewshot: 3 + random_baseline: 0.0 + - name: commonsense_reasoning + benchmarks: + - name: copa + num_fewshot: 0 + random_baseline: 0.5 + - name: siqa + num_fewshot: 3 + random_baseline: 0.5 + - name: commonsense_qa + num_fewshot: 0 + random_baseline: 0.25 + - name: piqa + num_fewshot: 0 + random_baseline: 0.5 + - name: openbook_qa + num_fewshot: 10 + random_baseline: 0.25 + - name: bigbench_strange_stories + num_fewshot: 0 + random_baseline: 0.5 + - name: bigbench_strategy_qa + num_fewshot: 0 + random_baseline: 0.5 + - name: language_understanding + benchmarks: + - name: lambada_openai + num_fewshot: 0 + random_baseline: 0.0 + - name: hellaswag + num_fewshot: 0 + random_baseline: 0.25 + - name: winograd + num_fewshot: 3 + random_baseline: 0.5 + - name: winogrande + num_fewshot: 5 + random_baseline: 0.5 + - name: symbolic_problem_solving + benchmarks: + - name: bigbench_elementary_math_qa + num_fewshot: 1 + random_baseline: 0.25 + - name: bigbench_dyck_languages + num_fewshot: 5 + random_baseline: 0 + - name: bigbench_operators + num_fewshot: 3 + random_baseline: 0.0 + - name: simple_arithmetic_withspaces + num_fewshot: 5 + random_baseline: 0.0 + - name: simple_arithmetic_nospaces + num_fewshot: 5 + random_baseline: 0.0 + - name: agi_eval_lsat_ar + num_fewshot: 5 + random_baseline: 0.25 + - name: reading_comprehension + benchmarks: + - name: squad + num_fewshot: 3 + random_baseline: 0 + - name: boolq + num_fewshot: 0 + random_baseline: 0.5 + - name: coqa + num_fewshot: 0 + random_baseline: 0.0 + - name: agi_eval_lsat_rc + num_fewshot: 5 + random_baseline: 0.25 + - name: agi_eval_lsat_lr + num_fewshot: 5 + random_baseline: 0.25 + - name: agi_eval_sat_en + num_fewshot: 5 + random_baseline: 0.25 diff --git a/scripts/eval/yamls/tasks_v0.2.1.yaml b/scripts/eval/yamls/tasks_v0.2.1.yaml new file mode 100644 index 0000000000..a1efad2fe2 --- /dev/null +++ b/scripts/eval/yamls/tasks_v0.2.1.yaml @@ -0,0 +1,164 @@ +icl_tasks: +- + label: jeopardy + dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl + num_fewshot: [3] + icl_task_type: language_modeling + continuation_delimiter: "\nAnswer: " # this separates questions from answers + has_categories: true +- + label: triviaqa_sm_sub + dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl + num_fewshot: [3] + icl_task_type: question_answering +- + label: bigbench_qa_wikidata + dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl + num_fewshot: [3] + icl_task_type: language_modeling +- + label: arc_easy + dataset_uri: eval/local_data/world_knowledge/arc_easy.jsonl + num_fewshot: [3] + icl_task_type: multiple_choice + continuation_delimiter: "\nAnswer: " # this separates questions from answers +- + label: arc_challenge + dataset_uri: eval/local_data/world_knowledge/arc_challenge.jsonl + num_fewshot: [3, 25] + icl_task_type: multiple_choice + continuation_delimiter: "\nAnswer: " # this separates questions from answers +- + label: mmlu + dataset_uri: eval/local_data/world_knowledge/mmlu.jsonl + num_fewshot: [5] + icl_task_type: multiple_choice + continuation_delimiter: "\nAnswer: " # this separates questions from answers + has_categories: true +- + label: copa + dataset_uri: eval/local_data/commonsense_reasoning/copa.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice +- + label: siqa + dataset_uri: eval/local_data/commonsense_reasoning/siqa.jsonl + num_fewshot: [3] + icl_task_type: multiple_choice +- + label: commonsense_qa + dataset_uri: eval/local_data/commonsense_reasoning/commonsense_qa.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice +- + label: piqa + dataset_uri: eval/local_data/commonsense_reasoning/piqa.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice + continuation_delimiter: "\nAnswer: " # this separates questions from answers +- + label: openbook_qa + dataset_uri: eval/local_data/commonsense_reasoning/openbook_qa.jsonl + num_fewshot: [10] + icl_task_type: multiple_choice +- + label: bigbench_strange_stories + dataset_uri: eval/local_data/commonsense_reasoning/bigbench_strange_stories.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice +- + label: bigbench_strategy_qa + dataset_uri: eval/local_data/commonsense_reasoning/bigbench_strategy_qa.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice +- + label: bigbench_dyck_languages + dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_dyck_languages.jsonl + num_fewshot: [5] + icl_task_type: language_modeling +- + label: lambada_openai + dataset_uri: eval/local_data/language_understanding/lambada_openai.jsonl + num_fewshot: [0] + icl_task_type: language_modeling +- + label: hellaswag + dataset_uri: eval/local_data/language_understanding/hellaswag.jsonl + num_fewshot: [0, 10] + icl_task_type: multiple_choice +- + label: winograd + dataset_uri: eval/local_data/language_understanding/winograd_wsc.jsonl + num_fewshot: [3] + icl_task_type: schema +- + label: winogrande + dataset_uri: eval/local_data/language_understanding/winogrande.jsonl + num_fewshot: [5] + icl_task_type: schema +- + label: bigbench_elementary_math_qa + dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_elementary_math_qa.jsonl + num_fewshot: [1] + icl_task_type: multiple_choice +- + label: agi_eval_lsat_ar + dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_lsat_ar.jsonl + num_fewshot: [5] + icl_task_type: multiple_choice +- + label: bigbench_cs_algorithms + dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_cs_algorithms.jsonl + num_fewshot: [10] + icl_task_type: language_modeling +- + label: bigbench_operators + dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_operators.jsonl + num_fewshot: [3] + icl_task_type: language_modeling +- + label: simple_arithmetic_nospaces + dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_nospaces.jsonl + num_fewshot: [5] + icl_task_type: language_modeling +- + label: simple_arithmetic_withspaces + dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_withspaces.jsonl + num_fewshot: [5] + icl_task_type: language_modeling +- + label: pubmed_qa_labeled + dataset_uri: eval/local_data/reading_comprehension/pubmed_qa_labeled.jsonl + num_fewshot: [10] + icl_task_type: language_modeling +- + label: squad + dataset_uri: eval/local_data/reading_comprehension/squad.jsonl + num_fewshot: [3] + icl_task_type: language_modeling +- + label: agi_eval_lsat_rc + dataset_uri: eval/local_data/reading_comprehension/agi_eval_lsat_rc.jsonl + num_fewshot: [5] + icl_task_type: multiple_choice +- + label: agi_eval_lsat_lr + dataset_uri: eval/local_data/reading_comprehension/agi_eval_lsat_lr.jsonl + num_fewshot: [5] + icl_task_type: multiple_choice +- + label: coqa + dataset_uri: eval/local_data/reading_comprehension/coqa.jsonl + num_fewshot: [0] + icl_task_type: language_modeling +- + label: boolq + dataset_uri: eval/local_data/reading_comprehension/boolq.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice + continuation_delimiter: "\nAnswer: " # this separates questions from answers +- + label: agi_eval_sat_en + dataset_uri: eval/local_data/reading_comprehension/agi_eval_sat_en.jsonl + num_fewshot: [5] + icl_task_type: multiple_choice