Skip to content

Commit

Permalink
remove cot
Browse files Browse the repository at this point in the history
  • Loading branch information
bmosaicml committed Jan 15, 2024
1 parent b40b805 commit cc0934a
Show file tree
Hide file tree
Showing 2 changed files with 273 additions and 0 deletions.
109 changes: 109 additions & 0 deletions scripts/eval/yamls/eval_gauntlet_v0.2.1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
eval_gauntlet:
weighting: EQUAL
subtract_random_baseline: true
rescale_accuracy: true
averages:
core_average:
- world_knowledge
- commonsense_reasoning
- language_understanding
- symbolic_problem_solving
- reading_comprehension
categories:
- name: world_knowledge
benchmarks:
- name: jeopardy
num_fewshot: 3
random_baseline: 0
- name: bigbench_qa_wikidata
num_fewshot: 3
random_baseline: 0
- name: arc_easy
num_fewshot: 3
random_baseline: 0.25
- name: arc_challenge
num_fewshot: 3
random_baseline: 0.25
- name: mmlu
num_fewshot: 5
random_baseline: 0.25
- name: triviaqa_sm_sub
num_fewshot: 3
random_baseline: 0.0
- name: commonsense_reasoning
benchmarks:
- name: copa
num_fewshot: 0
random_baseline: 0.5
- name: siqa
num_fewshot: 3
random_baseline: 0.5
- name: commonsense_qa
num_fewshot: 0
random_baseline: 0.25
- name: piqa
num_fewshot: 0
random_baseline: 0.5
- name: openbook_qa
num_fewshot: 10
random_baseline: 0.25
- name: bigbench_strange_stories
num_fewshot: 0
random_baseline: 0.5
- name: bigbench_strategy_qa
num_fewshot: 0
random_baseline: 0.5
- name: language_understanding
benchmarks:
- name: lambada_openai
num_fewshot: 0
random_baseline: 0.0
- name: hellaswag
num_fewshot: 0
random_baseline: 0.25
- name: winograd
num_fewshot: 3
random_baseline: 0.5
- name: winogrande
num_fewshot: 5
random_baseline: 0.5
- name: symbolic_problem_solving
benchmarks:
- name: bigbench_elementary_math_qa
num_fewshot: 1
random_baseline: 0.25
- name: bigbench_dyck_languages
num_fewshot: 5
random_baseline: 0
- name: bigbench_operators
num_fewshot: 3
random_baseline: 0.0
- name: simple_arithmetic_withspaces
num_fewshot: 5
random_baseline: 0.0
- name: simple_arithmetic_nospaces
num_fewshot: 5
random_baseline: 0.0
- name: agi_eval_lsat_ar
num_fewshot: 5
random_baseline: 0.25
- name: reading_comprehension
benchmarks:
- name: squad
num_fewshot: 3
random_baseline: 0
- name: boolq
num_fewshot: 0
random_baseline: 0.5
- name: coqa
num_fewshot: 0
random_baseline: 0.0
- name: agi_eval_lsat_rc
num_fewshot: 5
random_baseline: 0.25
- name: agi_eval_lsat_lr
num_fewshot: 5
random_baseline: 0.25
- name: agi_eval_sat_en
num_fewshot: 5
random_baseline: 0.25
164 changes: 164 additions & 0 deletions scripts/eval/yamls/tasks_v0.2.1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
icl_tasks:
-
label: jeopardy
dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl
num_fewshot: [3]
icl_task_type: language_modeling
continuation_delimiter: "\nAnswer: " # this separates questions from answers
has_categories: true
-
label: triviaqa_sm_sub
dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl
num_fewshot: [3]
icl_task_type: question_answering
-
label: bigbench_qa_wikidata
dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl
num_fewshot: [3]
icl_task_type: language_modeling
-
label: arc_easy
dataset_uri: eval/local_data/world_knowledge/arc_easy.jsonl
num_fewshot: [3]
icl_task_type: multiple_choice
continuation_delimiter: "\nAnswer: " # this separates questions from answers
-
label: arc_challenge
dataset_uri: eval/local_data/world_knowledge/arc_challenge.jsonl
num_fewshot: [3, 25]
icl_task_type: multiple_choice
continuation_delimiter: "\nAnswer: " # this separates questions from answers
-
label: mmlu
dataset_uri: eval/local_data/world_knowledge/mmlu.jsonl
num_fewshot: [5]
icl_task_type: multiple_choice
continuation_delimiter: "\nAnswer: " # this separates questions from answers
has_categories: true
-
label: copa
dataset_uri: eval/local_data/commonsense_reasoning/copa.jsonl
num_fewshot: [0]
icl_task_type: multiple_choice
-
label: siqa
dataset_uri: eval/local_data/commonsense_reasoning/siqa.jsonl
num_fewshot: [3]
icl_task_type: multiple_choice
-
label: commonsense_qa
dataset_uri: eval/local_data/commonsense_reasoning/commonsense_qa.jsonl
num_fewshot: [0]
icl_task_type: multiple_choice
-
label: piqa
dataset_uri: eval/local_data/commonsense_reasoning/piqa.jsonl
num_fewshot: [0]
icl_task_type: multiple_choice
continuation_delimiter: "\nAnswer: " # this separates questions from answers
-
label: openbook_qa
dataset_uri: eval/local_data/commonsense_reasoning/openbook_qa.jsonl
num_fewshot: [10]
icl_task_type: multiple_choice
-
label: bigbench_strange_stories
dataset_uri: eval/local_data/commonsense_reasoning/bigbench_strange_stories.jsonl
num_fewshot: [0]
icl_task_type: multiple_choice
-
label: bigbench_strategy_qa
dataset_uri: eval/local_data/commonsense_reasoning/bigbench_strategy_qa.jsonl
num_fewshot: [0]
icl_task_type: multiple_choice
-
label: bigbench_dyck_languages
dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_dyck_languages.jsonl
num_fewshot: [5]
icl_task_type: language_modeling
-
label: lambada_openai
dataset_uri: eval/local_data/language_understanding/lambada_openai.jsonl
num_fewshot: [0]
icl_task_type: language_modeling
-
label: hellaswag
dataset_uri: eval/local_data/language_understanding/hellaswag.jsonl
num_fewshot: [0, 10]
icl_task_type: multiple_choice
-
label: winograd
dataset_uri: eval/local_data/language_understanding/winograd_wsc.jsonl
num_fewshot: [3]
icl_task_type: schema
-
label: winogrande
dataset_uri: eval/local_data/language_understanding/winogrande.jsonl
num_fewshot: [5]
icl_task_type: schema
-
label: bigbench_elementary_math_qa
dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_elementary_math_qa.jsonl
num_fewshot: [1]
icl_task_type: multiple_choice
-
label: agi_eval_lsat_ar
dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_lsat_ar.jsonl
num_fewshot: [5]
icl_task_type: multiple_choice
-
label: bigbench_cs_algorithms
dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_cs_algorithms.jsonl
num_fewshot: [10]
icl_task_type: language_modeling
-
label: bigbench_operators
dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_operators.jsonl
num_fewshot: [3]
icl_task_type: language_modeling
-
label: simple_arithmetic_nospaces
dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_nospaces.jsonl
num_fewshot: [5]
icl_task_type: language_modeling
-
label: simple_arithmetic_withspaces
dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_withspaces.jsonl
num_fewshot: [5]
icl_task_type: language_modeling
-
label: pubmed_qa_labeled
dataset_uri: eval/local_data/reading_comprehension/pubmed_qa_labeled.jsonl
num_fewshot: [10]
icl_task_type: language_modeling
-
label: squad
dataset_uri: eval/local_data/reading_comprehension/squad.jsonl
num_fewshot: [3]
icl_task_type: language_modeling
-
label: agi_eval_lsat_rc
dataset_uri: eval/local_data/reading_comprehension/agi_eval_lsat_rc.jsonl
num_fewshot: [5]
icl_task_type: multiple_choice
-
label: agi_eval_lsat_lr
dataset_uri: eval/local_data/reading_comprehension/agi_eval_lsat_lr.jsonl
num_fewshot: [5]
icl_task_type: multiple_choice
-
label: coqa
dataset_uri: eval/local_data/reading_comprehension/coqa.jsonl
num_fewshot: [0]
icl_task_type: language_modeling
-
label: boolq
dataset_uri: eval/local_data/reading_comprehension/boolq.jsonl
num_fewshot: [0]
icl_task_type: multiple_choice
continuation_delimiter: "\nAnswer: " # this separates questions from answers
-
label: agi_eval_sat_en
dataset_uri: eval/local_data/reading_comprehension/agi_eval_sat_en.jsonl
num_fewshot: [5]
icl_task_type: multiple_choice

0 comments on commit cc0934a

Please sign in to comment.