diff --git a/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml b/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml index 8e95c94ddb..0aafff42a2 100644 --- a/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml +++ b/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml @@ -26,7 +26,7 @@ eval_gauntlet: - name: wikiqa_2k num_fewshot: 0 random_baseline: 0 - - name: 4k + - name: 4k benchmarks: - name: hotpotqa_beginning_4k num_fewshot: 0 @@ -49,7 +49,7 @@ eval_gauntlet: - name: wikiqa_4k num_fewshot: 0 random_baseline: 0 - - name: 8k + - name: 8k benchmarks: - name: hotpotqa_beginning_8k num_fewshot: 0 @@ -131,4 +131,4 @@ eval_gauntlet: random_baseline: 0 - name: kv_pairs_end_64k num_fewshot: 0 - random_baseline: 0 \ No newline at end of file + random_baseline: 0 diff --git a/scripts/eval/yamls/eval_gauntlet_long_context_section.yaml b/scripts/eval/yamls/eval_gauntlet_long_context_section.yaml index ca33ab39ae..f776047c38 100644 --- a/scripts/eval/yamls/eval_gauntlet_long_context_section.yaml +++ b/scripts/eval/yamls/eval_gauntlet_long_context_section.yaml @@ -3,7 +3,7 @@ eval_gauntlet: subtract_random_baseline: true rescale_accuracy: true categories: - - name: beginning + - name: beginning benchmarks: - name: hotpotqa_beginning_2k num_fewshot: 0 @@ -127,4 +127,4 @@ eval_gauntlet: random_baseline: 0 - name: wikiqa_8k num_fewshot: 0 - random_baseline: 0 \ No newline at end of file + random_baseline: 0