From a84dda03f7c2f5bd90c451bc753e14a19a26ba7a Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Tue, 3 Oct 2023 17:05:24 -0400 Subject: [PATCH] add simple human_eval --- ...split-0.25.jsonl => human_eval-0.25.jsonl} | 0 .../{split-0.5.jsonl => human_eval-0.5.jsonl} | 0 ...split-0.75.jsonl => human_eval-0.75.jsonl} | 0 scripts/eval/yamls/coding_tasks.yaml | 34 ++++++++++++++++++ scripts/eval/yamls/coding_tasks_simple.yaml | 35 ------------------- scripts/eval/yamls/eval_gauntlet.yaml | 15 ++++++++ scripts/eval/yamls/tasks.yaml | 34 ++++++++++++++++++ 7 files changed, 83 insertions(+), 35 deletions(-) rename scripts/eval/local_data/programming/{split-0.25.jsonl => human_eval-0.25.jsonl} (100%) rename scripts/eval/local_data/programming/{split-0.5.jsonl => human_eval-0.5.jsonl} (100%) rename scripts/eval/local_data/programming/{split-0.75.jsonl => human_eval-0.75.jsonl} (100%) delete mode 100644 scripts/eval/yamls/coding_tasks_simple.yaml diff --git a/scripts/eval/local_data/programming/split-0.25.jsonl b/scripts/eval/local_data/programming/human_eval-0.25.jsonl similarity index 100% rename from scripts/eval/local_data/programming/split-0.25.jsonl rename to scripts/eval/local_data/programming/human_eval-0.25.jsonl diff --git a/scripts/eval/local_data/programming/split-0.5.jsonl b/scripts/eval/local_data/programming/human_eval-0.5.jsonl similarity index 100% rename from scripts/eval/local_data/programming/split-0.5.jsonl rename to scripts/eval/local_data/programming/human_eval-0.5.jsonl diff --git a/scripts/eval/local_data/programming/split-0.75.jsonl b/scripts/eval/local_data/programming/human_eval-0.75.jsonl similarity index 100% rename from scripts/eval/local_data/programming/split-0.75.jsonl rename to scripts/eval/local_data/programming/human_eval-0.75.jsonl diff --git a/scripts/eval/yamls/coding_tasks.yaml b/scripts/eval/yamls/coding_tasks.yaml index 3a19dc640d..065f291ea0 100644 --- a/scripts/eval/yamls/coding_tasks.yaml +++ b/scripts/eval/yamls/coding_tasks.yaml @@ -20,3 +20,37 @@ icl_tasks: pass_at_k: 1 num_beams: 20 icl_task_type: code_evaluation +- + label: human_eval_return_simple + dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI + num_fewshot: [0] + pass_at_k : 1 + num_beams: 20 +- + label: human_eval_return_complex + dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI + num_fewshot: [0] + pass_at_k : 1 + num_beams: 20 + icl_task_type: code_evaluation +- + label: human_eval_25 + dataset_uri: eval/local_data/programming/split-0.25.jsonl # ADD YOUR OWN DATASET URI + num_fewshot: [0] + pass_at_k : 1 + num_beams: 20 + icl_task_type: code_evaluation +- + label: human_eval_50 + dataset_uri: eval/local_data/programming/split-0.5.jsonl # ADD YOUR OWN DATASET URI + num_fewshot: [0] + pass_at_k : 1 + num_beams: 20 + icl_task_type: code_evaluation +- + label: human_eval_75 + dataset_uri: eval/local_data/programming/split-0.75.jsonl # ADD YOUR OWN DATASET URI + num_fewshot: [0] + pass_at_k : 1 + num_beams: 20 + icl_task_type: code_evaluation diff --git a/scripts/eval/yamls/coding_tasks_simple.yaml b/scripts/eval/yamls/coding_tasks_simple.yaml deleted file mode 100644 index 88f795ea44..0000000000 --- a/scripts/eval/yamls/coding_tasks_simple.yaml +++ /dev/null @@ -1,35 +0,0 @@ -icl_tasks: -- - label: human_eval_return_simple - dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k : 1 - num_beams: 20 -- - label: human_eval_return_complex - dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k : 1 - num_beams: 20 - icl_task_type: code_evaluation -- - label: human_eval_25 - dataset_uri: eval/local_data/programming/split-0.25.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k : 1 - num_beams: 20 - icl_task_type: code_evaluation -- - label: human_eval_50 - dataset_uri: eval/local_data/programming/split-0.5.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k : 1 - num_beams: 20 - icl_task_type: code_evaluation -- - label: human_eval_75 - dataset_uri: eval/local_data/programming/split-0.75.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k : 1 - num_beams: 20 - icl_task_type: code_evaluation diff --git a/scripts/eval/yamls/eval_gauntlet.yaml b/scripts/eval/yamls/eval_gauntlet.yaml index 7e65334874..87e01fd44c 100644 --- a/scripts/eval/yamls/eval_gauntlet.yaml +++ b/scripts/eval/yamls/eval_gauntlet.yaml @@ -123,6 +123,21 @@ eval_gauntlet: - name: human_eval_js num_fewshot: 0 random_baseline: 0.0 + - name: human_eval_return_simple + num_fewshot: 0 + random_baseline: 0.0 + - name: human_eval_return_complex + num_fewshot: 0 + random_baseline: 0.0 + - name: human_eval_25 + num_fewshot: 0 + random_baseline: 0.0 + - name: human_eval_50 + num_fewshot: 0 + random_baseline: 0.0 + - name: human_eval_75 + num_fewshot: 0 + random_baseline: 0.0 - name: world_knowledge_lm_task_subscore benchmarks: - name: jeopardy diff --git a/scripts/eval/yamls/tasks.yaml b/scripts/eval/yamls/tasks.yaml index 54d14e34ab..6f082e5893 100644 --- a/scripts/eval/yamls/tasks.yaml +++ b/scripts/eval/yamls/tasks.yaml @@ -194,3 +194,37 @@ icl_tasks: pass_at_k: 1 num_beams: 20 icl_task_type: code_evaluation +- + label: human_eval_return_simple + dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI + num_fewshot: [0] + pass_at_k : 1 + num_beams: 20 +- + label: human_eval_return_complex + dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI + num_fewshot: [0] + pass_at_k : 1 + num_beams: 20 + icl_task_type: code_evaluation +- + label: human_eval_25 + dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI + num_fewshot: [0] + pass_at_k : 1 + num_beams: 20 + icl_task_type: code_evaluation +- + label: human_eval_50 + dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI + num_fewshot: [0] + pass_at_k : 1 + num_beams: 20 + icl_task_type: code_evaluation +- + label: human_eval_75 + dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI + num_fewshot: [0] + pass_at_k : 1 + num_beams: 20 + icl_task_type: code_evaluation