Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new ICL kwargs in eval.py and long_context yamls #925

Merged
merged 56 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
340b79e
add yamls w/ old links
maxisawesome Oct 30, 2023
26dc067
load from max's public hf and parse hf datasets
maxisawesome Nov 1, 2023
31851a5
update rest of tasks
maxisawesome Nov 1, 2023
203be47
add better logging
maxisawesome Nov 1, 2023
33b6513
implemented leval tasks
maxisawesome Nov 1, 2023
089c392
move level
maxisawesome Nov 1, 2023
b644df1
add level yaml
maxisawesome Nov 1, 2023
5adf77e
add str parsing to hf
maxisawesome Nov 3, 2023
79810f3
wip
maxisawesome Nov 14, 2023
44a209a
llm-foundry working with new parser
maxisawesome Nov 14, 2023
657fa13
working w/ new parsing
maxisawesome Nov 14, 2023
2629f75
fix old long context tasks
maxisawesome Nov 14, 2023
c019ea1
wip
maxisawesome Nov 20, 2023
0608ea2
wip
maxisawesome Nov 20, 2023
cebb487
wip
maxisawesome Nov 28, 2023
fcbeba8
wip
maxisawesome Nov 28, 2023
56ae289
update to hf_parsing_map
maxisawesome Dec 5, 2023
4aee1ec
rm defaults
maxisawesome Dec 7, 2023
3440348
Merge branch 'main' into hf_parsing_with_icl_refactor
maxisawesome Dec 7, 2023
23ca0ba
fix parsing vars
maxisawesome Dec 7, 2023
c10698f
update defaults again
maxisawesome Dec 7, 2023
4e05385
rm merge conflict
maxisawesome Dec 7, 2023
6b7d13f
fix gen_kwargs
maxisawesome Jan 19, 2024
871bd9a
Merge branch 'mosaicml:main' into hf_parsing_with_icl_refactor
maxisawesome Jan 19, 2024
d9c6a28
rm old code path
maxisawesome Jan 19, 2024
eda47f2
Merge branch 'mosaicml:main' into hf_parsing_with_icl_refactor
maxisawesome Jan 26, 2024
d9b284c
fixups
maxisawesome Jan 27, 2024
393adfb
wip
maxisawesome Jan 27, 2024
9d12917
Merge branch 'hf_parsing_with_icl_refactor' of github.com:maxisawesom…
maxisawesome Jan 29, 2024
7b23a93
Merge branch 'main' into hf_parsing_with_icl_refactor
maxisawesome Jan 29, 2024
662af67
rm leval from pr
maxisawesome Jan 30, 2024
c9e0ef5
fix comments in yamls
maxisawesome Jan 30, 2024
09ffafd
add cot params
maxisawesome Jan 30, 2024
fb782db
add fewshot_random_seed
maxisawesome Jan 30, 2024
e735ae7
fix early_stopping_criteria, fewshot_num_seed default
maxisawesome Jan 30, 2024
35641df
undo rm hf_eval
maxisawesome Jan 30, 2024
f1282bc
add fewshot_random_seed to test
maxisawesome Jan 30, 2024
4a9a8b0
add 64k tasks
maxisawesome Feb 6, 2024
65ee617
add longer context, update composer versin
maxisawesome Feb 6, 2024
5ba5e30
address comments
maxisawesome Feb 6, 2024
b7884de
mixed
maxisawesome Feb 6, 2024
ff31e72
use seed by default
maxisawesome Feb 6, 2024
fca3d35
rm long_context_eval_8k.yaml
maxisawesome Feb 7, 2024
f1b65f7
add longer context evals
maxisawesome Feb 7, 2024
0b494bb
mv yamls
maxisawesome Feb 7, 2024
bd6048b
eval gauntlet wip
maxisawesome Feb 8, 2024
51c3ea8
update niah and wikiqa
maxisawesome Feb 8, 2024
3c1e344
Merge branch 'main' into hf_parsing_with_icl_refactor
maxisawesome Feb 8, 2024
6ce8cc6
Merge branch 'main' into hf_parsing_with_icl_refactor
dakinggg Feb 12, 2024
7849528
fix linting
maxisawesome Feb 12, 2024
0ffab21
Merge branch 'hf_parsing_with_icl_refactor' of github.com:maxisawesom…
maxisawesome Feb 12, 2024
124b60a
add default option
maxisawesome Feb 12, 2024
6b37a8c
change defaults
maxisawesome Feb 12, 2024
3f08d92
fix linting
maxisawesome Feb 12, 2024
cee8256
fix linting 2
maxisawesome Feb 12, 2024
c2810ef
Merge branch 'main' into hf_parsing_with_icl_refactor
maxisawesome Feb 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llmfoundry/utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def build_evaluators(
device_eval_batch_size: int,
icl_seq_len: int,
icl_subset_num_batches: Optional[int],
fewshot_random_seed: Optional[int] = 1234,
) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]:

evaluators = []
Expand All @@ -72,6 +73,7 @@ def build_evaluators(
tokenizer,
device_eval_batch_size,
icl_seq_len,
fewshot_random_seed,
icl_subset_num_batches,
)
evaluators.extend(icl_evaluators)
Expand Down Expand Up @@ -128,13 +130,15 @@ def build_icl_data_and_gauntlet(
tokenizer: PreTrainedTokenizerBase,
device_eval_batch_size: int,
icl_seq_len: int,
fewshot_random_seed: Optional[int] = 1234,
icl_subset_num_batches: Optional[int] = None
) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]:
icl_evaluators, logger_keys = build_icl_evaluators(
icl_tasks_config,
tokenizer,
icl_seq_len,
device_eval_batch_size,
fewshot_random_seed=fewshot_random_seed,
icl_subset_num_batches=icl_subset_num_batches)
eval_gauntlet_cb = None
if eval_gauntlet_config is not None:
Expand Down Expand Up @@ -442,6 +446,7 @@ def build_icl_evaluators(
default_max_seq_len: int,
default_batch_size: int,
destination_dir: Optional[str] = None,
fewshot_random_seed: Optional[int] = 1234,
icl_subset_num_batches: Optional[int] = None,
) -> Tuple[List[Evaluator], List[str]]:
if destination_dir is None:
Expand Down Expand Up @@ -516,6 +521,10 @@ def _validate_cfg(icl_cfg: DictConfig):
if dist.get_local_rank() == 0 and os.path.exists(destination_path):
os.remove(destination_path)
dist.barrier()

hf_parsing_map = icl_cfg.get('hf_parsing_map', {})
hf_loading_vars = icl_cfg.get('hf_loading_vars', {})

early_stopping_criteria = icl_cfg.get('early_stopping_criteria',
None)
if isinstance(early_stopping_criteria, ListConfig):
Expand All @@ -533,13 +542,18 @@ def _validate_cfg(icl_cfg: DictConfig):
num_fewshot=num_fewshot,
prompt_string=icl_cfg.prompt_string,
example_delimiter=icl_cfg.example_delimiter,
hf_loading_vars=hf_loading_vars,
hf_parsing_map=hf_parsing_map,
continuation_delimiter=icl_cfg.continuation_delimiter,
question_prelimiter=icl_cfg.get('question_prelimiter', ''),
destination_path=destination_path,
fewshot_random_seed=icl_cfg.get('fewshot_random_seed',
fewshot_random_seed),
pass_at_k=icl_cfg.pass_at_k,
generations_per_sample=icl_cfg.num_beams,
has_categories=icl_cfg.get('has_categories', False),
cot_delimiter=icl_cfg.get('cot_delimiter', ''),
generation_kwargs=icl_cfg.get('generation_kwargs', {}),
early_stopping_criteria=early_stopping_criteria,
do_normalization=icl_cfg.get('do_normalization', True))
if hasattr(
Expand Down
1 change: 1 addition & 0 deletions scripts/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def evaluate_model(
tokenizer=tokenizer,
device_eval_batch_size=device_eval_batch_size,
icl_seq_len=max_seq_len,
fewshot_random_seed=seed,
icl_subset_num_batches=icl_subset_num_batches,
)

Expand Down
42 changes: 42 additions & 0 deletions scripts/eval/local_data/EVAL_GAUNTLET.md
Original file line number Diff line number Diff line change
Expand Up @@ -253,3 +253,45 @@ Programming tasks evaluate the model's ability to understand code, write functio
- Year released: 2022
- Number of few shot examples: 0
- Random baseline accuracy: 0%
54. HumanEval Python 25% code generation
- Description: HumanEval Python 25% is an easier variant of HumanEval Python in which in addition to the original method signature, the model is also provided 25% of the lines in the canonical solution and expected to complete the reaminder of the program. It consists of 164 samples.
- Year released: 2023
- Number of few shot examples: 0
- Random baseline accuracy: 0%
55. HumanEval Python 50% code generation
- Description: HumanEval Python 50% is an easier variant of HumanEval Python in which in addition to the original method signature, the model is also provided 50% of the lines in the canonical solution and expected to complete the reaminder of the program. It consists of 164 samples.
- Year released: 2023
- Number of few shot examples: 0
- Random baseline accuracy: 0%
56. HumanEval Python 75% code generation
- Description: HumanEval Python 75% is an easier variant of HumanEval Python in which in addition to the original method signature, the model is also provided 75% of the lines in the canonical solution and expected to complete the reaminder of the program. It consists of 164 samples.
- Year released: 2023
- Number of few shot examples: 0
- Random baseline accuracy: 0%
57. HumanEval Python simple return statement code generation
- Description: HumanEval Python simple return statament is an easier variant of HumanEval Python in which the model is provided all of the canonical solution with the exception of the return statement and is expected to complete the return statement. Additionally, this set contains only the problems for which the canonical solution has a "simple" return statement consisting only of a line of the form `return VARIABLE\_NAME`. There are 37 samples.
- Year released: 2023
- Number of few shot examples: 0
- Random baseline accuracy: 0%
58. HumanEval Python complex return statement code generation
- Description: HumanEval Pythom complex return statament is an easier variant of HumanEval Python in which the model is provided all of the canonical solution with the exception of the return statement and is expected to complete the return statement. Additionally, this set contains only the problems for which the canonical solution does not have a "simple" return statement as defined above. There are 127 samples.
- Year released: 2023
- Number of few shot examples: 0
- Random baseline accuracy: 0%

### Long Context Gauntlet

We've included three different tasks for long (> 4000 tokens) context length evals. They are meant as litmus tests for a model's ability to properly utilize it's longer context length, which is often the result of fine-tuning after pre-training. For some of these datasets, we explicitly create sets where the required information is located in different sections of the input context, either the beginning, middle, or end of the input context.

1. HotPotQAXL
- Description: (HotPotQA)[https://hotpotqa.github.io/] is originally a dataset of ten documents and a question requiring comprehension of one or more of the supplied documents. The non-related documents are completely unrelated and called "distractor" documents. To extend this to longer context lengths, we randomly sample documents from the full set of documents across the dataset, adding them to the current datapoint until the set of documents and its question fills the current context length. We insert the "gold" document(s) (the document(s) containing the information that answers the question) within the first third, second third, or last third of the context length.
- Lengths: 2k, 4k, 8k, 16k, 32k, 64k
- Locations: beginning, middle, end
2. Key Value Pairs (Needle In a Haystack)
- Description: We construct a `.json` of key value pairs, where both the key and value are random hashes, in the style of (Lost in the Middle)[https://github.com/nelson-liu/lost-in-the-middle]. We ask the model to produce a value given a key from a specific key value pair found int he json. The pair is correspondingly located in the first third, second third, or last third of the json.
- Lengths: 2k, 4k, 8k, 16k, 32k, 64k
- Locations: beginning, middle, end
2. WikiQA Numeric
- Description: (WikiQA Numeric)[https://huggingface.co/datasets/abacusai/WikiQA-Altered_Numeric_QA] is a Wikipedia Question Answering dataset with a focus on questions with numeric answers. We preprocess the data only to easily parse it for our framework.
- Lengths: 2k, 4k, 8k, 16k
- Locations: N/A
134 changes: 134 additions & 0 deletions scripts/eval/yamls/eval_gauntlet_long_context_length.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
eval_gauntlet:
weighting: EQUAL
subtract_random_baseline: true
rescale_accuracy: true
categories:
- name: 2k
benchmarks:
- name: hotpotqa_beginning_2k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_2k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_2k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_2k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_2k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_2k
num_fewshot: 0
random_baseline: 0
- name: wikiqa_2k
num_fewshot: 0
random_baseline: 0
- name: 4k
benchmarks:
- name: hotpotqa_beginning_4k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_4k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_4k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_4k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_4k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_4k
num_fewshot: 0
random_baseline: 0
- name: wikiqa_4k
num_fewshot: 0
random_baseline: 0
- name: 8k
benchmarks:
- name: hotpotqa_beginning_8k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_8k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_8k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_8k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_8k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_8k
num_fewshot: 0
random_baseline: 0
- name: wikiqa_8k
num_fewshot: 0
random_baseline: 0
- name: 16k
benchmarks:
- name: hotpotqa_beginning_16k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_16k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_16k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_16k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_16k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_16k
num_fewshot: 0
random_baseline: 0
- name: 32k
benchmarks:
- name: hotpotqa_beginning_32k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_32k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_32k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_32k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_32k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_32k
num_fewshot: 0
random_baseline: 0
- name: 64k
benchmarks:
- name: hotpotqa_beginning_64k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_64k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_64k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_64k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_64k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_64k
num_fewshot: 0
random_baseline: 0
130 changes: 130 additions & 0 deletions scripts/eval/yamls/eval_gauntlet_long_context_section.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
eval_gauntlet:
weighting: EQUAL
subtract_random_baseline: true
rescale_accuracy: true
categories:
- name: beginning
benchmarks:
- name: hotpotqa_beginning_2k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_2k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_beginning_4k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_4k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_beginning_8k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_8k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_beginning_16k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_16k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_beginning_32k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_32k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_beginning_64k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_beginning_64k
num_fewshot: 0
random_baseline: 0
- name: middle
benchmarks:
- name: hotpotqa_middle_2k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_2k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_4k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_4k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_8k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_8k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_16k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_16k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_32k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_32k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_middle_64k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_middle_64k
num_fewshot: 0
random_baseline: 0
- name: end
benchmarks:
- name: hotpotqa_end_2k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_2k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_4k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_4k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_8k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_8k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_16k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_16k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_32k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_32k
num_fewshot: 0
random_baseline: 0
- name: hotpotqa_end_64k
num_fewshot: 0
random_baseline: 0
- name: kv_pairs_end_64k
num_fewshot: 0
random_baseline: 0
- name: full
benchmarks:
- name: wikiqa_2k
num_fewshot: 0
random_baseline: 0
- name: wikiqa_4k
num_fewshot: 0
random_baseline: 0
- name: wikiqa_8k
num_fewshot: 0
random_baseline: 0
Loading
Loading