From 340b79e98f39199281f66645563e26461d7e80de Mon Sep 17 00:00:00 2001 From: Max Marion Date: Mon, 30 Oct 2023 23:04:10 +0000 Subject: [PATCH 01/47] add yamls w/ old links --- .../eval/yamls/eval_gauntlet_8k_length.yaml | 74 ++++++++++++ .../eval/yamls/eval_gauntlet_8k_section.yaml | 76 +++++++++++++ scripts/eval/yamls/long_context_eval_8k.yaml | 27 +++++ scripts/eval/yamls/long_context_tasks.yaml | 106 ++++++++++++++++++ 4 files changed, 283 insertions(+) create mode 100644 scripts/eval/yamls/eval_gauntlet_8k_length.yaml create mode 100644 scripts/eval/yamls/eval_gauntlet_8k_section.yaml create mode 100644 scripts/eval/yamls/long_context_eval_8k.yaml create mode 100644 scripts/eval/yamls/long_context_tasks.yaml diff --git a/scripts/eval/yamls/eval_gauntlet_8k_length.yaml b/scripts/eval/yamls/eval_gauntlet_8k_length.yaml new file mode 100644 index 0000000000..e17df6f3c4 --- /dev/null +++ b/scripts/eval/yamls/eval_gauntlet_8k_length.yaml @@ -0,0 +1,74 @@ +eval_gauntlet: + weighting: EQUAL + subtract_random_baseline: true + rescale_accuracy: true + categories: + - name: 2k + benchmarks: + - name: hotpotqa_beginning_2k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_middle_2k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_end_2k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_2k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_2k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_2k + num_fewshot: 0 + random_baseline: 0 + - name: wikiqa_2k + num_fewshot: 0 + random_baseline: 0 + - name: 4k + benchmarks: + - name: hotpotqa_beginning_4k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_middle_4k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_end_4k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_4k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_4k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_4k + num_fewshot: 0 + random_baseline: 0 + - name: wikiqa_4k + num_fewshot: 0 + random_baseline: 0 + - name: 8k + benchmarks: + - name: hotpotqa_beginning_8k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_middle_8k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_end_8k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_8k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_8k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_8k + num_fewshot: 0 + random_baseline: 0 + - name: wikiqa_8k + num_fewshot: 0 + random_baseline: 0 \ No newline at end of file diff --git a/scripts/eval/yamls/eval_gauntlet_8k_section.yaml b/scripts/eval/yamls/eval_gauntlet_8k_section.yaml new file mode 100644 index 0000000000..711a9b6ee4 --- /dev/null +++ b/scripts/eval/yamls/eval_gauntlet_8k_section.yaml @@ -0,0 +1,76 @@ +eval_gauntlet: + weighting: EQUAL + subtract_random_baseline: true + rescale_accuracy: true + categories: + - name: beginning + benchmarks: + - name: hotpotqa_beginning_2k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_2k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_beginning_4k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_4k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_beginning_8k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_8k + num_fewshot: 0 + random_baseline: 0 + - name: middle + benchmarks: + - name: hotpotqa_middle_2k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_2k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_middle_4k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_4k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_middle_8k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_8k + num_fewshot: 0 + random_baseline: 0 + - name: end + benchmarks: + - name: hotpotqa_end_2k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_2k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_end_4k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_4k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_end_8k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_8k + num_fewshot: 0 + random_baseline: 0 + - name: full + benchmarks: + - name: wikiqa_2k + num_fewshot: 0 + random_baseline: 0 + - name: wikiqa_4k + num_fewshot: 0 + random_baseline: 0 + - name: wikiqa_8k + num_fewshot: 0 + random_baseline: 0 \ No newline at end of file diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml new file mode 100644 index 0000000000..10d7987fa7 --- /dev/null +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -0,0 +1,27 @@ +max_seq_len: 8192 +seed: 1 +precision: amp_bf16 + +models: +- + model_name: mosaicml/mpt-7b-chat-8k + model: + name: hf_causal_lm + pretrained_model_name_or_path: mosaicml/mpt-7b-chat-8k + pretrained: true + attn_config: + attn_impl: triton + tokenizer: + name: mosaicml/mpt-7b-chat-8k + kwargs: + model_max_length: ${max_seq_len} + +device_eval_batch_size: 1 + +# FSDP config for model sharding +fsdp_config: + sharding_strategy: FULL_SHARD + mixed_precision: FULL + +icl_tasks: 'eval/yamls/long_context_tasks.yaml' +eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' \ No newline at end of file diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml new file mode 100644 index 0000000000..af78691649 --- /dev/null +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -0,0 +1,106 @@ +icl_tasks: +- + label: kv_pairs_beginning_2k + dataset_uri: eval/local_data/long_context/kv_pairs_beginning_len_2048.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: kv_pairs_middle_2k + dataset_uri: eval/local_data/long_context/kv_pairs_middle_len_2048.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: kv_pairs_end_2k + dataset_uri: eval/local_data/long_context/kv_pairs_end_len_2048.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: kv_pairs_beginning_4k + dataset_uri: eval/local_data/long_context/kv_pairs_beginning_len_4096.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: kv_pairs_middle_4k + dataset_uri: eval/local_data/long_context/kv_pairs_middle_len_4096.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: kv_pairs_end_4k + dataset_uri: eval/local_data/long_context/kv_pairs_end_len_4096.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: kv_pairs_beginning_8k + dataset_uri: eval/local_data/long_context/kv_pairs_beginning_len_8192.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: kv_pairs_middle_8k + dataset_uri: eval/local_data/long_context/kv_pairs_middle_len_8192.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: kv_pairs_end_8k + dataset_uri: eval/local_data/long_context/kv_pairs_end_len_8192.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: wikiqa_2k + dataset_uri: eval/local_data/long_context/wikiqa_2k.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: wikiqa_4k + dataset_uri: eval/local_data/long_context/wikiqa_4k.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: wikiqa_8k + dataset_uri: eval/local_data/long_context/wikiqa_8k.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: hotpotqa_beginning_2k + dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_beginning_context_len_2048_tokenizer_gpt-4_total_examples_500.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: hotpotqa_middle_2k + dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_middle_context_len_2048_tokenizer_gpt-4_total_examples_500.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: hotpotqa_end_2k + dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_end_context_len_2048_tokenizer_gpt-4_total_examples_500.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: hotpotqa_beginning_4k + dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_beginning_context_len_4096_tokenizer_gpt-4_total_examples_500.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: hotpotqa_middle_4k + dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_middle_context_len_4096_tokenizer_gpt-4_total_examples_500.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: hotpotqa_end_4k + dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_end_context_len_4096_tokenizer_gpt-4_total_examples_500.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: hotpotqa_beginning_8k + dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_beginning_context_len_8192_tokenizer_gpt-4_total_examples_500.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: hotpotqa_middle_8k + dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_middle_context_len_8192_tokenizer_gpt-4_total_examples_500.jsonl + num_fewshot: [0] + icl_task_type: question_answering +- + label: hotpotqa_end_8k + dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_end_context_len_8192_tokenizer_gpt-4_total_examples_500.jsonl + num_fewshot: [0] + icl_task_type: question_answering \ No newline at end of file From 26dc06762254254240a065d077281f512da6842f Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 1 Nov 2023 05:45:55 +0000 Subject: [PATCH 02/47] load from max's public hf and parse hf datasets --- llmfoundry/utils/builders.py | 30 ++ scripts/eval/yamls/long_context_tasks.yaml | 337 +++++++++++++++------ 2 files changed, 272 insertions(+), 95 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index f027afb0ce..1cd29149d4 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -7,6 +7,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union import torch +import datasets as hf_datasets +import json from composer import algorithms from composer.callbacks import (EarlyStopper, Generate, LRMonitor, MemoryMonitor, OptimizerMonitor, @@ -204,6 +206,28 @@ def build_tokenizer( return tokenizer +def prep_hf_dataset(icl_cfg: ListConfig): + hf_dataset_uri = icl_cfg.dataset_uri.replace("hf://", "") + dataset_args = icl_cfg.hf_vars + if "split" not in dataset_args: + dataset_args["split"] = "test" + + # TODO: should I use tmp here? + output_filepath = icl_cfg.dataset_uri.replace("hf://", "/tmp/").replace("/", "_") + '_'.join([str(dataset_arg) for dataset_arg in dataset_args.values()]) + '.jsonl' + if os.path.isfile(output_filepath): + print("Output file already exists, skipping dataset processing and saving") + else: + dataset = hf_datasets.load_dataset(hf_dataset_uri, **dataset_args) + dataset = dataset.map(lambda example: { + "context": ''.join([example[col] for col in icl_cfg.hf_cols['inputs']]), + "answer": ''.join([example[col] for col in icl_cfg.hf_cols['outputs']])} + ) + with open(output_filepath, 'w') as outfile: + for entry in dataset: + json.dump(entry, outfile) + outfile.write('\n') + return output_filepath + def build_icl_evaluators( icl_tasks: Union[str, ListConfig], @@ -269,6 +293,7 @@ def _validate_cfg(icl_cfg: DictConfig): if 'num_beams' not in icl_cfg: icl_cfg.num_beams = 20 + for icl_cfg in icl_tasks_list: assert isinstance(icl_cfg, DictConfig) _validate_cfg(icl_cfg) @@ -286,6 +311,11 @@ def _validate_cfg(icl_cfg: DictConfig): os.remove(destination_path) dist.barrier() + if "hf://" in icl_cfg.dataset_uri: + new_uri = prep_hf_dataset(icl_cfg) + icl_cfg.dataset_uri = new_uri + + dataloaders = get_icl_task_dataloader( icl_cfg.icl_task_type, icl_cfg.dataset_uri, diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml index af78691649..60ca1329a1 100644 --- a/scripts/eval/yamls/long_context_tasks.yaml +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -1,106 +1,253 @@ icl_tasks: - label: kv_pairs_beginning_2k - dataset_uri: eval/local_data/long_context/kv_pairs_beginning_len_2048.jsonl + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering + hf_vars: + name: kv_pairs + context_length: 2048 + section: beginning + hf_cols: + inputs: ["context"] + outputs: ["answer"] - label: kv_pairs_middle_2k - dataset_uri: eval/local_data/long_context/kv_pairs_middle_len_2048.jsonl + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering + hf_vars: + name: kv_pairs + context_length: 2048 + section: middle + hf_cols: + inputs: ["context"] + outputs: ["answer"] - label: kv_pairs_end_2k - dataset_uri: eval/local_data/long_context/kv_pairs_end_len_2048.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: kv_pairs_beginning_4k - dataset_uri: eval/local_data/long_context/kv_pairs_beginning_len_4096.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: kv_pairs_middle_4k - dataset_uri: eval/local_data/long_context/kv_pairs_middle_len_4096.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: kv_pairs_end_4k - dataset_uri: eval/local_data/long_context/kv_pairs_end_len_4096.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: kv_pairs_beginning_8k - dataset_uri: eval/local_data/long_context/kv_pairs_beginning_len_8192.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: kv_pairs_middle_8k - dataset_uri: eval/local_data/long_context/kv_pairs_middle_len_8192.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: kv_pairs_end_8k - dataset_uri: eval/local_data/long_context/kv_pairs_end_len_8192.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: wikiqa_2k - dataset_uri: eval/local_data/long_context/wikiqa_2k.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: wikiqa_4k - dataset_uri: eval/local_data/long_context/wikiqa_4k.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: wikiqa_8k - dataset_uri: eval/local_data/long_context/wikiqa_8k.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: hotpotqa_beginning_2k - dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_beginning_context_len_2048_tokenizer_gpt-4_total_examples_500.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: hotpotqa_middle_2k - dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_middle_context_len_2048_tokenizer_gpt-4_total_examples_500.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: hotpotqa_end_2k - dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_end_context_len_2048_tokenizer_gpt-4_total_examples_500.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: hotpotqa_beginning_4k - dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_beginning_context_len_4096_tokenizer_gpt-4_total_examples_500.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: hotpotqa_middle_4k - dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_middle_context_len_4096_tokenizer_gpt-4_total_examples_500.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: hotpotqa_end_4k - dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_end_context_len_4096_tokenizer_gpt-4_total_examples_500.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: hotpotqa_beginning_8k - dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_beginning_context_len_8192_tokenizer_gpt-4_total_examples_500.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: hotpotqa_middle_8k - dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_middle_context_len_8192_tokenizer_gpt-4_total_examples_500.jsonl - num_fewshot: [0] - icl_task_type: question_answering -- - label: hotpotqa_end_8k - dataset_uri: eval/local_data/long_context/hotpot_train_v1.1_end_context_len_8192_tokenizer_gpt-4_total_examples_500.jsonl - num_fewshot: [0] - icl_task_type: question_answering \ No newline at end of file + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: kv_pairs + context_length: 2048 + section: end + hf_cols: + inputs: ["context"] + outputs: ["answer"] +# - +# label: kv_pairs_beginning_4k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 4096 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: kv_pairs_middle_4k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 4096 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: kv_pairs_end_4k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 4096 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: kv_pairs_beginning_8k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 8192 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: kv_pairs_middle_8k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 8192 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: kv_pairs_end_8k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 8192 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: wikiqa_2k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: wikiqa_4k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: wikiqa_8k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: hotpotqa_beginning_2k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: hotpotqa_middle_2k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: hotpotqa_end_2k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: hotpotqa_beginning_4k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: hotpotqa_middle_4k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: hotpotqa_end_4k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: hotpotqa_beginning_8k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: hotpotqa_middle_8k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] +# - +# label: hotpotqa_end_8k +# dataset_uri: hf://maxisawesome/long_context_eval +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# name: kv_pairs +# context_length: 2048 +# section: middle +# hf_cols: +# inputs: ["context"] +# outputs: ["answer"] \ No newline at end of file From 31851a5510e9769b3b3f34c4a6a138a41b3fc4d5 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 1 Nov 2023 06:17:41 +0000 Subject: [PATCH 03/47] update rest of tasks --- scripts/eval/yamls/long_context_eval_8k.yaml | 2 +- scripts/eval/yamls/long_context_tasks.yaml | 362 +++++++++---------- 2 files changed, 182 insertions(+), 182 deletions(-) diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml index 10d7987fa7..0e674bfc4c 100644 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -1,4 +1,4 @@ -max_seq_len: 8192 +max_seq_len: 8192 seed: 1 precision: amp_bf16 diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml index 60ca1329a1..2e47b451b2 100644 --- a/scripts/eval/yamls/long_context_tasks.yaml +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -5,7 +5,7 @@ icl_tasks: num_fewshot: [0] icl_task_type: question_answering hf_vars: - name: kv_pairs + name: kv_pairs context_length: 2048 section: beginning hf_cols: @@ -35,79 +35,79 @@ icl_tasks: hf_cols: inputs: ["context"] outputs: ["answer"] -# - -# label: kv_pairs_beginning_4k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 4096 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: kv_pairs_middle_4k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 4096 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: kv_pairs_end_4k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 4096 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: kv_pairs_beginning_8k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 8192 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: kv_pairs_middle_8k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 8192 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: kv_pairs_end_8k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 8192 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - +- + label: kv_pairs_beginning_4k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: kv_pairs + context_length: 4096 + section: beginning + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: kv_pairs_middle_4k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: kv_pairs + context_length: 4096 + section: middle + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: kv_pairs_end_4k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: kv_pairs + context_length: 4096 + section: end + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: kv_pairs_beginning_8k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: kv_pairs + context_length: 8192 + section: beginning + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: kv_pairs_middle_8k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: kv_pairs + context_length: 8192 + section: middle + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: kv_pairs_end_8k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: kv_pairs + context_length: 8192 + section: end + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- # label: wikiqa_2k # dataset_uri: hf://maxisawesome/long_context_eval # num_fewshot: [0] @@ -144,110 +144,110 @@ icl_tasks: # inputs: ["context"] # outputs: ["answer"] # - -# label: hotpotqa_beginning_2k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: hotpotqa_middle_2k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: hotpotqa_end_2k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: hotpotqa_beginning_4k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: hotpotqa_middle_4k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: hotpotqa_end_4k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: hotpotqa_beginning_8k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: hotpotqa_middle_8k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: hotpotqa_end_8k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] \ No newline at end of file + label: hotpotqa_beginning_2k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: hotpotqa + context_length: 2048 + section: beginning + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: hotpotqa_middle_2k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: hotpotqa + context_length: 2048 + section: middle + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: hotpotqa_end_2k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: hotpotqa + context_length: 2048 + section: end + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: hotpotqa_beginning_4k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: hotpotqa + context_length: 4096 + section: beginning + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: hotpotqa_middle_4k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: hotpotqa + context_length: 4096 + section: middle + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: hotpotqa_end_4k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: hotpotqa + context_length: 4096 + section: end + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: hotpotqa_beginning_8k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: hotpotqa + context_length: 8192 + section: beginning + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: hotpotqa_middle_8k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: hotpotqa + context_length: 8192 + section: middle + hf_cols: + inputs: ["context"] + outputs: ["answer"] +- + label: hotpotqa_end_8k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: hotpotqa + context_length: 8192 + section: end + hf_cols: + inputs: ["context"] + outputs: ["answer"] \ No newline at end of file From 203be4798507578fb7bf6d00e868b57542aee340 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 1 Nov 2023 07:06:51 +0000 Subject: [PATCH 04/47] add better logging --- llmfoundry/utils/builders.py | 7 ++++-- .../eval/yamls/eval_gauntlet_8k_section.yaml | 22 +++++++++---------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 1cd29149d4..40e99cd97f 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -207,6 +207,9 @@ def build_tokenizer( return tokenizer def prep_hf_dataset(icl_cfg: ListConfig): + """ + Temporary hack to read HF datasets while the composer PR is still WIP + """ hf_dataset_uri = icl_cfg.dataset_uri.replace("hf://", "") dataset_args = icl_cfg.hf_vars if "split" not in dataset_args: @@ -215,8 +218,9 @@ def prep_hf_dataset(icl_cfg: ListConfig): # TODO: should I use tmp here? output_filepath = icl_cfg.dataset_uri.replace("hf://", "/tmp/").replace("/", "_") + '_'.join([str(dataset_arg) for dataset_arg in dataset_args.values()]) + '.jsonl' if os.path.isfile(output_filepath): - print("Output file already exists, skipping dataset processing and saving") + print(f"Output file already exists for {icl_cfg.label}, skipping dataset processing and saving") else: + print(f"Processing {icl_cfg.label}") dataset = hf_datasets.load_dataset(hf_dataset_uri, **dataset_args) dataset = dataset.map(lambda example: { "context": ''.join([example[col] for col in icl_cfg.hf_cols['inputs']]), @@ -315,7 +319,6 @@ def _validate_cfg(icl_cfg: DictConfig): new_uri = prep_hf_dataset(icl_cfg) icl_cfg.dataset_uri = new_uri - dataloaders = get_icl_task_dataloader( icl_cfg.icl_task_type, icl_cfg.dataset_uri, diff --git a/scripts/eval/yamls/eval_gauntlet_8k_section.yaml b/scripts/eval/yamls/eval_gauntlet_8k_section.yaml index 711a9b6ee4..2e668c3cc2 100644 --- a/scripts/eval/yamls/eval_gauntlet_8k_section.yaml +++ b/scripts/eval/yamls/eval_gauntlet_8k_section.yaml @@ -63,14 +63,14 @@ eval_gauntlet: - name: kv_pairs_end_8k num_fewshot: 0 random_baseline: 0 - - name: full - benchmarks: - - name: wikiqa_2k - num_fewshot: 0 - random_baseline: 0 - - name: wikiqa_4k - num_fewshot: 0 - random_baseline: 0 - - name: wikiqa_8k - num_fewshot: 0 - random_baseline: 0 \ No newline at end of file + # - name: full + # benchmarks: + # - name: wikiqa_2k + # num_fewshot: 0 + # random_baseline: 0 + # - name: wikiqa_4k + # num_fewshot: 0 + # random_baseline: 0 + # - name: wikiqa_8k + # num_fewshot: 0 + # random_baseline: 0 \ No newline at end of file From 33b651325647aaf6b8403e22e142f62c3a4122b5 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 1 Nov 2023 21:15:53 +0000 Subject: [PATCH 05/47] implemented leval tasks --- llmfoundry/utils/builders.py | 24 ++++++++-- scripts/eval/yamls/leval.yaml | 90 +++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 scripts/eval/yamls/leval.yaml diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 40e99cd97f..9246b27bb3 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -222,10 +222,28 @@ def prep_hf_dataset(icl_cfg: ListConfig): else: print(f"Processing {icl_cfg.label}") dataset = hf_datasets.load_dataset(hf_dataset_uri, **dataset_args) - dataset = dataset.map(lambda example: { - "context": ''.join([example[col] for col in icl_cfg.hf_cols['inputs']]), - "answer": ''.join([example[col] for col in icl_cfg.hf_cols['outputs']])} + if "pivot_col" in icl_cfg.hf_cols: + def _augment_data(examples): + outputs = [] + contexts = [] + for i, doc in enumerate(examples[icl_cfg.hf_cols["pivot_col"]]): + for j in range(len(examples[icl_cfg.hf_cols["inputs"][0]][i])): + instruction = ''.join([examples[input_col][i][j] for input_col in icl_cfg.hf_cols["inputs"]]) + contexts.append(doc + "\n" + instruction) + outputs.append(''.join([examples[output_col][i][j] for output_col in icl_cfg.hf_cols['outputs']])) + return {"context": contexts, "answer": outputs} + dataset = dataset.map( + _augment_data, + batched=True, + remove_columns=dataset.column_names, + batch_size=1000 ) + else: + dataset = dataset.map(lambda example: { + "context": ''.join([example[col] for col in icl_cfg.hf_cols['inputs']]), + "answer": ''.join([example[col] for col in icl_cfg.hf_cols['outputs']]) + } + ) with open(output_filepath, 'w') as outfile: for entry in dataset: json.dump(entry, outfile) diff --git a/scripts/eval/yamls/leval.yaml b/scripts/eval/yamls/leval.yaml new file mode 100644 index 0000000000..9314633ba5 --- /dev/null +++ b/scripts/eval/yamls/leval.yaml @@ -0,0 +1,90 @@ +icl_tasks: +# Unimplemented LEval tasks: +# 'coursera', 'quality', 'tpo', 'sci_fi', 'codeU', 'gov_report_summ', 'meeting_summ', 'news_summ', 'paper_assistant', 'patent_summ', 'review_summ', 'tv_show_summ' +- + label: gsm100 + dataset_uri: hf://L4NLP/LEval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: gsm100 + hf_cols: + inputs: ["inputs", "instructions"] + outputs: ["answer"] +- + label: legal_contract_qa + dataset_uri: hf://L4NLP/LEval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: legal_contract_qa + hf_cols: + pivot_col: input + inputs: ["instructions"] + outputs: ["outputs"] +- + label: financial_qa + dataset_uri: hf://L4NLP/LEval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: financial_qa + hf_cols: + pivot_col: input + inputs: ["instructions"] + outputs: ["outputs"] +- + label: multidoc_qa + dataset_uri: hf://L4NLP/LEval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: multidoc_qa + hf_cols: + pivot_col: input + inputs: ["instructions"] + outputs: ["outputs"] +- + label: scientific_qa + dataset_uri: hf://L4NLP/LEval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: scientific_qa + hf_cols: + pivot_col: input + inputs: ["instructions"] + outputs: ["outputs"] +- + label: narrative_qa + dataset_uri: hf://L4NLP/LEval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: scientific_qa + hf_cols: + pivot_col: input + inputs: ["instructions"] + outputs: ["outputs"] +- + label: natural_question + dataset_uri: hf://L4NLP/LEval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: natural_question + hf_cols: + pivot_col: input + inputs: ["instructions"] + outputs: ["outputs"] +- + label: topic_retrieval_longchat + dataset_uri: hf://L4NLP/LEval + num_fewshot: [0] + icl_task_type: question_answering + hf_vars: + name: topic_retrieval_longchat + hf_cols: + pivot_col: input + inputs: ["instructions"] + outputs: ["outputs"] \ No newline at end of file From 089c392931aa2e385b31419090b25a6b5a854af8 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 1 Nov 2023 21:18:57 +0000 Subject: [PATCH 06/47] move level --- scripts/eval/yamls/{leval.yaml => leval_tasks.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/eval/yamls/{leval.yaml => leval_tasks.yaml} (100%) diff --git a/scripts/eval/yamls/leval.yaml b/scripts/eval/yamls/leval_tasks.yaml similarity index 100% rename from scripts/eval/yamls/leval.yaml rename to scripts/eval/yamls/leval_tasks.yaml From b644df10f15e592460489ae1749b1b8f69dcc9e5 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 1 Nov 2023 21:22:19 +0000 Subject: [PATCH 07/47] add level yaml --- scripts/eval/yamls/leval.yaml | 28 ++++++++++++++++++++ scripts/eval/yamls/leval_tasks.yaml | 2 +- scripts/eval/yamls/long_context_eval_8k.yaml | 6 +++-- 3 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 scripts/eval/yamls/leval.yaml diff --git a/scripts/eval/yamls/leval.yaml b/scripts/eval/yamls/leval.yaml new file mode 100644 index 0000000000..28494e484c --- /dev/null +++ b/scripts/eval/yamls/leval.yaml @@ -0,0 +1,28 @@ +eval_gauntlet: + weighting: EQUAL + subtract_random_baseline: true + rescale_accuracy: true + categories: + - name: leval_qa + benchmarks: + - name: gsm100 + num_fewshot: 0 + random_baseline: 0 + - name: legal_contract_qa + num_fewshot: 0 + random_baseline: 0 + - name: financial_qa + num_fewshot: 0 + random_baseline: 0 + - name: multidoc_qa + num_fewshot: 0 + random_baseline: 0 + - name: scientific_qa + num_fewshot: 0 + random_baseline: 0 + - name: natural_question + num_fewshot: 0 + random_baseline: 0 + - name: topic_retrieval_longchat + num_fewshot: 0 + random_baseline: 0 \ No newline at end of file diff --git a/scripts/eval/yamls/leval_tasks.yaml b/scripts/eval/yamls/leval_tasks.yaml index 9314633ba5..5d214e61bb 100644 --- a/scripts/eval/yamls/leval_tasks.yaml +++ b/scripts/eval/yamls/leval_tasks.yaml @@ -61,7 +61,7 @@ icl_tasks: num_fewshot: [0] icl_task_type: question_answering hf_vars: - name: scientific_qa + name: narrative_qa hf_cols: pivot_col: input inputs: ["instructions"] diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml index 0e674bfc4c..bbc2699aab 100644 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -23,5 +23,7 @@ fsdp_config: sharding_strategy: FULL_SHARD mixed_precision: FULL -icl_tasks: 'eval/yamls/long_context_tasks.yaml' -eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' \ No newline at end of file +icl_tasks: 'eval/yamls/leval.yaml' +eval_gauntlet: 'eval/yamls/leval_test.yaml' +# icl_tasks: 'eval/yamls/long_context_tasks.yaml' +# eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' \ No newline at end of file From 5adf77e1d821443c40d925bcfd699d83e274fea4 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Fri, 3 Nov 2023 22:27:58 +0000 Subject: [PATCH 08/47] add str parsing to hf --- llmfoundry/utils/builders.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 9246b27bb3..b82e2581c7 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -239,10 +239,11 @@ def _augment_data(examples): batch_size=1000 ) else: - dataset = dataset.map(lambda example: { - "context": ''.join([example[col] for col in icl_cfg.hf_cols['inputs']]), - "answer": ''.join([example[col] for col in icl_cfg.hf_cols['outputs']]) - } + dataset = dataset.map( + lambda example: { + "context": ''.join([str(example[col]) for col in icl_cfg.hf_cols['inputs']]), + "answer": ''.join([str(example[col]) for col in icl_cfg.hf_cols['outputs']]) + } ) with open(output_filepath, 'w') as outfile: for entry in dataset: From 79810f37bb7ca38beff82f4583c90886f118840d Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 14 Nov 2023 04:27:02 +0000 Subject: [PATCH 09/47] wip --- llmfoundry/utils/builders.py | 49 ----------------------- llmfoundry/utils/data_parsing_registry.py | 5 +++ llmfoundry/utils/data_prep_utils.py | 9 +++++ scripts/eval/yamls/leval_tasks.yaml | 36 ++++++++--------- 4 files changed, 32 insertions(+), 67 deletions(-) create mode 100644 llmfoundry/utils/data_parsing_registry.py diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index b82e2581c7..14b17e70ce 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -206,51 +206,6 @@ def build_tokenizer( return tokenizer -def prep_hf_dataset(icl_cfg: ListConfig): - """ - Temporary hack to read HF datasets while the composer PR is still WIP - """ - hf_dataset_uri = icl_cfg.dataset_uri.replace("hf://", "") - dataset_args = icl_cfg.hf_vars - if "split" not in dataset_args: - dataset_args["split"] = "test" - - # TODO: should I use tmp here? - output_filepath = icl_cfg.dataset_uri.replace("hf://", "/tmp/").replace("/", "_") + '_'.join([str(dataset_arg) for dataset_arg in dataset_args.values()]) + '.jsonl' - if os.path.isfile(output_filepath): - print(f"Output file already exists for {icl_cfg.label}, skipping dataset processing and saving") - else: - print(f"Processing {icl_cfg.label}") - dataset = hf_datasets.load_dataset(hf_dataset_uri, **dataset_args) - if "pivot_col" in icl_cfg.hf_cols: - def _augment_data(examples): - outputs = [] - contexts = [] - for i, doc in enumerate(examples[icl_cfg.hf_cols["pivot_col"]]): - for j in range(len(examples[icl_cfg.hf_cols["inputs"][0]][i])): - instruction = ''.join([examples[input_col][i][j] for input_col in icl_cfg.hf_cols["inputs"]]) - contexts.append(doc + "\n" + instruction) - outputs.append(''.join([examples[output_col][i][j] for output_col in icl_cfg.hf_cols['outputs']])) - return {"context": contexts, "answer": outputs} - dataset = dataset.map( - _augment_data, - batched=True, - remove_columns=dataset.column_names, - batch_size=1000 - ) - else: - dataset = dataset.map( - lambda example: { - "context": ''.join([str(example[col]) for col in icl_cfg.hf_cols['inputs']]), - "answer": ''.join([str(example[col]) for col in icl_cfg.hf_cols['outputs']]) - } - ) - with open(output_filepath, 'w') as outfile: - for entry in dataset: - json.dump(entry, outfile) - outfile.write('\n') - return output_filepath - def build_icl_evaluators( icl_tasks: Union[str, ListConfig], @@ -334,10 +289,6 @@ def _validate_cfg(icl_cfg: DictConfig): os.remove(destination_path) dist.barrier() - if "hf://" in icl_cfg.dataset_uri: - new_uri = prep_hf_dataset(icl_cfg) - icl_cfg.dataset_uri = new_uri - dataloaders = get_icl_task_dataloader( icl_cfg.icl_task_type, icl_cfg.dataset_uri, diff --git a/llmfoundry/utils/data_parsing_registry.py b/llmfoundry/utils/data_parsing_registry.py new file mode 100644 index 0000000000..4fa618567f --- /dev/null +++ b/llmfoundry/utils/data_parsing_registry.py @@ -0,0 +1,5 @@ +from llmfoundry.utils.data_prep_utils import leval_hf_parsing_func + +EVAL_HF_PARSING_FUNCTION_REGISTRY = { + 'leval_hf_parsing_func': leval_hf_parsing_func, +} diff --git a/llmfoundry/utils/data_prep_utils.py b/llmfoundry/utils/data_prep_utils.py index 75e27b504f..f5c45e3b6a 100644 --- a/llmfoundry/utils/data_prep_utils.py +++ b/llmfoundry/utils/data_prep_utils.py @@ -72,6 +72,15 @@ def merge_shard_groups(root: str) -> None: with open(index_filename, 'w') as out: out.write(text) +def leval_hf_parsing_func(examples: dict): + outputs = [] + contexts = [] + for i, doc in enumerate(examples['input']): + for j in range(len(examples['instructions'][i])): + instruction = examples['instructions'][i][j] + contexts.append(doc + "\n" + instruction) + outputs.append(examples['outputs'][i][j]) + return {"context": contexts, "answer": outputs} class DownloadingIterable: diff --git a/scripts/eval/yamls/leval_tasks.yaml b/scripts/eval/yamls/leval_tasks.yaml index 5d214e61bb..a2ae7eefaa 100644 --- a/scripts/eval/yamls/leval_tasks.yaml +++ b/scripts/eval/yamls/leval_tasks.yaml @@ -6,19 +6,19 @@ icl_tasks: dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: gsm100 - hf_cols: - inputs: ["inputs", "instructions"] - outputs: ["answer"] + hf_parsing_vars: + context: ["input", "instructions"] + labels: ["outputs"] - label: legal_contract_qa dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: legal_contract_qa - hf_cols: + hf_parsing_vars: pivot_col: input inputs: ["instructions"] outputs: ["outputs"] @@ -27,9 +27,9 @@ icl_tasks: dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: financial_qa - hf_cols: + hf_parsing_vars: pivot_col: input inputs: ["instructions"] outputs: ["outputs"] @@ -38,9 +38,9 @@ icl_tasks: dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: multidoc_qa - hf_cols: + hf_parsing_vars: pivot_col: input inputs: ["instructions"] outputs: ["outputs"] @@ -49,9 +49,9 @@ icl_tasks: dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: scientific_qa - hf_cols: + hf_parsing_vars: pivot_col: input inputs: ["instructions"] outputs: ["outputs"] @@ -60,9 +60,9 @@ icl_tasks: dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: narrative_qa - hf_cols: + hf_parsing_vars: pivot_col: input inputs: ["instructions"] outputs: ["outputs"] @@ -71,9 +71,9 @@ icl_tasks: dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: natural_question - hf_cols: + hf_parsing_vars: pivot_col: input inputs: ["instructions"] outputs: ["outputs"] @@ -82,9 +82,9 @@ icl_tasks: dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: topic_retrieval_longchat - hf_cols: + hf_parsing_vars: pivot_col: input inputs: ["instructions"] outputs: ["outputs"] \ No newline at end of file From 44a209a7a4d10075f19bccf0ee89fa9920b1975d Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 14 Nov 2023 06:13:26 +0000 Subject: [PATCH 10/47] llm-foundry working with new parser --- llmfoundry/utils/builders.py | 11 ++++++-- llmfoundry/utils/data_prep_utils.py | 12 ++++---- scripts/eval/yamls/leval_tasks.yaml | 44 +++++++++++------------------ 3 files changed, 30 insertions(+), 37 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 14b17e70ce..9f0735dc54 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -7,8 +7,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union import torch -import datasets as hf_datasets -import json from composer import algorithms from composer.callbacks import (EarlyStopper, Generate, LRMonitor, MemoryMonitor, OptimizerMonitor, @@ -37,6 +35,7 @@ DecoupledLionW, DecoupledLionW_8bit) from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper +from llmfoundry.utils.data_parsing_registry import EVAL_HF_PARSING_FUNCTION_REGISTRY log = logging.getLogger(__name__) @@ -289,6 +288,11 @@ def _validate_cfg(icl_cfg: DictConfig): os.remove(destination_path) dist.barrier() + hf_parsing_vars = icl_cfg.get('hf_parsing_vars', {}) + hf_loading_vars = icl_cfg.get('hf_loading_vars', {}) + hf_parsing_func_name = icl_cfg.get('hf_parsing_func', '') + hf_parsing_func = EVAL_HF_PARSING_FUNCTION_REGISTRY.get(hf_parsing_func_name, None) + dataloaders = get_icl_task_dataloader( icl_cfg.icl_task_type, icl_cfg.dataset_uri, @@ -299,6 +303,9 @@ def _validate_cfg(icl_cfg: DictConfig): num_fewshot=num_fewshot, prompt_string=icl_cfg.prompt_string, example_delimiter=icl_cfg.example_delimiter, + hf_parsing_vars=hf_parsing_vars, + hf_loading_vars=hf_loading_vars, + hf_parsing_func=hf_parsing_func, continuation_delimiter=icl_cfg.continuation_delimiter, destination_path=destination_path, pass_at_k=icl_cfg.pass_at_k, diff --git a/llmfoundry/utils/data_prep_utils.py b/llmfoundry/utils/data_prep_utils.py index f5c45e3b6a..634a97a417 100644 --- a/llmfoundry/utils/data_prep_utils.py +++ b/llmfoundry/utils/data_prep_utils.py @@ -73,14 +73,12 @@ def merge_shard_groups(root: str) -> None: out.write(text) def leval_hf_parsing_func(examples: dict): - outputs = [] - contexts = [] + batch = {'context':[], 'answer':[]} for i, doc in enumerate(examples['input']): - for j in range(len(examples['instructions'][i])): - instruction = examples['instructions'][i][j] - contexts.append(doc + "\n" + instruction) - outputs.append(examples['outputs'][i][j]) - return {"context": contexts, "answer": outputs} + for j, instruction in enumerate(examples['instructions'][i]): + batch['context'].append(doc + "\n" + instruction) + batch['answer'].append(examples['outputs'][i][j]) + return batch class DownloadingIterable: diff --git a/scripts/eval/yamls/leval_tasks.yaml b/scripts/eval/yamls/leval_tasks.yaml index a2ae7eefaa..4de664f06d 100644 --- a/scripts/eval/yamls/leval_tasks.yaml +++ b/scripts/eval/yamls/leval_tasks.yaml @@ -6,8 +6,10 @@ icl_tasks: dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering + hf_parsing_func: leval_hf_parsing_func hf_loading_vars: name: gsm100 + split: test hf_parsing_vars: context: ["input", "instructions"] labels: ["outputs"] @@ -16,75 +18,61 @@ icl_tasks: dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering + hf_parsing_func: leval_hf_parsing_func hf_loading_vars: name: legal_contract_qa - hf_parsing_vars: - pivot_col: input - inputs: ["instructions"] - outputs: ["outputs"] + split: test - label: financial_qa dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering + hf_parsing_func: leval_hf_parsing_func hf_loading_vars: name: financial_qa - hf_parsing_vars: - pivot_col: input - inputs: ["instructions"] - outputs: ["outputs"] + split: test - label: multidoc_qa dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering + hf_parsing_func: leval_hf_parsing_func hf_loading_vars: name: multidoc_qa - hf_parsing_vars: - pivot_col: input - inputs: ["instructions"] - outputs: ["outputs"] + split: test - label: scientific_qa dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering + hf_parsing_func: leval_hf_parsing_func hf_loading_vars: name: scientific_qa - hf_parsing_vars: - pivot_col: input - inputs: ["instructions"] - outputs: ["outputs"] + split: test - label: narrative_qa dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering + hf_parsing_func: leval_hf_parsing_func hf_loading_vars: name: narrative_qa - hf_parsing_vars: - pivot_col: input - inputs: ["instructions"] - outputs: ["outputs"] + split: test - label: natural_question dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering + hf_parsing_func: leval_hf_parsing_func hf_loading_vars: name: natural_question - hf_parsing_vars: - pivot_col: input - inputs: ["instructions"] - outputs: ["outputs"] + split: test - label: topic_retrieval_longchat dataset_uri: hf://L4NLP/LEval num_fewshot: [0] icl_task_type: question_answering + hf_parsing_func: leval_hf_parsing_func hf_loading_vars: name: topic_retrieval_longchat - hf_parsing_vars: - pivot_col: input - inputs: ["instructions"] - outputs: ["outputs"] \ No newline at end of file + split: test \ No newline at end of file From 657fa13934b65417e601e43d96c522521116e706 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 14 Nov 2023 18:28:59 +0000 Subject: [PATCH 11/47] working w/ new parsing --- llmfoundry/utils/data_prep_utils.py | 2 +- scripts/eval/yamls/leval_tasks.yaml | 15 +++++++++++ scripts/eval/yamls/long_context_eval_8k.yaml | 27 +++++++++++++++++--- 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/llmfoundry/utils/data_prep_utils.py b/llmfoundry/utils/data_prep_utils.py index 634a97a417..80c29e55f5 100644 --- a/llmfoundry/utils/data_prep_utils.py +++ b/llmfoundry/utils/data_prep_utils.py @@ -72,7 +72,7 @@ def merge_shard_groups(root: str) -> None: with open(index_filename, 'w') as out: out.write(text) -def leval_hf_parsing_func(examples: dict): +def leval_hf_parsing_func(examples: dict, **kwargs): batch = {'context':[], 'answer':[]} for i, doc in enumerate(examples['input']): for j, instruction in enumerate(examples['instructions'][i]): diff --git a/scripts/eval/yamls/leval_tasks.yaml b/scripts/eval/yamls/leval_tasks.yaml index 4de664f06d..dde1226807 100644 --- a/scripts/eval/yamls/leval_tasks.yaml +++ b/scripts/eval/yamls/leval_tasks.yaml @@ -11,6 +11,7 @@ icl_tasks: name: gsm100 split: test hf_parsing_vars: + batched: True context: ["input", "instructions"] labels: ["outputs"] - @@ -19,6 +20,8 @@ icl_tasks: num_fewshot: [0] icl_task_type: question_answering hf_parsing_func: leval_hf_parsing_func + hf_parsing_vars: + batched: True hf_loading_vars: name: legal_contract_qa split: test @@ -28,6 +31,8 @@ icl_tasks: num_fewshot: [0] icl_task_type: question_answering hf_parsing_func: leval_hf_parsing_func + hf_parsing_vars: + batched: True hf_loading_vars: name: financial_qa split: test @@ -37,6 +42,8 @@ icl_tasks: num_fewshot: [0] icl_task_type: question_answering hf_parsing_func: leval_hf_parsing_func + hf_parsing_vars: + batched: True hf_loading_vars: name: multidoc_qa split: test @@ -46,6 +53,8 @@ icl_tasks: num_fewshot: [0] icl_task_type: question_answering hf_parsing_func: leval_hf_parsing_func + hf_parsing_vars: + batched: True hf_loading_vars: name: scientific_qa split: test @@ -55,6 +64,8 @@ icl_tasks: num_fewshot: [0] icl_task_type: question_answering hf_parsing_func: leval_hf_parsing_func + hf_parsing_vars: + batched: True hf_loading_vars: name: narrative_qa split: test @@ -64,6 +75,8 @@ icl_tasks: num_fewshot: [0] icl_task_type: question_answering hf_parsing_func: leval_hf_parsing_func + hf_parsing_vars: + batched: True hf_loading_vars: name: natural_question split: test @@ -73,6 +86,8 @@ icl_tasks: num_fewshot: [0] icl_task_type: question_answering hf_parsing_func: leval_hf_parsing_func + hf_parsing_vars: + batched: True hf_loading_vars: name: topic_retrieval_longchat split: test \ No newline at end of file diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml index bbc2699aab..35a143427f 100644 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -23,7 +23,28 @@ fsdp_config: sharding_strategy: FULL_SHARD mixed_precision: FULL -icl_tasks: 'eval/yamls/leval.yaml' -eval_gauntlet: 'eval/yamls/leval_test.yaml' +icl_tasks: 'eval/yamls/leval_tasks.yaml' +eval_gauntlet: 'eval/yamls/leval.yaml' # icl_tasks: 'eval/yamls/long_context_tasks.yaml' -# eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' \ No newline at end of file +# eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' +# icl_tasks: +# - label: longchat-lines +# dataset_uri: hf://abacusai/LongChat-Lines +# num_fewshot: [0] +# icl_task_type: question_answering +# hf_vars: +# split: 100 +# hf_cols: +# inputs: ["prompt"] +# outputs: ["expected_number"] + +# eval_gauntlet: +# weighting: EQUAL +# subtract_random_baseline: true +# rescale_accuracy: true +# categories: +# - name: longchat-lines +# benchmarks: +# - name: longchat-lines +# num_fewshot: 0 +# random_baseline: 0.0 \ No newline at end of file From 2629f75df23eea925469174a180119ff9d1d6858 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 14 Nov 2023 18:36:40 +0000 Subject: [PATCH 12/47] fix old long context tasks --- scripts/eval/yamls/long_context_tasks.yaml | 147 ++++++++++----------- 1 file changed, 72 insertions(+), 75 deletions(-) diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml index 2e47b451b2..85ca7305e2 100644 --- a/scripts/eval/yamls/long_context_tasks.yaml +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -4,11 +4,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: kv_pairs context_length: 2048 section: beginning - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -16,11 +16,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: kv_pairs context_length: 2048 section: middle - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -28,11 +28,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: kv_pairs context_length: 2048 section: end - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -40,11 +40,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: kv_pairs context_length: 4096 section: beginning - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -52,11 +52,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: kv_pairs context_length: 4096 section: middle - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -64,11 +64,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: kv_pairs context_length: 4096 section: end - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -76,11 +76,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: kv_pairs context_length: 8192 section: beginning - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -88,11 +88,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: kv_pairs context_length: 8192 section: middle - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -100,59 +100,56 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: kv_pairs context_length: 8192 section: end - hf_cols: - inputs: ["context"] - outputs: ["answer"] -- -# label: wikiqa_2k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: wikiqa_4k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - -# label: wikiqa_8k -# dataset_uri: hf://maxisawesome/long_context_eval -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# name: kv_pairs -# context_length: 2048 -# section: middle -# hf_cols: -# inputs: ["context"] -# outputs: ["answer"] -# - + hf_parsing_vars: + inputs: ["context"] + outputs: ["answer"] +- + label: wikiqa_2k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: wikiqa_2k + context_length: 2048 + hf_parsing_vars: + inputs: ["context"] + outputs: ["answer"] +- + label: wikiqa_4k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: wikiqa_4k + context_length: 2048 + hf_parsing_vars: + inputs: ["context"] + outputs: ["answer"] +- + label: wikiqa_8k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: wikiqa_8k + context_length: 2048 + hf_parsing_vars: + inputs: ["context"] + outputs: ["answer"] +- label: hotpotqa_beginning_2k dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: hotpotqa context_length: 2048 section: beginning - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -160,11 +157,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: hotpotqa context_length: 2048 section: middle - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -172,11 +169,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: hotpotqa context_length: 2048 section: end - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -184,11 +181,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: hotpotqa context_length: 4096 section: beginning - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -196,11 +193,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: hotpotqa context_length: 4096 section: middle - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -208,11 +205,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: hotpotqa context_length: 4096 section: end - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -220,11 +217,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: hotpotqa context_length: 8192 section: beginning - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -232,11 +229,11 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: hotpotqa context_length: 8192 section: middle - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] - @@ -244,10 +241,10 @@ icl_tasks: dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_vars: + hf_loading_vars: name: hotpotqa context_length: 8192 section: end - hf_cols: + hf_parsing_vars: inputs: ["context"] outputs: ["answer"] \ No newline at end of file From c019ea1222d5965d75b949e2e27b51135ce4b8dc Mon Sep 17 00:00:00 2001 From: Max Marion Date: Mon, 20 Nov 2023 19:26:52 +0000 Subject: [PATCH 13/47] wip --- scripts/eval/yamls/long_context_tasks.yaml | 148 ++++++++++++--------- 1 file changed, 83 insertions(+), 65 deletions(-) diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml index 85ca7305e2..69fe216a87 100644 --- a/scripts/eval/yamls/long_context_tasks.yaml +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -8,9 +8,10 @@ icl_tasks: name: kv_pairs context_length: 2048 section: beginning - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: kv_pairs_middle_2k dataset_uri: hf://maxisawesome/long_context_eval @@ -20,9 +21,10 @@ icl_tasks: name: kv_pairs context_length: 2048 section: middle - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: kv_pairs_end_2k dataset_uri: hf://maxisawesome/long_context_eval @@ -32,9 +34,10 @@ icl_tasks: name: kv_pairs context_length: 2048 section: end - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: kv_pairs_beginning_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -44,9 +47,10 @@ icl_tasks: name: kv_pairs context_length: 4096 section: beginning - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: kv_pairs_middle_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -56,9 +60,10 @@ icl_tasks: name: kv_pairs context_length: 4096 section: middle - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: kv_pairs_end_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -68,9 +73,10 @@ icl_tasks: name: kv_pairs context_length: 4096 section: end - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: kv_pairs_beginning_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -80,9 +86,10 @@ icl_tasks: name: kv_pairs context_length: 8192 section: beginning - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: kv_pairs_middle_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -92,9 +99,10 @@ icl_tasks: name: kv_pairs context_length: 8192 section: middle - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: kv_pairs_end_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -104,41 +112,42 @@ icl_tasks: name: kv_pairs context_length: 8192 section: end - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: wikiqa_2k dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: - name: wikiqa_2k + name: wikiqa context_length: 2048 - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: wikiqa_4k dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: - name: wikiqa_4k + name: wikiqa context_length: 2048 - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: wikiqa_8k dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: - name: wikiqa_8k + name: wikiqa context_length: 2048 - hf_parsing_vars: - inputs: ["context"] + # hf_parsing_vars: + # inputs: ["context"] outputs: ["answer"] - label: hotpotqa_beginning_2k @@ -149,9 +158,10 @@ icl_tasks: name: hotpotqa context_length: 2048 section: beginning - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: hotpotqa_middle_2k dataset_uri: hf://maxisawesome/long_context_eval @@ -161,9 +171,10 @@ icl_tasks: name: hotpotqa context_length: 2048 section: middle - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: hotpotqa_end_2k dataset_uri: hf://maxisawesome/long_context_eval @@ -173,9 +184,10 @@ icl_tasks: name: hotpotqa context_length: 2048 section: end - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: hotpotqa_beginning_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -185,9 +197,10 @@ icl_tasks: name: hotpotqa context_length: 4096 section: beginning - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: hotpotqa_middle_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -197,9 +210,10 @@ icl_tasks: name: hotpotqa context_length: 4096 section: middle - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: hotpotqa_end_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -209,9 +223,10 @@ icl_tasks: name: hotpotqa context_length: 4096 section: end - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: hotpotqa_beginning_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -221,9 +236,10 @@ icl_tasks: name: hotpotqa context_length: 8192 section: beginning - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: hotpotqa_middle_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -233,9 +249,10 @@ icl_tasks: name: hotpotqa context_length: 8192 section: middle - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] - label: hotpotqa_end_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -245,6 +262,7 @@ icl_tasks: name: hotpotqa context_length: 8192 section: end - hf_parsing_vars: - inputs: ["context"] - outputs: ["answer"] \ No newline at end of file + split: test + # hf_parsing_vars: + # inputs: ["context"] + # outputs: ["answer"] \ No newline at end of file From 0608ea2314317d4e78cffd240f1904ef1ec05e7e Mon Sep 17 00:00:00 2001 From: Max Marion Date: Mon, 20 Nov 2023 19:27:21 +0000 Subject: [PATCH 14/47] wip --- scripts/eval/yamls/eval_gauntlet.yaml | 52 ++++---- scripts/eval/yamls/hf_eval.yaml | 2 +- scripts/eval/yamls/long_context_eval_8k.yaml | 33 +++-- scripts/eval/yamls/tasks.yaml | 128 +++++++++---------- 4 files changed, 114 insertions(+), 101 deletions(-) diff --git a/scripts/eval/yamls/eval_gauntlet.yaml b/scripts/eval/yamls/eval_gauntlet.yaml index 87e01fd44c..06022f902a 100644 --- a/scripts/eval/yamls/eval_gauntlet.yaml +++ b/scripts/eval/yamls/eval_gauntlet.yaml @@ -112,32 +112,32 @@ eval_gauntlet: - name: boolq num_fewshot: 10 random_baseline: 0.5 - - name: programming - benchmarks: - - name: human_eval - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_cpp - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_js - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_return_simple - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_return_complex - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_25 - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_50 - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_75 - num_fewshot: 0 - random_baseline: 0.0 + # - name: programming + # benchmarks: + # - name: human_eval + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_cpp + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_js + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_return_simple + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_return_complex + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_25 + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_50 + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_75 + # num_fewshot: 0 + # random_baseline: 0.0 - name: world_knowledge_lm_task_subscore benchmarks: - name: jeopardy diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml index 05169818d9..8eecf57c30 100644 --- a/scripts/eval/yamls/hf_eval.yaml +++ b/scripts/eval/yamls/hf_eval.yaml @@ -12,7 +12,7 @@ models: model: name: hf_causal_lm pretrained_model_name_or_path: ${model_name_or_path} - init_device: mixed + init_device: cpu pretrained: true tokenizer: name: ${model_name_or_path} diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml index 35a143427f..233b7391ed 100644 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -1,18 +1,31 @@ -max_seq_len: 8192 +max_seq_len: 2048 seed: 1 precision: amp_bf16 +# models: +# - +# model_name: mosaicml/mpt-7b-chat-8k +# model: +# name: hf_causal_lm +# pretrained_model_name_or_path: mosaicml/mpt-7b-chat-8k +# pretrained: true +# attn_config: +# attn_impl: triton +# tokenizer: +# name: mosaicml/mpt-7b-chat-8k +# kwargs: +# model_max_length: ${max_seq_len} + models: - - model_name: mosaicml/mpt-7b-chat-8k + model_name: EleutherAI/gpt-neo-125m model: name: hf_causal_lm - pretrained_model_name_or_path: mosaicml/mpt-7b-chat-8k + pretrained_model_name_or_path: EleutherAI/gpt-neo-125m + init_device: cpu pretrained: true - attn_config: - attn_impl: triton tokenizer: - name: mosaicml/mpt-7b-chat-8k + name: EleutherAI/gpt-neo-125m kwargs: model_max_length: ${max_seq_len} @@ -23,10 +36,10 @@ fsdp_config: sharding_strategy: FULL_SHARD mixed_precision: FULL -icl_tasks: 'eval/yamls/leval_tasks.yaml' -eval_gauntlet: 'eval/yamls/leval.yaml' -# icl_tasks: 'eval/yamls/long_context_tasks.yaml' -# eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' +# icl_tasks: 'eval/yamls/leval_tasks.yaml' +# eval_gauntlet: 'eval/yamls/leval.yaml' +icl_tasks: 'eval/yamls/long_context_tasks.yaml' +eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' # icl_tasks: # - label: longchat-lines # dataset_uri: hf://abacusai/LongChat-Lines diff --git a/scripts/eval/yamls/tasks.yaml b/scripts/eval/yamls/tasks.yaml index 6b66c116ea..737b08ebeb 100644 --- a/scripts/eval/yamls/tasks.yaml +++ b/scripts/eval/yamls/tasks.yaml @@ -173,67 +173,67 @@ icl_tasks: num_fewshot: [10] icl_task_type: multiple_choice continuation_delimiter: "\nAnswer: " # this separates questions from answers -- - label: human_eval - dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_cpp - dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_js - dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_return_simple - dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_return_complex - dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_25 - dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_50 - dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_75 - dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation +# - +# label: human_eval +# dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_cpp +# dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_js +# dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_return_simple +# dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_return_complex +# dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_25 +# dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_50 +# dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_75 +# dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation From cebb487a61657402c4347139cf3c27127e938f9f Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 28 Nov 2023 17:33:21 +0000 Subject: [PATCH 15/47] wip --- scripts/eval/yamls/long_context_eval_8k.yaml | 38 ++++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml index 233b7391ed..064daf5807 100644 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -1,34 +1,34 @@ -max_seq_len: 2048 +max_seq_len: 8196 seed: 1 precision: amp_bf16 -# models: -# - -# model_name: mosaicml/mpt-7b-chat-8k -# model: -# name: hf_causal_lm -# pretrained_model_name_or_path: mosaicml/mpt-7b-chat-8k -# pretrained: true -# attn_config: -# attn_impl: triton -# tokenizer: -# name: mosaicml/mpt-7b-chat-8k -# kwargs: -# model_max_length: ${max_seq_len} - models: - - model_name: EleutherAI/gpt-neo-125m + model_name: mosaicml/mpt-7b-chat-8k model: name: hf_causal_lm - pretrained_model_name_or_path: EleutherAI/gpt-neo-125m - init_device: cpu + pretrained_model_name_or_path: mosaicml/mpt-7b-chat-8k pretrained: true + attn_config: + attn_impl: triton tokenizer: - name: EleutherAI/gpt-neo-125m + name: mosaicml/mpt-7b-chat-8k kwargs: model_max_length: ${max_seq_len} +# models: +# - +# model_name: EleutherAI/gpt-neo-125m +# model: +# name: hf_causal_lm +# pretrained_model_name_or_path: EleutherAI/gpt-neo-125m +# init_device: cpu +# pretrained: true +# tokenizer: +# name: EleutherAI/gpt-neo-125m +# kwargs: +# model_max_length: ${max_seq_len} + device_eval_batch_size: 1 # FSDP config for model sharding From fcbeba8a1e6686d6d91a1679590ccc4aed4ee49d Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 28 Nov 2023 17:33:45 +0000 Subject: [PATCH 16/47] wip --- scripts/eval/yamls/hf_eval.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml index 8eecf57c30..35ad360b56 100644 --- a/scripts/eval/yamls/hf_eval.yaml +++ b/scripts/eval/yamls/hf_eval.yaml @@ -37,11 +37,11 @@ models: device_eval_batch_size: 4 # FSDP config for model sharding -# fsdp_config: -# sharding_strategy: FULL_SHARD -# mixed_precision: FULL -# forward_prefetch: True -# limit_all_gathers: True +fsdp_config: + sharding_strategy: FULL_SHARD + mixed_precision: FULL + forward_prefetch: True + limit_all_gathers: True icl_tasks: 'eval/yamls/tasks.yaml' eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml' From 56ae289eb962b8b7137af04bef20b925795fffe1 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 5 Dec 2023 23:42:36 +0000 Subject: [PATCH 17/47] update to hf_parsing_map --- llmfoundry/utils/builders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 9f0735dc54..8da28ca8d5 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -288,9 +288,8 @@ def _validate_cfg(icl_cfg: DictConfig): os.remove(destination_path) dist.barrier() - hf_parsing_vars = icl_cfg.get('hf_parsing_vars', {}) + hf_parsing_map = icl_cfg.get('hf_parsing_map', {}) hf_loading_vars = icl_cfg.get('hf_loading_vars', {}) - hf_parsing_func_name = icl_cfg.get('hf_parsing_func', '') hf_parsing_func = EVAL_HF_PARSING_FUNCTION_REGISTRY.get(hf_parsing_func_name, None) dataloaders = get_icl_task_dataloader( @@ -303,12 +302,13 @@ def _validate_cfg(icl_cfg: DictConfig): num_fewshot=num_fewshot, prompt_string=icl_cfg.prompt_string, example_delimiter=icl_cfg.example_delimiter, - hf_parsing_vars=hf_parsing_vars, + hf_parsing_map=hf_parsing_map, hf_loading_vars=hf_loading_vars, hf_parsing_func=hf_parsing_func, continuation_delimiter=icl_cfg.continuation_delimiter, destination_path=destination_path, pass_at_k=icl_cfg.pass_at_k, + # TODO: these variables are poorly named. Either composer or icl_config needs to change generations_per_sample=icl_cfg.num_beams, has_categories=icl_cfg.get('has_categories', False), ) From 4aee1ec0a0a3e8fa385455e28be74a16b9c908e8 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Thu, 7 Dec 2023 19:18:29 +0000 Subject: [PATCH 18/47] rm defaults --- llmfoundry/utils/builders.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 8da28ca8d5..96bde1307a 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -255,20 +255,20 @@ def _validate_cfg(icl_cfg: DictConfig): f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg.icl_task_type}.' ) - if 'prompt_string' not in icl_cfg: - icl_cfg.prompt_string = '' - if 'example_delimiter' not in icl_cfg: - icl_cfg.example_delimiter = '\n' - if 'continuation_delimiter' not in icl_cfg: - icl_cfg.continuation_delimiter = ' ' + # if 'prompt_string' not in icl_cfg: + # icl_cfg.prompt_string = '' + # if 'example_delimiter' not in icl_cfg: + # icl_cfg.example_delimiter = '\n' + # if 'continuation_delimiter' not in icl_cfg: + # icl_cfg.continuation_delimiter = ' ' if 'max_seq_len' not in icl_cfg: icl_cfg.max_seq_len = default_max_seq_len if 'batch_size' not in icl_cfg: icl_cfg.batch_size = default_batch_size - if 'pass_at_k' not in icl_cfg: - icl_cfg.pass_at_k = 1 - if 'num_beams' not in icl_cfg: - icl_cfg.num_beams = 20 + # if 'pass_at_k' not in icl_cfg: + # icl_cfg.pass_at_k = 1 + # if 'num_beams' not in icl_cfg: + # icl_cfg.num_beams = 20 for icl_cfg in icl_tasks_list: From 23ca0ba741a25fa55c4bbdc4a32a1ec607db7342 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Thu, 7 Dec 2023 19:23:09 +0000 Subject: [PATCH 19/47] fix parsing vars --- scripts/eval/yamls/long_context_tasks.yaml | 65 +--------------------- 1 file changed, 1 insertion(+), 64 deletions(-) diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml index 69fe216a87..da6f367417 100644 --- a/scripts/eval/yamls/long_context_tasks.yaml +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -9,9 +9,6 @@ icl_tasks: context_length: 2048 section: beginning split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: kv_pairs_middle_2k dataset_uri: hf://maxisawesome/long_context_eval @@ -22,9 +19,6 @@ icl_tasks: context_length: 2048 section: middle split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: kv_pairs_end_2k dataset_uri: hf://maxisawesome/long_context_eval @@ -35,9 +29,6 @@ icl_tasks: context_length: 2048 section: end split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: kv_pairs_beginning_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -48,9 +39,6 @@ icl_tasks: context_length: 4096 section: beginning split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: kv_pairs_middle_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -61,9 +49,6 @@ icl_tasks: context_length: 4096 section: middle split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: kv_pairs_end_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -74,9 +59,6 @@ icl_tasks: context_length: 4096 section: end split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: kv_pairs_beginning_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -87,9 +69,6 @@ icl_tasks: context_length: 8192 section: beginning split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: kv_pairs_middle_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -100,9 +79,6 @@ icl_tasks: context_length: 8192 section: middle split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: kv_pairs_end_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -113,9 +89,6 @@ icl_tasks: context_length: 8192 section: end split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: wikiqa_2k dataset_uri: hf://maxisawesome/long_context_eval @@ -124,9 +97,6 @@ icl_tasks: hf_loading_vars: name: wikiqa context_length: 2048 - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: wikiqa_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -135,9 +105,6 @@ icl_tasks: hf_loading_vars: name: wikiqa context_length: 2048 - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: wikiqa_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -146,9 +113,6 @@ icl_tasks: hf_loading_vars: name: wikiqa context_length: 2048 - # hf_parsing_vars: - # inputs: ["context"] - outputs: ["answer"] - label: hotpotqa_beginning_2k dataset_uri: hf://maxisawesome/long_context_eval @@ -159,9 +123,6 @@ icl_tasks: context_length: 2048 section: beginning split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: hotpotqa_middle_2k dataset_uri: hf://maxisawesome/long_context_eval @@ -172,9 +133,6 @@ icl_tasks: context_length: 2048 section: middle split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: hotpotqa_end_2k dataset_uri: hf://maxisawesome/long_context_eval @@ -185,9 +143,6 @@ icl_tasks: context_length: 2048 section: end split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: hotpotqa_beginning_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -198,9 +153,6 @@ icl_tasks: context_length: 4096 section: beginning split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: hotpotqa_middle_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -211,9 +163,6 @@ icl_tasks: context_length: 4096 section: middle split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: hotpotqa_end_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -224,9 +173,6 @@ icl_tasks: context_length: 4096 section: end split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: hotpotqa_beginning_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -237,9 +183,6 @@ icl_tasks: context_length: 8192 section: beginning split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: hotpotqa_middle_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -250,9 +193,6 @@ icl_tasks: context_length: 8192 section: middle split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] - label: hotpotqa_end_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -262,7 +202,4 @@ icl_tasks: name: hotpotqa context_length: 8192 section: end - split: test - # hf_parsing_vars: - # inputs: ["context"] - # outputs: ["answer"] \ No newline at end of file + split: test \ No newline at end of file From c10698f9121ff90c734103fc1bf5ebf0481ee095 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Thu, 7 Dec 2023 20:16:15 +0000 Subject: [PATCH 20/47] update defaults again --- llmfoundry/utils/builders.py | 24 ++++++------- scripts/eval/yamls/long_context_eval_8k.yaml | 36 ++++++++++---------- scripts/eval/yamls/long_context_tasks.yaml | 3 ++ 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 6327c42b6e..9ea70a3f91 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -464,20 +464,20 @@ def _validate_cfg(icl_cfg: DictConfig): f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg.icl_task_type}.' ) - # if 'prompt_string' not in icl_cfg: - # icl_cfg.prompt_string = '' - # if 'example_delimiter' not in icl_cfg: - # icl_cfg.example_delimiter = '\n' - # if 'continuation_delimiter' not in icl_cfg: - # icl_cfg.continuation_delimiter = ' ' + if 'prompt_string' not in icl_cfg: + icl_cfg.prompt_string = '' + if 'example_delimiter' not in icl_cfg: + icl_cfg.example_delimiter = '\n' + if 'continuation_delimiter' not in icl_cfg: + icl_cfg.continuation_delimiter = ' ' if 'max_seq_len' not in icl_cfg: icl_cfg.max_seq_len = default_max_seq_len if 'batch_size' not in icl_cfg: icl_cfg.batch_size = default_batch_size - # if 'pass_at_k' not in icl_cfg: - # icl_cfg.pass_at_k = 1 - # if 'num_beams' not in icl_cfg: - # icl_cfg.num_beams = 20 + if 'pass_at_k' not in icl_cfg: + icl_cfg.pass_at_k = 1 + if 'num_beams' not in icl_cfg: + icl_cfg.num_beams = 20 for icl_cfg in icl_tasks_list: @@ -499,7 +499,6 @@ def _validate_cfg(icl_cfg: DictConfig): hf_parsing_map = icl_cfg.get('hf_parsing_map', {}) hf_loading_vars = icl_cfg.get('hf_loading_vars', {}) - hf_parsing_func = EVAL_HF_PARSING_FUNCTION_REGISTRY.get(hf_parsing_func_name, None) dataloaders = get_icl_task_dataloader( icl_cfg.icl_task_type, @@ -511,9 +510,8 @@ def _validate_cfg(icl_cfg: DictConfig): num_fewshot=num_fewshot, prompt_string=icl_cfg.prompt_string, example_delimiter=icl_cfg.example_delimiter, - hf_parsing_map=hf_parsing_map, hf_loading_vars=hf_loading_vars, - hf_parsing_func=hf_parsing_func, + hf_parsing_map=hf_parsing_map, continuation_delimiter=icl_cfg.continuation_delimiter, question_prelimiter=icl_cfg.get('question_prelimiter', ''), destination_path=destination_path, diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml index 064daf5807..1260c17127 100644 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -2,33 +2,33 @@ max_seq_len: 8196 seed: 1 precision: amp_bf16 -models: -- - model_name: mosaicml/mpt-7b-chat-8k - model: - name: hf_causal_lm - pretrained_model_name_or_path: mosaicml/mpt-7b-chat-8k - pretrained: true - attn_config: - attn_impl: triton - tokenizer: - name: mosaicml/mpt-7b-chat-8k - kwargs: - model_max_length: ${max_seq_len} - # models: # - -# model_name: EleutherAI/gpt-neo-125m +# model_name: mosaicml/mpt-7b-chat-8k # model: # name: hf_causal_lm -# pretrained_model_name_or_path: EleutherAI/gpt-neo-125m -# init_device: cpu +# pretrained_model_name_or_path: mosaicml/mpt-7b-chat-8k # pretrained: true +# attn_config: +# attn_impl: triton # tokenizer: -# name: EleutherAI/gpt-neo-125m +# name: mosaicml/mpt-7b-chat-8k # kwargs: # model_max_length: ${max_seq_len} +models: +- + model_name: EleutherAI/gpt-neo-125m + model: + name: hf_causal_lm + pretrained_model_name_or_path: EleutherAI/gpt-neo-125m + init_device: cpu + pretrained: true + tokenizer: + name: EleutherAI/gpt-neo-125m + kwargs: + model_max_length: ${max_seq_len} + device_eval_batch_size: 1 # FSDP config for model sharding diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml index da6f367417..d1e09f2bd0 100644 --- a/scripts/eval/yamls/long_context_tasks.yaml +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -97,6 +97,7 @@ icl_tasks: hf_loading_vars: name: wikiqa context_length: 2048 + split: test - label: wikiqa_4k dataset_uri: hf://maxisawesome/long_context_eval @@ -105,6 +106,7 @@ icl_tasks: hf_loading_vars: name: wikiqa context_length: 2048 + split: test - label: wikiqa_8k dataset_uri: hf://maxisawesome/long_context_eval @@ -113,6 +115,7 @@ icl_tasks: hf_loading_vars: name: wikiqa context_length: 2048 + split: test - label: hotpotqa_beginning_2k dataset_uri: hf://maxisawesome/long_context_eval From 4e0538553a2e2a0928e765e6470a71cc7bc1125b Mon Sep 17 00:00:00 2001 From: Max Marion Date: Thu, 7 Dec 2023 22:59:14 +0000 Subject: [PATCH 21/47] rm merge conflict --- scripts/eval/yamls/tasks.yaml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/scripts/eval/yamls/tasks.yaml b/scripts/eval/yamls/tasks.yaml index 4b3588bf4d..12ed2cdfa0 100644 --- a/scripts/eval/yamls/tasks.yaml +++ b/scripts/eval/yamls/tasks.yaml @@ -175,11 +175,7 @@ icl_tasks: continuation_delimiter: "\nAnswer: " # this separates questions from answers # - # label: human_eval -<<<<<<< HEAD -# dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI -======= # dataset_uri: eval/local_data/programming/human_eval.jsonl ->>>>>>> main # num_fewshot: [0] # pass_at_k: 1 # num_beams: 20 @@ -187,11 +183,7 @@ icl_tasks: # icl_task_type: code_evaluation # - # label: human_eval_cpp -<<<<<<< HEAD -# dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI -======= # dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl ->>>>>>> main # num_fewshot: [0] # pass_at_k: 1 # num_beams: 20 @@ -199,11 +191,7 @@ icl_tasks: # icl_task_type: code_evaluation # - # label: human_eval_js -<<<<<<< HEAD -# dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI -======= # dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl ->>>>>>> main # num_fewshot: [0] # pass_at_k: 1 # num_beams: 20 @@ -211,11 +199,7 @@ icl_tasks: # icl_task_type: code_evaluation # - # label: human_eval_return_simple -<<<<<<< HEAD -# dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI -======= # dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl ->>>>>>> main # num_fewshot: [0] # pass_at_k: 1 # num_beams: 20 From 6b7d13f3e9b9b7ff33340710c144320dfb7357aa Mon Sep 17 00:00:00 2001 From: Max Marion Date: Fri, 19 Jan 2024 00:37:07 +0000 Subject: [PATCH 22/47] fix gen_kwargs --- llmfoundry/utils/builders.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 9ea70a3f91..80871a3546 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -519,7 +519,9 @@ def _validate_cfg(icl_cfg: DictConfig): # TODO: these variables are poorly named. Either composer or icl_config needs to change generations_per_sample=icl_cfg.num_beams, has_categories=icl_cfg.get('has_categories', False), - cot_delimiter=icl_cfg.get('cot_delimiter', '')) + cot_delimiter=icl_cfg.get('cot_delimiter', ''), + generation_kwargs=icl_cfg.get('generation_kwargs', {}) + ) if hasattr( icl_cfg, 'has_categories') and icl_cfg.has_categories and isinstance( From d9c6a28450a6531bd190186959edcf6a9009be74 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Fri, 19 Jan 2024 00:39:57 +0000 Subject: [PATCH 23/47] rm old code path --- llmfoundry/utils/builders.py | 1 - llmfoundry/utils/data_parsing_registry.py | 5 ----- llmfoundry/utils/data_prep_utils.py | 7 ------- 3 files changed, 13 deletions(-) delete mode 100644 llmfoundry/utils/data_parsing_registry.py diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 4a519bca74..e4887b53bb 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -40,7 +40,6 @@ DecoupledLionW, DecoupledLionW_8bit) from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper -from llmfoundry.utils.data_parsing_registry import EVAL_HF_PARSING_FUNCTION_REGISTRY log = logging.getLogger(__name__) diff --git a/llmfoundry/utils/data_parsing_registry.py b/llmfoundry/utils/data_parsing_registry.py deleted file mode 100644 index 4fa618567f..0000000000 --- a/llmfoundry/utils/data_parsing_registry.py +++ /dev/null @@ -1,5 +0,0 @@ -from llmfoundry.utils.data_prep_utils import leval_hf_parsing_func - -EVAL_HF_PARSING_FUNCTION_REGISTRY = { - 'leval_hf_parsing_func': leval_hf_parsing_func, -} diff --git a/llmfoundry/utils/data_prep_utils.py b/llmfoundry/utils/data_prep_utils.py index 36d0592b01..a88e65ee94 100644 --- a/llmfoundry/utils/data_prep_utils.py +++ b/llmfoundry/utils/data_prep_utils.py @@ -72,13 +72,6 @@ def merge_shard_groups(root: str) -> None: with open(index_filename, 'w') as out: out.write(text) -def leval_hf_parsing_func(examples: dict, **kwargs): - batch = {'context':[], 'answer':[]} - for i, doc in enumerate(examples['input']): - for j, instruction in enumerate(examples['instructions'][i]): - batch['context'].append(doc + "\n" + instruction) - batch['answer'].append(examples['outputs'][i][j]) - return batch class DownloadingIterable: From d9b284cd9813b99659f1668c7a7ad1189e9d29bc Mon Sep 17 00:00:00 2001 From: Max Marion Date: Sat, 27 Jan 2024 01:25:26 +0000 Subject: [PATCH 24/47] fixups --- llmfoundry/utils/builders.py | 1 - scripts/eval/yamls/long_context_eval_8k.yaml | 37 +------------------- 2 files changed, 1 insertion(+), 37 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index e4887b53bb..f1d8794617 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -527,7 +527,6 @@ def _validate_cfg(icl_cfg: DictConfig): question_prelimiter=icl_cfg.get('question_prelimiter', ''), destination_path=destination_path, pass_at_k=icl_cfg.pass_at_k, - # TODO: these variables are poorly named. Either composer or icl_config needs to change generations_per_sample=icl_cfg.num_beams, has_categories=icl_cfg.get('has_categories', False), cot_delimiter=icl_cfg.get('cot_delimiter', ''), diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml index 1260c17127..78b847e6d2 100644 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -2,20 +2,6 @@ max_seq_len: 8196 seed: 1 precision: amp_bf16 -# models: -# - -# model_name: mosaicml/mpt-7b-chat-8k -# model: -# name: hf_causal_lm -# pretrained_model_name_or_path: mosaicml/mpt-7b-chat-8k -# pretrained: true -# attn_config: -# attn_impl: triton -# tokenizer: -# name: mosaicml/mpt-7b-chat-8k -# kwargs: -# model_max_length: ${max_seq_len} - models: - model_name: EleutherAI/gpt-neo-125m @@ -39,25 +25,4 @@ fsdp_config: # icl_tasks: 'eval/yamls/leval_tasks.yaml' # eval_gauntlet: 'eval/yamls/leval.yaml' icl_tasks: 'eval/yamls/long_context_tasks.yaml' -eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' -# icl_tasks: -# - label: longchat-lines -# dataset_uri: hf://abacusai/LongChat-Lines -# num_fewshot: [0] -# icl_task_type: question_answering -# hf_vars: -# split: 100 -# hf_cols: -# inputs: ["prompt"] -# outputs: ["expected_number"] - -# eval_gauntlet: -# weighting: EQUAL -# subtract_random_baseline: true -# rescale_accuracy: true -# categories: -# - name: longchat-lines -# benchmarks: -# - name: longchat-lines -# num_fewshot: 0 -# random_baseline: 0.0 \ No newline at end of file +eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' \ No newline at end of file From 393adfb1e6a6c899bb280d7c2ec8f8efafa153e4 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Sat, 27 Jan 2024 19:17:29 +0000 Subject: [PATCH 25/47] wip --- llmfoundry/utils/builders.py | 4 +++- scripts/eval/yamls/long_context_eval_8k.yaml | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index f1d8794617..404a91bad7 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -530,7 +530,9 @@ def _validate_cfg(icl_cfg: DictConfig): generations_per_sample=icl_cfg.num_beams, has_categories=icl_cfg.get('has_categories', False), cot_delimiter=icl_cfg.get('cot_delimiter', ''), - generation_kwargs=icl_cfg.get('generation_kwargs', {}) + generation_kwargs=icl_cfg.get('generation_kwargs', {}), + # early_stopping=icl_cfg.get('early_stopping_criteria'), + # do_normalization=icl_cfg.get('do_normalization', True), ) if hasattr( icl_cfg, diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml index 78b847e6d2..d56098254b 100644 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -16,6 +16,7 @@ models: model_max_length: ${max_seq_len} device_eval_batch_size: 1 +icl_subset_num_batches: 2 # FSDP config for model sharding fsdp_config: From 662af6785234dcb8ec36ed19e36fed4a5243d266 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 30 Jan 2024 17:15:17 +0000 Subject: [PATCH 26/47] rm leval from pr --- scripts/eval/yamls/leval.yaml | 28 --------- scripts/eval/yamls/leval_tasks.yaml | 93 ----------------------------- 2 files changed, 121 deletions(-) delete mode 100644 scripts/eval/yamls/leval.yaml delete mode 100644 scripts/eval/yamls/leval_tasks.yaml diff --git a/scripts/eval/yamls/leval.yaml b/scripts/eval/yamls/leval.yaml deleted file mode 100644 index 28494e484c..0000000000 --- a/scripts/eval/yamls/leval.yaml +++ /dev/null @@ -1,28 +0,0 @@ -eval_gauntlet: - weighting: EQUAL - subtract_random_baseline: true - rescale_accuracy: true - categories: - - name: leval_qa - benchmarks: - - name: gsm100 - num_fewshot: 0 - random_baseline: 0 - - name: legal_contract_qa - num_fewshot: 0 - random_baseline: 0 - - name: financial_qa - num_fewshot: 0 - random_baseline: 0 - - name: multidoc_qa - num_fewshot: 0 - random_baseline: 0 - - name: scientific_qa - num_fewshot: 0 - random_baseline: 0 - - name: natural_question - num_fewshot: 0 - random_baseline: 0 - - name: topic_retrieval_longchat - num_fewshot: 0 - random_baseline: 0 \ No newline at end of file diff --git a/scripts/eval/yamls/leval_tasks.yaml b/scripts/eval/yamls/leval_tasks.yaml deleted file mode 100644 index dde1226807..0000000000 --- a/scripts/eval/yamls/leval_tasks.yaml +++ /dev/null @@ -1,93 +0,0 @@ -icl_tasks: -# Unimplemented LEval tasks: -# 'coursera', 'quality', 'tpo', 'sci_fi', 'codeU', 'gov_report_summ', 'meeting_summ', 'news_summ', 'paper_assistant', 'patent_summ', 'review_summ', 'tv_show_summ' -- - label: gsm100 - dataset_uri: hf://L4NLP/LEval - num_fewshot: [0] - icl_task_type: question_answering - hf_parsing_func: leval_hf_parsing_func - hf_loading_vars: - name: gsm100 - split: test - hf_parsing_vars: - batched: True - context: ["input", "instructions"] - labels: ["outputs"] -- - label: legal_contract_qa - dataset_uri: hf://L4NLP/LEval - num_fewshot: [0] - icl_task_type: question_answering - hf_parsing_func: leval_hf_parsing_func - hf_parsing_vars: - batched: True - hf_loading_vars: - name: legal_contract_qa - split: test -- - label: financial_qa - dataset_uri: hf://L4NLP/LEval - num_fewshot: [0] - icl_task_type: question_answering - hf_parsing_func: leval_hf_parsing_func - hf_parsing_vars: - batched: True - hf_loading_vars: - name: financial_qa - split: test -- - label: multidoc_qa - dataset_uri: hf://L4NLP/LEval - num_fewshot: [0] - icl_task_type: question_answering - hf_parsing_func: leval_hf_parsing_func - hf_parsing_vars: - batched: True - hf_loading_vars: - name: multidoc_qa - split: test -- - label: scientific_qa - dataset_uri: hf://L4NLP/LEval - num_fewshot: [0] - icl_task_type: question_answering - hf_parsing_func: leval_hf_parsing_func - hf_parsing_vars: - batched: True - hf_loading_vars: - name: scientific_qa - split: test -- - label: narrative_qa - dataset_uri: hf://L4NLP/LEval - num_fewshot: [0] - icl_task_type: question_answering - hf_parsing_func: leval_hf_parsing_func - hf_parsing_vars: - batched: True - hf_loading_vars: - name: narrative_qa - split: test -- - label: natural_question - dataset_uri: hf://L4NLP/LEval - num_fewshot: [0] - icl_task_type: question_answering - hf_parsing_func: leval_hf_parsing_func - hf_parsing_vars: - batched: True - hf_loading_vars: - name: natural_question - split: test -- - label: topic_retrieval_longchat - dataset_uri: hf://L4NLP/LEval - num_fewshot: [0] - icl_task_type: question_answering - hf_parsing_func: leval_hf_parsing_func - hf_parsing_vars: - batched: True - hf_loading_vars: - name: topic_retrieval_longchat - split: test \ No newline at end of file From c9e0ef50c9e67c822e17f1e1686f87640293019a Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 30 Jan 2024 17:17:41 +0000 Subject: [PATCH 27/47] fix comments in yamls --- .../eval/yamls/eval_gauntlet_8k_section.yaml | 22 ++++----- scripts/eval/yamls/hf_eval.yaml | 47 ------------------- scripts/eval/yamls/long_context_eval_8k.yaml | 2 - 3 files changed, 11 insertions(+), 60 deletions(-) delete mode 100644 scripts/eval/yamls/hf_eval.yaml diff --git a/scripts/eval/yamls/eval_gauntlet_8k_section.yaml b/scripts/eval/yamls/eval_gauntlet_8k_section.yaml index 2e668c3cc2..711a9b6ee4 100644 --- a/scripts/eval/yamls/eval_gauntlet_8k_section.yaml +++ b/scripts/eval/yamls/eval_gauntlet_8k_section.yaml @@ -63,14 +63,14 @@ eval_gauntlet: - name: kv_pairs_end_8k num_fewshot: 0 random_baseline: 0 - # - name: full - # benchmarks: - # - name: wikiqa_2k - # num_fewshot: 0 - # random_baseline: 0 - # - name: wikiqa_4k - # num_fewshot: 0 - # random_baseline: 0 - # - name: wikiqa_8k - # num_fewshot: 0 - # random_baseline: 0 \ No newline at end of file + - name: full + benchmarks: + - name: wikiqa_2k + num_fewshot: 0 + random_baseline: 0 + - name: wikiqa_4k + num_fewshot: 0 + random_baseline: 0 + - name: wikiqa_8k + num_fewshot: 0 + random_baseline: 0 \ No newline at end of file diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml deleted file mode 100644 index 1f0f8f670e..0000000000 --- a/scripts/eval/yamls/hf_eval.yaml +++ /dev/null @@ -1,47 +0,0 @@ -max_seq_len: 1024 -seed: 1 -precision: fp32 - -# If you are using one model, put it here: -model_name_or_path: EleutherAI/gpt-neo-125m -# otherwise, write a block for each model you want to test in the `models` section - -models: -- - model_name: ${model_name_or_path} - model: - name: hf_causal_lm - pretrained_model_name_or_path: ${model_name_or_path} - init_device: cpu - pretrained: true - tokenizer: - name: ${model_name_or_path} - kwargs: - model_max_length: ${max_seq_len} -# # if you are evaluating more than one model, list them all as YAML blocks without variable interpolation -# - -# model_name: mosaicml/mpt-7b -# model: -# name: hf_causal_lm -# pretrained_model_name_or_path: mosaicml/mpt-7b -# init_device: cpu -# pretrained: true -# config_overrides: -# max_seq_len: ${max_seq_len} -# tokenizer: -# name: mosaicml/mpt-7b -# kwargs: -# model_max_length: ${max_seq_len} - - -device_eval_batch_size: 4 - -# FSDP config for model sharding -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: FULL - forward_prefetch: True - limit_all_gathers: True - -icl_tasks: 'eval/yamls/tasks_v0.1.yaml' -eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.1.yaml' diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml index d56098254b..429c87fb77 100644 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -23,7 +23,5 @@ fsdp_config: sharding_strategy: FULL_SHARD mixed_precision: FULL -# icl_tasks: 'eval/yamls/leval_tasks.yaml' -# eval_gauntlet: 'eval/yamls/leval.yaml' icl_tasks: 'eval/yamls/long_context_tasks.yaml' eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' \ No newline at end of file From 09ffafde5e80dd51976c4a7fbc19160e74968af3 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 30 Jan 2024 17:18:39 +0000 Subject: [PATCH 28/47] add cot params --- llmfoundry/utils/builders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 1bb8973dbd..24ebede1ac 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -526,8 +526,8 @@ def _validate_cfg(icl_cfg: DictConfig): has_categories=icl_cfg.get('has_categories', False), cot_delimiter=icl_cfg.get('cot_delimiter', ''), generation_kwargs=icl_cfg.get('generation_kwargs', {}), - # early_stopping=icl_cfg.get('early_stopping_criteria'), - # do_normalization=icl_cfg.get('do_normalization', True), + early_stopping=icl_cfg.get('early_stopping_criteria'), + do_normalization=icl_cfg.get('do_normalization', True), ) if hasattr( icl_cfg, From fb782db2b0d1e624ec0b215b9b4c44fe01880c70 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 30 Jan 2024 18:53:16 +0000 Subject: [PATCH 29/47] add fewshot_random_seed --- llmfoundry/utils/builders.py | 6 ++++++ scripts/eval/eval.py | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 24ebede1ac..878e73fd4d 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -52,6 +52,7 @@ def build_evaluators( tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, icl_seq_len: int, + fewshot_random_seed: Optional[int], icl_subset_num_batches: Optional[int], ) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: @@ -72,6 +73,7 @@ def build_evaluators( tokenizer, device_eval_batch_size, icl_seq_len, + fewshot_random_seed, icl_subset_num_batches, ) evaluators.extend(icl_evaluators) @@ -129,6 +131,7 @@ def build_icl_data_and_gauntlet( tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, icl_seq_len: int, + fewshot_random_seed: Optional[int], icl_subset_num_batches: Optional[int] = None ) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: icl_evaluators, logger_keys = build_icl_evaluators( @@ -136,6 +139,7 @@ def build_icl_data_and_gauntlet( tokenizer, icl_seq_len, device_eval_batch_size, + fewshot_random_seed=fewshot_random_seed, icl_subset_num_batches=icl_subset_num_batches) eval_gauntlet_cb = None if eval_gauntlet_config is not None: @@ -427,6 +431,7 @@ def build_icl_evaluators( default_max_seq_len: int, default_batch_size: int, destination_dir: Optional[str] = None, + fewshot_random_seed: Optional[int] = None, icl_subset_num_batches: Optional[int] = None, ) -> Tuple[List[Evaluator], List[str]]: if destination_dir is None: @@ -521,6 +526,7 @@ def _validate_cfg(icl_cfg: DictConfig): continuation_delimiter=icl_cfg.continuation_delimiter, question_prelimiter=icl_cfg.get('question_prelimiter', ''), destination_path=destination_path, + fewshot_random_seed=fewshot_random_seed, pass_at_k=icl_cfg.pass_at_k, generations_per_sample=icl_cfg.num_beams, has_categories=icl_cfg.get('has_categories', False), diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index d4ba39acfa..4dfc47c2c8 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -118,6 +118,7 @@ def evaluate_model( python_log_level: Optional[str], precision: str, eval_gauntlet_df: Optional[pd.DataFrame], + fewshot_random_seed: Optional[int], eval_subset_num_batches: int, icl_subset_num_batches: Optional[int], metadata: Optional[Dict[str, str]], @@ -141,6 +142,7 @@ def evaluate_model( tokenizer=tokenizer, device_eval_batch_size=device_eval_batch_size, icl_seq_len=max_seq_len, + fewshot_random_seed=fewshot_random_seed, icl_subset_num_batches=icl_subset_num_batches, ) @@ -301,6 +303,10 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: 'loggers', must_exist=False, default_value={}) + fewshot_random_seed: int = pop_config(cfg, + 'fewshot_random_seed', + must_exist=False, + default_value=None) eval_subset_num_batches: int = pop_config(cfg, 'eval_subset_num_batches', must_exist=False, @@ -318,6 +324,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: 'log_config', must_exist=False, default_value=True) + # Pop out interpolation variables. pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None) @@ -362,6 +369,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: python_log_level=python_log_level, precision=precision, eval_gauntlet_df=eval_gauntlet_df, + fewshot_random_seed=fewshot_random_seed, eval_subset_num_batches=eval_subset_num_batches, icl_subset_num_batches=icl_subset_num_batches, metadata=metadata, From e735ae78c90934f76b4457737c92e40cb2875ca5 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 30 Jan 2024 21:24:57 +0000 Subject: [PATCH 30/47] fix early_stopping_criteria, fewshot_num_seed default --- llmfoundry/utils/builders.py | 2 +- scripts/eval/eval.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 878e73fd4d..822f6671e9 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -532,7 +532,7 @@ def _validate_cfg(icl_cfg: DictConfig): has_categories=icl_cfg.get('has_categories', False), cot_delimiter=icl_cfg.get('cot_delimiter', ''), generation_kwargs=icl_cfg.get('generation_kwargs', {}), - early_stopping=icl_cfg.get('early_stopping_criteria'), + early_stopping_criteria=icl_cfg.get('early_stopping_criteria'), do_normalization=icl_cfg.get('do_normalization', True), ) if hasattr( diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 4dfc47c2c8..ef4391b28d 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -306,7 +306,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: fewshot_random_seed: int = pop_config(cfg, 'fewshot_random_seed', must_exist=False, - default_value=None) + default_value=1234) eval_subset_num_batches: int = pop_config(cfg, 'eval_subset_num_batches', must_exist=False, From 35641df659c9f86f6dbe7956629ae3bbb860db2f Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 30 Jan 2024 21:29:32 +0000 Subject: [PATCH 31/47] undo rm hf_eval --- scripts/eval/yamls/hf_eval.yaml | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 scripts/eval/yamls/hf_eval.yaml diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml new file mode 100644 index 0000000000..1f0f8f670e --- /dev/null +++ b/scripts/eval/yamls/hf_eval.yaml @@ -0,0 +1,47 @@ +max_seq_len: 1024 +seed: 1 +precision: fp32 + +# If you are using one model, put it here: +model_name_or_path: EleutherAI/gpt-neo-125m +# otherwise, write a block for each model you want to test in the `models` section + +models: +- + model_name: ${model_name_or_path} + model: + name: hf_causal_lm + pretrained_model_name_or_path: ${model_name_or_path} + init_device: cpu + pretrained: true + tokenizer: + name: ${model_name_or_path} + kwargs: + model_max_length: ${max_seq_len} +# # if you are evaluating more than one model, list them all as YAML blocks without variable interpolation +# - +# model_name: mosaicml/mpt-7b +# model: +# name: hf_causal_lm +# pretrained_model_name_or_path: mosaicml/mpt-7b +# init_device: cpu +# pretrained: true +# config_overrides: +# max_seq_len: ${max_seq_len} +# tokenizer: +# name: mosaicml/mpt-7b +# kwargs: +# model_max_length: ${max_seq_len} + + +device_eval_batch_size: 4 + +# FSDP config for model sharding +fsdp_config: + sharding_strategy: FULL_SHARD + mixed_precision: FULL + forward_prefetch: True + limit_all_gathers: True + +icl_tasks: 'eval/yamls/tasks_v0.1.yaml' +eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.1.yaml' From f1282bc5fb91eef8e416fd4ef6bbb807fe48386b Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 30 Jan 2024 21:38:41 +0000 Subject: [PATCH 32/47] add fewshot_random_seed to test --- tests/utils/test_builders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/utils/test_builders.py b/tests/utils/test_builders.py index 303afc9b7d..04e4fd551a 100644 --- a/tests/utils/test_builders.py +++ b/tests/utils/test_builders.py @@ -243,6 +243,7 @@ def test_build_evaluators_empty(): None, tokenizer=None, # type: ignore device_eval_batch_size=1, + fewshot_random_seed=1234, icl_seq_len=2, icl_subset_num_batches=3) assert evaluators == [] From 4a9a8b0b06610b5cff3ff80f5e0fa37dc019ddc5 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 6 Feb 2024 00:42:36 +0000 Subject: [PATCH 33/47] add 64k tasks --- scripts/eval/yamls/long_context_tasks.yaml | 182 ++++++++++++++++++++- 1 file changed, 181 insertions(+), 1 deletion(-) diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml index d1e09f2bd0..b0c1f4565c 100644 --- a/scripts/eval/yamls/long_context_tasks.yaml +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -205,4 +205,184 @@ icl_tasks: name: hotpotqa context_length: 8192 section: end - split: test \ No newline at end of file + split: test +- + label: hotpotqa_beginning_16k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 16384 + section: beginning + split: test +- + label: hotpotqa_beginning_32k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 32768 + section: beginning + split: test +- + label: hotpotqa_beginning_64k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 65536 + section: beginning + split: test +- + label: hotpotqa_middle_16k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 16384 + section: middle + split: test +- + label: hotpotqa_middle_32k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 32768 + section: middle + split: test +- + label: hotpotqa_middle_64k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 65536 + section: middle + split: test +- + label: hotpotqa_end_16k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 16384 + section: end + split: test +- + label: hotpotqa_end_32k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 32768 + section: end + split: test +- + label: hotpotqa_end_64k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 65536 + section: end + split: test +- + label: kv_pairs_beginning_16k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 16384 + section: beginning + split: test +- + label: kv_pairs_beginning_32k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 32768 + section: beginning + split: test +- + label: kv_pairs_beginning_64k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 65536 + section: beginning + split: test +- + label: kv_pairs_middle_16k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 16384 + section: middle + split: test +- + label: kv_pairs_middle_32k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 32768 + section: middle + split: test +- + label: kv_pairs_middle_64k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 65536 + section: middle + split: test +- + label: kv_pairs_end_16k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 16384 + section: end + split: test +- + label: kv_pairs_end_32k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 32768 + section: end + split: test +- + label: kv_pairs_end_64k + dataset_uri: hf://maxisawesome/long_context_eval + num_fewshot: [0] + icl_task_type: question_answering + hf_loading_vars: + name: hotpotqa + context_length: 65536 + section: end + split: test From 65ee6176c40eb0c69c88404288a238871ca08374 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 6 Feb 2024 00:55:53 +0000 Subject: [PATCH 34/47] add longer context, update composer versin --- scripts/eval/yamls/long_context_tasks.yaml | 264 ++++++++++----------- setup.py | 2 +- 2 files changed, 133 insertions(+), 133 deletions(-) diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml index b0c1f4565c..37296ea6b0 100644 --- a/scripts/eval/yamls/long_context_tasks.yaml +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -1,388 +1,388 @@ icl_tasks: - label: kv_pairs_beginning_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: - name: kv_pairs + hf_loading_vars: + name: kv_pairs context_length: 2048 - section: beginning + section: beginning split: test - label: kv_pairs_middle_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: kv_pairs context_length: 2048 - section: middle + section: middle split: test - label: kv_pairs_end_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: kv_pairs context_length: 2048 - section: end + section: end split: test - label: kv_pairs_beginning_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: kv_pairs context_length: 4096 - section: beginning + section: beginning split: test - label: kv_pairs_middle_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: kv_pairs context_length: 4096 - section: middle + section: middle split: test - label: kv_pairs_end_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: kv_pairs context_length: 4096 section: end split: test - label: kv_pairs_beginning_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: kv_pairs context_length: 8192 - section: beginning + section: beginning split: test - label: kv_pairs_middle_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: kv_pairs context_length: 8192 - section: middle + section: middle split: test - label: kv_pairs_end_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: kv_pairs context_length: 8192 section: end split: test - label: wikiqa_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: wikiqa context_length: 2048 split: test - label: wikiqa_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: wikiqa context_length: 2048 split: test - label: wikiqa_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: wikiqa context_length: 2048 split: test - label: hotpotqa_beginning_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa context_length: 2048 - section: beginning + section: beginning split: test - label: hotpotqa_middle_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa context_length: 2048 - section: middle + section: middle split: test - label: hotpotqa_end_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa context_length: 2048 - section: end + section: end split: test - label: hotpotqa_beginning_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa context_length: 4096 section: beginning split: test - label: hotpotqa_middle_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa context_length: 4096 - section: middle + section: middle split: test - label: hotpotqa_end_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa context_length: 4096 section: end split: test - label: hotpotqa_beginning_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 8192 - section: beginning + context_length: 8192 + section: beginning split: test - label: hotpotqa_middle_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 8192 - section: middle + context_length: 8192 + section: middle split: test - label: hotpotqa_end_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 8192 - section: end + context_length: 8192 + section: end split: test - label: hotpotqa_beginning_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 16384 - section: beginning + context_length: 16384 + section: beginning split: test - label: hotpotqa_beginning_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 32768 - section: beginning + context_length: 32768 + section: beginning split: test - label: hotpotqa_beginning_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 65536 - section: beginning + context_length: 65536 + section: beginning split: test - label: hotpotqa_middle_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 16384 - section: middle + context_length: 16384 + section: middle split: test - label: hotpotqa_middle_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 32768 - section: middle + context_length: 32768 + section: middle split: test - label: hotpotqa_middle_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 65536 - section: middle + context_length: 65536 + section: middle split: test - label: hotpotqa_end_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 16384 - section: end + context_length: 16384 + section: end split: test - label: hotpotqa_end_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 32768 - section: end + context_length: 32768 + section: end split: test - label: hotpotqa_end_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 65536 - section: end + context_length: 65536 + section: end split: test - label: kv_pairs_beginning_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 16384 - section: beginning + context_length: 16384 + section: beginning split: test - label: kv_pairs_beginning_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 32768 - section: beginning + context_length: 32768 + section: beginning split: test - label: kv_pairs_beginning_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 65536 - section: beginning + context_length: 65536 + section: beginning split: test - label: kv_pairs_middle_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 16384 - section: middle + context_length: 16384 + section: middle split: test - label: kv_pairs_middle_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 32768 - section: middle + context_length: 32768 + section: middle split: test - label: kv_pairs_middle_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 65536 - section: middle + context_length: 65536 + section: middle split: test - label: kv_pairs_end_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 16384 - section: end + context_length: 16384 + section: end split: test - label: kv_pairs_end_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 32768 - section: end + context_length: 32768 + section: end split: test - label: kv_pairs_end_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://maxisawesome/long_context_eval num_fewshot: [0] icl_task_type: question_answering - hf_loading_vars: + hf_loading_vars: name: hotpotqa - context_length: 65536 - section: end + context_length: 65536 + section: end split: test diff --git a/setup.py b/setup.py index e5bc7e81d2..6a0cd2af6c 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,oci,gcs]>=0.17.2,<0.18', + 'mosaicml[libcloud,wandb,oci,gcs]>=0.17.2,<0.19', 'mlflow>=2.10,<3', 'accelerate>=0.25,<0.26', # for HF inference `device_map` 'transformers>=4.37,<4.38', From 5ba5e30f8e2f9bb5682fb7816d42d7d25aad0913 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 6 Feb 2024 14:33:15 +0000 Subject: [PATCH 35/47] address comments --- scripts/eval/yamls/long_context_eval_8k.yaml | 2 +- scripts/eval/yamls/long_context_tasks.yaml | 78 ++++++++++---------- setup.py | 2 +- 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml index 429c87fb77..2b2e8c62bd 100644 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ b/scripts/eval/yamls/long_context_eval_8k.yaml @@ -8,7 +8,7 @@ models: model: name: hf_causal_lm pretrained_model_name_or_path: EleutherAI/gpt-neo-125m - init_device: cpu + init_device: mixed pretrained: true tokenizer: name: EleutherAI/gpt-neo-125m diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml index 37296ea6b0..daf958a340 100644 --- a/scripts/eval/yamls/long_context_tasks.yaml +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -1,7 +1,7 @@ icl_tasks: - label: kv_pairs_beginning_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -11,7 +11,7 @@ icl_tasks: split: test - label: kv_pairs_middle_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -21,7 +21,7 @@ icl_tasks: split: test - label: kv_pairs_end_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -31,7 +31,7 @@ icl_tasks: split: test - label: kv_pairs_beginning_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -41,7 +41,7 @@ icl_tasks: split: test - label: kv_pairs_middle_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -51,7 +51,7 @@ icl_tasks: split: test - label: kv_pairs_end_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -61,7 +61,7 @@ icl_tasks: split: test - label: kv_pairs_beginning_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -71,7 +71,7 @@ icl_tasks: split: test - label: kv_pairs_middle_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -81,7 +81,7 @@ icl_tasks: split: test - label: kv_pairs_end_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -91,7 +91,7 @@ icl_tasks: split: test - label: wikiqa_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -100,7 +100,7 @@ icl_tasks: split: test - label: wikiqa_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -109,7 +109,7 @@ icl_tasks: split: test - label: wikiqa_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -118,7 +118,7 @@ icl_tasks: split: test - label: hotpotqa_beginning_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -128,7 +128,7 @@ icl_tasks: split: test - label: hotpotqa_middle_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -138,7 +138,7 @@ icl_tasks: split: test - label: hotpotqa_end_2k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -148,7 +148,7 @@ icl_tasks: split: test - label: hotpotqa_beginning_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -158,7 +158,7 @@ icl_tasks: split: test - label: hotpotqa_middle_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -168,7 +168,7 @@ icl_tasks: split: test - label: hotpotqa_end_4k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -178,7 +178,7 @@ icl_tasks: split: test - label: hotpotqa_beginning_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -188,7 +188,7 @@ icl_tasks: split: test - label: hotpotqa_middle_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -198,7 +198,7 @@ icl_tasks: split: test - label: hotpotqa_end_8k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -208,7 +208,7 @@ icl_tasks: split: test - label: hotpotqa_beginning_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -218,7 +218,7 @@ icl_tasks: split: test - label: hotpotqa_beginning_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -228,7 +228,7 @@ icl_tasks: split: test - label: hotpotqa_beginning_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -238,7 +238,7 @@ icl_tasks: split: test - label: hotpotqa_middle_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -248,7 +248,7 @@ icl_tasks: split: test - label: hotpotqa_middle_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -258,7 +258,7 @@ icl_tasks: split: test - label: hotpotqa_middle_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -268,7 +268,7 @@ icl_tasks: split: test - label: hotpotqa_end_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -278,7 +278,7 @@ icl_tasks: split: test - label: hotpotqa_end_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -288,7 +288,7 @@ icl_tasks: split: test - label: hotpotqa_end_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -298,7 +298,7 @@ icl_tasks: split: test - label: kv_pairs_beginning_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -308,7 +308,7 @@ icl_tasks: split: test - label: kv_pairs_beginning_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -318,7 +318,7 @@ icl_tasks: split: test - label: kv_pairs_beginning_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -328,7 +328,7 @@ icl_tasks: split: test - label: kv_pairs_middle_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -338,7 +338,7 @@ icl_tasks: split: test - label: kv_pairs_middle_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -348,7 +348,7 @@ icl_tasks: split: test - label: kv_pairs_middle_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -358,7 +358,7 @@ icl_tasks: split: test - label: kv_pairs_end_16k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -368,7 +368,7 @@ icl_tasks: split: test - label: kv_pairs_end_32k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: @@ -378,7 +378,7 @@ icl_tasks: split: test - label: kv_pairs_end_64k - dataset_uri: hf://maxisawesome/long_context_eval + dataset_uri: hf://mosaicml/long_context_eval num_fewshot: [0] icl_task_type: question_answering hf_loading_vars: diff --git a/setup.py b/setup.py index 6a0cd2af6c..e5bc7e81d2 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,oci,gcs]>=0.17.2,<0.19', + 'mosaicml[libcloud,wandb,oci,gcs]>=0.17.2,<0.18', 'mlflow>=2.10,<3', 'accelerate>=0.25,<0.26', # for HF inference `device_map` 'transformers>=4.37,<4.38', From b7884deffc6c2edff0941046a1c0eeee19634e89 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 6 Feb 2024 14:39:29 +0000 Subject: [PATCH 36/47] mixed --- scripts/eval/yamls/hf_eval.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml index 1f0f8f670e..4043f6676a 100644 --- a/scripts/eval/yamls/hf_eval.yaml +++ b/scripts/eval/yamls/hf_eval.yaml @@ -12,7 +12,7 @@ models: model: name: hf_causal_lm pretrained_model_name_or_path: ${model_name_or_path} - init_device: cpu + init_device: mixed pretrained: true tokenizer: name: ${model_name_or_path} From ff31e725b09bdffe3310372ec0641d445311abac Mon Sep 17 00:00:00 2001 From: Max Marion Date: Tue, 6 Feb 2024 23:55:19 +0000 Subject: [PATCH 37/47] use seed by default --- llmfoundry/utils/builders.py | 2 +- scripts/eval/eval.py | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 822f6671e9..3ce430fc6f 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -526,7 +526,7 @@ def _validate_cfg(icl_cfg: DictConfig): continuation_delimiter=icl_cfg.continuation_delimiter, question_prelimiter=icl_cfg.get('question_prelimiter', ''), destination_path=destination_path, - fewshot_random_seed=fewshot_random_seed, + fewshot_random_seed=icl_cfg.get('fewshot_random_seed', fewshot_random_seed), pass_at_k=icl_cfg.pass_at_k, generations_per_sample=icl_cfg.num_beams, has_categories=icl_cfg.get('has_categories', False), diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index ef4391b28d..a3ebc76b65 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -118,7 +118,6 @@ def evaluate_model( python_log_level: Optional[str], precision: str, eval_gauntlet_df: Optional[pd.DataFrame], - fewshot_random_seed: Optional[int], eval_subset_num_batches: int, icl_subset_num_batches: Optional[int], metadata: Optional[Dict[str, str]], @@ -142,7 +141,7 @@ def evaluate_model( tokenizer=tokenizer, device_eval_batch_size=device_eval_batch_size, icl_seq_len=max_seq_len, - fewshot_random_seed=fewshot_random_seed, + fewshot_random_seed=seed, icl_subset_num_batches=icl_subset_num_batches, ) @@ -303,10 +302,6 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: 'loggers', must_exist=False, default_value={}) - fewshot_random_seed: int = pop_config(cfg, - 'fewshot_random_seed', - must_exist=False, - default_value=1234) eval_subset_num_batches: int = pop_config(cfg, 'eval_subset_num_batches', must_exist=False, @@ -369,7 +364,6 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: python_log_level=python_log_level, precision=precision, eval_gauntlet_df=eval_gauntlet_df, - fewshot_random_seed=fewshot_random_seed, eval_subset_num_batches=eval_subset_num_batches, icl_subset_num_batches=icl_subset_num_batches, metadata=metadata, From fca3d357e045be80cd502c8d4cd62a5919877a34 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 7 Feb 2024 00:05:34 +0000 Subject: [PATCH 38/47] rm long_context_eval_8k.yaml --- scripts/eval/yamls/long_context_eval_8k.yaml | 27 -------------------- 1 file changed, 27 deletions(-) delete mode 100644 scripts/eval/yamls/long_context_eval_8k.yaml diff --git a/scripts/eval/yamls/long_context_eval_8k.yaml b/scripts/eval/yamls/long_context_eval_8k.yaml deleted file mode 100644 index 2b2e8c62bd..0000000000 --- a/scripts/eval/yamls/long_context_eval_8k.yaml +++ /dev/null @@ -1,27 +0,0 @@ -max_seq_len: 8196 -seed: 1 -precision: amp_bf16 - -models: -- - model_name: EleutherAI/gpt-neo-125m - model: - name: hf_causal_lm - pretrained_model_name_or_path: EleutherAI/gpt-neo-125m - init_device: mixed - pretrained: true - tokenizer: - name: EleutherAI/gpt-neo-125m - kwargs: - model_max_length: ${max_seq_len} - -device_eval_batch_size: 1 -icl_subset_num_batches: 2 - -# FSDP config for model sharding -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: FULL - -icl_tasks: 'eval/yamls/long_context_tasks.yaml' -eval_gauntlet: 'eval/yamls/eval_gauntlet_8k_section.yaml' \ No newline at end of file From f1b65f7b8cab63c2f69685ff8bdff780b6ae3d0c Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 7 Feb 2024 00:10:50 +0000 Subject: [PATCH 39/47] add longer context evals --- .../eval/yamls/eval_gauntlet_8k_length.yaml | 60 +++++++++++++++++++ .../eval/yamls/eval_gauntlet_8k_section.yaml | 54 +++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/scripts/eval/yamls/eval_gauntlet_8k_length.yaml b/scripts/eval/yamls/eval_gauntlet_8k_length.yaml index e17df6f3c4..8e95c94ddb 100644 --- a/scripts/eval/yamls/eval_gauntlet_8k_length.yaml +++ b/scripts/eval/yamls/eval_gauntlet_8k_length.yaml @@ -70,5 +70,65 @@ eval_gauntlet: num_fewshot: 0 random_baseline: 0 - name: wikiqa_8k + num_fewshot: 0 + random_baseline: 0 + - name: 16k + benchmarks: + - name: hotpotqa_beginning_16k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_middle_16k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_end_16k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_16k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_16k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_16k + num_fewshot: 0 + random_baseline: 0 + - name: 32k + benchmarks: + - name: hotpotqa_beginning_32k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_middle_32k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_end_32k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_32k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_32k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_32k + num_fewshot: 0 + random_baseline: 0 + - name: 64k + benchmarks: + - name: hotpotqa_beginning_64k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_middle_64k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_end_64k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_64k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_64k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_64k num_fewshot: 0 random_baseline: 0 \ No newline at end of file diff --git a/scripts/eval/yamls/eval_gauntlet_8k_section.yaml b/scripts/eval/yamls/eval_gauntlet_8k_section.yaml index 711a9b6ee4..ca33ab39ae 100644 --- a/scripts/eval/yamls/eval_gauntlet_8k_section.yaml +++ b/scripts/eval/yamls/eval_gauntlet_8k_section.yaml @@ -23,6 +23,24 @@ eval_gauntlet: - name: kv_pairs_beginning_8k num_fewshot: 0 random_baseline: 0 + - name: hotpotqa_beginning_16k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_16k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_beginning_32k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_32k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_beginning_64k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_beginning_64k + num_fewshot: 0 + random_baseline: 0 - name: middle benchmarks: - name: hotpotqa_middle_2k @@ -43,6 +61,24 @@ eval_gauntlet: - name: kv_pairs_middle_8k num_fewshot: 0 random_baseline: 0 + - name: hotpotqa_middle_16k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_16k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_middle_32k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_32k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_middle_64k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_middle_64k + num_fewshot: 0 + random_baseline: 0 - name: end benchmarks: - name: hotpotqa_end_2k @@ -63,6 +99,24 @@ eval_gauntlet: - name: kv_pairs_end_8k num_fewshot: 0 random_baseline: 0 + - name: hotpotqa_end_16k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_16k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_end_32k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_32k + num_fewshot: 0 + random_baseline: 0 + - name: hotpotqa_end_64k + num_fewshot: 0 + random_baseline: 0 + - name: kv_pairs_end_64k + num_fewshot: 0 + random_baseline: 0 - name: full benchmarks: - name: wikiqa_2k From 0b494bb077e2a36dddef3ae889139c84d41fcdc4 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Wed, 7 Feb 2024 00:12:00 +0000 Subject: [PATCH 40/47] mv yamls --- ...tlet_8k_length.yaml => eval_gauntlet_long_context_length.yaml} | 0 ...et_8k_section.yaml => eval_gauntlet_long_context_section.yaml} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename scripts/eval/yamls/{eval_gauntlet_8k_length.yaml => eval_gauntlet_long_context_length.yaml} (100%) rename scripts/eval/yamls/{eval_gauntlet_8k_section.yaml => eval_gauntlet_long_context_section.yaml} (100%) diff --git a/scripts/eval/yamls/eval_gauntlet_8k_length.yaml b/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml similarity index 100% rename from scripts/eval/yamls/eval_gauntlet_8k_length.yaml rename to scripts/eval/yamls/eval_gauntlet_long_context_length.yaml diff --git a/scripts/eval/yamls/eval_gauntlet_8k_section.yaml b/scripts/eval/yamls/eval_gauntlet_long_context_section.yaml similarity index 100% rename from scripts/eval/yamls/eval_gauntlet_8k_section.yaml rename to scripts/eval/yamls/eval_gauntlet_long_context_section.yaml From bd6048bdad5f9e1761db7e89b0cb827597f99679 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Thu, 8 Feb 2024 01:34:31 +0000 Subject: [PATCH 41/47] eval gauntlet wip --- scripts/eval/local_data/EVAL_GAUNTLET.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/scripts/eval/local_data/EVAL_GAUNTLET.md b/scripts/eval/local_data/EVAL_GAUNTLET.md index 4292d762e6..a777e1d355 100644 --- a/scripts/eval/local_data/EVAL_GAUNTLET.md +++ b/scripts/eval/local_data/EVAL_GAUNTLET.md @@ -391,3 +391,20 @@ Programming tasks evaluate the model's ability to understand code, write functio - Year released: 2023 - Number of few shot examples: 0 - Random baseline accuracy: 0% + +### Long Context Gauntlet + +We've included three different tasks for long (> 4000 tokens) context length evals. They are meant as litmus tests for a model's ability to properly utilize it's longer context length, which is often the result of fine-tuning after pre-training. For some of these datasets, we explicitly create sets where the required information is located in different sections of the input context, either the beginning, middle, or end of the input context. + +1. HotPotQAXL + - Description: (HotPotQA)[https://hotpotqa.github.io/] is originally a dataset of ten documents and a question requiring comprehension of one or more of the supplied documents. The non-related documents are completely unrelated and called "distractor" documents. To extend this to longer context lengths, we randomly sample documents from the full set of documents across the dataset, adding them to the current datapoint until the set of documents and its question fills the current context length. We insert the "gold" document(s) (the document(s) containing the information that answers the question) within the first third, second third, or last third of the context length. + - Lengths: 2k, 4k, 8k, 16k, 32k, 64k + - Locations: beginning, middle, end +2. Key Value Pairs (Needle In a Haystack) + - Description: We construct a `.json` of key value pairs, where both the key and value are random hashes. We then ask the model to produce a specific value from a key value pair. The pair is correspondingly located in the first third, second third, or last third of the json. + - Lengths: 2k, 4k, 8k, 16k, 32k, 64k + - Locations: beginning, middle, end +2. WikiQA Numeric + - Description: + - Lengths: 2k, 4k, 8k, 16k + - Locations: N/A From 51c3ea85a6422eaa2344807493b02f134117cf07 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Thu, 8 Feb 2024 09:09:59 +0000 Subject: [PATCH 42/47] update niah and wikiqa --- scripts/eval/local_data/EVAL_GAUNTLET.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/eval/local_data/EVAL_GAUNTLET.md b/scripts/eval/local_data/EVAL_GAUNTLET.md index a777e1d355..afd82037a8 100644 --- a/scripts/eval/local_data/EVAL_GAUNTLET.md +++ b/scripts/eval/local_data/EVAL_GAUNTLET.md @@ -401,10 +401,10 @@ We've included three different tasks for long (> 4000 tokens) context length eva - Lengths: 2k, 4k, 8k, 16k, 32k, 64k - Locations: beginning, middle, end 2. Key Value Pairs (Needle In a Haystack) - - Description: We construct a `.json` of key value pairs, where both the key and value are random hashes. We then ask the model to produce a specific value from a key value pair. The pair is correspondingly located in the first third, second third, or last third of the json. + - Description: We construct a `.json` of key value pairs, where both the key and value are random hashes, in the style of (Lost in the Middle)[https://github.com/nelson-liu/lost-in-the-middle]. We ask the model to produce a value given a key from a specific key value pair found int he json. The pair is correspondingly located in the first third, second third, or last third of the json. - Lengths: 2k, 4k, 8k, 16k, 32k, 64k - Locations: beginning, middle, end 2. WikiQA Numeric - - Description: + - Description: (WikiQA Numeric)[https://huggingface.co/datasets/abacusai/WikiQA-Altered_Numeric_QA] is a Wikipedia Question Answering dataset with a focus on questions with numeric answers. We preprocess the data only to easily parse it for our framework. - Lengths: 2k, 4k, 8k, 16k - Locations: N/A From 78495285aab4d488827dc5ab0b58a21b49709737 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Mon, 12 Feb 2024 16:25:51 +0000 Subject: [PATCH 43/47] fix linting --- scripts/eval/yamls/eval_gauntlet_long_context_length.yaml | 6 +++--- scripts/eval/yamls/eval_gauntlet_long_context_section.yaml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml b/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml index 8e95c94ddb..0aafff42a2 100644 --- a/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml +++ b/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml @@ -26,7 +26,7 @@ eval_gauntlet: - name: wikiqa_2k num_fewshot: 0 random_baseline: 0 - - name: 4k + - name: 4k benchmarks: - name: hotpotqa_beginning_4k num_fewshot: 0 @@ -49,7 +49,7 @@ eval_gauntlet: - name: wikiqa_4k num_fewshot: 0 random_baseline: 0 - - name: 8k + - name: 8k benchmarks: - name: hotpotqa_beginning_8k num_fewshot: 0 @@ -131,4 +131,4 @@ eval_gauntlet: random_baseline: 0 - name: kv_pairs_end_64k num_fewshot: 0 - random_baseline: 0 \ No newline at end of file + random_baseline: 0 diff --git a/scripts/eval/yamls/eval_gauntlet_long_context_section.yaml b/scripts/eval/yamls/eval_gauntlet_long_context_section.yaml index ca33ab39ae..f776047c38 100644 --- a/scripts/eval/yamls/eval_gauntlet_long_context_section.yaml +++ b/scripts/eval/yamls/eval_gauntlet_long_context_section.yaml @@ -3,7 +3,7 @@ eval_gauntlet: subtract_random_baseline: true rescale_accuracy: true categories: - - name: beginning + - name: beginning benchmarks: - name: hotpotqa_beginning_2k num_fewshot: 0 @@ -127,4 +127,4 @@ eval_gauntlet: random_baseline: 0 - name: wikiqa_8k num_fewshot: 0 - random_baseline: 0 \ No newline at end of file + random_baseline: 0 From 124b60a5288b782cab5bb1f5314d528ef978dcb2 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Mon, 12 Feb 2024 18:43:05 +0000 Subject: [PATCH 44/47] add default option --- llmfoundry/utils/builders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index bf960603a2..391b792a1c 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -52,8 +52,8 @@ def build_evaluators( tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, icl_seq_len: int, - fewshot_random_seed: Optional[int], icl_subset_num_batches: Optional[int], + fewshot_random_seed: Optional[int] = None, ) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: evaluators = [] From 6b37a8c65660cf2b1c563db6c0f86ff55f80a7dc Mon Sep 17 00:00:00 2001 From: Max Marion Date: Mon, 12 Feb 2024 19:44:28 +0000 Subject: [PATCH 45/47] change defaults --- llmfoundry/utils/builders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 391b792a1c..103110615d 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -53,7 +53,7 @@ def build_evaluators( device_eval_batch_size: int, icl_seq_len: int, icl_subset_num_batches: Optional[int], - fewshot_random_seed: Optional[int] = None, + fewshot_random_seed: Optional[int] = 1234, ) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: evaluators = [] @@ -130,7 +130,7 @@ def build_icl_data_and_gauntlet( tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, icl_seq_len: int, - fewshot_random_seed: Optional[int], + fewshot_random_seed: Optional[int] = 1234, icl_subset_num_batches: Optional[int] = None ) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: icl_evaluators, logger_keys = build_icl_evaluators( @@ -446,7 +446,7 @@ def build_icl_evaluators( default_max_seq_len: int, default_batch_size: int, destination_dir: Optional[str] = None, - fewshot_random_seed: Optional[int] = None, + fewshot_random_seed: Optional[int] = 1234, icl_subset_num_batches: Optional[int] = None, ) -> Tuple[List[Evaluator], List[str]]: if destination_dir is None: From 3f08d92b673a43d826c577647015fc688b7c5d74 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Mon, 12 Feb 2024 20:40:25 +0000 Subject: [PATCH 46/47] fix linting --- llmfoundry/utils/builders.py | 10 +++++----- scripts/eval/eval.py | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 103110615d..fb3a0d97f8 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -505,7 +505,6 @@ def _validate_cfg(icl_cfg: DictConfig): if 'num_beams' not in icl_cfg: icl_cfg.num_beams = 20 - for icl_cfg in icl_tasks_list: assert isinstance(icl_cfg, DictConfig) _validate_cfg(icl_cfg) @@ -524,7 +523,7 @@ def _validate_cfg(icl_cfg: DictConfig): dist.barrier() hf_parsing_map = icl_cfg.get('hf_parsing_map', {}) - hf_loading_vars = icl_cfg.get('hf_loading_vars', {}) + hf_loading_vars = icl_cfg.get('hf_loading_vars', {}) early_stopping_criteria = icl_cfg.get('early_stopping_criteria', None) @@ -543,12 +542,13 @@ def _validate_cfg(icl_cfg: DictConfig): num_fewshot=num_fewshot, prompt_string=icl_cfg.prompt_string, example_delimiter=icl_cfg.example_delimiter, - hf_loading_vars=hf_loading_vars, - hf_parsing_map=hf_parsing_map, + hf_loading_vars=hf_loading_vars, + hf_parsing_map=hf_parsing_map, continuation_delimiter=icl_cfg.continuation_delimiter, question_prelimiter=icl_cfg.get('question_prelimiter', ''), destination_path=destination_path, - fewshot_random_seed=icl_cfg.get('fewshot_random_seed', fewshot_random_seed), + fewshot_random_seed=icl_cfg.get('fewshot_random_seed', + fewshot_random_seed), pass_at_k=icl_cfg.pass_at_k, generations_per_sample=icl_cfg.num_beams, has_categories=icl_cfg.get('has_categories', False), diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index a1a32d2a48..9c8dad0977 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -262,7 +262,6 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: 'log_config', must_exist=False, default_value=True) - # Pop out interpolation variables. pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None) From cee8256cbc2f6e72962ceb26376f45421888ee30 Mon Sep 17 00:00:00 2001 From: Max Marion Date: Mon, 12 Feb 2024 20:40:56 +0000 Subject: [PATCH 47/47] fix linting 2 --- scripts/eval/yamls/eval_gauntlet_long_context_length.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml b/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml index 0aafff42a2..bcb52bf658 100644 --- a/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml +++ b/scripts/eval/yamls/eval_gauntlet_long_context_length.yaml @@ -72,7 +72,7 @@ eval_gauntlet: - name: wikiqa_8k num_fewshot: 0 random_baseline: 0 - - name: 16k + - name: 16k benchmarks: - name: hotpotqa_beginning_16k num_fewshot: 0