From 3b1fc4ae5c205118901fcf1557260952fe844e2e Mon Sep 17 00:00:00 2001 From: Eitan Turok <150733043+eitanturok@users.noreply.github.com> Date: Thu, 26 Sep 2024 17:23:34 -0400 Subject: [PATCH] Use `allenai/c4` instead of `c4` dataset (#1554) Co-authored-by: Eitan Turok --- README.md | 2 +- TUTORIAL.md | 4 ++-- .../data_prep/convert_dataset_hf.py | 4 ++-- .../data_prep/convert_dataset_json.py | 2 +- mcli/mcli-1b-max-seq-len-8k.yaml | 2 +- mcli/mcli-1b.yaml | 2 +- mcli/mcli-pretokenize-oci-upload.yaml | 2 +- scripts/data_prep/README.md | 2 +- scripts/train/README.md | 6 ++--- .../train/benchmarking/submit_benchmarks.py | 2 +- .../data_prep/test_convert_dataset_hf.py | 2 +- tests/a_scripts/eval/test_eval.py | 11 +++++----- tests/a_scripts/train/test_train.py | 22 ++++++++++--------- tests/data/test_dataloader.py | 6 ++--- tests/data_utils.py | 2 +- 15 files changed, 37 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 0fabb98653..bc4eff48fd 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,7 @@ cd scripts # Convert C4 dataset to StreamingDataset format python data_prep/convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root my-copy-c4 --splits train_small val_small \ --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' diff --git a/TUTORIAL.md b/TUTORIAL.md index 3be4910c4f..d1751f62e3 100644 --- a/TUTORIAL.md +++ b/TUTORIAL.md @@ -216,7 +216,7 @@ Output the processed data to `./my-adaptation-data`. Note that we use smaller su ```bash python scripts/data_prep/convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root my-adaptation-data --splits train_small val_small \ --concat_tokens 4096 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \ --compression zstd @@ -248,7 +248,7 @@ The first step to training from scratch is to get your pretraining data prepared ```bash python scripts/data_prep/convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root my-copy-c4 --splits train_small val_small \ --concat_tokens 2048 --tokenizer gpt2 \ --eos_text '<|endoftext|>' \ diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py index 0ea94ac687..2667407110 100644 --- a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py +++ b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py @@ -158,7 +158,7 @@ def __init__( truncated_samples=100, ) -CONSTS = {'c4': c4constants, 'the_pile': pileconstants} +CONSTS = {'allenai/c4': c4constants, 'the_pile': pileconstants} def build_hf_dataset( @@ -335,7 +335,7 @@ def convert_dataset_hf( dataset_constants = CONSTS[dataset] except KeyError: raise ValueError( - f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "c4" are supported.', + f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "allenai/c4" are supported.', ) if concat_tokens is not None and tokenizer is not None: diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_json.py b/llmfoundry/command_utils/data_prep/convert_dataset_json.py index 35d7e637e6..c6f7d51c02 100644 --- a/llmfoundry/command_utils/data_prep/convert_dataset_json.py +++ b/llmfoundry/command_utils/data_prep/convert_dataset_json.py @@ -43,7 +43,7 @@ def build_hf_dataset( no_wrap (bool): if concatenating, whether to wrap text across `max_length` boundaries tokenizer (PreTrainedTokenizerBase): if mode is CONCAT_TOKENS, the tokenizer to use data_subset (str): Referred to as "name" in HuggingFace datasets.load_dataset. - Typically "all" (The Pile) or "en" (c4). + Typically "all" (The Pile) or "en" (allenai/c4). Returns: An IterableDataset. diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml index b437bc5f0d..1d48cd8105 100644 --- a/mcli/mcli-1b-max-seq-len-8k.yaml +++ b/mcli/mcli-1b-max-seq-len-8k.yaml @@ -13,7 +13,7 @@ integrations: command: | cd llm-foundry/scripts python data_prep/convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root ./my-copy-c4 --splits train_small val_small \ --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' composer train/train.py /mnt/config/parameters.yaml diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml index 789fc4fc02..71566d4c46 100644 --- a/mcli/mcli-1b.yaml +++ b/mcli/mcli-1b.yaml @@ -13,7 +13,7 @@ integrations: command: | cd llm-foundry/scripts python data_prep/convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root ./my-copy-c4 --splits train_small val_small \ --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' composer train/train.py train/yamls/pretrain/mpt-1b.yaml \ diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml index 49fbbb08d8..a3e8c40b88 100644 --- a/mcli/mcli-pretokenize-oci-upload.yaml +++ b/mcli/mcli-pretokenize-oci-upload.yaml @@ -24,7 +24,7 @@ command: | # Run the dataset conversion python convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root ./my-copy-c4 \ --splits val_small val train_small train \ --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' diff --git a/scripts/data_prep/README.md b/scripts/data_prep/README.md index 3601cc865f..b72caeebc4 100644 --- a/scripts/data_prep/README.md +++ b/scripts/data_prep/README.md @@ -14,7 +14,7 @@ Currently supports `c4` and `The Pile`. ```bash # Convert C4 dataset to StreamingDataset format python convert_dataset_hf.py \ - --dataset c4 --data_subset en \ + --dataset allenai/c4 --data_subset en \ --out_root my-copy-c4 --splits train_small val_small \ --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \ --compression zstd diff --git a/scripts/train/README.md b/scripts/train/README.md index 6730cb793b..247814d782 100644 --- a/scripts/train/README.md +++ b/scripts/train/README.md @@ -27,7 +27,7 @@ If you haven't already, make sure to [install the requirements](../../README.md# To run pretraining, you'll need to make yourself a copy of a pretraining dataset and format it for efficient streaming. Check out the [`llm-foundry/data_prep`](../data_prep) folder for detailed instructions on how to convert your dataset to the MosaicML [StreamingDataset](https://github.com/mosaicml/streaming) format. -As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/c4) dataset here. +As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/allenai/c4) dataset here. We first convert the dataset from its native format (a collection of zipped JSONs) to MosaicML's StreamingDataset format, which is a collection of binary `.mds` files. @@ -44,13 +44,13 @@ This will take 20-60 seconds depending on your internet bandwidth. You should see two folders once completed: `./my-copy-c4/train_small` and `./my-copy-c4/val_small` that are ~1.0GB total. Note that we are using the `--concat_tokens` option to pre tokenize our samples to be of the max sequence length without padding ```bash -python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' +python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' ``` Alternatively, you can download the full `train` and `val` splits if you really want to train the model (i.e. not just profile the model). This will take 1-to-many hours depending on bandwidth, number of CPUs, etc. The final folder `./my-copy-c4/train` will be ~800GB so make sure you have space! ```bash -python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' +python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' ``` For any of the above commands, you can also choose to compress the `.mds` files. diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py index fd7be1fc6d..27f5c26c7d 100644 --- a/scripts/train/benchmarking/submit_benchmarks.py +++ b/scripts/train/benchmarking/submit_benchmarks.py @@ -479,7 +479,7 @@ def run_config( if args.data_remote is None: command += f""" cd llm-foundry/scripts - python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>' + python data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>' composer train/train.py /mnt/config/parameters.yaml """ else: diff --git a/tests/a_scripts/data_prep/test_convert_dataset_hf.py b/tests/a_scripts/data_prep/test_convert_dataset_hf.py index e09c54ca70..da1e101ae7 100644 --- a/tests/a_scripts/data_prep/test_convert_dataset_hf.py +++ b/tests/a_scripts/data_prep/test_convert_dataset_hf.py @@ -11,7 +11,7 @@ def test_download_script_from_api(tmp_path: Path): # test calling it directly path = os.path.join(tmp_path, 'my-copy-c4-1') convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=['val_xsmall'], out_root=path, diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index fc0dc8a882..f1b76913d1 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -121,7 +121,7 @@ def test_loader_eval( # Set up multiple eval dataloaders first_eval_loader = test_cfg.eval_loader - first_eval_loader.label = 'c4' + first_eval_loader.label = 'allenai/c4' # Create second eval dataloader using the arxiv dataset. second_eval_loader = copy.deepcopy(first_eval_loader) second_eval_loader.label = 'arxiv' @@ -157,16 +157,17 @@ def test_loader_eval( print(inmemorylogger.data.keys()) # Checks for first eval dataloader - assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys() + assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys( + ) assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'], list, ) assert len( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], ) > 0 assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], tuple, ) diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 9af96f9868..b1bca9ebd0 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -134,7 +134,7 @@ def test_train_multi_eval(tmp_path: pathlib.Path): test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') # Set up multiple eval dataloaders first_eval_loader = test_cfg.eval_loader - first_eval_loader.label = 'c4' + first_eval_loader.label = 'allenai/c4' # Create second eval dataloader using the arxiv dataset. second_eval_loader = copy.deepcopy(first_eval_loader) second_eval_loader.label = 'arxiv' @@ -154,16 +154,17 @@ def test_train_multi_eval(tmp_path: pathlib.Path): assert isinstance(inmemorylogger, InMemoryLogger) # Checks for first eval dataloader - assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys() + assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys( + ) assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'], list, ) assert len( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], ) > 0 assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], tuple, ) @@ -212,7 +213,7 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path): c4_dataset_name = create_c4_dataset_xxsmall(tmp_path) test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') first_eval_loader = test_cfg.eval_loader - first_eval_loader.label = 'c4' + first_eval_loader.label = 'allenai/c4' test_cfg.eval_loader = om.create([first_eval_loader]) test_cfg.eval_subset_num_batches = 1 # -1 to evaluate on all batches test_cfg.max_duration = '1ba' @@ -226,15 +227,16 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path): 0] # pyright: ignore [reportGeneralTypeIssues] assert isinstance(inmemorylogger, InMemoryLogger) - assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys() + assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys( + ) assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'], list, ) assert len( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], ) > 0 assert isinstance( - inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], + inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1], tuple, ) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index d215d93542..7239bfe958 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -204,7 +204,7 @@ def test_correct_padding( shutil.rmtree(path, ignore_errors=True) if pretokenize: convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=[split], out_root=path, @@ -219,7 +219,7 @@ def test_correct_padding( ) else: convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=[split], out_root=path, @@ -233,7 +233,7 @@ def test_correct_padding( num_workers=None, ) if not os.path.isdir(path): - raise RuntimeError(f'c4 dataset at {path} not set up as expected') + raise RuntimeError(f'allenai/c4 dataset at {path} not set up as expected') test_cfg = get_config( conf_path='scripts/train/yamls/pretrain/mpt-125m.yaml', diff --git a/tests/data_utils.py b/tests/data_utils.py index 117310b0cf..1f6c26b72e 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -231,7 +231,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str: # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188 convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=[downloaded_split], out_root=c4_dir,