Skip to content

Commit

Permalink
Use allenai/c4 instead of c4 dataset (#1554)
Browse files Browse the repository at this point in the history
Co-authored-by: Eitan Turok <[email protected]>
  • Loading branch information
eitanturok and eitanturok authored Sep 26, 2024
1 parent dc58bb7 commit 3b1fc4a
Show file tree
Hide file tree
Showing 15 changed files with 37 additions and 34 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ cd scripts

# Convert C4 dataset to StreamingDataset format
python data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root my-copy-c4 --splits train_small val_small \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'

Expand Down
4 changes: 2 additions & 2 deletions TUTORIAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ Output the processed data to `./my-adaptation-data`. Note that we use smaller su
<!--pytest.mark.skip-->
```bash
python scripts/data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root my-adaptation-data --splits train_small val_small \
--concat_tokens 4096 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \
--compression zstd
Expand Down Expand Up @@ -248,7 +248,7 @@ The first step to training from scratch is to get your pretraining data prepared
<!--pytest.mark.skip-->
```bash
python scripts/data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root my-copy-c4 --splits train_small val_small \
--concat_tokens 2048 --tokenizer gpt2 \
--eos_text '<|endoftext|>' \
Expand Down
4 changes: 2 additions & 2 deletions llmfoundry/command_utils/data_prep/convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def __init__(
truncated_samples=100,
)

CONSTS = {'c4': c4constants, 'the_pile': pileconstants}
CONSTS = {'allenai/c4': c4constants, 'the_pile': pileconstants}


def build_hf_dataset(
Expand Down Expand Up @@ -335,7 +335,7 @@ def convert_dataset_hf(
dataset_constants = CONSTS[dataset]
except KeyError:
raise ValueError(
f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "c4" are supported.',
f'Constants for dataset "{dataset}" not found. Currently only "the_pile" and "allenai/c4" are supported.',
)

if concat_tokens is not None and tokenizer is not None:
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/command_utils/data_prep/convert_dataset_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def build_hf_dataset(
no_wrap (bool): if concatenating, whether to wrap text across `max_length` boundaries
tokenizer (PreTrainedTokenizerBase): if mode is CONCAT_TOKENS, the tokenizer to use
data_subset (str): Referred to as "name" in HuggingFace datasets.load_dataset.
Typically "all" (The Pile) or "en" (c4).
Typically "all" (The Pile) or "en" (allenai/c4).
Returns:
An IterableDataset.
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-1b-max-seq-len-8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ integrations:
command: |
cd llm-foundry/scripts
python data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root ./my-copy-c4 --splits train_small val_small \
--concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
composer train/train.py /mnt/config/parameters.yaml
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-1b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ integrations:
command: |
cd llm-foundry/scripts
python data_prep/convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root ./my-copy-c4 --splits train_small val_small \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
composer train/train.py train/yamls/pretrain/mpt-1b.yaml \
Expand Down
2 changes: 1 addition & 1 deletion mcli/mcli-pretokenize-oci-upload.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ command: |
# Run the dataset conversion
python convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root ./my-copy-c4 \
--splits val_small val train_small train \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
Expand Down
2 changes: 1 addition & 1 deletion scripts/data_prep/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Currently supports `c4` and `The Pile`.
```bash
# Convert C4 dataset to StreamingDataset format
python convert_dataset_hf.py \
--dataset c4 --data_subset en \
--dataset allenai/c4 --data_subset en \
--out_root my-copy-c4 --splits train_small val_small \
--concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' \
--compression zstd
Expand Down
6 changes: 3 additions & 3 deletions scripts/train/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ If you haven't already, make sure to [install the requirements](../../README.md#

To run pretraining, you'll need to make yourself a copy of a pretraining dataset and format it for efficient streaming. Check out the [`llm-foundry/data_prep`](../data_prep) folder for detailed instructions on how to convert your dataset to the MosaicML [StreamingDataset](https://github.com/mosaicml/streaming) format.

As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/c4) dataset here.
As a quickstart, we elaborate on how to prepare the [C4 (Colossal, Cleaned, Common Crawl)](https://huggingface.co/datasets/allenai/c4) dataset here.

We first convert the dataset from its native format (a collection of zipped JSONs)
to MosaicML's StreamingDataset format, which is a collection of binary `.mds` files.
Expand All @@ -44,13 +44,13 @@ This will take 20-60 seconds depending on your internet bandwidth.
You should see two folders once completed: `./my-copy-c4/train_small` and `./my-copy-c4/val_small` that are ~1.0GB total. Note that we are using the `--concat_tokens` option to pre tokenize our samples to be of the max sequence length without padding
<!--pytest.mark.skip-->
```bash
python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
```

Alternatively, you can download the full `train` and `val` splits if you really want to train the model (i.e. not just profile the model). This will take 1-to-many hours depending on bandwidth, number of CPUs, etc. The final folder `./my-copy-c4/train` will be ~800GB so make sure you have space!
<!--pytest.mark.skip-->
```bash
python ../data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
python ../data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train val --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
```

For any of the above commands, you can also choose to compress the `.mds` files.
Expand Down
2 changes: 1 addition & 1 deletion scripts/train/benchmarking/submit_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ def run_config(
if args.data_remote is None:
command += f"""
cd llm-foundry/scripts
python data_prep/convert_dataset_hf.py --dataset c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>'
python data_prep/convert_dataset_hf.py --dataset allenai/c4 --data_subset en --out_root ./my-copy-c4 --splits train_small val_small --concat_tokens {max_seq_len} --eos_text '<|endoftext|>'
composer train/train.py /mnt/config/parameters.yaml
"""
else:
Expand Down
2 changes: 1 addition & 1 deletion tests/a_scripts/data_prep/test_convert_dataset_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_download_script_from_api(tmp_path: Path):
# test calling it directly
path = os.path.join(tmp_path, 'my-copy-c4-1')
convert_dataset_hf(
dataset='c4',
dataset='allenai/c4',
data_subset='en',
splits=['val_xsmall'],
out_root=path,
Expand Down
11 changes: 6 additions & 5 deletions tests/a_scripts/eval/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def test_loader_eval(

# Set up multiple eval dataloaders
first_eval_loader = test_cfg.eval_loader
first_eval_loader.label = 'c4'
first_eval_loader.label = 'allenai/c4'
# Create second eval dataloader using the arxiv dataset.
second_eval_loader = copy.deepcopy(first_eval_loader)
second_eval_loader.label = 'arxiv'
Expand Down Expand Up @@ -157,16 +157,17 @@ def test_loader_eval(
print(inmemorylogger.data.keys())

# Checks for first eval dataloader
assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
)
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
list,
)
assert len(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
) > 0
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
tuple,
)

Expand Down
22 changes: 12 additions & 10 deletions tests/a_scripts/train/test_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def test_train_multi_eval(tmp_path: pathlib.Path):
test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
# Set up multiple eval dataloaders
first_eval_loader = test_cfg.eval_loader
first_eval_loader.label = 'c4'
first_eval_loader.label = 'allenai/c4'
# Create second eval dataloader using the arxiv dataset.
second_eval_loader = copy.deepcopy(first_eval_loader)
second_eval_loader.label = 'arxiv'
Expand All @@ -154,16 +154,17 @@ def test_train_multi_eval(tmp_path: pathlib.Path):
assert isinstance(inmemorylogger, InMemoryLogger)

# Checks for first eval dataloader
assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
)
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
list,
)
assert len(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
) > 0
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
tuple,
)

Expand Down Expand Up @@ -212,7 +213,7 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path):
c4_dataset_name = create_c4_dataset_xxsmall(tmp_path)
test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
first_eval_loader = test_cfg.eval_loader
first_eval_loader.label = 'c4'
first_eval_loader.label = 'allenai/c4'
test_cfg.eval_loader = om.create([first_eval_loader])
test_cfg.eval_subset_num_batches = 1 # -1 to evaluate on all batches
test_cfg.max_duration = '1ba'
Expand All @@ -226,15 +227,16 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path):
0] # pyright: ignore [reportGeneralTypeIssues]
assert isinstance(inmemorylogger, InMemoryLogger)

assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys()
assert 'metrics/eval/allenai/c4/LanguageCrossEntropy' in inmemorylogger.data.keys(
)
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'],
list,
)
assert len(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
) > 0
assert isinstance(
inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1],
inmemorylogger.data['metrics/eval/allenai/c4/LanguageCrossEntropy'][-1],
tuple,
)
6 changes: 3 additions & 3 deletions tests/data/test_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def test_correct_padding(
shutil.rmtree(path, ignore_errors=True)
if pretokenize:
convert_dataset_hf(
dataset='c4',
dataset='allenai/c4',
data_subset='en',
splits=[split],
out_root=path,
Expand All @@ -219,7 +219,7 @@ def test_correct_padding(
)
else:
convert_dataset_hf(
dataset='c4',
dataset='allenai/c4',
data_subset='en',
splits=[split],
out_root=path,
Expand All @@ -233,7 +233,7 @@ def test_correct_padding(
num_workers=None,
)
if not os.path.isdir(path):
raise RuntimeError(f'c4 dataset at {path} not set up as expected')
raise RuntimeError(f'allenai/c4 dataset at {path} not set up as expected')

test_cfg = get_config(
conf_path='scripts/train/yamls/pretrain/mpt-125m.yaml',
Expand Down
2 changes: 1 addition & 1 deletion tests/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str:

# Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188
convert_dataset_hf(
dataset='c4',
dataset='allenai/c4',
data_subset='en',
splits=[downloaded_split],
out_root=c4_dir,
Expand Down

0 comments on commit 3b1fc4a

Please sign in to comment.