From e7b812d39cce052d5bc55fcd4c4547bbf9c3f6f9 Mon Sep 17 00:00:00 2001 From: Eitan Turok Date: Thu, 26 Sep 2024 18:44:16 +0000 Subject: [PATCH] use allenai/c4 --- llmfoundry/command_utils/data_prep/convert_dataset_hf.py | 2 +- tests/a_scripts/data_prep/test_convert_dataset_hf.py | 2 +- tests/a_scripts/eval/test_eval.py | 2 +- tests/a_scripts/train/test_train.py | 4 ++-- tests/data/test_dataloader.py | 6 +++--- tests/data_utils.py | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py index 0ea94ac687..fba062d6f5 100644 --- a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py +++ b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py @@ -158,7 +158,7 @@ def __init__( truncated_samples=100, ) -CONSTS = {'c4': c4constants, 'the_pile': pileconstants} +CONSTS = {'allenai/c4': c4constants, 'the_pile': pileconstants} def build_hf_dataset( diff --git a/tests/a_scripts/data_prep/test_convert_dataset_hf.py b/tests/a_scripts/data_prep/test_convert_dataset_hf.py index e09c54ca70..da1e101ae7 100644 --- a/tests/a_scripts/data_prep/test_convert_dataset_hf.py +++ b/tests/a_scripts/data_prep/test_convert_dataset_hf.py @@ -11,7 +11,7 @@ def test_download_script_from_api(tmp_path: Path): # test calling it directly path = os.path.join(tmp_path, 'my-copy-c4-1') convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=['val_xsmall'], out_root=path, diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index fc0dc8a882..3fc7141b9a 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -121,7 +121,7 @@ def test_loader_eval( # Set up multiple eval dataloaders first_eval_loader = test_cfg.eval_loader - first_eval_loader.label = 'c4' + first_eval_loader.label = 'allenai/c4' # Create second eval dataloader using the arxiv dataset. second_eval_loader = copy.deepcopy(first_eval_loader) second_eval_loader.label = 'arxiv' diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 9af96f9868..4f6a2e2ed9 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -134,7 +134,7 @@ def test_train_multi_eval(tmp_path: pathlib.Path): test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') # Set up multiple eval dataloaders first_eval_loader = test_cfg.eval_loader - first_eval_loader.label = 'c4' + first_eval_loader.label = 'allenai/c4' # Create second eval dataloader using the arxiv dataset. second_eval_loader = copy.deepcopy(first_eval_loader) second_eval_loader.label = 'arxiv' @@ -212,7 +212,7 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path): c4_dataset_name = create_c4_dataset_xxsmall(tmp_path) test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') first_eval_loader = test_cfg.eval_loader - first_eval_loader.label = 'c4' + first_eval_loader.label = 'allenai/c4' test_cfg.eval_loader = om.create([first_eval_loader]) test_cfg.eval_subset_num_batches = 1 # -1 to evaluate on all batches test_cfg.max_duration = '1ba' diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index d215d93542..7239bfe958 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -204,7 +204,7 @@ def test_correct_padding( shutil.rmtree(path, ignore_errors=True) if pretokenize: convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=[split], out_root=path, @@ -219,7 +219,7 @@ def test_correct_padding( ) else: convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=[split], out_root=path, @@ -233,7 +233,7 @@ def test_correct_padding( num_workers=None, ) if not os.path.isdir(path): - raise RuntimeError(f'c4 dataset at {path} not set up as expected') + raise RuntimeError(f'allenai/c4 dataset at {path} not set up as expected') test_cfg = get_config( conf_path='scripts/train/yamls/pretrain/mpt-125m.yaml', diff --git a/tests/data_utils.py b/tests/data_utils.py index 117310b0cf..1f6c26b72e 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -231,7 +231,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str: # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188 convert_dataset_hf( - dataset='c4', + dataset='allenai/c4', data_subset='en', splits=[downloaded_split], out_root=c4_dir,