From e7b812d39cce052d5bc55fcd4c4547bbf9c3f6f9 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 18:44:16 +0000
Subject: [PATCH] use allenai/c4

---
 llmfoundry/command_utils/data_prep/convert_dataset_hf.py | 2 +-
 tests/a_scripts/data_prep/test_convert_dataset_hf.py     | 2 +-
 tests/a_scripts/eval/test_eval.py                        | 2 +-
 tests/a_scripts/train/test_train.py                      | 4 ++--
 tests/data/test_dataloader.py                            | 6 +++---
 tests/data_utils.py                                      | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
index 0ea94ac687..fba062d6f5 100644
--- a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
+++ b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
@@ -158,7 +158,7 @@ def __init__(
     truncated_samples=100,
 )
 
-CONSTS = {'c4': c4constants, 'the_pile': pileconstants}
+CONSTS = {'allenai/c4': c4constants, 'the_pile': pileconstants}
 
 
 def build_hf_dataset(
diff --git a/tests/a_scripts/data_prep/test_convert_dataset_hf.py b/tests/a_scripts/data_prep/test_convert_dataset_hf.py
index e09c54ca70..da1e101ae7 100644
--- a/tests/a_scripts/data_prep/test_convert_dataset_hf.py
+++ b/tests/a_scripts/data_prep/test_convert_dataset_hf.py
@@ -11,7 +11,7 @@ def test_download_script_from_api(tmp_path: Path):
     # test calling it directly
     path = os.path.join(tmp_path, 'my-copy-c4-1')
     convert_dataset_hf(
-        dataset='c4',
+        dataset='allenai/c4',
         data_subset='en',
         splits=['val_xsmall'],
         out_root=path,
diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py
index fc0dc8a882..3fc7141b9a 100644
--- a/tests/a_scripts/eval/test_eval.py
+++ b/tests/a_scripts/eval/test_eval.py
@@ -121,7 +121,7 @@ def test_loader_eval(
 
     # Set up multiple eval dataloaders
     first_eval_loader = test_cfg.eval_loader
-    first_eval_loader.label = 'c4'
+    first_eval_loader.label = 'allenai/c4'
     # Create second eval dataloader using the arxiv dataset.
     second_eval_loader = copy.deepcopy(first_eval_loader)
     second_eval_loader.label = 'arxiv'
diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py
index 9af96f9868..4f6a2e2ed9 100644
--- a/tests/a_scripts/train/test_train.py
+++ b/tests/a_scripts/train/test_train.py
@@ -134,7 +134,7 @@ def test_train_multi_eval(tmp_path: pathlib.Path):
     test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
     # Set up multiple eval dataloaders
     first_eval_loader = test_cfg.eval_loader
-    first_eval_loader.label = 'c4'
+    first_eval_loader.label = 'allenai/c4'
     # Create second eval dataloader using the arxiv dataset.
     second_eval_loader = copy.deepcopy(first_eval_loader)
     second_eval_loader.label = 'arxiv'
@@ -212,7 +212,7 @@ def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path):
     c4_dataset_name = create_c4_dataset_xxsmall(tmp_path)
     test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu')
     first_eval_loader = test_cfg.eval_loader
-    first_eval_loader.label = 'c4'
+    first_eval_loader.label = 'allenai/c4'
     test_cfg.eval_loader = om.create([first_eval_loader])
     test_cfg.eval_subset_num_batches = 1  # -1 to evaluate on all batches
     test_cfg.max_duration = '1ba'
diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py
index d215d93542..7239bfe958 100644
--- a/tests/data/test_dataloader.py
+++ b/tests/data/test_dataloader.py
@@ -204,7 +204,7 @@ def test_correct_padding(
     shutil.rmtree(path, ignore_errors=True)
     if pretokenize:
         convert_dataset_hf(
-            dataset='c4',
+            dataset='allenai/c4',
             data_subset='en',
             splits=[split],
             out_root=path,
@@ -219,7 +219,7 @@ def test_correct_padding(
         )
     else:
         convert_dataset_hf(
-            dataset='c4',
+            dataset='allenai/c4',
             data_subset='en',
             splits=[split],
             out_root=path,
@@ -233,7 +233,7 @@ def test_correct_padding(
             num_workers=None,
         )
     if not os.path.isdir(path):
-        raise RuntimeError(f'c4 dataset at {path} not set up as expected')
+        raise RuntimeError(f'allenai/c4 dataset at {path} not set up as expected')
 
     test_cfg = get_config(
         conf_path='scripts/train/yamls/pretrain/mpt-125m.yaml',
diff --git a/tests/data_utils.py b/tests/data_utils.py
index 117310b0cf..1f6c26b72e 100644
--- a/tests/data_utils.py
+++ b/tests/data_utils.py
@@ -231,7 +231,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str:
 
     # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188
     convert_dataset_hf(
-        dataset='c4',
+        dataset='allenai/c4',
         data_subset='en',
         splits=[downloaded_split],
         out_root=c4_dir,