From 7dc4531f4b272bba97f56974f31d12b55abce836 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Wed, 15 Nov 2023 18:46:37 -0800
Subject: [PATCH 01/34] remove test suite

---
 .github/workflows/pr-gpu.yaml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index ffbfac4585..84c46a1368 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -18,14 +18,9 @@ jobs:
     uses: ./.github/workflows/pytest-gpu.yaml
     strategy:
       matrix:
-        # TODO: After the PR with the flash attention 2 images goes in, add the new unit test suite
         include:
-        - name: 'gpu-latest'
-          container: mosaicml/pytorch:latest  # mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
-          markers: 'gpu'
-          pytest_command: 'coverage run -m pytest'
-        - name: 'gpu-2.0.1'
-          container: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
+        - name: 'gpu-1.13.1'
+          container: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
           markers: 'gpu'
           pytest_command: 'coverage run -m pytest'
         - name: 'gpu-2.1.0'

From 3592d88e352e9327321f06dc40b99f9e11a89374 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Wed, 15 Nov 2023 19:45:46 -0800
Subject: [PATCH 02/34] wip

---
 tests/test_hf_conversion_script.py | 207 ++++++++++++++++++++++++++---
 1 file changed, 187 insertions(+), 20 deletions(-)

diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index dcb743b536..070f6e2fa8 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -259,6 +259,183 @@ def test_callback_inits():
     }
 
 
+@pytest.mark.gpu
+@pytest.mark.parametrize('log_to_mlflow', [True, False])
+@pytest.mark.parametrize(
+    'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
+    [('3ba', '2ba', '7ba', 3, 4), ('1dur', '2ba', '1ep', 1, 4)])
+@patch('os.cpu_count', MagicMock(return_value=None))
+def test_huggingface_conversion_callback_interval(
+        tmp_path: pathlib.Path, log_to_mlflow: bool, hf_save_interval: str,
+        save_interval: str, max_duration: str, expected_hf_checkpoints: int,
+        expected_normal_checkpoints: int):
+    delete_transformers_cache()
+
+    dist.initialize_dist(get_device('gpu'))
+
+    max_seq_len = 16
+    device_batch_size = 1
+    dataset_size = 14
+    precision_str = 'bfloat16'
+    precision = torch.bfloat16
+    batches_per_epoch = math.ceil(dataset_size / (device_batch_size * 2))
+
+    checkpointer_callback = HuggingFaceCheckpointer(
+        save_folder=os.path.join(tmp_path, 'checkpoints'),
+        save_interval=hf_save_interval,
+        precision=precision_str,
+        mlflow_registered_model_name='dummy-registered-name'
+        if log_to_mlflow else None,
+    )
+
+    # get small version of each model
+    model_cfg = {
+        'name': 'mpt_causal_lm',
+        'init_device': 'cpu',
+        'd_model': 128,
+        'n_heads': 2,
+        'n_layers': 2,
+        'expansion_ratio': 4,
+        'max_seq_len': max_seq_len,
+        'vocab_size': 50368,
+        'attn_config': {
+            'attn_impl': 'torch',
+        },
+        'loss_fn': 'torch_crossentropy',
+        'tie_word_embeddings': tie_word_embeddings,
+    }
+    tokenizer_name = 'EleutherAI/gpt-neox-20b'
+    model_cfg = om.create(model_cfg)
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        tokenizer_name, use_auth_token=model == 'llama2')
+
+    tiny_dataset_folder_path = os.path.join(os.getcwd(), 'test-ift-data-small')
+    tiny_dataset_path = os.path.join(tiny_dataset_folder_path, 'train.jsonl')
+    make_tiny_ft_dataset(path=tiny_dataset_path, size=dataset_size)
+
+    dataloader_cfg = {
+        'name': 'finetuning',
+        'dataset': {
+            'hf_name': tiny_dataset_folder_path,
+            'split': 'train',
+            'max_seq_len': max_seq_len,
+            'decoder_only_format': True,
+            'allow_pad_trimming': False,
+            'packing_ratio': None,
+            'shuffle': True,
+        },
+        'drop_last': False,
+        'num_workers': 4,
+        'pin_memory': False,
+        'prefetch_factor': 2,
+        'persistent_workers': False,
+        'timeout': 0
+    }
+
+    dataloader_cfg = om.create(dataloader_cfg)
+
+    tokenizer = build_tokenizer(
+        tokenizer_name=tokenizer_name,
+        tokenizer_kwargs={'model_max_length': max_seq_len},
+    )
+
+    train_dataloader = build_finetuning_dataloader(
+        dataloader_cfg,
+        tokenizer,
+        device_batch_size,
+    )
+
+    original_model = COMPOSER_MODEL_REGISTRY[model_cfg['name']](model_cfg,
+                                                                tokenizer)
+
+    optimizer_config = {
+        'name': 'decoupled_adamw',
+        'lr': 6e-4,
+        'betas': [0.9, 0.95],
+        'eps': 1e-8,
+        'weight_decay': 0.0,
+    }
+    optimizer_name = optimizer_config.pop('name')
+    optimizer = build_optimizer(original_model, optimizer_name,
+                                optimizer_config)
+
+    mlflow_logger_mock = MagicMock(spec=MLFlowLogger)
+    mlflow_logger_mock.state_dict = lambda *args, **kwargs: {}
+    mlflow_logger_mock.save_model = MagicMock()
+    mlflow_logger_mock.register_model = MagicMock()
+    mlflow_logger_mock.model_registry_prefix = ''
+    trainer = Trainer(
+        model=original_model,
+        device='gpu',
+        train_dataloader=train_dataloader,
+        save_folder=os.path.join(tmp_path, 'checkpoints'),
+        save_interval=save_interval,
+        max_duration=max_duration,
+        callbacks=[checkpointer_callback],
+        loggers=[mlflow_logger_mock] if log_to_mlflow else [],
+        optimizers=optimizer,
+        save_latest_filename=None,
+    )
+    trainer.fit()
+
+    if log_to_mlflow:
+        assert mlflow_logger_mock.save_model.call_count == 1
+        mlflow_logger_mock.save_model.assert_called_with(
+            flavor='transformers',
+            transformers_model=ANY,
+            path=ANY,
+            task='text-generation',
+            metadata={'task': 'llm/v1/completions'})
+        assert mlflow_logger_mock.register_model.call_count == 1
+    else:
+        assert mlflow_logger_mock.save_model.call_count == 0
+        assert mlflow_logger_mock.register_model.call_count == 0
+
+    normal_checkpoints = [
+        name for name in os.listdir(os.path.join(tmp_path, 'checkpoints'))
+        if name != 'huggingface'
+    ]
+    huggingface_checkpoints = [
+        name for name in os.listdir(
+            os.path.join(tmp_path, 'checkpoints', 'huggingface'))
+    ]
+    assert len(normal_checkpoints) == expected_normal_checkpoints
+    assert len(huggingface_checkpoints) == expected_hf_checkpoints
+
+    # Load the last huggingface checkpoint
+    loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
+        os.path.join(tmp_path, 'checkpoints', 'huggingface',
+                     f'ba{batches_per_epoch}'),
+        trust_remote_code=True,
+    )
+
+    # Check that the loaded model has the correct precision, and then set it back
+    # to the original for the equivalence check
+    assert loaded_model.config.torch_dtype == precision
+    loaded_model.config.torch_dtype = original_model.model.config.torch_dtype
+
+    # Check that we have correctly set these attributes, and then set them back
+    # to the original for the equivalence check
+    assert loaded_model.config.attn_config['attn_impl'] == 'torch'
+    assert loaded_model.config.init_device == 'cpu'
+    loaded_model.config.attn_config[
+        'attn_impl'] = original_model.model.config.attn_config['attn_impl']
+    loaded_model.config.init_device = original_model.model.config.init_device
+
+    loaded_tokenizer = transformers.AutoTokenizer.from_pretrained(
+        os.path.join(tmp_path, 'checkpoints', 'huggingface',
+                     f'ba{batches_per_epoch}'),
+        trust_remote_code=True,
+    )
+
+    check_hf_model_equivalence(trainer.state.model.module.model.to(precision),
+                               loaded_model)
+    check_hf_tokenizer_equivalence(tokenizer, loaded_tokenizer)
+
+    delete_transformers_cache()
+
+
 @pytest.mark.world_size(2)
 @pytest.mark.gpu
 @pytest.mark.parametrize(
@@ -266,10 +443,6 @@ def test_callback_inits():
     [('mpt', True), ('mpt', False), ('neo', None), ('llama2', None)],
 )
 @pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None])
-@pytest.mark.parametrize('log_to_mlflow', [True, False])
-@pytest.mark.parametrize(
-    'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
-    [('3ba', '2ba', '7ba', 3, 4), ('1dur', '2ba', '1ep', 1, 4)])
 @patch('os.cpu_count', MagicMock(return_value=None))
 def test_huggingface_conversion_callback(
         model: str, tmp_path: pathlib.Path, tie_word_embeddings: bool,
@@ -291,9 +464,7 @@ def test_huggingface_conversion_callback(
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=hf_save_interval,
         precision=precision_str,
-        mlflow_registered_model_name='dummy-registered-name'
-        if log_to_mlflow else None,
-    )
+        mlflow_registered_model_name='dummy-registered-name')
 
     # get small version of each model
     model_cfg = None
@@ -431,25 +602,21 @@ def test_huggingface_conversion_callback(
         save_interval=save_interval,
         max_duration=max_duration,
         callbacks=[checkpointer_callback],
-        loggers=[mlflow_logger_mock] if log_to_mlflow else [],
+        loggers=[mlflow_logger_mock],
         optimizers=optimizer,
         save_latest_filename=None,
     )
     trainer.fit()
 
     if dist.get_global_rank() == 0:
-        if log_to_mlflow:
-            assert mlflow_logger_mock.save_model.call_count == 1
-            mlflow_logger_mock.save_model.assert_called_with(
-                flavor='transformers',
-                transformers_model=ANY,
-                path=ANY,
-                task='text-generation',
-                metadata={'task': 'llm/v1/completions'})
-            assert mlflow_logger_mock.register_model.call_count == 1
-        else:
-            assert mlflow_logger_mock.save_model.call_count == 0
-            assert mlflow_logger_mock.register_model.call_count == 0
+        assert mlflow_logger_mock.save_model.call_count == 1
+        mlflow_logger_mock.save_model.assert_called_with(
+            flavor='transformers',
+            transformers_model=ANY,
+            path=ANY,
+            task='text-generation',
+            metadata={'task': 'llm/v1/completions'})
+        assert mlflow_logger_mock.register_model.call_count == 1
     else:
         assert mlflow_logger_mock.log_model.call_count == 0
         assert mlflow_logger_mock.register_model.call_count == 0

From 6b75e343845ba5f2a50be8c220e683f1ec07369c Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Wed, 15 Nov 2023 20:58:48 -0800
Subject: [PATCH 03/34] fix typos

---
 tests/test_hf_conversion_script.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index 070f6e2fa8..e69396d08b 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -302,13 +302,12 @@ def test_huggingface_conversion_callback_interval(
             'attn_impl': 'torch',
         },
         'loss_fn': 'torch_crossentropy',
-        'tie_word_embeddings': tie_word_embeddings,
+        'tie_word_embeddings': True,
     }
     tokenizer_name = 'EleutherAI/gpt-neox-20b'
     model_cfg = om.create(model_cfg)
 
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        tokenizer_name, use_auth_token=model == 'llama2')
+    tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
 
     tiny_dataset_folder_path = os.path.join(os.getcwd(), 'test-ift-data-small')
     tiny_dataset_path = os.path.join(tiny_dataset_folder_path, 'train.jsonl')

From f23a3f6ae14360f756dc1414ebe145b4110e7afd Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 00:05:47 -0800
Subject: [PATCH 04/34] fix

---
 tests/test_hf_conversion_script.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index e69396d08b..713b13d305 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -443,11 +443,13 @@ def test_huggingface_conversion_callback_interval(
 )
 @pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None])
 @patch('os.cpu_count', MagicMock(return_value=None))
-def test_huggingface_conversion_callback(
-        model: str, tmp_path: pathlib.Path, tie_word_embeddings: bool,
-        fsdp_state_dict_type: Optional[str], log_to_mlflow: bool,
-        hf_save_interval: str, save_interval: str, max_duration: str,
-        expected_hf_checkpoints: int, expected_normal_checkpoints: int):
+def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
+                                         tie_word_embeddings: bool,
+                                         fsdp_state_dict_type: Optional[str],
+                                         hf_save_interval: str,
+                                         save_interval: str, max_duration: str,
+                                         expected_hf_checkpoints: int,
+                                         expected_normal_checkpoints: int):
     delete_transformers_cache()
 
     dist.initialize_dist(get_device('gpu'))

From c125b3b021a85365d90334673090c3b5eda61041 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 00:23:00 +0000
Subject: [PATCH 05/34] wip

---
 tests/test_flash_triton_torch.py   | 18 +++++++++++------
 tests/test_hf_conversion_script.py | 31 +++++++++++++++---------------
 tests/test_model.py                | 25 ++++++------------------
 3 files changed, 34 insertions(+), 40 deletions(-)

diff --git a/tests/test_flash_triton_torch.py b/tests/test_flash_triton_torch.py
index 1ede36c0b5..1d1919217c 100644
--- a/tests/test_flash_triton_torch.py
+++ b/tests/test_flash_triton_torch.py
@@ -5,6 +5,7 @@
 import torch
 from omegaconf import OmegaConf as om
 
+from llmfoundry.models.layers import attention
 from llmfoundry.models.layers.attention import is_flash_v2_installed
 from llmfoundry.models.mpt.modeling_mpt import gen_rotary_embedding
 
@@ -17,8 +18,14 @@ def allclose_helper(t0: torch.Tensor,
 
 
 @pytest.mark.gpu
-@pytest.mark.parametrize('attn_impl_0', ['flash', 'triton', 'torch'])
-@pytest.mark.parametrize('attn_impl_1', ['flash', 'triton', 'torch'])
+@pytest.mark.parametrize('attn_impl_0,attn_impl_1', [
+    ('flash', 'flash'),
+    ('flash', 'triton'),
+    ('flash', 'torch'),
+    ('triton', 'triton'),
+    ('triton', 'torch'),
+    ('torch', 'torch'),
+])
 @pytest.mark.parametrize('clip_qkv', [True, False])
 @pytest.mark.parametrize('qk_ln', [True, False])
 @pytest.mark.parametrize('pos_emb_config', [{
@@ -62,11 +69,10 @@ def test_attn_impl(attn_impl_0: str,
     Includes testing with and without attn_clip_qkv, attn_qk_ln, alibi, and
     rope.
     """
-    from llmfoundry.models.layers import attention
     alibi = pos_emb_config['alibi']
     rope = pos_emb_config['rope']
     if alibi and (attn_impl_0 == 'flash' or attn_impl_1 == 'flash'):
-        pytest.xfail('flash attn does not support alibi')
+        pytest.skip('flash attn does not support alibi')
 
     if rope and (pos_emb_config['rope_impl']
                  == 'dail') and (not is_flash_v2_installed()):
@@ -81,7 +87,7 @@ def test_attn_impl(attn_impl_0: str,
         'qk_ln': qk_ln,
     })
 
-    n, s, f = 2, 16, cfg.d_model
+    n, s, f = 2, 4, cfg.d_model
     assert cfg.d_model % cfg.n_heads == 0
     if attn_type == 'grouped_query_attention':
         cfg.kv_n_heads = 2
@@ -311,7 +317,7 @@ def test_grouped_attention_heads(attn_impl: str,
         'kv_n_heads': kv_n_heads
     })
 
-    n, s, f = 2, 16, cfg.d_model
+    n, s, f = 2, 4, cfg.d_model
 
     mmhsa = attention.GroupedQueryAttention(**cfg).to(device)
 
diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index 713b13d305..0d4cde342d 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -263,7 +263,7 @@ def test_callback_inits():
 @pytest.mark.parametrize('log_to_mlflow', [True, False])
 @pytest.mark.parametrize(
     'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
-    [('3ba', '2ba', '7ba', 3, 4), ('1dur', '2ba', '1ep', 1, 4)])
+    [('3ba', '2ba', '4ba', 2, 2), ('1dur', '2ba', '1ep', 1, 2)])
 @patch('os.cpu_count', MagicMock(return_value=None))
 def test_huggingface_conversion_callback_interval(
         tmp_path: pathlib.Path, log_to_mlflow: bool, hf_save_interval: str,
@@ -273,12 +273,12 @@ def test_huggingface_conversion_callback_interval(
 
     dist.initialize_dist(get_device('gpu'))
 
-    max_seq_len = 16
-    device_batch_size = 1
-    dataset_size = 14
+    max_seq_len = 4
+    device_batch_size = 2
+    dataset_size = 8
     precision_str = 'bfloat16'
     precision = torch.bfloat16
-    batches_per_epoch = math.ceil(dataset_size / (device_batch_size * 2))
+    batches_per_epoch = math.ceil(dataset_size / device_batch_size)
 
     checkpointer_callback = HuggingFaceCheckpointer(
         save_folder=os.path.join(tmp_path, 'checkpoints'),
@@ -292,7 +292,7 @@ def test_huggingface_conversion_callback_interval(
     model_cfg = {
         'name': 'mpt_causal_lm',
         'init_device': 'cpu',
-        'd_model': 128,
+        'd_model': 64,
         'n_heads': 2,
         'n_layers': 2,
         'expansion_ratio': 4,
@@ -401,7 +401,7 @@ def test_huggingface_conversion_callback_interval(
     ]
     assert len(normal_checkpoints) == expected_normal_checkpoints
     assert len(huggingface_checkpoints) == expected_hf_checkpoints
-
+    print(huggingface_checkpoints)
     # Load the last huggingface checkpoint
     loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
         os.path.join(tmp_path, 'checkpoints', 'huggingface',
@@ -428,7 +428,7 @@ def test_huggingface_conversion_callback_interval(
         trust_remote_code=True,
     )
 
-    check_hf_model_equivalence(trainer.state.model.module.model.to(precision),
+    check_hf_model_equivalence(trainer.state.model.model.to(precision),
                                loaded_model)
     check_hf_tokenizer_equivalence(tokenizer, loaded_tokenizer)
 
@@ -442,14 +442,15 @@ def test_huggingface_conversion_callback_interval(
     [('mpt', True), ('mpt', False), ('neo', None), ('llama2', None)],
 )
 @pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None])
+@pytest.mark.parametrize(
+    'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
+    [('3ba', '2ba', '7ba', 3, 4)])
 @patch('os.cpu_count', MagicMock(return_value=None))
-def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
-                                         tie_word_embeddings: bool,
-                                         fsdp_state_dict_type: Optional[str],
-                                         hf_save_interval: str,
-                                         save_interval: str, max_duration: str,
-                                         expected_hf_checkpoints: int,
-                                         expected_normal_checkpoints: int):
+def test_huggingface_conversion_callback(
+        model: str, tmp_path: pathlib.Path, tie_word_embeddings: bool,
+        fsdp_state_dict_type: Optional[str],
+        hf_save_interval: str, save_interval: str, max_duration: str,
+        expected_hf_checkpoints: int, expected_normal_checkpoints: int):
     delete_transformers_cache()
 
     dist.initialize_dist(get_device('gpu'))
diff --git a/tests/test_model.py b/tests/test_model.py
index c160c064dc..51180a6c28 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -874,11 +874,11 @@ def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
     save_path = tmp_path / 'test-device-map'
     hf_config = MPTConfig(
         init_device='cpu',
-        d_model=128,
+        d_model=64,
         n_heads=4,
         n_layers=2,
         expansion_ratio=2,
-        max_seq_len=2048,
+        max_seq_len=4,
         emb_pdrop=0.1,
         resid_pdrop=0.2,
         attn_config={
@@ -914,8 +914,8 @@ def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
     )
     with torch.autocast('cuda', dtype=torch.bfloat16):
         _ = pipe(
-            'The quick fox jumped over',
-            max_length=10,
+            'The fox',
+            max_new_tokens=2,
             do_sample=True,
         )
 
@@ -1482,18 +1482,17 @@ def test_model_to(attention_impl: str, pos_emb_config: dict,
 
     hf_config = MPTConfig(
         init_device='cpu',
-        d_model=128,
+        d_model=64,
         n_heads=4,
         n_layers=2,
         expansion_ratio=2,
-        max_seq_len=2048,
+        max_seq_len=4,
         emb_pdrop=0.1,
         resid_pdrop=0.2,
         attn_config={
             'attn_impl': attention_impl,
             **pos_emb_config,
         },
-        use_cache=True,
         init_config={
             'name': 'baseline_',
             'init_std': 0.02,
@@ -1509,11 +1508,9 @@ def test_model_to(attention_impl: str, pos_emb_config: dict,
     input_ids = torch.tensor([[11274, 16390, 11]]).to('cuda')
     attention_mask = torch.tensor([[1, 1, 1]]).bool().to('cuda')
 
-    # with get_precision_context('amp_bf16'):
     _ = mpt(input_ids, attention_mask=attention_mask)
 
     # move the model around using different methods
-    mpt = mpt.bfloat16()
     mpt = mpt.to('cpu')
 
     # verify the model still works
@@ -1523,15 +1520,6 @@ def test_model_to(attention_impl: str, pos_emb_config: dict,
             _ = mpt(input_ids.to('cpu'),
                     attention_mask=attention_mask.to('cpu'))
 
-    mpt = mpt.cuda()
-    mpt = mpt.bfloat16()
-
-    # verify the model still works
-    if attention_impl == 'torch':
-        with torch.autocast('cuda', dtype=torch.bfloat16, enabled=True):
-            _ = mpt(input_ids, attention_mask=attention_mask)
-
-    mpt = mpt.to('cpu')
     mpt = mpt.float()
 
     # verify the model still works
@@ -1539,7 +1527,6 @@ def test_model_to(attention_impl: str, pos_emb_config: dict,
             pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'):
         _ = mpt(input_ids.to('cpu'), attention_mask=attention_mask.to('cpu'))
 
-    mpt = mpt.half()
     mpt = mpt.to(0)  # move to rank0
     mpt = mpt.bfloat16()
 

From 564bfa7db4e1ba1cddab9ccace326f960d99692d Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 08:24:36 +0000
Subject: [PATCH 06/34] precommit

---
 tests/test_hf_conversion_script.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index 0d4cde342d..f0d306f7a3 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -307,8 +307,6 @@ def test_huggingface_conversion_callback_interval(
     tokenizer_name = 'EleutherAI/gpt-neox-20b'
     model_cfg = om.create(model_cfg)
 
-    tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
-
     tiny_dataset_folder_path = os.path.join(os.getcwd(), 'test-ift-data-small')
     tiny_dataset_path = os.path.join(tiny_dataset_folder_path, 'train.jsonl')
     make_tiny_ft_dataset(path=tiny_dataset_path, size=dataset_size)
@@ -446,11 +444,13 @@ def test_huggingface_conversion_callback_interval(
     'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
     [('3ba', '2ba', '7ba', 3, 4)])
 @patch('os.cpu_count', MagicMock(return_value=None))
-def test_huggingface_conversion_callback(
-        model: str, tmp_path: pathlib.Path, tie_word_embeddings: bool,
-        fsdp_state_dict_type: Optional[str],
-        hf_save_interval: str, save_interval: str, max_duration: str,
-        expected_hf_checkpoints: int, expected_normal_checkpoints: int):
+def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
+                                         tie_word_embeddings: bool,
+                                         fsdp_state_dict_type: Optional[str],
+                                         hf_save_interval: str,
+                                         save_interval: str, max_duration: str,
+                                         expected_hf_checkpoints: int,
+                                         expected_normal_checkpoints: int):
     delete_transformers_cache()
 
     dist.initialize_dist(get_device('gpu'))
@@ -536,9 +536,6 @@ def test_huggingface_conversion_callback(
         'state_dict_type': fsdp_state_dict_type,
     }
 
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        tokenizer_name, use_auth_token=model == 'llama2')
-
     tiny_dataset_folder_path = os.path.join(os.getcwd(), 'test-ift-data-small')
     tiny_dataset_path = os.path.join(tiny_dataset_folder_path, 'train.jsonl')
     if dist.get_global_rank() == 0:

From 275f34cd14011721e50afce986112dc564be2253 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 08:45:28 +0000
Subject: [PATCH 07/34] fix comparison tests

---
 tests/test_hf_v_mpt.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/tests/test_hf_v_mpt.py b/tests/test_hf_v_mpt.py
index 46172faf35..bae33b088f 100644
--- a/tests/test_hf_v_mpt.py
+++ b/tests/test_hf_v_mpt.py
@@ -5,33 +5,20 @@
 
 import pytest
 import torch
+from composer.utils import reproducibility
 from omegaconf import OmegaConf as om
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
 
 
 @pytest.mark.gpu
-@pytest.mark.xfail(reason='CUDA OOM expected, needs to be fixed.')
+# @pytest.mark.xfail(reason='CUDA OOM expected, needs to be fixed.')
 @pytest.mark.parametrize('attn_impl,dropout,alibi,mask_val,no_attn_mask', [
     ('flash', 0.0, False, 1, False),
     ('flash', 0.1, False, 1, False),
     ('torch', 0.0, False, 1, False),
     ('triton', 0.0, False, 1, False),
     ('triton', 0.1, False, 1, False),
-    pytest.param('torch',
-                 0.0,
-                 True,
-                 1,
-                 False,
-                 marks=pytest.mark.xfail(
-                     reason='hf model is not implemented with alibi')),
-    pytest.param('triton',
-                 0.1,
-                 True,
-                 1,
-                 False,
-                 marks=pytest.mark.xfail(
-                     reason='hf model is not implemented with alibi')),
     ('torch', 0.0, False, 0, False),
     ('triton', 0.0, False, 0, False),
     ('triton', 0.1, False, 0, False),
@@ -58,6 +45,11 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool,
             'pretrained_model_name_or_path': 'gpt2',
             'device': 'cpu',
             'pretrained': False,
+            'config_overrides': {
+                'n_layer': 2,
+                'n_embd': 64,
+                'n_head': 8,
+            }
         },
         'tokenizer': {
             'name': 'gpt2'
@@ -107,6 +99,9 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool,
     # given this, it will generate different drop idx when compared to nn.Dropout
     # reguradless of if rng is seeded.
     model_cfg.attn_pdrop = hf_model.model.config.attn_pdrop
+    model_cfg.n_layers = hf_model.model.config.n_layer
+    model_cfg.d_model = hf_model.model.config.n_embd
+    model_cfg.n_heads = hf_model.model.config.n_head
 
     # Build Model
     print('Initializing model...')
@@ -172,8 +167,8 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool,
     # HF keys which need to be replaced by the associated value
     hf_2_mosaic_key_mods = {
         'model.transformer.h.': 'model.transformer.blocks.',
-        '.mlp.c_fc.': '.mlp.mlp_up.',
-        '.mlp.c_proj.': '.mlp.mlp_down.',
+        '.mlp.c_fc.': '.ffn.up_proj.',
+        '.mlp.c_proj.': '.ffn.down_proj.',
         '.attn.c_attn.': '.attn.Wqkv.',
         '.attn.c_proj.': '.attn.out_proj.',
         '.ln_': '.norm_',
@@ -201,9 +196,11 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool,
     model.load_state_dict(_hf_model_statedict)
 
     with torch.autocast(device_type=device, dtype=torch.float16):
+        reproducibility.seed_all(17)
         hf_model_fwd = hf_model(batch)['logits']
         if kpm is not None:
             hf_model_fwd *= kpm
+        reproducibility.seed_all(17)
         model_fwd = model(batch).logits
         if kpm is not None:
             model_fwd *= kpm

From 19135995465bb516e429d1862553b1591bf7944e Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 08:52:37 +0000
Subject: [PATCH 08/34] precommit

---
 tests/test_huggingface_flash.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/test_huggingface_flash.py b/tests/test_huggingface_flash.py
index 834488bb6a..70c08c4eb1 100644
--- a/tests/test_huggingface_flash.py
+++ b/tests/test_huggingface_flash.py
@@ -44,8 +44,8 @@ def test_patch_equivalence(patch_fn_name: str, explicit_mask: bool,
         )
 
     device = 'cuda:0'
-    sequence_length = 4096
-    model_dim = 4096 if '7b' in model_name else 8192
+    sequence_length = 64
+    model_dim = 128 if '7b' in model_name else 256
     batch_size = 2
     if patch_fn_name == 'torch':
         patch_fn = llama_attention_patch_torch
@@ -64,8 +64,8 @@ def test_patch_equivalence(patch_fn_name: str, explicit_mask: bool,
     else:
         raise ValueError(f'Unknown patch_fn_name: {patch_fn_name}')
 
-    llama_config = transformers.AutoConfig.from_pretrained(model_name,
-                                                           use_auth_token=True)
+    llama_config = transformers.AutoConfig.from_pretrained(
+        model_name, use_auth_token=True, hidden_size=model_dim)
 
     reproducibility.seed_all(42)
     attention = LlamaAttention(config=llama_config,)
@@ -127,6 +127,7 @@ def test_attn_patch_integration(patch: str):
         'config_overrides': {
             'num_hidden_layers': 2,
             'intermediate_size': 64,
+            'hidden_size': 64,
         },
         'use_auth_token': True,
         'pretrained': False,
@@ -172,6 +173,7 @@ def test_flash2(model_name: str, use_flash_attention_2: bool):
             'config_overrides': {
                 'num_hidden_layers': 2,
                 'intermediate_size': 64,
+                'hidden_size': 64,
             },
             'use_auth_token': True,
             'pretrained': False,
@@ -191,6 +193,7 @@ def test_flash2(model_name: str, use_flash_attention_2: bool):
             'config_overrides': {
                 'num_hidden_layers': 2,
                 'intermediate_size': 64,
+                'hidden_size': 64,
             },
             'pretrained': False,
             'init_device': 'cpu',

From 4f21ede15cb345e8d50644509bbbb19ff0d7b526 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 09:10:56 +0000
Subject: [PATCH 09/34] attn cmp

---
 tests/test_flash_triton_torch.py | 41 ++++++++------------------------
 1 file changed, 10 insertions(+), 31 deletions(-)

diff --git a/tests/test_flash_triton_torch.py b/tests/test_flash_triton_torch.py
index 1d1919217c..2059585a35 100644
--- a/tests/test_flash_triton_torch.py
+++ b/tests/test_flash_triton_torch.py
@@ -19,12 +19,9 @@ def allclose_helper(t0: torch.Tensor,
 
 @pytest.mark.gpu
 @pytest.mark.parametrize('attn_impl_0,attn_impl_1', [
-    ('flash', 'flash'),
     ('flash', 'triton'),
     ('flash', 'torch'),
-    ('triton', 'triton'),
     ('triton', 'torch'),
-    ('torch', 'torch'),
 ])
 @pytest.mark.parametrize('clip_qkv', [True, False])
 @pytest.mark.parametrize('qk_ln', [True, False])
@@ -212,7 +209,7 @@ def test_vs_mha(attn_impl: str, device: str = 'cuda'):
 
     cfg = om.create({
         'attn_impl': attn_impl,
-        'd_model': 256,
+        'd_model': 64,
         'n_heads': 2,
         'attn_pdrop': 0,
         'clip_qkv': False,
@@ -298,8 +295,8 @@ def gen_tca_mask():
 
 @pytest.mark.gpu
 @pytest.mark.parametrize('attn_impl', ['flash', 'triton', 'torch'])
-@pytest.mark.parametrize('n_heads', [32, 16, 8])
-@pytest.mark.parametrize('kv_n_heads', [8, 4, 2, 1])
+@pytest.mark.parametrize('n_heads', [16, 8])
+@pytest.mark.parametrize('kv_n_heads', [4, 2, 1])
 def test_grouped_attention_heads(attn_impl: str,
                                  n_heads: int,
                                  kv_n_heads: int,
@@ -338,14 +335,12 @@ def test_grouped_attention_heads(attn_impl: str,
     loss0.backward()
 
 
-@pytest.mark.gpu
-@pytest.mark.parametrize('attn_impl', ['flash', 'triton', 'torch'])
-def test_grouped_query_invalid_heads(attn_impl: str, device: str = 'cuda'):
+def test_grouped_query_invalid_heads():
     """Check indivisble combinations of grouped_query_attention."""
     from llmfoundry.models.layers import attention
 
     cfg = om.create({
-        'attn_impl': attn_impl,
+        'attn_impl': 'torch',
         'd_model': 256,
         'n_heads': 16,
         'attn_pdrop': 0,
@@ -357,34 +352,18 @@ def test_grouped_query_invalid_heads(attn_impl: str, device: str = 'cuda'):
     expected_error = 'Each Q head should get the same number of KV heads, so n_heads must be divisible by kv_n_heads'
 
     with pytest.raises(ValueError, match=expected_error):
-        _ = attention.GroupedQueryAttention(**cfg).to(device)
+        _ = attention.GroupedQueryAttention(**cfg)
 
-    cfg = om.create({
-        'attn_impl': attn_impl,
-        'd_model': 256,
-        'n_heads': 16,
-        'attn_pdrop': 0,
-        'clip_qkv': False,
-        'qk_ln': False,
-        'kv_n_heads': 17
-    })
+    cfg.kv_n_heads = 17
 
     expected_error = 'The number of KV heads should be less than or equal to Q heads'
 
     with pytest.raises(ValueError, match=expected_error):
-        _ = attention.GroupedQueryAttention(**cfg).to(device)
+        _ = attention.GroupedQueryAttention(**cfg)
 
-    cfg = om.create({
-        'attn_impl': attn_impl,
-        'd_model': 256,
-        'n_heads': 16,
-        'attn_pdrop': 0,
-        'clip_qkv': False,
-        'qk_ln': False,
-        'kv_n_heads': 0
-    })
+    cfg.kv_n_heads = 0
 
     expected_error = 'kv_n_heads should be greater than zero'
 
     with pytest.raises(ValueError, match=expected_error):
-        _ = attention.GroupedQueryAttention(**cfg).to(device)
+        _ = attention.GroupedQueryAttention(**cfg)

From 7ac1a1b4ffc2d7b4b0aa1b3fcea8ea62b832b789 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 09:11:16 +0000
Subject: [PATCH 10/34] lion8b

---
 tests/test_lion8b.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/test_lion8b.py b/tests/test_lion8b.py
index 0c7010ce9f..7f308144da 100644
--- a/tests/test_lion8b.py
+++ b/tests/test_lion8b.py
@@ -78,18 +78,19 @@ def test_modifies_weights_and_momentums(N: int, D: int, dtype: torch.dtype,
         assert torch.std(momentum).item() > 0
 
 
-@pytest.mark.gpu
 @pytest.mark.parametrize('N,D', _MANY_PARAM_SHAPES)
-@pytest.mark.parametrize('device,dtype', [('cpu', torch.float32),
-                                          ('cuda', torch.bfloat16),
-                                          ('cuda', torch.float16),
-                                          ('cuda', torch.float32)])
+@pytest.mark.parametrize('dtype', [torch.float32,
+                                          pytest.param(torch.bfloat16, marks=pytest.mark.gpu),
+                                          pytest.param(torch.float16, marks=pytest.mark.gpu),
+                                          pytest.param(torch.float32, marks=pytest.mark.gpu)])
 @pytest.mark.parametrize('weight_decay', [0, .1])
 @pytest.mark.parametrize('fused,use_errors', [(False, False), (True, False),
                                               (True, True)])
-def test_changes_with_zero_grads(N: int, D: int, device: str,
+def test_changes_with_zero_grads(N: int, D: int,
                                  dtype: torch.dtype, weight_decay: float,
                                  fused: bool, use_errors: bool) -> None:
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
     if (device == 'cpu') and (fused or use_errors):
         return
 
@@ -121,16 +122,17 @@ def test_changes_with_zero_grads(N: int, D: int, device: str,
             torch.testing.assert_close(W_orig, W)  # no weight modification
 
 
-@pytest.mark.gpu
 @pytest.mark.parametrize('N,D', [(1, 8), (17, 23), (32, 32)])
-@pytest.mark.parametrize('device,dtype', [('cpu', torch.float32),
-                                          ('cuda', torch.bfloat16),
-                                          ('cuda', torch.float16),
-                                          ('cuda', torch.float32)])
+@pytest.mark.parametrize('dtype', [torch.float32,
+                                          pytest.param(torch.bfloat16, marks=pytest.mark.gpu),
+                                          pytest.param(torch.float16, marks=pytest.mark.gpu),
+                                          pytest.param(torch.float32, marks=pytest.mark.gpu)])
 @pytest.mark.parametrize('fused,use_errors', [(False, False), (True, False),
                                               (True, True)])
-def test_descends(N: int, D: int, device: str, dtype: torch.dtype, fused: bool,
+def test_descends(N: int, D: int, dtype: torch.dtype, fused: bool,
                   use_errors: bool) -> None:
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
     if (device == 'cpu') and (fused or use_errors):
         return
     torch.manual_seed(123)
@@ -399,8 +401,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:  # type:ignore
 _LOCAL_STATE = fsdp.StateDictType.LOCAL_STATE_DICT
 
 
-# run just this test with:
-# python3 -m composer.cli.launcher -n 2 --master_port 26000 -m pytest -m gpu tests/test_lion8b.py::test_fsdp_save_load  # noqa
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
 @pytest.mark.parametrize('dtype', _FLOAT_DTYPES)

From 92bdcaf2e2c9c3fe2a1e5d620824fb6a6817abfc Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 10:05:39 +0000
Subject: [PATCH 11/34] training

---
 scripts/data_prep/convert_dataset_hf.py |  4 ++++
 tests/test_dataloader.py                | 14 ++++++-------
 tests/test_training.py                  | 26 ++++++++++++++++---------
 3 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/scripts/data_prep/convert_dataset_hf.py b/scripts/data_prep/convert_dataset_hf.py
index 964b05ed09..7fa8778fd2 100644
--- a/scripts/data_prep/convert_dataset_hf.py
+++ b/scripts/data_prep/convert_dataset_hf.py
@@ -186,6 +186,10 @@ def __init__(self,
                                                       folder_split='val_xsmall',
                                                       raw_samples=3000,
                                                       truncated_samples=3000)
+c4constants.splits['val_xxsmall'] = DataSplitConstants(hf_split='validation',
+                                                      folder_split='val_xxsmall',
+                                                      raw_samples=100,
+                                                      truncated_samples=100)
 
 CONSTS = {'c4': c4constants, 'the_pile': pileconstants}
 
diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py
index 2080ec32ec..9a382c15d8 100644
--- a/tests/test_dataloader.py
+++ b/tests/test_dataloader.py
@@ -253,17 +253,16 @@ def test_finetuning_dataloader(decoder_only_format: bool,
                                allow_pad_trimming: bool,
                                packing_ratio: Optional[Union[float,
                                                              Literal['auto']]]):
-    # Use the datasets just built in the last test
-    tokenizer_name = 'gpt2' if decoder_only_format else 't5-base'
-    max_seq_len = 2048 if decoder_only_format else 1024
-
     if (decoder_only_format is False) and (packing_ratio is not None):
         pytest.xfail('packing_ratio only supported for decoder-only format.')
 
+    tokenizer_name = 'gpt2' if decoder_only_format else 't5-base'
+    max_seq_len = 2048 if decoder_only_format else 1024
+
     cfg = {
         'name': 'finetuning',
         'dataset': {
-            'hf_name': 'tatsu-lab/alpaca',
+            'hf_name': 'HuggingFaceH4/databricks_dolly_15k',
             'split': 'train',
             'max_seq_len': max_seq_len,
             'decoder_only_format': decoder_only_format,
@@ -272,9 +271,9 @@ def test_finetuning_dataloader(decoder_only_format: bool,
             'shuffle': True,
         },
         'drop_last': False,
-        'num_workers': 4,
+        'num_workers': 0,
         'pin_memory': False,
-        'prefetch_factor': 2,
+        'prefetch_factor': None,
         'persistent_workers': False,
         'timeout': 0
     }
@@ -530,7 +529,6 @@ def test_malformed_data(
         },
         'drop_last': False,
         'num_workers': 0,
-        # set prefetch to 2 if < torch 2, else set it to None
         'prefetch_factor': None if using_torch_2() else 2,
         'pin_memory': False,
         'persistent_workers': False,
diff --git a/tests/test_training.py b/tests/test_training.py
index 214909cc28..4532fdcb74 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -26,9 +26,7 @@
 def create_c4_dataset_xsmall(path: pathlib.Path) -> str:
     """Creates a small mocked version of the C4 dataset."""
     c4_dir = os.path.join(path, f'my-copy-c4')
-    downloaded_split = 'val_xsmall'  # very fast to convert
-
-    # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188
+    downloaded_split = 'val_xxsmall'
     main_hf(
         Namespace(
             **{
@@ -49,7 +47,7 @@ def create_c4_dataset_xsmall(path: pathlib.Path) -> str:
     # copy the small downloaded_split to other c4 splits for mocking purposes
     mocked_splits = ['train', 'val']
     for mocked_split in mocked_splits:
-        shutil.copytree(os.path.join(c4_dir, 'val_xsmall'),
+        shutil.copytree(os.path.join(c4_dir, 'val_xxsmall'),
                         os.path.join(c4_dir, mocked_split))
     assert os.path.exists(c4_dir)
     return c4_dir
@@ -86,13 +84,16 @@ def gpt_tiny_cfg(dataset_name: str, device: str):
     assert isinstance(test_cfg, DictConfig)
 
     test_cfg.data_local = dataset_name
-    test_cfg.global_train_batch_size = 8
-    test_cfg.device_eval_batch_size = 4
-    test_cfg.device_train_microbatch_size = 4
+    test_cfg.global_train_batch_size = 1
+    test_cfg.device_eval_batch_size = 2
+    test_cfg.device_train_microbatch_size = 1
     test_cfg.max_duration = '4ba'
     test_cfg.eval_interval = '4ba'
     test_cfg.run_name = 'gpt-mini-integration-test'
 
+    test_cfg.model.n_layer = 2
+    test_cfg.model.n_embd = 64
+
     if device == 'cpu':
         test_cfg.model.init_device = 'cpu'
         test_cfg.fsdp_config = None
@@ -133,7 +134,14 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
                 'language_modeling'
         })
     ])
-    test_cfg.icl_subset_num_batches = 1  # -1 to evaluate on all batches
+    test_cfg.icl_subset_num_batches = 1
+    test_cfg.eval_subset_num_batches = 2
+    test_cfg.train_loader.num_workers = 0
+    test_cfg.train_loader.prefetch_factor = None
+    test_cfg.train_loader.persistent_workers = False
+    test_cfg.eval_loader.num_workers = 0
+    test_cfg.eval_loader.prefetch_factor = None
+    test_cfg.eval_loader.persistent_workers = False
 
     test_cfg.eval_gauntlet = DictConfig({
         'weighting':
@@ -162,7 +170,7 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
     if averages is not None:
         test_cfg.eval_gauntlet['averages'] = averages
 
-    test_cfg.icl_seq_len = 128
+    test_cfg.icl_seq_len = 16
     test_cfg.max_duration = '1ba'
     test_cfg.eval_interval = '1ba'
     test_cfg.loggers = DictConfig({'inmemory': DictConfig({})})

From f8a2429d2246349fdcd9ff394a74a936a9d98313 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 10:07:46 +0000
Subject: [PATCH 12/34] precommit

---
 scripts/data_prep/convert_dataset_hf.py |  9 +++++----
 tests/test_lion8b.py                    | 26 ++++++++++++++-----------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/scripts/data_prep/convert_dataset_hf.py b/scripts/data_prep/convert_dataset_hf.py
index 7fa8778fd2..f33f24e478 100644
--- a/scripts/data_prep/convert_dataset_hf.py
+++ b/scripts/data_prep/convert_dataset_hf.py
@@ -186,10 +186,11 @@ def __init__(self,
                                                       folder_split='val_xsmall',
                                                       raw_samples=3000,
                                                       truncated_samples=3000)
-c4constants.splits['val_xxsmall'] = DataSplitConstants(hf_split='validation',
-                                                      folder_split='val_xxsmall',
-                                                      raw_samples=100,
-                                                      truncated_samples=100)
+c4constants.splits['val_xxsmall'] = DataSplitConstants(
+    hf_split='validation',
+    folder_split='val_xxsmall',
+    raw_samples=100,
+    truncated_samples=100)
 
 CONSTS = {'c4': c4constants, 'the_pile': pileconstants}
 
diff --git a/tests/test_lion8b.py b/tests/test_lion8b.py
index 7f308144da..d5b284b23c 100644
--- a/tests/test_lion8b.py
+++ b/tests/test_lion8b.py
@@ -79,16 +79,18 @@ def test_modifies_weights_and_momentums(N: int, D: int, dtype: torch.dtype,
 
 
 @pytest.mark.parametrize('N,D', _MANY_PARAM_SHAPES)
-@pytest.mark.parametrize('dtype', [torch.float32,
-                                          pytest.param(torch.bfloat16, marks=pytest.mark.gpu),
-                                          pytest.param(torch.float16, marks=pytest.mark.gpu),
-                                          pytest.param(torch.float32, marks=pytest.mark.gpu)])
+@pytest.mark.parametrize('dtype', [
+    torch.float32,
+    pytest.param(torch.bfloat16, marks=pytest.mark.gpu),
+    pytest.param(torch.float16, marks=pytest.mark.gpu),
+    pytest.param(torch.float32, marks=pytest.mark.gpu)
+])
 @pytest.mark.parametrize('weight_decay', [0, .1])
 @pytest.mark.parametrize('fused,use_errors', [(False, False), (True, False),
                                               (True, True)])
-def test_changes_with_zero_grads(N: int, D: int,
-                                 dtype: torch.dtype, weight_decay: float,
-                                 fused: bool, use_errors: bool) -> None:
+def test_changes_with_zero_grads(N: int, D: int, dtype: torch.dtype,
+                                 weight_decay: float, fused: bool,
+                                 use_errors: bool) -> None:
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
     if (device == 'cpu') and (fused or use_errors):
@@ -123,10 +125,12 @@ def test_changes_with_zero_grads(N: int, D: int,
 
 
 @pytest.mark.parametrize('N,D', [(1, 8), (17, 23), (32, 32)])
-@pytest.mark.parametrize('dtype', [torch.float32,
-                                          pytest.param(torch.bfloat16, marks=pytest.mark.gpu),
-                                          pytest.param(torch.float16, marks=pytest.mark.gpu),
-                                          pytest.param(torch.float32, marks=pytest.mark.gpu)])
+@pytest.mark.parametrize('dtype', [
+    torch.float32,
+    pytest.param(torch.bfloat16, marks=pytest.mark.gpu),
+    pytest.param(torch.float16, marks=pytest.mark.gpu),
+    pytest.param(torch.float32, marks=pytest.mark.gpu)
+])
 @pytest.mark.parametrize('fused,use_errors', [(False, False), (True, False),
                                               (True, True)])
 def test_descends(N: int, D: int, dtype: torch.dtype, fused: bool,

From d26f3b01648f888def2e0652a6e1354cfb80b62e Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 23:04:16 +0000
Subject: [PATCH 13/34] remove extra cpu workflow too

---
 .github/workflows/pr-cpu.yaml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index efdf8eec58..542f81857d 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -20,11 +20,7 @@ jobs:
       matrix:
         include:
         - name: 'cpu-latest'
-          container: mosaicml/pytorch:latest_cpu  # mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
-          markers: 'not gpu'
-          pytest_command: 'coverage run -m pytest'
-        - name: 'cpu-2.0.1'
-          container: mosaicml/pytorch:2.0.1_cpu-python3.10-ubuntu20.04
+          container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
           markers: 'not gpu'
           pytest_command: 'coverage run -m pytest'
         - name: 'cpu-2.1.0'

From fc0d944b754302f1b935d0f47f1c320c2de96cee Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 23:05:03 +0000
Subject: [PATCH 14/34] more

---
 tests/test_hf_conversion_script.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index f0d306f7a3..5db9f941a4 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -442,7 +442,7 @@ def test_huggingface_conversion_callback_interval(
 @pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None])
 @pytest.mark.parametrize(
     'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints',
-    [('3ba', '2ba', '7ba', 3, 4)])
+    [('1ba', '1ba', '1ba', 1, 1)])
 @patch('os.cpu_count', MagicMock(return_value=None))
 def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
                                          tie_word_embeddings: bool,
@@ -457,7 +457,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
 
     max_seq_len = 16
     device_batch_size = 1
-    dataset_size = 14
+    dataset_size = 2
     precision_str = 'bfloat16'
     precision = torch.bfloat16
     batches_per_epoch = math.ceil(dataset_size / (device_batch_size * 2))
@@ -475,7 +475,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
         model_cfg = {
             'name': 'mpt_causal_lm',
             'init_device': 'cpu',
-            'd_model': 128,
+            'd_model': 64,
             'n_heads': 2,
             'n_layers': 2,
             'expansion_ratio': 4,
@@ -553,9 +553,9 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path,
             'shuffle': True,
         },
         'drop_last': False,
-        'num_workers': 4,
+        'num_workers': 0,
         'pin_memory': False,
-        'prefetch_factor': 2,
+        'prefetch_factor': None,
         'persistent_workers': False,
         'timeout': 0
     }

From b54c837d029a7649ce6fe972b88787cb065a13ea Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 23:35:20 +0000
Subject: [PATCH 15/34] rename workflow

---
 .github/workflows/pr-cpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 542f81857d..f57362ac82 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -19,7 +19,7 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: 'cpu-latest'
+        - name: 'cpu-1.13.1'
           container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04
           markers: 'not gpu'
           pytest_command: 'coverage run -m pytest'

From 6c273024f729feb80cc1351fb66f3de49a35f433 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 23:35:30 +0000
Subject: [PATCH 16/34] fix?

---
 tests/test_training.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_training.py b/tests/test_training.py
index 4532fdcb74..8390834d1d 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -10,6 +10,7 @@
 
 import pytest
 from composer.loggers import InMemoryLogger
+from composer.utils import using_torch_2
 from omegaconf import DictConfig, ListConfig
 from omegaconf import OmegaConf as om
 
@@ -137,10 +138,10 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any,
     test_cfg.icl_subset_num_batches = 1
     test_cfg.eval_subset_num_batches = 2
     test_cfg.train_loader.num_workers = 0
-    test_cfg.train_loader.prefetch_factor = None
+    test_cfg.train_loader.prefetch_factor = None if using_torch_2() else 2
     test_cfg.train_loader.persistent_workers = False
     test_cfg.eval_loader.num_workers = 0
-    test_cfg.eval_loader.prefetch_factor = None
+    test_cfg.eval_loader.prefetch_factor = None if using_torch_2() else 2
     test_cfg.eval_loader.persistent_workers = False
 
     test_cfg.eval_gauntlet = DictConfig({

From 959a8deec7053dcf2bbf1f5e55c2818980e255f6 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sat, 18 Nov 2023 23:57:28 +0000
Subject: [PATCH 17/34] fix

---
 tests/test_dataloader.py | 2 +-
 tests/test_packing.py    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py
index 9a382c15d8..4e1fd6f1f8 100644
--- a/tests/test_dataloader.py
+++ b/tests/test_dataloader.py
@@ -273,7 +273,7 @@ def test_finetuning_dataloader(decoder_only_format: bool,
         'drop_last': False,
         'num_workers': 0,
         'pin_memory': False,
-        'prefetch_factor': None,
+        'prefetch_factor': None if using_torch_2() else 2,
         'persistent_workers': False,
         'timeout': 0
     }
diff --git a/tests/test_packing.py b/tests/test_packing.py
index cbeca8b7b1..517dca944e 100644
--- a/tests/test_packing.py
+++ b/tests/test_packing.py
@@ -6,7 +6,7 @@
 
 import pytest
 import torch
-from composer.utils import dist, reproducibility
+from composer.utils import dist, reproducibility, using_torch_2
 from omegaconf import DictConfig
 from pytest import approx
 from torch.utils.data import DataLoader
@@ -164,7 +164,7 @@ def test_packing_with_dataloader(packing_ratio: Any):
         # Gets copied per worker and we cannot check the waste for child processes.
         'num_workers': 0,
         'pin_memory': False,
-        'prefetch_factor': None,
+        'prefetch_factor': None if using_torch_2() else 2,
         'persistent_workers': False,
         'timeout': 0,
     })

From b0b16370330dce9579708b6c22093aefed4a2ad4 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 00:26:54 +0000
Subject: [PATCH 18/34] fix auto packing on 1.13

---
 llmfoundry/data/packing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
index 45322c9b2f..3fca0ade5e 100644
--- a/llmfoundry/data/packing.py
+++ b/llmfoundry/data/packing.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import torch
+from composer.utils import using_torch_2
 from omegaconf import DictConfig
 from transformers import PreTrainedTokenizerBase
 
@@ -347,7 +348,7 @@ def profile_packing(
     dataloader_cfg.dataset.packing_ratio = None
     dataloader_cfg.drop_last = False
     dataloader_cfg.num_workers = 0
-    dataloader_cfg.prefetch_factor = None
+    dataloader_cfg.prefetch_factor = None if using_torch_2() else 2
     dataloader_cfg.persistent_workers = False
 
     # Determine the packing_ratio values we'll try

From 5d975759ec60777ad1f8f2f8e5d47de63fc889a3 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 01:09:20 +0000
Subject: [PATCH 19/34] speed up packing test

---
 tests/test_packing.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/test_packing.py b/tests/test_packing.py
index 517dca944e..807028dcbb 100644
--- a/tests/test_packing.py
+++ b/tests/test_packing.py
@@ -143,7 +143,15 @@ def test_dist_auto_packing(profile_packing: Mock):
     assert packing_ratio == 2
 
 
+def patched_packing_ratio(*args, **kwargs):
+    from llmfoundry.data.packing import auto_packing_ratio
+
+    return auto_packing_ratio(*args, **kwargs, num_packing_ratios=4)
+
+
 @pytest.mark.parametrize('packing_ratio', ['auto', 2.0])
+@patch('llmfoundry.data.finetuning.dataloader.auto_packing_ratio',
+       patched_packing_ratio)
 def test_packing_with_dataloader(packing_ratio: Any):
     """Tests that packing works with a dataloader."""
     reproducibility.seed_all(17)
@@ -185,7 +193,7 @@ def test_packing_with_dataloader(packing_ratio: Any):
     padding = (1 - pack_collator.efficiency)
     if packing_ratio == 'auto':
         assert pack_collator.waste == approx(0)
-        assert padding == approx(0.1197916, rel=.01)
+        assert padding == approx(0.292019, rel=.01)
     else:
         assert pack_collator.waste == approx(0)
         assert padding == approx(0.873720, rel=.01)

From 16e58f60394a60cbceafeaf720d9cfb5943ef6fb Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 01:54:13 +0000
Subject: [PATCH 20/34] icl speedup

---
 tests/test_icl_datasets.py | 43 ++++++++------------------------------
 tests/test_tasks.yaml      |  2 +-
 2 files changed, 10 insertions(+), 35 deletions(-)

diff --git a/tests/test_icl_datasets.py b/tests/test_icl_datasets.py
index 524aac9fd0..8da44583ad 100644
--- a/tests/test_icl_datasets.py
+++ b/tests/test_icl_datasets.py
@@ -20,19 +20,6 @@ def load_icl_config(conf_path: str = 'tests/test_tasks.yaml'):
     return test_cfg
 
 
-@pytest.fixture(autouse=True, scope='function')
-def tmp_dir():
-    TMP_FOLDER = 'tmp_data' + str(random.randint(0, 100_000))
-    dirpath = Path(TMP_FOLDER)
-    if dirpath.exists() and dirpath.is_dir():
-        shutil.rmtree(dirpath)
-    os.mkdir(TMP_FOLDER)
-    yield TMP_FOLDER
-    dirpath = Path(TMP_FOLDER)
-    if dirpath.exists() and dirpath.is_dir():
-        shutil.rmtree(dirpath)
-
-
 def run_test(dir: pathlib.Path,
              tokenizer: PreTrainedTokenizerBase,
              bos_tok: str = ''):
@@ -41,7 +28,7 @@ def run_test(dir: pathlib.Path,
                                          tokenizer,
                                          1024,
                                          8,
-                                         destination_dir=f'{os.getcwd()}/{dir}')
+                                         destination_dir=dir)
 
     for e in evaluators:
         batch = next(e.dataloader.dataloader.__iter__())
@@ -72,7 +59,7 @@ def run_test(dir: pathlib.Path,
             assert full_example == bos_tok + 'Question: Who was the man behind The Chipmunks?\nAnswer:'
             assert answer == 'David Seville'
         elif e.label == 'triviaqa/1-shot':
-            assert full_example == bos_tok + 'Question: High Willhays is the highest point of what National Park?\nAnswer: DARTMOOR\nQuestion: Who was the man behind The Chipmunks?\nAnswer:'
+            assert full_example == bos_tok + 'Question: Which was the only eastern bloc country to participate in the 1984 LA Olympics?\nAnswer: Rumania\nQuestion: Who was the man behind The Chipmunks?\nAnswer:'
             assert answer == 'David Seville'
         elif e.label == 'copa/0-shot':
             assert full_example == bos_tok + 'The man turned on the faucet, therefore the toilet filled with water'
@@ -87,22 +74,10 @@ def run_test(dir: pathlib.Path,
             assert full_example == bos_tok + "Tom gave Ralph a lift to school so Ralph wouldn't have to walk.\nThe city councilmen refused the demonstrators a permit because the city councilmen feared violence"
             assert answer == ' feared violence'
 
-
-def test_icl_task_loading_gpt2_tokenizer(tmp_dir: pathlib.Path):
-    tokenizer = AutoTokenizer.from_pretrained('gpt2')
-    run_test(tmp_dir, tokenizer)
-
-
-def test_icl_task_loading_gptj_tokenizer(tmp_dir: pathlib.Path):
-    tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6b')
-    run_test(tmp_dir, tokenizer)
-
-
-def test_icl_task_loading_opt_tokenizer(tmp_dir: pathlib.Path):
-    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-6.7b')
-    run_test(tmp_dir, tokenizer, '</s>')
-
-
-def test_icl_task_loading_gptneox_tokenizer(tmp_dir: pathlib.Path):
-    tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
-    run_test(tmp_dir, tokenizer)
+@pytest.mark.parametrize('tokenizer_name,bos_token', [
+    ('facebook/opt-6.7b', '</s>'),
+    ('EleutherAI/gpt-neox-20b', '')
+])
+def test_icl_task_tokenizer(tmp_path: pathlib.Path, tokenizer_name: str, bos_token: str):
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    run_test(tmp_path, tokenizer, bos_token)
diff --git a/tests/test_tasks.yaml b/tests/test_tasks.yaml
index f6e215abf7..15c4749057 100644
--- a/tests/test_tasks.yaml
+++ b/tests/test_tasks.yaml
@@ -18,6 +18,6 @@ icl_tasks:
   icl_task_type: schema
 -
   label: triviaqa
-  dataset_uri: scripts/eval/local_data/world_knowledge/triviaqa.jsonl # ADD YOUR OWN DATASET URI
+  dataset_uri: scripts/eval/local_data/world_knowledge/triviaqa_small.jsonl # ADD YOUR OWN DATASET URI
   num_fewshot: [0, 1]
   icl_task_type: question_answering

From dff0fcf82993a6be32a2ac8f79d41ba383350a6e Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 01:56:17 +0000
Subject: [PATCH 21/34] precommit

---
 tests/test_icl_datasets.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/tests/test_icl_datasets.py b/tests/test_icl_datasets.py
index 8da44583ad..4f251ed22b 100644
--- a/tests/test_icl_datasets.py
+++ b/tests/test_icl_datasets.py
@@ -1,11 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 import pathlib
-import random
-import shutil
-from pathlib import Path
 
 import pytest
 from omegaconf import OmegaConf as om
@@ -74,10 +70,11 @@ def run_test(dir: pathlib.Path,
             assert full_example == bos_tok + "Tom gave Ralph a lift to school so Ralph wouldn't have to walk.\nThe city councilmen refused the demonstrators a permit because the city councilmen feared violence"
             assert answer == ' feared violence'
 
-@pytest.mark.parametrize('tokenizer_name,bos_token', [
-    ('facebook/opt-6.7b', '</s>'),
-    ('EleutherAI/gpt-neox-20b', '')
-])
-def test_icl_task_tokenizer(tmp_path: pathlib.Path, tokenizer_name: str, bos_token: str):
+
+@pytest.mark.parametrize('tokenizer_name,bos_token',
+                         [('facebook/opt-6.7b', '</s>'),
+                          ('EleutherAI/gpt-neox-20b', '')])
+def test_icl_task_tokenizer(tmp_path: pathlib.Path, tokenizer_name: str,
+                            bos_token: str):
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
     run_test(tmp_path, tokenizer, bos_token)

From 8cf8bdbca7a2bea82e634dc56a97975d8a8a77c5 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 02:08:12 +0000
Subject: [PATCH 22/34] precommit

---
 .../world_knowledge/triviaqa_small.jsonl      | 32 +++++++++++++++++++
 tests/test_packing.py                         |  2 +-
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 scripts/eval/local_data/world_knowledge/triviaqa_small.jsonl

diff --git a/scripts/eval/local_data/world_knowledge/triviaqa_small.jsonl b/scripts/eval/local_data/world_knowledge/triviaqa_small.jsonl
new file mode 100644
index 0000000000..9d7a20deec
--- /dev/null
+++ b/scripts/eval/local_data/world_knowledge/triviaqa_small.jsonl
@@ -0,0 +1,32 @@
+{"context": "Question: Who was the man behind The Chipmunks?\nAnswer:", "answer": "David Seville", "aliases": ["David Seville"]}
+{"context": "Question: What star sign is Jamie Lee Curtis?\nAnswer:", "answer": "Scorpio", "aliases": ["Scorpio", "Skorpio", "Scorpio (disambiguation)"]}
+{"context": "Question: Which Lloyd Webber musical premiered in the US on 10th December 1993?\nAnswer:", "answer": "Sunset Boulevard", "aliases": ["Sunset Blvd", "West Sunset Boulevard", "Sunset Boulevard", "Sunset Bulevard", "Sunset Blvd."]}
+{"context": "Question: Who was the next British Prime Minister after Arthur Balfour?\nAnswer:", "answer": "Campbell-Bannerman", "aliases": ["Sir Henry Campbell-Bannerman", "Campbell-Bannerman", "Campbell Bannerman", "Sir Henry Campbell Bannerman", "Henry Campbell Bannerman", "Henry Campbell-Bannerman"]}
+{"context": "Question: Who had a 70s No 1 hit with Kiss You All Over?\nAnswer:", "answer": "Exile", "aliases": ["Internal exile", "Exiles", "Transported for life", "Exile (politics and government)", "Voluntary exile", "Sent into exile", "Exile and Banishment", "Self-exile", "Forced exile", "Exile", "Exile in Greek tragedy", "Banish", "Banishment"]}
+{"context": "Question: What claimed the life of singer Kathleen Ferrier?\nAnswer:", "answer": "Cancer", "aliases": ["Cancer pathology", "Deaths by cancer", "Anti-cancer", "Cancer (disease)", "Cancerophobia", "Malignant lesion", "Cancer medication", "Malignant tumors", "Cancer signs", "Malignant neoplasm", "Invasive (cancer)", "Malignant Neoplasms", "Malignant growth", "Sporadic cancer", "Malignant cancer", "Tumour virus", "Cancer en cuirasse", "Microtumor", "Malignant neoplasms", "Malignant tumour", "Carcinophobia", "Malignacy", "Cancer patient", "Epithelial cancers", "Solid cancer", "Cancers", "Tumor medication", "Malignant neoplastic disease", "AIDS-related cancer", "Invasive cancer", "Cancer therapy", "Cancerous tumor", "Cancer", "Financial toxicity", "Cancer diagnosis", "Cancer (medicine)", "Malignant tumor", "Cancerous", "Borderline (cancer)", "Signs of cancer", "Malignancies", "Cancer aromatase"]}
+{"context": "Question: Rita Coolidge sang the title song for which Bond film?\nAnswer:", "answer": "Octopussy", "aliases": ["Kamal kahn", "List of Bond girls in Octopussy", "Magda (James Bond)", "List of James Bond allies in Octopussy", "Vijay (James Bond)", "Bond 13", "Octopussy (character)", "Penelope Smallbone", "Octopussy", "General Orlov", "Kamal Khan", "Octopussy (film)", "List of James Bond villains in Octopussy", "Jim Fanning (James Bond)"]}
+{"context": "Question: To the nearest million what is the population of Australia?\nAnswer:", "answer": "18 million", "aliases": ["18million", "18 million", "eighteen million"]}
+{"context": "Question: What was the last US state to reintroduce alcohol after prohibition?\nAnswer:", "answer": "Utah", "aliases": ["Utah (State)", "Forty-Fifth State", "Sports in Utah", "Climate of Utah", "Education in Utah", "UT (state)", "Utahn", "Yutas", "Geography of Utah", "Utah", "Utah, United States", "Utah state nickname", "History of mining in Utah", "State of Utah", "Religion in Utah", "Utah (U.S. state)", "Transportation in Utah", "Beehive State", "US-UT", "Utah (state)", "Forty-fifth State", "Utahan", "Politics of Utah", "Salt Lake Seagulls", "45th State", "History of Utah (to 1847)", "The Beehive State", "Youtah", "Transport in Utah"]}
+{"context": "Question: Which actress was voted Miss Greenwich Village in 1942?\nAnswer:", "answer": "Lauren Bacall", "aliases": ["Bacall", "Lauren Becal", "Lauren Bacall", "Lauren Becall", "Betty J. Perske", "Loren Bacall", "Betty Joan Perske", "Betty Perske", "Betty Joan Perski"]}
+{"context": "Question: What is the Japanese share index called?\nAnswer:", "answer": "Nikkei", "aliases": ["Nikkei", "Nikkei (disambiguation)"]}
+{"context": "Question: What was the name of Michael Jackson's autobiography written in 1988?\nAnswer:", "answer": "Moonwalk", "aliases": ["Walk on the Moon", "Walk on the moon", "Moonwalk (disambiguation)", "Lunar walks", "Moonwalk", "Moon Walk", "Moonwalking", "Lunar walk", "Moon walk", "Moonwalks", "Moon walks", "Lunar walking", "Moon walking"]}
+{"context": "Question: In which decade did stereo records first go on sale?\nAnswer:", "answer": "1930s", "aliases": ["1930’s", "Thirties", "1930s literature", "Nineteen-thirties", "1930–1939", "1930-1939", "'30s", "1930s", "1930's", "%6030s", "1930s (decade)", "The Thirties"]}
+{"context": "Question: What was golfing great Ben Hogan's famous reply when he was asked how to improve one's game?\nAnswer:", "answer": "Hit the ball closer to the hole", "aliases": ["Hit the ball closer to the hole"]}
+{"context": "Question: In what year's Olympics were electric timing devices and a public-address system used for the first time?\nAnswer:", "answer": "In 1912, in Stockholm", "aliases": ["In 1912, in Stockholm"]}
+{"context": "Question: Why is the site of a boxing match called a ring when it's square?\nAnswer:", "answer": "Boxing rings were originally circular", "aliases": ["Boxing rings were originally circular"]}
+{"context": "Question: In the very first Boston Marathon, 15 runners competed. How many finished?\nAnswer:", "answer": "$85,000", "aliases": ["eighty-five thousand  distance", "$85,000", "85000 distance"]}
+{"context": "Question: \"How many different animal shapes are there in the \"\"Animal Crackers\"\" cookie zoo?\"\nAnswer:", "answer": "Eighteen--two bears (one walking, one seated), a bison, camel, cougar, elephant, giraffe, gorilla, hippopotamus, hyena , kangaroo, lion, monkey, rhinoceros, seal, sheep, tier, and zebra", "aliases": ["Eighteen--two bears (one walking, one seated), a bison, camel, cougar, elephant, giraffe, gorilla, hippopotamus, hyena , kangaroo, lion, monkey, rhinoceros, seal, sheep, tier, and zebra"]}
+{"context": "Question: Which volcano in Tanzania is the highest mountain in Africa?\nAnswer:", "answer": "Kilimanjaro", "aliases": ["Mawensi", "Mt. Kilimanjaro", "Kibo (volcano)", "Mount killimanjaro", "Highest mountain in Africa", "Kilimanjaro Massif", "Stella Point", "Kilimandjaro", "Kilimonjaro", "Kilimanjaro", "Gilman's Point", "Killimanjaro", "Kilima-Njaro", "Kiliminjaro", "Mt Kilimanjaro", "Kilimanjaro Mountain", "Mount Kilimanjaro", "Mawenzi", "Uhuru Peak", "Kilimanjiro", "Kaiser-Wilhelm-Spitze", "Mt Kilamanjaro", "Mount Kiliminjaro", "Mount Kilimandjaro", "Mount Kilamanjaro", "Tussock Grassland (Tanzania)", "Kilamanjaro"]}
+{"context": "Question: The flag of Libya is a plain rectangle of which color?\nAnswer:", "answer": "Green", "aliases": ["Greenishly", "Avacado (color)", "Green (color)", "Rgb(0, 255, 0)", "Greenishness", "The colour green", "Greenest", "List of terms associated with the color green", "The color green", "Green", "Pastel green", "(0, 255, 0)", "Green (colour)", "Greenness"]}
+{"context": "Question: Of which African country is Niamey the capital?\nAnswer:", "answer": "Niger", "aliases": ["Niger Republic", "Nigerois", "Republic Of Niger", "Republic of Niger", "The Republic of Niger", "Nigerien", "Niger (country)", "République du Niger", "Republique du Niger", "ISO 3166-1:NE", "Niger", "NG-NI"]}
+{"context": "Question: Who was the director of the CIA from 1976-81?\nAnswer:", "answer": "George Bush", "aliases": ["George Bush", "George bush", "Goerge Bush", "George W. Bush (disambiguation)", "GeorgeBush", "George Bushe", "Georgebush", "Georg bush", "G Bush", "George Bush, President", "George Bush (disambiguation)", "Bush, George", "Geroge Bush"]}
+{"context": "Question: Which musical featured the song The Street Where You Live?\nAnswer:", "answer": "My Fair Lady", "aliases": ["My Fair Lady (2010 film)", "Enry Iggins", "Why Can't the English%3F", "My Fair Lady", "My Fair Lady (upcoming film)", "My Fair Lady (musical)", "My fair lady", "I'm an Ordinary Man", "My Fair Lady (2014 film)", "My Fair Lady (2012 film)", "My Fair Lady (2015 film)"]}
+{"context": "Question: \"Who was the target of the failed \"\"Bomb Plot\"\" of 1944?\"\nAnswer:", "answer": "Hitler", "aliases": ["Hitlerian", "Adolph Schicklgruber", "HitlerAdolf", "Hitler's medical health", "Adolf Hitle", "Hitlar", "Adolph Hiedler", "Adolf Hiedler", "Adolph Hittler", "Day of Potsdam", "Adolpf Hitler", "Adolf Hister", "Adolf Hitlier", "Adolph Hitler's health", "Hitler's health", "Hitlers", "Aldof Hilter", "HITLER", "Hitler, Adolph", "History of Adolf Hitler", "Hitler,Adolph", "Adolph Hiter", "Adolf Hittler", "Herr Hitler", "Hitler,Adolf", "Adolf Schicklegruber", "Adolf hitler", "Adlof hitler", "Adolph Schickelgruber", "Hitler Adolf", "Hitlers medical health", "HitlerAdolph", "Adolph Schicklegruber", "Adolf Hiler", "Adolf Hitler's medical condition", "Hittler", "Adolf Schickelgruber", "Adolf Hitler", "Hitler's", "Hitler, adolf", "Nazi leader", "Hitler, Adolf", "Herr Wolf", "Adolph Hitler's medical health", "Adolph Hitler", "Adolf Hitler's health", "Adolf Schicklgruber", "AdolphHitler", "Adolf Hilter", "Health of Adolf Hitler", "Adolf Hitler's medical health", "Hitler Adolph", "AdolfHitler", "Adolf HItler", "Hitlet", "Hitler adolf", "Adoff Hitler", "Adolfus Hitler", "Hitler", "Adolph hitler"]}
+{"context": "Question: Who had an 80s No 1 hit with Hold On To The Nights?\nAnswer:", "answer": "Richard Marx", "aliases": ["Richard Noel Marx", "Richard Marx"]}
+{"context": "Question: Who directed the classic 30s western Stagecoach?\nAnswer:", "answer": "John Ford", "aliases": ["John Ford (1895-1973)", "Sean O'Feeney", "John Ford (film director)", "Ford, John (1895-1973)", "Argosy Pictures", "John Ford statue", "John Martin O'Feeney", "John Ford (director)", "Cavalry trilogy", "John O'Feeney", "Sean Aloysius O'Feeney", "Ford, John", "John Ford"]}
+{"context": "Question: Dave Gilmore and Roger Waters were in which rock group?\nAnswer:", "answer": "Pink Floyd", "aliases": ["Grey Floyd", "Pink Floyd trivia", "The Screaming Ab Dabs", "Pink flowd", "The Meggadeaths", "The Architectural Abdabs", "PINK FLOYD", "Pink Flod", "Pink Floyd", "Pink Floyd Trivia", "The Pink Floyd", "Notable or frequent contributors to pink floyd", "The Tea Set", "Pinkfloyd", "Pi5", "Pink floid", "Pink Floyd (band)", "The T Set", "Screaming abdabs", "Notable or frequent contributors to Pink Floyd", "The Megadeaths", "Pik floyd", "The Pink Floyd Sound", "Pink floyd", "The T-Set", "The Screaming Abdabs", "Clive Metcalfe", "Meggadeaths"]}
+{"context": "Question: Which highway was Revisited in a classic 60s album by Bob Dylan?\nAnswer:", "answer": "61", "aliases": ["61", "sixty-one"]}
+{"context": "Question: Which was the only eastern bloc country to participate in the 1984 LA Olympics?\nAnswer:", "answer": "Rumania", "aliases": ["ISO 3166-1:RO", "Romanian state", "ROMANIA", "Roumania", "Etymology of Romania", "Romainia", "Romînia", "North Danubian region", "Carpathian Danubian space", "ROU", "România", "Romanian State", "Roumanie", "Country ROM", "Rromania", "Romania", "Republic of Romania", "RO (country)", "Rumänien", "Danubian-Carpathian Area", "Rumania", "Austro-Hungarian Empire (Romania)", "Rumunia"]}
+{"context": "Question: Which 90s sci fi series with James Belushi was based on Bruce Wagner's comic strip of the same name?\nAnswer:", "answer": "Wild Palms", "aliases": ["Wild Palms"]}
+{"context": "Question: If I Were A Rich Man Was a big hit from which stage show?\nAnswer:", "answer": "Fiddler on the Roof", "aliases": ["Fiddler on a Roof", "Fiddler on the roof", "Sprintze", "Fiddler On the Roof", "2 life", "Fiddler On The Roof", "The Fiddler on the Roof", "Fiddler on the Roof", "Fiddler on the reoof", "Anatevka"]}
+{"context": "Question: Men Against the Sea and Pitcairn's Island were two sequels to what famous novel?\nAnswer:", "answer": "Mutiny On The Bounty", "aliases": ["HMS Bounty mutineers", "Mutiny on the Bounty", "Mutiny on Bounty", "Mutiny On The Bounty", "Mutiny on the Bounty (history)", "Mutiny on the bounty", "Bounty (vessel)", "Thomas Ledward"]}
diff --git a/tests/test_packing.py b/tests/test_packing.py
index 807028dcbb..73453b6782 100644
--- a/tests/test_packing.py
+++ b/tests/test_packing.py
@@ -143,7 +143,7 @@ def test_dist_auto_packing(profile_packing: Mock):
     assert packing_ratio == 2
 
 
-def patched_packing_ratio(*args, **kwargs):
+def patched_packing_ratio(*args: Any, **kwargs: Any):
     from llmfoundry.data.packing import auto_packing_ratio
 
     return auto_packing_ratio(*args, **kwargs, num_packing_ratios=4)

From 52b11f555f0418ac7e5849d1b632fd123f20b16b Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 02:21:51 +0000
Subject: [PATCH 23/34] type

---
 tests/test_icl_datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_icl_datasets.py b/tests/test_icl_datasets.py
index 4f251ed22b..28d12df91d 100644
--- a/tests/test_icl_datasets.py
+++ b/tests/test_icl_datasets.py
@@ -24,7 +24,7 @@ def run_test(dir: pathlib.Path,
                                          tokenizer,
                                          1024,
                                          8,
-                                         destination_dir=dir)
+                                         destination_dir=str(dir))
 
     for e in evaluators:
         batch = next(e.dataloader.dataloader.__iter__())

From 1a5301f878bcec4d6724a3aee3fa0727927d6e18 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 04:25:43 +0000
Subject: [PATCH 24/34] less gen

---
 tests/test_hf_mpt_gen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_hf_mpt_gen.py b/tests/test_hf_mpt_gen.py
index ea133c64fa..1df553f126 100644
--- a/tests/test_hf_mpt_gen.py
+++ b/tests/test_hf_mpt_gen.py
@@ -37,7 +37,7 @@ def test_init_hfhub_mpt(
         _ = model.generate(
             composer_device.tensor_to_device(
                 mpt_tokenizer('hello', return_tensors='pt')['input_ids']),
-            max_new_tokens=10,
+            max_new_tokens=2,
         )
 
 

From eea448f77bec6ccf79da514a2edbce5bfce823e9 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 04:35:18 +0000
Subject: [PATCH 25/34] remove verbose

---
 llmfoundry/models/mpt/configuration_mpt.py               | 2 +-
 mcli/mcli-1b-max-seq-len-8k.yaml                         | 1 -
 mcli/mcli-llama2-finetune.yaml                           | 1 -
 scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml | 1 -
 scripts/train/yamls/finetune/1b_local_data_sft.yaml      | 1 -
 scripts/train/yamls/finetune/7b_dolly_sft.yaml           | 1 -
 scripts/train/yamls/finetune/mpt-30b-instruct.yaml       | 1 -
 scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml       | 1 -
 scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml    | 1 -
 scripts/train/yamls/finetune/t5-small_dolly_sft.yaml     | 1 -
 scripts/train/yamls/pretrain/gpt-neo-125m.yaml           | 1 -
 scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml      | 1 -
 scripts/train/yamls/pretrain/gpt2-small.yaml             | 1 -
 scripts/train/yamls/pretrain/mpt-125m.yaml               | 1 -
 scripts/train/yamls/pretrain/mpt-13b.yaml                | 1 -
 scripts/train/yamls/pretrain/mpt-1b.yaml                 | 1 -
 scripts/train/yamls/pretrain/mpt-30b.yaml                | 1 -
 scripts/train/yamls/pretrain/mpt-350m.yaml               | 1 -
 scripts/train/yamls/pretrain/mpt-3b.yaml                 | 1 -
 scripts/train/yamls/pretrain/mpt-70b.yaml                | 1 -
 scripts/train/yamls/pretrain/mpt-760m.yaml               | 1 -
 scripts/train/yamls/pretrain/mpt-7b.yaml                 | 1 -
 scripts/train/yamls/pretrain/mpt-small-cpu.yaml          | 1 -
 scripts/train/yamls/pretrain/opt-3b.yaml                 | 1 -
 scripts/train/yamls/pretrain/testing.yaml                | 1 -
 25 files changed, 1 insertion(+), 25 deletions(-)

diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index c0a1e65248..f8022808bf 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -109,7 +109,7 @@ def __init__(
             init_device (str): The device to use for parameter initialization.
             logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
             no_bias (bool): Whether to use bias in all layers.
-            verbose (int): The verbosity level. 0 is silent.
+            verbose (int): Deprecated.
             embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
             norm_type (str): choose type of norm to use
             use_cache (bool): Whether or not the model should return the last key/values attentions
diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
index 24af39234c..e89bc78c64 100644
--- a/mcli/mcli-1b-max-seq-len-8k.yaml
+++ b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -123,7 +123,6 @@ parameters:
     activation_checkpointing_reentrant: false
     activation_cpu_offload: false
     limit_all_gathers: true
-    verbose: false
 
   # Logging
   progress_bar: false
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index 93d46f57e3..5b74d31685 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -127,7 +127,6 @@ parameters:
     activation_checkpointing_reentrant: false
     activation_cpu_offload: false
     limit_all_gathers: true
-    verbose: false
 
   # Logging
   progress_bar: false
diff --git a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml
index ed2e9fcac0..e5a4ce8f23 100644
--- a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml
+++ b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml
@@ -92,7 +92,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/finetune/1b_local_data_sft.yaml b/scripts/train/yamls/finetune/1b_local_data_sft.yaml
index d6f72b0c8e..3ee3f9d5cd 100644
--- a/scripts/train/yamls/finetune/1b_local_data_sft.yaml
+++ b/scripts/train/yamls/finetune/1b_local_data_sft.yaml
@@ -111,7 +111,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/finetune/7b_dolly_sft.yaml b/scripts/train/yamls/finetune/7b_dolly_sft.yaml
index c5813235d9..ffe9fd6c10 100644
--- a/scripts/train/yamls/finetune/7b_dolly_sft.yaml
+++ b/scripts/train/yamls/finetune/7b_dolly_sft.yaml
@@ -99,7 +99,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/finetune/mpt-30b-instruct.yaml b/scripts/train/yamls/finetune/mpt-30b-instruct.yaml
index a4896b0b3f..fab2594847 100644
--- a/scripts/train/yamls/finetune/mpt-30b-instruct.yaml
+++ b/scripts/train/yamls/finetune/mpt-30b-instruct.yaml
@@ -101,7 +101,6 @@ fsdp_config:
   activation_cpu_offload: false
   limit_all_gathers: true
   sync_module_states: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml
index 2f23d8e55a..9936575626 100644
--- a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml
+++ b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml
@@ -105,7 +105,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml b/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml
index 5fdd8242f8..845b24b7e4 100644
--- a/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml
+++ b/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml
@@ -90,7 +90,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml
index b544546239..c54f5b9db3 100644
--- a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml
+++ b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml
@@ -76,7 +76,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m.yaml
index 12914e14bc..c17d0860c8 100644
--- a/scripts/train/yamls/pretrain/gpt-neo-125m.yaml
+++ b/scripts/train/yamls/pretrain/gpt-neo-125m.yaml
@@ -92,7 +92,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml
index 3da239c717..3b0954db3a 100644
--- a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml
+++ b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml
@@ -92,7 +92,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/gpt2-small.yaml b/scripts/train/yamls/pretrain/gpt2-small.yaml
index d40cff6e9e..3adec18570 100644
--- a/scripts/train/yamls/pretrain/gpt2-small.yaml
+++ b/scripts/train/yamls/pretrain/gpt2-small.yaml
@@ -92,7 +92,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml
index 1d4c1d964c..1fd73c6429 100644
--- a/scripts/train/yamls/pretrain/mpt-125m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-125m.yaml
@@ -91,7 +91,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/mpt-13b.yaml b/scripts/train/yamls/pretrain/mpt-13b.yaml
index 567942b190..f45b376c94 100644
--- a/scripts/train/yamls/pretrain/mpt-13b.yaml
+++ b/scripts/train/yamls/pretrain/mpt-13b.yaml
@@ -91,7 +91,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/mpt-1b.yaml b/scripts/train/yamls/pretrain/mpt-1b.yaml
index 312a5fc368..8b9cf82ba6 100644
--- a/scripts/train/yamls/pretrain/mpt-1b.yaml
+++ b/scripts/train/yamls/pretrain/mpt-1b.yaml
@@ -91,7 +91,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/mpt-30b.yaml b/scripts/train/yamls/pretrain/mpt-30b.yaml
index e4873b68a9..400aa888c4 100644
--- a/scripts/train/yamls/pretrain/mpt-30b.yaml
+++ b/scripts/train/yamls/pretrain/mpt-30b.yaml
@@ -91,7 +91,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/mpt-350m.yaml b/scripts/train/yamls/pretrain/mpt-350m.yaml
index c3c00c391b..d3620140e2 100644
--- a/scripts/train/yamls/pretrain/mpt-350m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-350m.yaml
@@ -91,7 +91,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/mpt-3b.yaml b/scripts/train/yamls/pretrain/mpt-3b.yaml
index 7df446eede..57af3236d2 100644
--- a/scripts/train/yamls/pretrain/mpt-3b.yaml
+++ b/scripts/train/yamls/pretrain/mpt-3b.yaml
@@ -91,7 +91,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/mpt-70b.yaml b/scripts/train/yamls/pretrain/mpt-70b.yaml
index 65dca2e313..ce45603d38 100644
--- a/scripts/train/yamls/pretrain/mpt-70b.yaml
+++ b/scripts/train/yamls/pretrain/mpt-70b.yaml
@@ -91,7 +91,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/mpt-760m.yaml b/scripts/train/yamls/pretrain/mpt-760m.yaml
index 53a9086475..68c6edbd82 100644
--- a/scripts/train/yamls/pretrain/mpt-760m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-760m.yaml
@@ -91,7 +91,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/mpt-7b.yaml b/scripts/train/yamls/pretrain/mpt-7b.yaml
index d41ef009e6..eba8bf20b3 100644
--- a/scripts/train/yamls/pretrain/mpt-7b.yaml
+++ b/scripts/train/yamls/pretrain/mpt-7b.yaml
@@ -91,7 +91,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/mpt-small-cpu.yaml b/scripts/train/yamls/pretrain/mpt-small-cpu.yaml
index cc04f11e44..c73159e3fc 100644
--- a/scripts/train/yamls/pretrain/mpt-small-cpu.yaml
+++ b/scripts/train/yamls/pretrain/mpt-small-cpu.yaml
@@ -93,7 +93,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/opt-3b.yaml b/scripts/train/yamls/pretrain/opt-3b.yaml
index 4423784b54..0bb823ffac 100644
--- a/scripts/train/yamls/pretrain/opt-3b.yaml
+++ b/scripts/train/yamls/pretrain/opt-3b.yaml
@@ -85,7 +85,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false
diff --git a/scripts/train/yamls/pretrain/testing.yaml b/scripts/train/yamls/pretrain/testing.yaml
index 995461a443..d432d833c6 100644
--- a/scripts/train/yamls/pretrain/testing.yaml
+++ b/scripts/train/yamls/pretrain/testing.yaml
@@ -92,7 +92,6 @@ fsdp_config:
   activation_checkpointing_reentrant: false
   activation_cpu_offload: false
   limit_all_gathers: true
-  verbose: false
 
 # Logging
 progress_bar: false

From d3d3bfef2a299a3f05d4700f5235404b7af0a10f Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 05:24:32 +0000
Subject: [PATCH 26/34] clean up test model

---
 tests/test_model.py | 68 ++++++++++++---------------------------------
 1 file changed, 17 insertions(+), 51 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index 51180a6c28..5e589dbd60 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import contextlib
 import copy
-import gc
 import os
 import pathlib
 import warnings
@@ -864,10 +863,9 @@ def test_generate(attention_impl: str, precision: str, pos_emb_config: dict,
 
 @pytest.mark.gpu
 @pytest.mark.parametrize('world_size', [1, 2])
-@pytest.mark.parametrize('use_cache', [False, True])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
-                                  use_cache: bool, tie_word_embeddings: bool):
+                                  tie_word_embeddings: bool):
     if not torch.cuda.device_count() >= world_size:
         pytest.skip(f'This test requires {world_size} GPUs.')
 
@@ -884,7 +882,7 @@ def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
         attn_config={
             'attn_impl': 'torch',
         },
-        use_cache=use_cache,
+        use_cache=True,
         tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
@@ -970,7 +968,6 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
     'torch',
     pytest.param('flash', marks=pytest.mark.gpu),
     pytest.param('triton', marks=pytest.mark.gpu),
-    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -998,9 +995,7 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
         'factor': 1.0,
     },
 }])
-@pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict,
-                                        tie_word_embeddings: bool):
+def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict):
     # Tests that the result is the same with or without padding when using kv caching
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
@@ -1029,7 +1024,7 @@ def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict,
             'name': 'baseline_',
             'init_std': 0.02,
         },
-        tie_word_embeddings=tie_word_embeddings,
+        tie_word_embeddings=True,
     )
 
     mpt = MPTForCausalLM(hf_config)
@@ -1107,7 +1102,6 @@ def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict,
     'torch',
     pytest.param('flash', marks=pytest.mark.gpu),
     pytest.param('triton', marks=pytest.mark.gpu),
-    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1247,7 +1241,6 @@ def test_forward_with_cache(attn_impl: str, pos_emb_config: dict,
     'torch',
     pytest.param('flash', marks=pytest.mark.gpu),
     pytest.param('triton', marks=pytest.mark.gpu),
-    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1347,18 +1340,12 @@ def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict,
     'torch',
     pytest.param('flash', marks=pytest.mark.gpu),
     pytest.param('triton', marks=pytest.mark.gpu),
-    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('generation_kwargs', [{
     'max_new_tokens': 2,
-    'num_beams': 4
-}, {
-    'max_new_tokens': 2,
+    'num_beams': 4,
     'top_k': 5,
     'penalty_alpha': 0.4
-}, {
-    'do_sample': True,
-    'top_p': 0.95
 }])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1425,7 +1412,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str,
 
     with get_precision_context('amp_bf16' if composer_device.name ==
                                'gpu' else 'fp32'):
-        # no padding in the input
         no_padding_input_ids = torch.tensor([[11274, 16390, 11]])
         no_padding_input_ids = composer_device.tensor_to_device(
             no_padding_input_ids)
@@ -1442,7 +1428,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str,
 
 
 @pytest.mark.gpu
-@pytest.mark.parametrize('attention_impl', ['torch', 'flash', 'triton'])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
     'rope': False
@@ -1470,12 +1455,8 @@ def test_generation_kwargs_dont_crash(attn_impl: str,
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_model_to(attention_impl: str, pos_emb_config: dict,
-                  tie_word_embeddings: bool):
+def test_model_to(pos_emb_config: dict, tie_word_embeddings: bool):
     # test that moving the model to diff devices and dtypes in diff ways does not break the model
-    if pos_emb_config['alibi'] and attention_impl == 'flash':
-        pytest.skip(f'alibi only implemented with torch and triton attention.')
-
     if pos_emb_config['rope'] and pos_emb_config[
             'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(f'dail implementation of rope requires flash attention 2.')
@@ -1490,7 +1471,7 @@ def test_model_to(attention_impl: str, pos_emb_config: dict,
         emb_pdrop=0.1,
         resid_pdrop=0.2,
         attn_config={
-            'attn_impl': attention_impl,
+            'attn_impl': 'torch',
             **pos_emb_config,
         },
         init_config={
@@ -1514,8 +1495,7 @@ def test_model_to(attention_impl: str, pos_emb_config: dict,
     mpt = mpt.to('cpu')
 
     # verify the model still works
-    if attention_impl == 'torch' and not (
-            pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'):
+    if not (pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'):
         with torch.autocast('cpu', dtype=torch.bfloat16, enabled=True):
             _ = mpt(input_ids.to('cpu'),
                     attention_mask=attention_mask.to('cpu'))
@@ -1523,8 +1503,7 @@ def test_model_to(attention_impl: str, pos_emb_config: dict,
     mpt = mpt.float()
 
     # verify the model still works
-    if attention_impl == 'torch' and not (
-            pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'):
+    if not (pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'):
         _ = mpt(input_ids.to('cpu'), attention_mask=attention_mask.to('cpu'))
 
     mpt = mpt.to(0)  # move to rank0
@@ -1586,16 +1565,11 @@ def test_alibi_vs_hf():
         'factor': 1.0,
     },
 }])
-@pytest.mark.parametrize('output_attentions', [True, False])
-@pytest.mark.parametrize('output_hidden_states', [True, False])
-@pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_output_attentions_and_output_hidden_states(
-        attn_impl: str, pos_emb_config: dict, output_attentions: bool,
-        output_hidden_states: bool, tie_word_embeddings: bool):
-    # Test that model forward with output_attentions_and_output_hidden_states
+        attn_impl: str, pos_emb_config: dict):
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
-    if output_attentions and attn_impl in ['flash', 'triton']:
+    if attn_impl in ['flash', 'triton']:
         pytest.skip(f'output_attentions only implemented with torch attention.')
     if pos_emb_config['rope'] and pos_emb_config[
             'rope_impl'] == 'dail' and not is_flash_v2_installed():
@@ -1624,7 +1598,7 @@ def test_forward_with_output_attentions_and_output_hidden_states(
             'name': 'baseline_',
             'init_std': 0.02,
         },
-        tie_word_embeddings=tie_word_embeddings,
+        tie_word_embeddings=True,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt = composer_device.module_to_device(mpt)
@@ -1637,20 +1611,16 @@ def test_forward_with_output_attentions_and_output_hidden_states(
         attention_mask = torch.tensor([[1, 1, 1]]).bool()
         attention_mask = composer_device.tensor_to_device(attention_mask)
 
-        # start with passing the first three tokens through
         outputs = mpt(
             input_ids,
             attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            output_attentions=True,
+            output_hidden_states=True,
         )
 
-        if output_attentions:
-            assert len(outputs.attentions) == n_layers
-            assert all(
-                attn.shape == (1, 4, 3, 3) for attn in outputs.attentions)
-        if output_hidden_states:
-            assert len(outputs.hidden_states) == n_layers + 1
+        assert len(outputs.attentions) == n_layers
+        assert all(attn.shape == (1, 4, 3, 3) for attn in outputs.attentions)
+        assert len(outputs.hidden_states) == n_layers + 1
 
 
 @pytest.mark.gpu
@@ -1663,10 +1633,6 @@ def test_hf_init(tmp_path: pathlib.Path,
     if not torch.cuda.device_count() >= world_size:
         pytest.skip(f'This test requires {world_size} GPUs.')
 
-    torch.cuda.empty_cache()
-    gc.collect()  #just in case
-    torch.cuda.synchronize()
-
     test_cfg = get_config(conf_path='scripts/train/yamls/pretrain/testing.yaml')
     test_cfg.device = torch.cuda.current_device()
 

From 870441c6fc69e38aaa715d1969896e988f72fca6 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 05:49:23 +0000
Subject: [PATCH 27/34] remove comment

---
 tests/test_hf_v_mpt.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_hf_v_mpt.py b/tests/test_hf_v_mpt.py
index bae33b088f..1319934506 100644
--- a/tests/test_hf_v_mpt.py
+++ b/tests/test_hf_v_mpt.py
@@ -12,7 +12,6 @@
 
 
 @pytest.mark.gpu
-# @pytest.mark.xfail(reason='CUDA OOM expected, needs to be fixed.')
 @pytest.mark.parametrize('attn_impl,dropout,alibi,mask_val,no_attn_mask', [
     ('flash', 0.0, False, 1, False),
     ('flash', 0.1, False, 1, False),

From ff25766f663b07e2517b64964539f14b84db4ce1 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 06:09:03 +0000
Subject: [PATCH 28/34] fix flash2 mistaken override

---
 .github/mcp/mcp_pytest.py         | 5 ++++-
 .github/workflows/pytest-gpu.yaml | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/mcp/mcp_pytest.py b/.github/mcp/mcp_pytest.py
index ba51576a2b..8bd2312d43 100644
--- a/.github/mcp/mcp_pytest.py
+++ b/.github/mcp/mcp_pytest.py
@@ -54,6 +54,9 @@
                         type=int,
                         default=1800,
                         help='Timeout for run (in seconds)')
+    parser.add_argument('--deps_group',
+                        type=str,
+                        help='Dependency group to install')
     args = parser.parse_args()
 
     name = args.name
@@ -89,7 +92,7 @@
     clear_tmp_path_flag = '-o tmp_path_retention_policy=none'
     command += f'''
 
-    pip install --upgrade --user .[all]
+    pip install --upgrade --user .[{deps_group}]
 
     export COMMON_ARGS="-v --durations=20 -m '{args.pytest_markers}' {clear_tmp_path_flag}"
 
diff --git a/.github/workflows/pytest-gpu.yaml b/.github/workflows/pytest-gpu.yaml
index 45b49366c9..773fcb019c 100644
--- a/.github/workflows/pytest-gpu.yaml
+++ b/.github/workflows/pytest-gpu.yaml
@@ -22,6 +22,9 @@ on:
         required: false
         type: string
         default: 3.9
+      deps-group:
+        require: true
+        type: string
     secrets:
       mcloud-api-key:
         required: true
@@ -78,3 +81,4 @@ jobs:
               --pytest_markers '${{ inputs.pytest-markers }}' \
               --pytest_command '${{ inputs.pytest-command }}' \
               --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS}
+              --deps_group ${{ inputs.deps-group }}

From 7a5a1f454a51288d757d76f21d58bb58f8d84728 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 06:09:54 +0000
Subject: [PATCH 29/34] fix typo

---
 .github/mcp/mcp_pytest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/mcp/mcp_pytest.py b/.github/mcp/mcp_pytest.py
index 8bd2312d43..5f0aaa147b 100644
--- a/.github/mcp/mcp_pytest.py
+++ b/.github/mcp/mcp_pytest.py
@@ -92,7 +92,7 @@
     clear_tmp_path_flag = '-o tmp_path_retention_policy=none'
     command += f'''
 
-    pip install --upgrade --user .[{deps_group}]
+    pip install --upgrade --user .[{args.deps_group}]
 
     export COMMON_ARGS="-v --durations=20 -m '{args.pytest_markers}' {clear_tmp_path_flag}"
 

From edacb1662eeaf859a44ead1b965ecdc6350f3f60 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 06:15:10 +0000
Subject: [PATCH 30/34] fix

---
 .github/workflows/pr-gpu.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 84c46a1368..87ae173e77 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -23,14 +23,17 @@ jobs:
           container: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04
           markers: 'gpu'
           pytest_command: 'coverage run -m pytest'
+          deps_group: 'all'
         - name: 'gpu-2.1.0'
           container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
           markers: 'gpu'
           pytest_command: 'coverage run -m pytest'
+          deps_group: 'all'
         - name: 'gpu-2.1.0-flash2'
           container: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest
           markers: 'gpu'
           pytest_command: 'coverage run -m pytest'
+          deps_group: 'all-flash2'
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:
@@ -40,5 +43,6 @@ jobs:
       pytest-command: ${{ matrix.pytest_command }}
       pytest-markers: ${{ matrix.markers }}
       python-version: 3.9
+      deps-group: ${{ matrix.deps_group }}
     secrets:
       mcloud-api-key: ${{ secrets.MCLOUD_API_KEY }}

From 0a9e0f2c9bbe4488595217128795f12a060c518e Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 06:30:06 +0000
Subject: [PATCH 31/34] less rope parametrization

---
 tests/test_rope_dail_vs_hf.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_rope_dail_vs_hf.py b/tests/test_rope_dail_vs_hf.py
index 598e308546..45b2ad9aa5 100644
--- a/tests/test_rope_dail_vs_hf.py
+++ b/tests/test_rope_dail_vs_hf.py
@@ -11,8 +11,6 @@
 
 
 @pytest.mark.gpu
-@pytest.mark.parametrize('clip_qkv', [True, False])
-@pytest.mark.parametrize('qk_ln', [True, False])
 @pytest.mark.parametrize(
     'attn_type',
     ['multihead_attention', 'multiquery_attention', 'grouped_query_attention'])

From 7a8bc43834348b222d7ec61b36cc8ab56bb26acc Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 06:31:02 +0000
Subject: [PATCH 32/34] precommit

---
 tests/test_rope_dail_vs_hf.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tests/test_rope_dail_vs_hf.py b/tests/test_rope_dail_vs_hf.py
index 45b2ad9aa5..70a00470f9 100644
--- a/tests/test_rope_dail_vs_hf.py
+++ b/tests/test_rope_dail_vs_hf.py
@@ -15,11 +15,7 @@
     'attn_type',
     ['multihead_attention', 'multiquery_attention', 'grouped_query_attention'])
 @pytest.mark.parametrize('seq_len', [1, 233, 2048])
-def test_rope_dail_vs_hf(clip_qkv: bool,
-                         qk_ln: bool,
-                         attn_type: str,
-                         seq_len: int,
-                         device: str = 'cuda'):
+def test_rope_dail_vs_hf(attn_type: str, seq_len: int, device: str = 'cuda'):
     # compare rope rotations for the dail vs hf implementations
     if not is_flash_v2_installed():
         pytest.skip('dail implementation of rope requires flash attention 2.')
@@ -31,8 +27,8 @@ def test_rope_dail_vs_hf(clip_qkv: bool,
         'd_model': 128,
         'n_heads': 4,
         'attn_pdrop': 0,
-        'clip_qkv': clip_qkv,
-        'qk_ln': qk_ln,
+        'clip_qkv': True,
+        'qk_ln': False,
     })
 
     batch_size = 2

From 606f97529810e9bfd01daa74dce046cb1fd3e38c Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 19 Nov 2023 18:53:25 +0000
Subject: [PATCH 33/34] fix

---
 tests/test_hf_conversion_script.py | 74 ++++--------------------------
 1 file changed, 10 insertions(+), 64 deletions(-)

diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py
index 5db9f941a4..1b40c715de 100644
--- a/tests/test_hf_conversion_script.py
+++ b/tests/test_hf_conversion_script.py
@@ -5,6 +5,7 @@
 import os
 import pathlib
 import sys
+from typing import Callable
 from unittest.mock import ANY, MagicMock, patch
 
 from composer import Trainer
@@ -26,6 +27,7 @@
 import transformers
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
+from torch.utils.data import DataLoader
 from transformers import PreTrainedModel, PreTrainedTokenizerBase
 
 from llmfoundry import COMPOSER_MODEL_REGISTRY
@@ -268,14 +270,14 @@ def test_callback_inits():
 def test_huggingface_conversion_callback_interval(
         tmp_path: pathlib.Path, log_to_mlflow: bool, hf_save_interval: str,
         save_interval: str, max_duration: str, expected_hf_checkpoints: int,
-        expected_normal_checkpoints: int):
+        expected_normal_checkpoints: int, tiny_ft_dataloader: DataLoader,
+        mpt_tokenizer: PreTrainedTokenizerBase, build_tiny_mpt: Callable):
     delete_transformers_cache()
 
     dist.initialize_dist(get_device('gpu'))
 
-    max_seq_len = 4
-    device_batch_size = 2
-    dataset_size = 8
+    device_batch_size = 1
+    dataset_size = 4
     precision_str = 'bfloat16'
     precision = torch.bfloat16
     batches_per_epoch = math.ceil(dataset_size / device_batch_size)
@@ -288,63 +290,7 @@ def test_huggingface_conversion_callback_interval(
         if log_to_mlflow else None,
     )
 
-    # get small version of each model
-    model_cfg = {
-        'name': 'mpt_causal_lm',
-        'init_device': 'cpu',
-        'd_model': 64,
-        'n_heads': 2,
-        'n_layers': 2,
-        'expansion_ratio': 4,
-        'max_seq_len': max_seq_len,
-        'vocab_size': 50368,
-        'attn_config': {
-            'attn_impl': 'torch',
-        },
-        'loss_fn': 'torch_crossentropy',
-        'tie_word_embeddings': True,
-    }
-    tokenizer_name = 'EleutherAI/gpt-neox-20b'
-    model_cfg = om.create(model_cfg)
-
-    tiny_dataset_folder_path = os.path.join(os.getcwd(), 'test-ift-data-small')
-    tiny_dataset_path = os.path.join(tiny_dataset_folder_path, 'train.jsonl')
-    make_tiny_ft_dataset(path=tiny_dataset_path, size=dataset_size)
-
-    dataloader_cfg = {
-        'name': 'finetuning',
-        'dataset': {
-            'hf_name': tiny_dataset_folder_path,
-            'split': 'train',
-            'max_seq_len': max_seq_len,
-            'decoder_only_format': True,
-            'allow_pad_trimming': False,
-            'packing_ratio': None,
-            'shuffle': True,
-        },
-        'drop_last': False,
-        'num_workers': 4,
-        'pin_memory': False,
-        'prefetch_factor': 2,
-        'persistent_workers': False,
-        'timeout': 0
-    }
-
-    dataloader_cfg = om.create(dataloader_cfg)
-
-    tokenizer = build_tokenizer(
-        tokenizer_name=tokenizer_name,
-        tokenizer_kwargs={'model_max_length': max_seq_len},
-    )
-
-    train_dataloader = build_finetuning_dataloader(
-        dataloader_cfg,
-        tokenizer,
-        device_batch_size,
-    )
-
-    original_model = COMPOSER_MODEL_REGISTRY[model_cfg['name']](model_cfg,
-                                                                tokenizer)
+    original_model = build_tiny_mpt()
 
     optimizer_config = {
         'name': 'decoupled_adamw',
@@ -365,7 +311,7 @@ def test_huggingface_conversion_callback_interval(
     trainer = Trainer(
         model=original_model,
         device='gpu',
-        train_dataloader=train_dataloader,
+        train_dataloader=tiny_ft_dataloader,
         save_folder=os.path.join(tmp_path, 'checkpoints'),
         save_interval=save_interval,
         max_duration=max_duration,
@@ -399,7 +345,7 @@ def test_huggingface_conversion_callback_interval(
     ]
     assert len(normal_checkpoints) == expected_normal_checkpoints
     assert len(huggingface_checkpoints) == expected_hf_checkpoints
-    print(huggingface_checkpoints)
+
     # Load the last huggingface checkpoint
     loaded_model = transformers.AutoModelForCausalLM.from_pretrained(
         os.path.join(tmp_path, 'checkpoints', 'huggingface',
@@ -428,7 +374,7 @@ def test_huggingface_conversion_callback_interval(
 
     check_hf_model_equivalence(trainer.state.model.model.to(precision),
                                loaded_model)
-    check_hf_tokenizer_equivalence(tokenizer, loaded_tokenizer)
+    check_hf_tokenizer_equivalence(mpt_tokenizer, loaded_tokenizer)
 
     delete_transformers_cache()
 

From 3e6ab20a7f903605380b5f3ad20d9858760658c8 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Mon, 20 Nov 2023 11:53:24 -0800
Subject: [PATCH 34/34] Update .github/workflows/pytest-gpu.yaml

Co-authored-by: Charles Tang <j316chuck@users.noreply.github.com>
---
 .github/workflows/pytest-gpu.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest-gpu.yaml b/.github/workflows/pytest-gpu.yaml
index 773fcb019c..4e6699e323 100644
--- a/.github/workflows/pytest-gpu.yaml
+++ b/.github/workflows/pytest-gpu.yaml
@@ -80,5 +80,5 @@ jobs:
               --image '${{ inputs.container }}' \
               --pytest_markers '${{ inputs.pytest-markers }}' \
               --pytest_command '${{ inputs.pytest-command }}' \
-              --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS}
+              --timeout ${{ inputs.mcloud-timeout }} ${REF_ARGS} \
               --deps_group ${{ inputs.deps-group }}