From 0303fdac8e0e793295e9d5117f85bdb97f7bf517 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 17 Nov 2023 16:04:02 -0800
Subject: [PATCH 1/8] fix the tests that werent running

---
 tests/test_model.py | 133 ++++++++++++++++++--------------------------
 1 file changed, 53 insertions(+), 80 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index 3308c65fd3..c437b6fd9f 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -310,11 +310,6 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2):
      pytest.param('flash', torch.float16, marks=pytest.mark.gpu),
      pytest.param('flash', torch.bfloat16, marks=pytest.mark.gpu)])
 def test_determinism(attn_impl: str, precision: torch.dtype):
-    if not torch.cuda.is_available():
-        pytest.skip(
-            'This test requires CUDA to be available in order to run with bfloat16 precision.'
-        )
-
     conf_path = 'scripts/train/yamls/pretrain/testing.yaml'
     with open(conf_path) as f:
         test_cfg = om.load(f)
@@ -519,10 +514,10 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
         assert block.resid_ffn_dropout.p == 0.2
 
 
-@pytest.mark.parametrize('attention_impl,device', [('torch', 'cpu'),
-                                                   ('flash', 'gpu'),
-                                                   ('triton', 'gpu'),
-                                                   ('torch', 'gpu')])
+@pytest.mark.parametrize('attention_impl', ['torch',
+                                                   pytest.param('flash', marks=pytest.mark.gpu),
+                                                   pytest.param('triton', marks=pytest.mark.gpu),
+                                                   pytest.param('torch', marks=pytest.mark.gpu)])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
     'rope': False
@@ -550,13 +545,11 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_forward_with_padding(attention_impl: str, device: str,
+def test_forward_with_padding(attention_impl: str,
                               pos_emb_config: dict, tie_word_embeddings: bool):
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+
     # Test that different placement of padding does not affect the output.
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
-        )
     alibi = pos_emb_config['alibi']
     if alibi and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
@@ -743,12 +736,12 @@ def test_advanced_mask_building(attention_impl: str):
     assert torch.equal(attn_bias, expected_attn_bias)
 
 
-@pytest.mark.parametrize('attention_impl,device,precision', [
-    ('torch', 'cpu', 'fp32'),
-    ('flash', 'gpu', 'amp_bf16'),
-    ('triton', 'gpu', 'amp_bf16'),
-    ('torch', 'gpu', 'amp_bf16'),
-    ('torch', 'gpu', 'fp32'),
+@pytest.mark.parametrize('attention_impl,precision', [
+    ('torch', 'fp32'),
+    pytest.param('flash', 'amp_bf16', marks=pytest.mark.gpu),
+    pytest.param('triton', 'amp_bf16', marks=pytest.mark.gpu),
+    pytest.param('torch', 'amp_bf16', marks=pytest.mark.gpu),
+    pytest.param('torch', 'fp32', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -777,14 +770,12 @@ def test_advanced_mask_building(attention_impl: str):
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_generate(attention_impl: str, device: str, precision: str,
+def test_generate(attention_impl: str, precision: str,
                   pos_emb_config: dict, tie_word_embeddings: bool):
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+
     # Test that generate works, and produces the same output with or without
     # padding in the input.
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
@@ -878,8 +869,6 @@ def test_generate(attention_impl: str, device: str, precision: str,
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
                                   use_cache: bool, tie_word_embeddings: bool):
-    if not torch.cuda.is_available():
-        pytest.skip(f'This test requires CUDA to be available.')
     if not torch.cuda.device_count() >= world_size:
         pytest.skip(f'This test requires {world_size} GPUs.')
 
@@ -978,11 +967,11 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
     check_hf_model_equivalence(mpt, mpt2)
 
 
-@pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1011,14 +1000,12 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_forward_with_cache_and_padding(attn_impl: str, device: str,
+def test_forward_with_cache_and_padding(attn_impl: str,
                                         pos_emb_config: dict,
                                         tie_word_embeddings: bool):
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+
     # Tests that the result is the same with or without padding when using kv caching
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
     if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
@@ -1120,11 +1107,11 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str,
                 rtol=1e-6)
 
 
-@pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+@pytest.mark.parametrize('attn_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1153,14 +1140,12 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str,
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict,
+def test_forward_with_cache(attn_impl: str, pos_emb_config: dict,
                             tie_word_embeddings: bool):
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+
     # Test that model forward with and without the key-value cache produces the
     # same output.
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
@@ -1265,10 +1250,10 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict,
 
 
 @pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+    'torch',
+    pytest.param('flash', 'gpu', marks=pytest.mark.gpu),
+    pytest.param('triton', 'gpu', marks=pytest.mark.gpu),
+    pytest.param('torch', 'gpu', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1297,12 +1282,10 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict,
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_generate_with_past_kv(attn_impl: str, device: str,
+def test_generate_with_past_kv(attn_impl: str,
                                pos_emb_config: dict, tie_word_embeddings: bool):
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
     if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
@@ -1369,10 +1352,10 @@ def test_generate_with_past_kv(attn_impl: str, device: str,
 
 
 @pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+    'torch',
+    pytest.param('flash', 'gpu', marks=pytest.mark.gpu),
+    pytest.param('triton', 'gpu', marks=pytest.mark.gpu),
+    pytest.param('torch', 'gpu', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('generation_kwargs', [{
     'max_new_tokens': 2,
@@ -1412,14 +1395,12 @@ def test_generate_with_past_kv(attn_impl: str, device: str,
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
+def test_generation_kwargs_dont_crash(attn_impl: str,
                                       generation_kwargs: Dict[str, Any],
                                       pos_emb_config: dict,
                                       tie_word_embeddings: bool):
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
@@ -1499,10 +1480,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
 def test_model_to(attention_impl: str, pos_emb_config: dict,
                   tie_word_embeddings: bool):
     # test that moving the model to diff devices and dtypes in diff ways does not break the model
-    if not torch.cuda.is_available():
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
@@ -1598,10 +1575,10 @@ def test_alibi_vs_hf():
 
 
 @pytest.mark.parametrize('attn_impl,device', [
-    ('torch', 'cpu'),
-    ('flash', 'gpu'),
-    ('triton', 'gpu'),
-    ('torch', 'gpu'),
+    'torch',
+    pytest.param('flash', 'gpu', marks=pytest.mark.gpu),
+    pytest.param('triton', 'gpu', marks=pytest.mark.gpu),
+    pytest.param('torch', 'gpu', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
@@ -1633,14 +1610,12 @@ def test_alibi_vs_hf():
 @pytest.mark.parametrize('output_hidden_states', [True, False])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_output_attentions_and_output_hidden_states(
-        attn_impl: str, device: str, pos_emb_config: dict,
+        attn_impl: str, pos_emb_config: dict,
         output_attentions: bool, output_hidden_states: bool,
         tie_word_embeddings: bool):
+    device = 'gpu' if torch.cuda.is_available() else 'cpu'
+
     # Test that model forward with output_attentions_and_output_hidden_states
-    if not torch.cuda.is_available() and device == 'gpu':
-        pytest.skip(
-            f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
-        )
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
     if output_attentions and attn_impl in ['flash', 'triton']:
@@ -1708,8 +1683,6 @@ def test_hf_init(tmp_path: pathlib.Path,
                  init_device: str,
                  world_size: int,
                  batch_size: int = 1):
-    if not torch.cuda.is_available():
-        pytest.skip(f'This test requires CUDA to be available.')
     if not torch.cuda.device_count() >= world_size:
         pytest.skip(f'This test requires {world_size} GPUs.')
 

From 0167cb020ab7f3d3010c27b69cbcaec329ac0140 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 17 Nov 2023 16:19:07 -0800
Subject: [PATCH 2/8] precommit

---
 tests/test_model.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index c437b6fd9f..e5644712ee 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -514,10 +514,12 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
         assert block.resid_ffn_dropout.p == 0.2
 
 
-@pytest.mark.parametrize('attention_impl', ['torch',
-                                                   pytest.param('flash', marks=pytest.mark.gpu),
-                                                   pytest.param('triton', marks=pytest.mark.gpu),
-                                                   pytest.param('torch', marks=pytest.mark.gpu)])
+@pytest.mark.parametrize('attention_impl', [
+    'torch',
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu)
+])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,
     'rope': False
@@ -545,8 +547,8 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_forward_with_padding(attention_impl: str,
-                              pos_emb_config: dict, tie_word_embeddings: bool):
+def test_forward_with_padding(attention_impl: str, pos_emb_config: dict,
+                              tie_word_embeddings: bool):
     device = 'gpu' if torch.cuda.is_available() else 'cpu'
 
     # Test that different placement of padding does not affect the output.
@@ -770,8 +772,8 @@ def test_advanced_mask_building(attention_impl: str):
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_generate(attention_impl: str, precision: str,
-                  pos_emb_config: dict, tie_word_embeddings: bool):
+def test_generate(attention_impl: str, precision: str, pos_emb_config: dict,
+                  tie_word_embeddings: bool):
     device = 'gpu' if torch.cuda.is_available() else 'cpu'
 
     # Test that generate works, and produces the same output with or without
@@ -1000,8 +1002,7 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_forward_with_cache_and_padding(attn_impl: str,
-                                        pos_emb_config: dict,
+def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict,
                                         tie_word_embeddings: bool):
     device = 'gpu' if torch.cuda.is_available() else 'cpu'
 
@@ -1282,8 +1283,8 @@ def test_forward_with_cache(attn_impl: str, pos_emb_config: dict,
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_generate_with_past_kv(attn_impl: str,
-                               pos_emb_config: dict, tie_word_embeddings: bool):
+def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict,
+                               tie_word_embeddings: bool):
     device = 'gpu' if torch.cuda.is_available() else 'cpu'
 
     if pos_emb_config['alibi'] and attn_impl == 'flash':
@@ -1610,9 +1611,8 @@ def test_alibi_vs_hf():
 @pytest.mark.parametrize('output_hidden_states', [True, False])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_output_attentions_and_output_hidden_states(
-        attn_impl: str, pos_emb_config: dict,
-        output_attentions: bool, output_hidden_states: bool,
-        tie_word_embeddings: bool):
+        attn_impl: str, pos_emb_config: dict, output_attentions: bool,
+        output_hidden_states: bool, tie_word_embeddings: bool):
     device = 'gpu' if torch.cuda.is_available() else 'cpu'
 
     # Test that model forward with output_attentions_and_output_hidden_states

From b64166b53a3dc87a02d4e598f135131e928ed01c Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 17 Nov 2023 16:24:55 -0800
Subject: [PATCH 3/8] fix

---
 tests/test_model.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index e5644712ee..19a89dcc07 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1250,11 +1250,11 @@ def test_forward_with_cache(attn_impl: str, pos_emb_config: dict,
         )
 
 
-@pytest.mark.parametrize('attn_impl,device', [
+@pytest.mark.parametrize('attn_impl', [
     'torch',
-    pytest.param('flash', 'gpu', marks=pytest.mark.gpu),
-    pytest.param('triton', 'gpu', marks=pytest.mark.gpu),
-    pytest.param('torch', 'gpu', marks=pytest.mark.gpu),
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,

From 21031b59cb949e9b6fa5a490ec05a92f7ba32e6c Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 17 Nov 2023 16:34:06 -0800
Subject: [PATCH 4/8] fix another

---
 tests/test_model.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index 19a89dcc07..e7025b2ebe 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1352,11 +1352,11 @@ def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict,
                                                              hf_config.d_model)
 
 
-@pytest.mark.parametrize('attn_impl,device', [
+@pytest.mark.parametrize('attn_impl', [
     'torch',
-    pytest.param('flash', 'gpu', marks=pytest.mark.gpu),
-    pytest.param('triton', 'gpu', marks=pytest.mark.gpu),
-    pytest.param('torch', 'gpu', marks=pytest.mark.gpu),
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('generation_kwargs', [{
     'max_new_tokens': 2,

From 1ba798e7cc66cb356845ff037351e6c1f4a161bb Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 17 Nov 2023 16:35:19 -0800
Subject: [PATCH 5/8] another

---
 tests/test_model.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index e7025b2ebe..4ed3a385cb 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -1575,11 +1575,11 @@ def test_alibi_vs_hf():
             torch.testing.assert_close(alibi_bias_hf, alibi_bias_m)
 
 
-@pytest.mark.parametrize('attn_impl,device', [
+@pytest.mark.parametrize('attn_impl', [
     'torch',
-    pytest.param('flash', 'gpu', marks=pytest.mark.gpu),
-    pytest.param('triton', 'gpu', marks=pytest.mark.gpu),
-    pytest.param('torch', 'gpu', marks=pytest.mark.gpu),
+    pytest.param('flash', marks=pytest.mark.gpu),
+    pytest.param('triton', marks=pytest.mark.gpu),
+    pytest.param('torch', marks=pytest.mark.gpu),
 ])
 @pytest.mark.parametrize('pos_emb_config', [{
     'alibi': False,

From 3a863ca8ddabf4d4921b2752e07bfcef4def4bd3 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 17 Nov 2023 16:55:41 -0800
Subject: [PATCH 6/8] fix

---
 tests/test_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_model.py b/tests/test_model.py
index 4ed3a385cb..64de3a3cff 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -304,6 +304,7 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2):
     assert not torch.equal(original_params, updated_params)
 
 
+@pytest.mark.gpu
 @pytest.mark.parametrize(
     'attn_impl,precision',
     [('torch', torch.float16), ('torch', torch.bfloat16),

From 84220de7128fdbd295855c8f577455e5227ee3d1 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 17 Nov 2023 17:08:44 -0800
Subject: [PATCH 7/8] precommit

---
 tests/test_model.py | 63 +++++++++++++++++----------------------------
 1 file changed, 23 insertions(+), 40 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index 64de3a3cff..c38db7c9f7 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -16,7 +16,7 @@
 from composer.core.precision import Precision, get_precision_context
 from composer.optim import DecoupledAdamW
 from composer.trainer.dist_strategy import prepare_fsdp_module
-from composer.utils import dist, get_device, reproducibility
+from composer.utils import dist, get_device
 from omegaconf import DictConfig, ListConfig
 from omegaconf import OmegaConf as om
 from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel,
@@ -550,20 +550,18 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_padding(attention_impl: str, pos_emb_config: dict,
                               tie_word_embeddings: bool):
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-
     # Test that different placement of padding does not affect the output.
     alibi = pos_emb_config['alibi']
     if alibi and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
     rope = pos_emb_config['rope']
-    if rope and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if rope and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -775,21 +773,19 @@ def test_advanced_mask_building(attention_impl: str):
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_generate(attention_impl: str, precision: str, pos_emb_config: dict,
                   tie_word_embeddings: bool):
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-
     # Test that generate works, and produces the same output with or without
     # padding in the input.
     if pos_emb_config['alibi'] and attention_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
     if attention_impl == 'torch' and precision == 'amp_bf16' and tie_word_embeddings == False:
         pytest.skip(f'This test configuration has precision / sampling issues.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -1005,17 +1001,15 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict,
                                         tie_word_embeddings: bool):
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-
     # Tests that the result is the same with or without padding when using kv caching
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -1144,19 +1138,17 @@ def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict,
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_forward_with_cache(attn_impl: str, pos_emb_config: dict,
                             tie_word_embeddings: bool):
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-
     # Test that model forward with and without the key-value cache produces the
     # same output.
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -1286,16 +1278,14 @@ def test_forward_with_cache(attn_impl: str, pos_emb_config: dict,
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
 def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict,
                                tie_word_embeddings: bool):
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     hf_config = MPTConfig(
         init_device='cpu',
@@ -1401,18 +1391,15 @@ def test_generation_kwargs_dont_crash(attn_impl: str,
                                       generation_kwargs: Dict[str, Any],
                                       pos_emb_config: dict,
                                       tie_word_embeddings: bool):
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
 
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
-    composer_device = get_device(device)
-    if device == 'gpu':  # Switch deteminism off
-        torch.use_deterministic_algorithms(False)
+    composer_device = get_device(None)
+
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -1446,8 +1433,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str,
         _ = mpt.generate(input_ids=no_padding_input_ids,
                          attention_mask=no_padding_attention_mask,
                          **generation_kwargs)
-    if device == 'gpu':  # Switch deteminism back on
-        reproducibility.configure_deterministic_mode()
 
 
 @pytest.mark.gpu
@@ -1614,19 +1599,17 @@ def test_alibi_vs_hf():
 def test_forward_with_output_attentions_and_output_hidden_states(
         attn_impl: str, pos_emb_config: dict, output_attentions: bool,
         output_hidden_states: bool, tie_word_embeddings: bool):
-    device = 'gpu' if torch.cuda.is_available() else 'cpu'
-
     # Test that model forward with output_attentions_and_output_hidden_states
     if pos_emb_config['alibi'] and attn_impl == 'flash':
         pytest.skip(f'alibi only implemented with torch and triton attention.')
     if output_attentions and attn_impl in ['flash', 'triton']:
         pytest.skip(f'output_attentions only implemented with torch attention.')
-    if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
-            device != 'gpu' or not is_flash_v2_installed()):
+    if pos_emb_config['rope'] and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
         pytest.skip(
             f'dail implementation of rope requires gpu and flash attention 2.')
 
-    composer_device = get_device(device)
+    composer_device = get_device(None)
 
     n_layers = 2
 

From ae719553116a80077e5d84a94ea780e94853b9cf Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 17 Nov 2023 22:39:49 -0800
Subject: [PATCH 8/8] fix

---
 tests/test_model.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index c38db7c9f7..c160c064dc 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -16,7 +16,7 @@
 from composer.core.precision import Precision, get_precision_context
 from composer.optim import DecoupledAdamW
 from composer.trainer.dist_strategy import prepare_fsdp_module
-from composer.utils import dist, get_device
+from composer.utils import dist, get_device, reproducibility
 from omegaconf import DictConfig, ListConfig
 from omegaconf import OmegaConf as om
 from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel,
@@ -1400,6 +1400,9 @@ def test_generation_kwargs_dont_crash(attn_impl: str,
             f'dail implementation of rope requires gpu and flash attention 2.')
     composer_device = get_device(None)
 
+    if composer_device.name == 'gpu':
+        torch.use_deterministic_algorithms(False)
+
     hf_config = MPTConfig(
         init_device='cpu',
         d_model=128,
@@ -1434,6 +1437,9 @@ def test_generation_kwargs_dont_crash(attn_impl: str,
                          attention_mask=no_padding_attention_mask,
                          **generation_kwargs)
 
+    if composer_device.name == 'gpu':
+        reproducibility.configure_deterministic_mode()
+
 
 @pytest.mark.gpu
 @pytest.mark.parametrize('attention_impl', ['torch', 'flash', 'triton'])