From 0303fdac8e0e793295e9d5117f85bdb97f7bf517 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 17 Nov 2023 16:04:02 -0800 Subject: [PATCH 1/8] fix the tests that werent running --- tests/test_model.py | 133 ++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 80 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 3308c65fd3..c437b6fd9f 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -310,11 +310,6 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): pytest.param('flash', torch.float16, marks=pytest.mark.gpu), pytest.param('flash', torch.bfloat16, marks=pytest.mark.gpu)]) def test_determinism(attn_impl: str, precision: torch.dtype): - if not torch.cuda.is_available(): - pytest.skip( - 'This test requires CUDA to be available in order to run with bfloat16 precision.' - ) - conf_path = 'scripts/train/yamls/pretrain/testing.yaml' with open(conf_path) as f: test_cfg = om.load(f) @@ -519,10 +514,10 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool): assert block.resid_ffn_dropout.p == 0.2 -@pytest.mark.parametrize('attention_impl,device', [('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu')]) +@pytest.mark.parametrize('attention_impl', ['torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu)]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, 'rope': False @@ -550,13 +545,11 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_padding(attention_impl: str, device: str, +def test_forward_with_padding(attention_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): + device = 'gpu' if torch.cuda.is_available() else 'cpu' + # Test that different placement of padding does not affect the output. - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attention_impl} attention.' - ) alibi = pos_emb_config['alibi'] if alibi and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') @@ -743,12 +736,12 @@ def test_advanced_mask_building(attention_impl: str): assert torch.equal(attn_bias, expected_attn_bias) -@pytest.mark.parametrize('attention_impl,device,precision', [ - ('torch', 'cpu', 'fp32'), - ('flash', 'gpu', 'amp_bf16'), - ('triton', 'gpu', 'amp_bf16'), - ('torch', 'gpu', 'amp_bf16'), - ('torch', 'gpu', 'fp32'), +@pytest.mark.parametrize('attention_impl,precision', [ + ('torch', 'fp32'), + pytest.param('flash', 'amp_bf16', marks=pytest.mark.gpu), + pytest.param('triton', 'amp_bf16', marks=pytest.mark.gpu), + pytest.param('torch', 'amp_bf16', marks=pytest.mark.gpu), + pytest.param('torch', 'fp32', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -777,14 +770,12 @@ def test_advanced_mask_building(attention_impl: str): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generate(attention_impl: str, device: str, precision: str, +def test_generate(attention_impl: str, precision: str, pos_emb_config: dict, tie_word_embeddings: bool): + device = 'gpu' if torch.cuda.is_available() else 'cpu' + # Test that generate works, and produces the same output with or without # padding in the input. - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attention_impl} attention.' - ) if pos_emb_config['alibi'] and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') @@ -878,8 +869,6 @@ def test_generate(attention_impl: str, device: str, precision: str, @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int, use_cache: bool, tie_word_embeddings: bool): - if not torch.cuda.is_available(): - pytest.skip(f'This test requires CUDA to be available.') if not torch.cuda.device_count() >= world_size: pytest.skip(f'This test requires {world_size} GPUs.') @@ -978,11 +967,11 @@ def test_save_from_pretrained(tmp_path: pathlib.Path): check_hf_model_equivalence(mpt, mpt2) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1011,14 +1000,12 @@ def test_save_from_pretrained(tmp_path: pathlib.Path): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_cache_and_padding(attn_impl: str, device: str, +def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): + device = 'gpu' if torch.cuda.is_available() else 'cpu' + # Tests that the result is the same with or without padding when using kv caching - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( @@ -1120,11 +1107,11 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str, rtol=1e-6) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1153,14 +1140,12 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict, +def test_forward_with_cache(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): + device = 'gpu' if torch.cuda.is_available() else 'cpu' + # Test that model forward with and without the key-value cache produces the # same output. - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') @@ -1265,10 +1250,10 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict, @pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), + 'torch', + pytest.param('flash', 'gpu', marks=pytest.mark.gpu), + pytest.param('triton', 'gpu', marks=pytest.mark.gpu), + pytest.param('torch', 'gpu', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1297,12 +1282,10 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generate_with_past_kv(attn_impl: str, device: str, +def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) + device = 'gpu' if torch.cuda.is_available() else 'cpu' + if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( @@ -1369,10 +1352,10 @@ def test_generate_with_past_kv(attn_impl: str, device: str, @pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), + 'torch', + pytest.param('flash', 'gpu', marks=pytest.mark.gpu), + pytest.param('triton', 'gpu', marks=pytest.mark.gpu), + pytest.param('torch', 'gpu', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('generation_kwargs', [{ 'max_new_tokens': 2, @@ -1412,14 +1395,12 @@ def test_generate_with_past_kv(attn_impl: str, device: str, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generation_kwargs_dont_crash(attn_impl: str, device: str, +def test_generation_kwargs_dont_crash(attn_impl: str, generation_kwargs: Dict[str, Any], pos_emb_config: dict, tie_word_embeddings: bool): - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) + device = 'gpu' if torch.cuda.is_available() else 'cpu' + if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') @@ -1499,10 +1480,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str, def test_model_to(attention_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): # test that moving the model to diff devices and dtypes in diff ways does not break the model - if not torch.cuda.is_available(): - pytest.skip( - f'This test requires CUDA to be available in order to run with {attention_impl} attention.' - ) if pos_emb_config['alibi'] and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') @@ -1598,10 +1575,10 @@ def test_alibi_vs_hf(): @pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), + 'torch', + pytest.param('flash', 'gpu', marks=pytest.mark.gpu), + pytest.param('triton', 'gpu', marks=pytest.mark.gpu), + pytest.param('torch', 'gpu', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1633,14 +1610,12 @@ def test_alibi_vs_hf(): @pytest.mark.parametrize('output_hidden_states', [True, False]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_forward_with_output_attentions_and_output_hidden_states( - attn_impl: str, device: str, pos_emb_config: dict, + attn_impl: str, pos_emb_config: dict, output_attentions: bool, output_hidden_states: bool, tie_word_embeddings: bool): + device = 'gpu' if torch.cuda.is_available() else 'cpu' + # Test that model forward with output_attentions_and_output_hidden_states - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') if output_attentions and attn_impl in ['flash', 'triton']: @@ -1708,8 +1683,6 @@ def test_hf_init(tmp_path: pathlib.Path, init_device: str, world_size: int, batch_size: int = 1): - if not torch.cuda.is_available(): - pytest.skip(f'This test requires CUDA to be available.') if not torch.cuda.device_count() >= world_size: pytest.skip(f'This test requires {world_size} GPUs.') From 0167cb020ab7f3d3010c27b69cbcaec329ac0140 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 17 Nov 2023 16:19:07 -0800 Subject: [PATCH 2/8] precommit --- tests/test_model.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index c437b6fd9f..e5644712ee 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -514,10 +514,12 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool): assert block.resid_ffn_dropout.p == 0.2 -@pytest.mark.parametrize('attention_impl', ['torch', - pytest.param('flash', marks=pytest.mark.gpu), - pytest.param('triton', marks=pytest.mark.gpu), - pytest.param('torch', marks=pytest.mark.gpu)]) +@pytest.mark.parametrize('attention_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu) +]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, 'rope': False @@ -545,8 +547,8 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_padding(attention_impl: str, - pos_emb_config: dict, tie_word_embeddings: bool): +def test_forward_with_padding(attention_impl: str, pos_emb_config: dict, + tie_word_embeddings: bool): device = 'gpu' if torch.cuda.is_available() else 'cpu' # Test that different placement of padding does not affect the output. @@ -770,8 +772,8 @@ def test_advanced_mask_building(attention_impl: str): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generate(attention_impl: str, precision: str, - pos_emb_config: dict, tie_word_embeddings: bool): +def test_generate(attention_impl: str, precision: str, pos_emb_config: dict, + tie_word_embeddings: bool): device = 'gpu' if torch.cuda.is_available() else 'cpu' # Test that generate works, and produces the same output with or without @@ -1000,8 +1002,7 @@ def test_save_from_pretrained(tmp_path: pathlib.Path): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_cache_and_padding(attn_impl: str, - pos_emb_config: dict, +def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): device = 'gpu' if torch.cuda.is_available() else 'cpu' @@ -1282,8 +1283,8 @@ def test_forward_with_cache(attn_impl: str, pos_emb_config: dict, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generate_with_past_kv(attn_impl: str, - pos_emb_config: dict, tie_word_embeddings: bool): +def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict, + tie_word_embeddings: bool): device = 'gpu' if torch.cuda.is_available() else 'cpu' if pos_emb_config['alibi'] and attn_impl == 'flash': @@ -1610,9 +1611,8 @@ def test_alibi_vs_hf(): @pytest.mark.parametrize('output_hidden_states', [True, False]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_forward_with_output_attentions_and_output_hidden_states( - attn_impl: str, pos_emb_config: dict, - output_attentions: bool, output_hidden_states: bool, - tie_word_embeddings: bool): + attn_impl: str, pos_emb_config: dict, output_attentions: bool, + output_hidden_states: bool, tie_word_embeddings: bool): device = 'gpu' if torch.cuda.is_available() else 'cpu' # Test that model forward with output_attentions_and_output_hidden_states From b64166b53a3dc87a02d4e598f135131e928ed01c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 17 Nov 2023 16:24:55 -0800 Subject: [PATCH 3/8] fix --- tests/test_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index e5644712ee..19a89dcc07 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1250,11 +1250,11 @@ def test_forward_with_cache(attn_impl: str, pos_emb_config: dict, ) -@pytest.mark.parametrize('attn_impl,device', [ +@pytest.mark.parametrize('attn_impl', [ 'torch', - pytest.param('flash', 'gpu', marks=pytest.mark.gpu), - pytest.param('triton', 'gpu', marks=pytest.mark.gpu), - pytest.param('torch', 'gpu', marks=pytest.mark.gpu), + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, From 21031b59cb949e9b6fa5a490ec05a92f7ba32e6c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 17 Nov 2023 16:34:06 -0800 Subject: [PATCH 4/8] fix another --- tests/test_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 19a89dcc07..e7025b2ebe 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1352,11 +1352,11 @@ def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict, hf_config.d_model) -@pytest.mark.parametrize('attn_impl,device', [ +@pytest.mark.parametrize('attn_impl', [ 'torch', - pytest.param('flash', 'gpu', marks=pytest.mark.gpu), - pytest.param('triton', 'gpu', marks=pytest.mark.gpu), - pytest.param('torch', 'gpu', marks=pytest.mark.gpu), + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('generation_kwargs', [{ 'max_new_tokens': 2, From 1ba798e7cc66cb356845ff037351e6c1f4a161bb Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 17 Nov 2023 16:35:19 -0800 Subject: [PATCH 5/8] another --- tests/test_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index e7025b2ebe..4ed3a385cb 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1575,11 +1575,11 @@ def test_alibi_vs_hf(): torch.testing.assert_close(alibi_bias_hf, alibi_bias_m) -@pytest.mark.parametrize('attn_impl,device', [ +@pytest.mark.parametrize('attn_impl', [ 'torch', - pytest.param('flash', 'gpu', marks=pytest.mark.gpu), - pytest.param('triton', 'gpu', marks=pytest.mark.gpu), - pytest.param('torch', 'gpu', marks=pytest.mark.gpu), + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, From 3a863ca8ddabf4d4921b2752e07bfcef4def4bd3 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 17 Nov 2023 16:55:41 -0800 Subject: [PATCH 6/8] fix --- tests/test_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_model.py b/tests/test_model.py index 4ed3a385cb..64de3a3cff 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -304,6 +304,7 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): assert not torch.equal(original_params, updated_params) +@pytest.mark.gpu @pytest.mark.parametrize( 'attn_impl,precision', [('torch', torch.float16), ('torch', torch.bfloat16), From 84220de7128fdbd295855c8f577455e5227ee3d1 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 17 Nov 2023 17:08:44 -0800 Subject: [PATCH 7/8] precommit --- tests/test_model.py | 63 +++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 40 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 64de3a3cff..c38db7c9f7 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -16,7 +16,7 @@ from composer.core.precision import Precision, get_precision_context from composer.optim import DecoupledAdamW from composer.trainer.dist_strategy import prepare_fsdp_module -from composer.utils import dist, get_device, reproducibility +from composer.utils import dist, get_device from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, @@ -550,20 +550,18 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool): @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_forward_with_padding(attention_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): - device = 'gpu' if torch.cuda.is_available() else 'cpu' - # Test that different placement of padding does not affect the output. alibi = pos_emb_config['alibi'] if alibi and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') rope = pos_emb_config['rope'] - if rope and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if rope and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -775,21 +773,19 @@ def test_advanced_mask_building(attention_impl: str): @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_generate(attention_impl: str, precision: str, pos_emb_config: dict, tie_word_embeddings: bool): - device = 'gpu' if torch.cuda.is_available() else 'cpu' - # Test that generate works, and produces the same output with or without # padding in the input. if pos_emb_config['alibi'] and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') if attention_impl == 'torch' and precision == 'amp_bf16' and tie_word_embeddings == False: pytest.skip(f'This test configuration has precision / sampling issues.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1005,17 +1001,15 @@ def test_save_from_pretrained(tmp_path: pathlib.Path): @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): - device = 'gpu' if torch.cuda.is_available() else 'cpu' - # Tests that the result is the same with or without padding when using kv caching if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1144,19 +1138,17 @@ def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict, @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_forward_with_cache(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): - device = 'gpu' if torch.cuda.is_available() else 'cpu' - # Test that model forward with and without the key-value cache produces the # same output. if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1286,16 +1278,14 @@ def test_forward_with_cache(attn_impl: str, pos_emb_config: dict, @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): - device = 'gpu' if torch.cuda.is_available() else 'cpu' - if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1401,18 +1391,15 @@ def test_generation_kwargs_dont_crash(attn_impl: str, generation_kwargs: Dict[str, Any], pos_emb_config: dict, tie_word_embeddings: bool): - device = 'gpu' if torch.cuda.is_available() else 'cpu' - if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) - if device == 'gpu': # Switch deteminism off - torch.use_deterministic_algorithms(False) + composer_device = get_device(None) + hf_config = MPTConfig( init_device='cpu', d_model=128, @@ -1446,8 +1433,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str, _ = mpt.generate(input_ids=no_padding_input_ids, attention_mask=no_padding_attention_mask, **generation_kwargs) - if device == 'gpu': # Switch deteminism back on - reproducibility.configure_deterministic_mode() @pytest.mark.gpu @@ -1614,19 +1599,17 @@ def test_alibi_vs_hf(): def test_forward_with_output_attentions_and_output_hidden_states( attn_impl: str, pos_emb_config: dict, output_attentions: bool, output_hidden_states: bool, tie_word_embeddings: bool): - device = 'gpu' if torch.cuda.is_available() else 'cpu' - # Test that model forward with output_attentions_and_output_hidden_states if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') if output_attentions and attn_impl in ['flash', 'triton']: pytest.skip(f'output_attentions only implemented with torch attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) n_layers = 2 From ae719553116a80077e5d84a94ea780e94853b9cf Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 17 Nov 2023 22:39:49 -0800 Subject: [PATCH 8/8] fix --- tests/test_model.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_model.py b/tests/test_model.py index c38db7c9f7..c160c064dc 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -16,7 +16,7 @@ from composer.core.precision import Precision, get_precision_context from composer.optim import DecoupledAdamW from composer.trainer.dist_strategy import prepare_fsdp_module -from composer.utils import dist, get_device +from composer.utils import dist, get_device, reproducibility from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedModel, @@ -1400,6 +1400,9 @@ def test_generation_kwargs_dont_crash(attn_impl: str, f'dail implementation of rope requires gpu and flash attention 2.') composer_device = get_device(None) + if composer_device.name == 'gpu': + torch.use_deterministic_algorithms(False) + hf_config = MPTConfig( init_device='cpu', d_model=128, @@ -1434,6 +1437,9 @@ def test_generation_kwargs_dont_crash(attn_impl: str, attention_mask=no_padding_attention_mask, **generation_kwargs) + if composer_device.name == 'gpu': + reproducibility.configure_deterministic_mode() + @pytest.mark.gpu @pytest.mark.parametrize('attention_impl', ['torch', 'flash', 'triton'])