diff --git a/tests/test_model.py b/tests/test_model.py index 51180a6c28..5e589dbd60 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import contextlib import copy -import gc import os import pathlib import warnings @@ -864,10 +863,9 @@ def test_generate(attention_impl: str, precision: str, pos_emb_config: dict, @pytest.mark.gpu @pytest.mark.parametrize('world_size', [1, 2]) -@pytest.mark.parametrize('use_cache', [False, True]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int, - use_cache: bool, tie_word_embeddings: bool): + tie_word_embeddings: bool): if not torch.cuda.device_count() >= world_size: pytest.skip(f'This test requires {world_size} GPUs.') @@ -884,7 +882,7 @@ def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int, attn_config={ 'attn_impl': 'torch', }, - use_cache=use_cache, + use_cache=True, tie_word_embeddings=tie_word_embeddings, ) mpt = MPTForCausalLM(hf_config) @@ -970,7 +968,6 @@ def test_save_from_pretrained(tmp_path: pathlib.Path): 'torch', pytest.param('flash', marks=pytest.mark.gpu), pytest.param('triton', marks=pytest.mark.gpu), - pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -998,9 +995,7 @@ def test_save_from_pretrained(tmp_path: pathlib.Path): 'factor': 1.0, }, }]) -@pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict, - tie_word_embeddings: bool): +def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict): # Tests that the result is the same with or without padding when using kv caching if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') @@ -1029,7 +1024,7 @@ def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict, 'name': 'baseline_', 'init_std': 0.02, }, - tie_word_embeddings=tie_word_embeddings, + tie_word_embeddings=True, ) mpt = MPTForCausalLM(hf_config) @@ -1107,7 +1102,6 @@ def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict, 'torch', pytest.param('flash', marks=pytest.mark.gpu), pytest.param('triton', marks=pytest.mark.gpu), - pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1247,7 +1241,6 @@ def test_forward_with_cache(attn_impl: str, pos_emb_config: dict, 'torch', pytest.param('flash', marks=pytest.mark.gpu), pytest.param('triton', marks=pytest.mark.gpu), - pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1347,18 +1340,12 @@ def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict, 'torch', pytest.param('flash', marks=pytest.mark.gpu), pytest.param('triton', marks=pytest.mark.gpu), - pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('generation_kwargs', [{ 'max_new_tokens': 2, - 'num_beams': 4 -}, { - 'max_new_tokens': 2, + 'num_beams': 4, 'top_k': 5, 'penalty_alpha': 0.4 -}, { - 'do_sample': True, - 'top_p': 0.95 }]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1425,7 +1412,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str, with get_precision_context('amp_bf16' if composer_device.name == 'gpu' else 'fp32'): - # no padding in the input no_padding_input_ids = torch.tensor([[11274, 16390, 11]]) no_padding_input_ids = composer_device.tensor_to_device( no_padding_input_ids) @@ -1442,7 +1428,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str, @pytest.mark.gpu -@pytest.mark.parametrize('attention_impl', ['torch', 'flash', 'triton']) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, 'rope': False @@ -1470,12 +1455,8 @@ def test_generation_kwargs_dont_crash(attn_impl: str, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_model_to(attention_impl: str, pos_emb_config: dict, - tie_word_embeddings: bool): +def test_model_to(pos_emb_config: dict, tie_word_embeddings: bool): # test that moving the model to diff devices and dtypes in diff ways does not break the model - if pos_emb_config['alibi'] and attention_impl == 'flash': - pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config[ 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip(f'dail implementation of rope requires flash attention 2.') @@ -1490,7 +1471,7 @@ def test_model_to(attention_impl: str, pos_emb_config: dict, emb_pdrop=0.1, resid_pdrop=0.2, attn_config={ - 'attn_impl': attention_impl, + 'attn_impl': 'torch', **pos_emb_config, }, init_config={ @@ -1514,8 +1495,7 @@ def test_model_to(attention_impl: str, pos_emb_config: dict, mpt = mpt.to('cpu') # verify the model still works - if attention_impl == 'torch' and not ( - pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'): + if not (pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'): with torch.autocast('cpu', dtype=torch.bfloat16, enabled=True): _ = mpt(input_ids.to('cpu'), attention_mask=attention_mask.to('cpu')) @@ -1523,8 +1503,7 @@ def test_model_to(attention_impl: str, pos_emb_config: dict, mpt = mpt.float() # verify the model still works - if attention_impl == 'torch' and not ( - pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'): + if not (pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'): _ = mpt(input_ids.to('cpu'), attention_mask=attention_mask.to('cpu')) mpt = mpt.to(0) # move to rank0 @@ -1586,16 +1565,11 @@ def test_alibi_vs_hf(): 'factor': 1.0, }, }]) -@pytest.mark.parametrize('output_attentions', [True, False]) -@pytest.mark.parametrize('output_hidden_states', [True, False]) -@pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_forward_with_output_attentions_and_output_hidden_states( - attn_impl: str, pos_emb_config: dict, output_attentions: bool, - output_hidden_states: bool, tie_word_embeddings: bool): - # Test that model forward with output_attentions_and_output_hidden_states + attn_impl: str, pos_emb_config: dict): if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if output_attentions and attn_impl in ['flash', 'triton']: + if attn_impl in ['flash', 'triton']: pytest.skip(f'output_attentions only implemented with torch attention.') if pos_emb_config['rope'] and pos_emb_config[ 'rope_impl'] == 'dail' and not is_flash_v2_installed(): @@ -1624,7 +1598,7 @@ def test_forward_with_output_attentions_and_output_hidden_states( 'name': 'baseline_', 'init_std': 0.02, }, - tie_word_embeddings=tie_word_embeddings, + tie_word_embeddings=True, ) mpt = MPTForCausalLM(hf_config) mpt = composer_device.module_to_device(mpt) @@ -1637,20 +1611,16 @@ def test_forward_with_output_attentions_and_output_hidden_states( attention_mask = torch.tensor([[1, 1, 1]]).bool() attention_mask = composer_device.tensor_to_device(attention_mask) - # start with passing the first three tokens through outputs = mpt( input_ids, attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=True, + output_hidden_states=True, ) - if output_attentions: - assert len(outputs.attentions) == n_layers - assert all( - attn.shape == (1, 4, 3, 3) for attn in outputs.attentions) - if output_hidden_states: - assert len(outputs.hidden_states) == n_layers + 1 + assert len(outputs.attentions) == n_layers + assert all(attn.shape == (1, 4, 3, 3) for attn in outputs.attentions) + assert len(outputs.hidden_states) == n_layers + 1 @pytest.mark.gpu @@ -1663,10 +1633,6 @@ def test_hf_init(tmp_path: pathlib.Path, if not torch.cuda.device_count() >= world_size: pytest.skip(f'This test requires {world_size} GPUs.') - torch.cuda.empty_cache() - gc.collect() #just in case - torch.cuda.synchronize() - test_cfg = get_config(conf_path='scripts/train/yamls/pretrain/testing.yaml') test_cfg.device = torch.cuda.current_device()