From c125b3b021a85365d90334673090c3b5eda61041 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 18 Nov 2023 00:23:00 +0000 Subject: [PATCH] wip --- tests/test_flash_triton_torch.py | 18 +++++++++++------ tests/test_hf_conversion_script.py | 31 +++++++++++++++--------------- tests/test_model.py | 25 ++++++------------------ 3 files changed, 34 insertions(+), 40 deletions(-) diff --git a/tests/test_flash_triton_torch.py b/tests/test_flash_triton_torch.py index 1ede36c0b5..1d1919217c 100644 --- a/tests/test_flash_triton_torch.py +++ b/tests/test_flash_triton_torch.py @@ -5,6 +5,7 @@ import torch from omegaconf import OmegaConf as om +from llmfoundry.models.layers import attention from llmfoundry.models.layers.attention import is_flash_v2_installed from llmfoundry.models.mpt.modeling_mpt import gen_rotary_embedding @@ -17,8 +18,14 @@ def allclose_helper(t0: torch.Tensor, @pytest.mark.gpu -@pytest.mark.parametrize('attn_impl_0', ['flash', 'triton', 'torch']) -@pytest.mark.parametrize('attn_impl_1', ['flash', 'triton', 'torch']) +@pytest.mark.parametrize('attn_impl_0,attn_impl_1', [ + ('flash', 'flash'), + ('flash', 'triton'), + ('flash', 'torch'), + ('triton', 'triton'), + ('triton', 'torch'), + ('torch', 'torch'), +]) @pytest.mark.parametrize('clip_qkv', [True, False]) @pytest.mark.parametrize('qk_ln', [True, False]) @pytest.mark.parametrize('pos_emb_config', [{ @@ -62,11 +69,10 @@ def test_attn_impl(attn_impl_0: str, Includes testing with and without attn_clip_qkv, attn_qk_ln, alibi, and rope. """ - from llmfoundry.models.layers import attention alibi = pos_emb_config['alibi'] rope = pos_emb_config['rope'] if alibi and (attn_impl_0 == 'flash' or attn_impl_1 == 'flash'): - pytest.xfail('flash attn does not support alibi') + pytest.skip('flash attn does not support alibi') if rope and (pos_emb_config['rope_impl'] == 'dail') and (not is_flash_v2_installed()): @@ -81,7 +87,7 @@ def test_attn_impl(attn_impl_0: str, 'qk_ln': qk_ln, }) - n, s, f = 2, 16, cfg.d_model + n, s, f = 2, 4, cfg.d_model assert cfg.d_model % cfg.n_heads == 0 if attn_type == 'grouped_query_attention': cfg.kv_n_heads = 2 @@ -311,7 +317,7 @@ def test_grouped_attention_heads(attn_impl: str, 'kv_n_heads': kv_n_heads }) - n, s, f = 2, 16, cfg.d_model + n, s, f = 2, 4, cfg.d_model mmhsa = attention.GroupedQueryAttention(**cfg).to(device) diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 713b13d305..0d4cde342d 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -263,7 +263,7 @@ def test_callback_inits(): @pytest.mark.parametrize('log_to_mlflow', [True, False]) @pytest.mark.parametrize( 'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints', - [('3ba', '2ba', '7ba', 3, 4), ('1dur', '2ba', '1ep', 1, 4)]) + [('3ba', '2ba', '4ba', 2, 2), ('1dur', '2ba', '1ep', 1, 2)]) @patch('os.cpu_count', MagicMock(return_value=None)) def test_huggingface_conversion_callback_interval( tmp_path: pathlib.Path, log_to_mlflow: bool, hf_save_interval: str, @@ -273,12 +273,12 @@ def test_huggingface_conversion_callback_interval( dist.initialize_dist(get_device('gpu')) - max_seq_len = 16 - device_batch_size = 1 - dataset_size = 14 + max_seq_len = 4 + device_batch_size = 2 + dataset_size = 8 precision_str = 'bfloat16' precision = torch.bfloat16 - batches_per_epoch = math.ceil(dataset_size / (device_batch_size * 2)) + batches_per_epoch = math.ceil(dataset_size / device_batch_size) checkpointer_callback = HuggingFaceCheckpointer( save_folder=os.path.join(tmp_path, 'checkpoints'), @@ -292,7 +292,7 @@ def test_huggingface_conversion_callback_interval( model_cfg = { 'name': 'mpt_causal_lm', 'init_device': 'cpu', - 'd_model': 128, + 'd_model': 64, 'n_heads': 2, 'n_layers': 2, 'expansion_ratio': 4, @@ -401,7 +401,7 @@ def test_huggingface_conversion_callback_interval( ] assert len(normal_checkpoints) == expected_normal_checkpoints assert len(huggingface_checkpoints) == expected_hf_checkpoints - + print(huggingface_checkpoints) # Load the last huggingface checkpoint loaded_model = transformers.AutoModelForCausalLM.from_pretrained( os.path.join(tmp_path, 'checkpoints', 'huggingface', @@ -428,7 +428,7 @@ def test_huggingface_conversion_callback_interval( trust_remote_code=True, ) - check_hf_model_equivalence(trainer.state.model.module.model.to(precision), + check_hf_model_equivalence(trainer.state.model.model.to(precision), loaded_model) check_hf_tokenizer_equivalence(tokenizer, loaded_tokenizer) @@ -442,14 +442,15 @@ def test_huggingface_conversion_callback_interval( [('mpt', True), ('mpt', False), ('neo', None), ('llama2', None)], ) @pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None]) +@pytest.mark.parametrize( + 'hf_save_interval,save_interval,max_duration,expected_hf_checkpoints,expected_normal_checkpoints', + [('3ba', '2ba', '7ba', 3, 4)]) @patch('os.cpu_count', MagicMock(return_value=None)) -def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, - tie_word_embeddings: bool, - fsdp_state_dict_type: Optional[str], - hf_save_interval: str, - save_interval: str, max_duration: str, - expected_hf_checkpoints: int, - expected_normal_checkpoints: int): +def test_huggingface_conversion_callback( + model: str, tmp_path: pathlib.Path, tie_word_embeddings: bool, + fsdp_state_dict_type: Optional[str], + hf_save_interval: str, save_interval: str, max_duration: str, + expected_hf_checkpoints: int, expected_normal_checkpoints: int): delete_transformers_cache() dist.initialize_dist(get_device('gpu')) diff --git a/tests/test_model.py b/tests/test_model.py index c160c064dc..51180a6c28 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -874,11 +874,11 @@ def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int, save_path = tmp_path / 'test-device-map' hf_config = MPTConfig( init_device='cpu', - d_model=128, + d_model=64, n_heads=4, n_layers=2, expansion_ratio=2, - max_seq_len=2048, + max_seq_len=4, emb_pdrop=0.1, resid_pdrop=0.2, attn_config={ @@ -914,8 +914,8 @@ def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int, ) with torch.autocast('cuda', dtype=torch.bfloat16): _ = pipe( - 'The quick fox jumped over', - max_length=10, + 'The fox', + max_new_tokens=2, do_sample=True, ) @@ -1482,18 +1482,17 @@ def test_model_to(attention_impl: str, pos_emb_config: dict, hf_config = MPTConfig( init_device='cpu', - d_model=128, + d_model=64, n_heads=4, n_layers=2, expansion_ratio=2, - max_seq_len=2048, + max_seq_len=4, emb_pdrop=0.1, resid_pdrop=0.2, attn_config={ 'attn_impl': attention_impl, **pos_emb_config, }, - use_cache=True, init_config={ 'name': 'baseline_', 'init_std': 0.02, @@ -1509,11 +1508,9 @@ def test_model_to(attention_impl: str, pos_emb_config: dict, input_ids = torch.tensor([[11274, 16390, 11]]).to('cuda') attention_mask = torch.tensor([[1, 1, 1]]).bool().to('cuda') - # with get_precision_context('amp_bf16'): _ = mpt(input_ids, attention_mask=attention_mask) # move the model around using different methods - mpt = mpt.bfloat16() mpt = mpt.to('cpu') # verify the model still works @@ -1523,15 +1520,6 @@ def test_model_to(attention_impl: str, pos_emb_config: dict, _ = mpt(input_ids.to('cpu'), attention_mask=attention_mask.to('cpu')) - mpt = mpt.cuda() - mpt = mpt.bfloat16() - - # verify the model still works - if attention_impl == 'torch': - with torch.autocast('cuda', dtype=torch.bfloat16, enabled=True): - _ = mpt(input_ids, attention_mask=attention_mask) - - mpt = mpt.to('cpu') mpt = mpt.float() # verify the model still works @@ -1539,7 +1527,6 @@ def test_model_to(attention_impl: str, pos_emb_config: dict, pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail'): _ = mpt(input_ids.to('cpu'), attention_mask=attention_mask.to('cpu')) - mpt = mpt.half() mpt = mpt.to(0) # move to rank0 mpt = mpt.bfloat16()