diff --git a/scripts/inference/hf_generate.py b/scripts/inference/hf_generate.py index 96592ca477..45ddc6b63e 100644 --- a/scripts/inference/hf_generate.py +++ b/scripts/inference/hf_generate.py @@ -217,6 +217,7 @@ def main(args: Namespace) -> None: if device is not None: print(f'Placing model on {device=}...') model.to(device) + model.to(model_dtype) except Exception as e: raise RuntimeError( 'Unable to load HF model. ' + diff --git a/setup.py b/setup.py index 05d1d1bbbe..afdfce8d48 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.16.4,<0.17', + 'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17,<0.18', 'accelerate>=0.20,<0.21', # for HF inference `device_map` 'transformers>=4.34.1,<4.35', 'mosaicml-streaming>=0.7.1,<0.8', @@ -84,11 +84,11 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]', + 'mosaicml[databricks]>=0.17,<0.18', ] extra_deps['tensorboard'] = [ - 'mosaicml[tensorboard]>=0.16.1,<0.17', + 'mosaicml[tensorboard]>=0.17,<0.18', ] extra_deps['gpu'] = [ diff --git a/tests/fixtures/data.py b/tests/fixtures/data.py index 39032146b6..16dd01347d 100644 --- a/tests/fixtures/data.py +++ b/tests/fixtures/data.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from pathlib import Path +from unittest.mock import MagicMock, patch from composer.utils import dist from omegaconf import DictConfig @@ -25,6 +26,7 @@ def tiny_ft_dataset_path(tmp_path: Path, dataset_size: int = 4) -> Path: @fixture +@patch('os.cpu_count', MagicMock(return_value=None)) def tiny_ft_dataloader(tiny_ft_dataset_path: Path, mpt_tokenizer: PreTrainedTokenizerBase, max_seq_len: int = 128, diff --git a/tests/test_model.py b/tests/test_model.py index f6564059d3..882573bfdd 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -304,17 +304,13 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): assert not torch.equal(original_params, updated_params) +@pytest.mark.gpu @pytest.mark.parametrize( 'attn_impl,precision', [('torch', torch.float16), ('torch', torch.bfloat16), pytest.param('flash', torch.float16, marks=pytest.mark.gpu), pytest.param('flash', torch.bfloat16, marks=pytest.mark.gpu)]) def test_determinism(attn_impl: str, precision: torch.dtype): - if not torch.cuda.is_available(): - pytest.skip( - 'This test requires CUDA to be available in order to run with bfloat16 precision.' - ) - conf_path = 'scripts/train/yamls/pretrain/testing.yaml' with open(conf_path) as f: test_cfg = om.load(f) @@ -519,11 +515,12 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool): assert block.resid_ffn_dropout.p == 0.2 + @pytest.mark.parametrize('attention_impl', [ 'torch', pytest.param('flash', marks=pytest.mark.gpu), pytest.param('triton', marks=pytest.mark.gpu), - pytest.param('torch', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu) ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': True, @@ -642,10 +639,12 @@ def test_sequence_id_based_masking(attention_impl: str, atol=atol) -@pytest.mark.parametrize('attention_impl,device', [('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu')]) +@pytest.mark.parametrize('attention_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu) +]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, 'rope': False @@ -673,24 +672,20 @@ def test_sequence_id_based_masking(attention_impl: str, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_padding(attention_impl: str, device: str, - pos_emb_config: dict, tie_word_embeddings: bool): +def test_forward_with_padding(attention_impl: str, pos_emb_config: dict, + tie_word_embeddings: bool): # Test that different placement of padding does not affect the output. - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attention_impl} attention.' - ) alibi = pos_emb_config['alibi'] if alibi and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') rope = pos_emb_config['rope'] - if rope and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if rope and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -866,12 +861,12 @@ def test_advanced_mask_building(attention_impl: str): assert torch.equal(attn_bias, expected_attn_bias) -@pytest.mark.parametrize('attention_impl,device,precision', [ - ('torch', 'cpu', 'fp32'), - ('flash', 'gpu', 'amp_bf16'), - ('triton', 'gpu', 'amp_bf16'), - ('torch', 'gpu', 'amp_bf16'), - ('torch', 'gpu', 'fp32'), +@pytest.mark.parametrize('attention_impl,precision', [ + ('torch', 'fp32'), + pytest.param('flash', 'amp_bf16', marks=pytest.mark.gpu), + pytest.param('triton', 'amp_bf16', marks=pytest.mark.gpu), + pytest.param('torch', 'amp_bf16', marks=pytest.mark.gpu), + pytest.param('torch', 'fp32', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -900,25 +895,21 @@ def test_advanced_mask_building(attention_impl: str): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generate(attention_impl: str, device: str, precision: str, - pos_emb_config: dict, tie_word_embeddings: bool): +def test_generate(attention_impl: str, precision: str, pos_emb_config: dict, + tie_word_embeddings: bool): # Test that generate works, and produces the same output with or without # padding in the input. - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attention_impl} attention.' - ) if pos_emb_config['alibi'] and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') if attention_impl == 'torch' and precision == 'amp_bf16' and tie_word_embeddings == False: pytest.skip(f'This test configuration has precision / sampling issues.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1001,8 +992,6 @@ def test_generate(attention_impl: str, device: str, precision: str, @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int, use_cache: bool, tie_word_embeddings: bool): - if not torch.cuda.is_available(): - pytest.skip(f'This test requires CUDA to be available.') if not torch.cuda.device_count() >= world_size: pytest.skip(f'This test requires {world_size} GPUs.') @@ -1101,11 +1090,11 @@ def test_save_from_pretrained(tmp_path: pathlib.Path): check_hf_model_equivalence(mpt, mpt2) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1134,22 +1123,17 @@ def test_save_from_pretrained(tmp_path: pathlib.Path): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_cache_and_padding(attn_impl: str, device: str, - pos_emb_config: dict, +def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): # Tests that the result is the same with or without padding when using kv caching - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1243,11 +1227,11 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str, rtol=1e-6) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1276,23 +1260,19 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict, +def test_forward_with_cache(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): # Test that model forward with and without the key-value cache produces the # same output. - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1387,11 +1367,11 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict, ) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1420,20 +1400,16 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generate_with_past_kv(attn_impl: str, device: str, - pos_emb_config: dict, tie_word_embeddings: bool): - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) +def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict, + tie_word_embeddings: bool): if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1491,11 +1467,11 @@ def test_generate_with_past_kv(attn_impl: str, device: str, hf_config.d_model) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('generation_kwargs', [{ 'max_new_tokens': 2, @@ -1535,24 +1511,22 @@ def test_generate_with_past_kv(attn_impl: str, device: str, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generation_kwargs_dont_crash(attn_impl: str, device: str, +def test_generation_kwargs_dont_crash(attn_impl: str, generation_kwargs: Dict[str, Any], pos_emb_config: dict, tie_word_embeddings: bool): - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) - if device == 'gpu': # Switch deteminism off + composer_device = get_device(None) + + if composer_device.name == 'gpu': torch.use_deterministic_algorithms(False) + hf_config = MPTConfig( init_device='cpu', d_model=128, @@ -1586,7 +1560,8 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str, _ = mpt.generate(input_ids=no_padding_input_ids, attention_mask=no_padding_attention_mask, **generation_kwargs) - if device == 'gpu': # Switch deteminism back on + + if composer_device.name == 'gpu': reproducibility.configure_deterministic_mode() @@ -1622,10 +1597,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str, def test_model_to(attention_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): # test that moving the model to diff devices and dtypes in diff ways does not break the model - if not torch.cuda.is_available(): - pytest.skip( - f'This test requires CUDA to be available in order to run with {attention_impl} attention.' - ) if pos_emb_config['alibi'] and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') @@ -1720,11 +1691,11 @@ def test_alibi_vs_hf(): torch.testing.assert_close(alibi_bias_hf, alibi_bias_m) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1756,24 +1727,19 @@ def test_alibi_vs_hf(): @pytest.mark.parametrize('output_hidden_states', [True, False]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_forward_with_output_attentions_and_output_hidden_states( - attn_impl: str, device: str, pos_emb_config: dict, - output_attentions: bool, output_hidden_states: bool, - tie_word_embeddings: bool): + attn_impl: str, pos_emb_config: dict, output_attentions: bool, + output_hidden_states: bool, tie_word_embeddings: bool): # Test that model forward with output_attentions_and_output_hidden_states - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') if output_attentions and attn_impl in ['flash', 'triton']: pytest.skip(f'output_attentions only implemented with torch attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) n_layers = 2 @@ -1831,8 +1797,6 @@ def test_hf_init(tmp_path: pathlib.Path, init_device: str, world_size: int, batch_size: int = 1): - if not torch.cuda.is_available(): - pytest.skip(f'This test requires CUDA to be available.') if not torch.cuda.device_count() >= world_size: pytest.skip(f'This test requires {world_size} GPUs.') diff --git a/tests/test_mpt_gen.py b/tests/test_mpt_gen.py index 413e39bf8c..9f022ef487 100644 --- a/tests/test_mpt_gen.py +++ b/tests/test_mpt_gen.py @@ -95,9 +95,7 @@ def test_mpt_generate_multi_gpu(attn_impl: str, use_alibi: bool, @pytest.mark.gpu @pytest.mark.parametrize('attn_impl', ['triton', 'torch']) @pytest.mark.parametrize('use_alibi', [True, False]) -@pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_mpt_generate_callback(attn_impl: str, use_alibi: bool, - tie_word_embeddings: bool, build_tiny_mpt: Callable[..., ComposerMPTCausalLM], tiny_ft_dataloader: DataLoader): @@ -105,7 +103,7 @@ def test_mpt_generate_callback(attn_impl: str, use_alibi: bool, # build mpt model model = build_tiny_mpt( - tie_word_embeddings=tie_word_embeddings, + tie_word_embeddings=True, attn_config={ 'attn_impl': attn_impl, 'attn_uses_sequence_id': False, @@ -143,3 +141,54 @@ def test_mpt_generate_callback(attn_impl: str, use_alibi: bool, generate.generate.assert_called_once() trainer.logger.log_table.assert_called_once() + + +@pytest.mark.gpu +@pytest.mark.parametrize('attn_impl', ['triton', 'torch']) +@pytest.mark.parametrize('use_alibi', [True, False]) +def test_mpt_generate_callback_not_tied( + use_alibi: bool, attn_impl: str, + build_tiny_mpt: Callable[..., ComposerMPTCausalLM], + tiny_ft_dataloader: DataLoader): + device = get_device('gpu') + + # build mpt model + model = build_tiny_mpt( + tie_word_embeddings=False, + attn_config={ + 'attn_impl': attn_impl, + 'attn_uses_sequence_id': False, + 'alibi': use_alibi, + }, + ) + model = device.module_to_device(model) + + # generate callback + prompts = [ + 'The best banana bread recipe is', + '2+2=', + 'how much wood could a woodchuck chuck', + ] + gen_interval = 1 + generate = ComposerGenerate( + prompts, + interval=f'{gen_interval}ba', + max_new_tokens=5, + batch_size=len(prompts), + use_cache=True, + ) + generate.generate = Mock(wraps=generate.generate, autospec=True) + + # build trainer + trainer = Trainer( + model=model, + train_dataloader=tiny_ft_dataloader, + device=device, + max_duration=f'{gen_interval}ba', + callbacks=[generate], + ) + trainer.logger.log_table = Mock() + trainer.fit() + + generate.generate.assert_called_once() + trainer.logger.log_table.assert_called_once()