diff --git a/tests/data_utils.py b/tests/data_utils.py index 1f6c26b72e..67c1be9f6e 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -251,6 +251,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str: shutil.copytree( os.path.join(c4_dir, 'val_xxsmall'), os.path.join(c4_dir, mocked_split), + dirs_exist_ok=True, ) assert os.path.exists(c4_dir) return c4_dir diff --git a/tests/tp/test_tp_strategies.py b/tests/tp/test_tp_strategies.py index 1aaff9e099..085e51787f 100644 --- a/tests/tp/test_tp_strategies.py +++ b/tests/tp/test_tp_strategies.py @@ -1,10 +1,13 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import os from pathlib import Path +from shutil import rmtree from tempfile import TemporaryDirectory import pytest +from composer.utils import dist from icecream import install from omegaconf import OmegaConf as om from torch.distributed._tensor import Replicate, Shard @@ -104,10 +107,12 @@ def test_ffn_tp_strategy(): @pytest.mark.parametrize('tp_strategy', ['ffn']) def test_tp_train(tp_strategy: str): """Test that we can train with FSDP-TP.""" - with TemporaryDirectory() as tmp_path: + data_dir = '/my-data-dir/' + try: # Make `train_cfg`` with a tensor parallelism strategy - dataset_name = create_c4_dataset_xxsmall(Path(tmp_path)) + dataset_name = create_c4_dataset_xxsmall(Path(data_dir)) train_cfg = gpt_tiny_cfg(dataset_name, 'gpu') + train_cfg.variables.run_name = 'tp-test' train_cfg.tp_config = { 'strategy': tp_strategy, 'tensor_parallel_degree': 2, @@ -115,6 +120,12 @@ def test_tp_train(tp_strategy: str): # Train train(train_cfg) + except Exception as e: + raise e + finally: + # always remove data directory + if os.path.isdir(data_dir): + rmtree(data_dir) @pytest.mark.gpu @@ -152,3 +163,7 @@ def test_tp_train_with_moes(): match='Tensor Parallelism is not currently supported for MoE models.', ): process_init_device(model_cfg, fsdp_cfg, tp_cfg) + + +if __name__ == '__main__': + test_tp_train('ffn')