diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py index 29878714f6..520257218d 100644 --- a/llmfoundry/command_utils/train.py +++ b/llmfoundry/command_utils/train.py @@ -5,7 +5,6 @@ import os import time import warnings -from copy import deepcopy from typing import Any, Optional, Union import torch @@ -351,7 +350,7 @@ def train(cfg: DictConfig) -> Trainer: # Initialize context init_context = process_init_device(model_config, fsdp_config, tp_config) logged_cfg.update({'fsdp_config': fsdp_config}, merge=True) - logged_cfg.update({'tp_config': deepcopy(tp_config)}, merge=True) + logged_cfg.update({'tp_config': tp_config}, merge=True) # Build tokenizer log.info('Building tokenizer...') @@ -517,6 +516,7 @@ def train(cfg: DictConfig) -> Trainer: # TP config if tp_config is not None: + strategy = tp_config.pop('strategy', None) assert isinstance(strategy, str), '`strategy` must be in `tp_config`.' tp_config['layer_plan'] = build_tp_strategies(strategy, model) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index c22495993c..75f2d3ffdd 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -538,6 +538,21 @@ def process_init_device( # Set defaults for mixed initialization fsdp_config.setdefault('load_monolith_rank0_only', True) + if tp_config is not None: + # Check tp_config has required fields + if 'strategy' not in tp_config or 'tensor_parallel_degree' not in tp_config: + raise ValueError( + "`tp_config` requires 'strategy' and 'tensor_parallel_degree' values. " + ) + + # Check we are not using tensor parallelism with MoEs + if 'ffn_config' in model_cfg and model_cfg['ffn_config'].get( + 'ffn_type', None + ) in ffns_with_megablocks: + raise ValueError( + 'Tensor Parallelism is not currently supported for MoE models.', + ) + # Check we are not using tensor parallelism with MoEs if tp_config is not None and 'ffn_config' in model_cfg and model_cfg[ 'ffn_config'].get('ffn_type', None) in ffns_with_megablocks: diff --git a/tests/tp/test_tp_strategies.py b/tests/tp/test_tp_strategies.py index fd2fa384ce..1aaff9e099 100644 --- a/tests/tp/test_tp_strategies.py +++ b/tests/tp/test_tp_strategies.py @@ -5,6 +5,7 @@ from tempfile import TemporaryDirectory import pytest +from icecream import install from omegaconf import OmegaConf as om from torch.distributed._tensor import Replicate, Shard from torch.distributed.tensor.parallel import ( @@ -19,6 +20,8 @@ from llmfoundry.utils.config_utils import process_init_device from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg +install() + @pytest.mark.gpu @pytest.mark.filterwarnings( @@ -97,8 +100,26 @@ def test_ffn_tp_strategy(): @pytest.mark.gpu -def test_no_tp_with_one_gpu(): - """Test that when we have one GPU, we use DDP and not FSDP-TP.""" +@pytest.mark.world_size(4) +@pytest.mark.parametrize('tp_strategy', ['ffn']) +def test_tp_train(tp_strategy: str): + """Test that we can train with FSDP-TP.""" + with TemporaryDirectory() as tmp_path: + # Make `train_cfg`` with a tensor parallelism strategy + dataset_name = create_c4_dataset_xxsmall(Path(tmp_path)) + train_cfg = gpt_tiny_cfg(dataset_name, 'gpu') + train_cfg.tp_config = { + 'strategy': tp_strategy, + 'tensor_parallel_degree': 2, + } + + # Train + train(train_cfg) + + +@pytest.mark.gpu +def test_tp_train_with_one_gpu(): + """Test that when we have one GPU, we train DDP and not FSDP-TP.""" with TemporaryDirectory() as tmp_path: # Make `train_cfg`` with a tensor parallelism strategy dataset_name = create_c4_dataset_xxsmall(Path(tmp_path)) @@ -115,7 +136,7 @@ def test_no_tp_with_one_gpu(): @pytest.mark.gpu # use gpu because `megablocks` only installed with `gpu` dependencies -def test_no_tp_with_moes(): +def test_tp_train_with_moes(): """Test that tensor parallelism is not compatible with MoEs.""" # Make `cfg` for MoE model, fsdp, and tp train_cfg_path: str = 'scripts/train/yamls/pretrain/testing-moe.yaml'