diff --git a/examples/config_nanoset.yaml b/examples/config_nanoset.yaml index 127ddb5e..b89b0bec 100644 --- a/examples/config_nanoset.yaml +++ b/examples/config_nanoset.yaml @@ -88,12 +88,12 @@ optimizer: weight_decay: 0.01 zero_stage: 0 parallelism: - dp: 1 + dp: -1 expert_parallel_size: 1 pp: 1 pp_engine: 1f1b tp: 1 - tp_linear_async_communication: true + tp_linear_async_communication: false tp_mode: REDUCE_SCATTER profiler: null tokenizer: diff --git a/src/nanotron/config/parallelism_config.py b/src/nanotron/config/parallelism_config.py index 5912425b..55d58652 100644 --- a/src/nanotron/config/parallelism_config.py +++ b/src/nanotron/config/parallelism_config.py @@ -1,3 +1,4 @@ +import os from dataclasses import dataclass from typing import Optional @@ -16,7 +17,7 @@ class ParallelismArgs: """Arguments related to TP/PP/DP Args: - dp: Number of DP replicas + dp: Number of DP replicas. Set to -1 to automatically compute DP size after dividing the model w/ PP & TP pp: Number of PP stages tp: Number of TP replicas expert_parallel_size: Number of expert parallel replicas (used only for MoEs) @@ -47,3 +48,6 @@ def __post_init__(self): self.pp_engine = cast_str_to_pipeline_engine(self.pp_engine) if isinstance(self.tp_mode, str): self.tp_mode = TensorParallelLinearMode[self.tp_mode.upper()] + + if self.dp == -1: + self.dp = int(os.environ["WORLD_SIZE"]) // (self.tp * self.pp)