Added Automatic DP computation

TJ-Solergibert · Jul 18, 2024 · 9a7d4a3 · 9a7d4a3
1 parent 50da275
commit 9a7d4a3
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 3 deletions.
diff --git a/examples/config_nanoset.yaml b/examples/config_nanoset.yaml
@@ -88,12 +88,12 @@ optimizer:
   weight_decay: 0.01
   zero_stage: 0
 parallelism:
-  dp: 1
+  dp: -1
   expert_parallel_size: 1
   pp: 1
   pp_engine: 1f1b
   tp: 1
-  tp_linear_async_communication: true
+  tp_linear_async_communication: false
   tp_mode: REDUCE_SCATTER
 profiler: null
 tokenizer:

diff --git a/src/nanotron/config/parallelism_config.py b/src/nanotron/config/parallelism_config.py
@@ -1,3 +1,4 @@
+import os
 from dataclasses import dataclass
 from typing import Optional
 
@@ -16,7 +17,7 @@ class ParallelismArgs:
     """Arguments related to TP/PP/DP
 
     Args:
-        dp: Number of DP replicas
+        dp: Number of DP replicas. Set to -1 to automatically compute DP size after dividing the model w/ PP & TP
         pp: Number of PP stages
         tp: Number of TP replicas
         expert_parallel_size: Number of expert parallel replicas (used only for MoEs)
@@ -47,3 +48,6 @@ def __post_init__(self):
             self.pp_engine = cast_str_to_pipeline_engine(self.pp_engine)
         if isinstance(self.tp_mode, str):
             self.tp_mode = TensorParallelLinearMode[self.tp_mode.upper()]
+
+        if self.dp == -1:
+            self.dp = int(os.environ["WORLD_SIZE"]) // (self.tp * self.pp)