fix the bug so that the sequence parallel norm is all-reduced when ov…

…erlap is False (InternLM#534)
yingtongxiong · Dec 12, 2023 · 432bd5e · 432bd5e
1 parent d904730
commit 432bd5e
Showing 1 changed file with 10 additions and 10 deletions.
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -219,10 +219,7 @@ def __init__(
         # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled.
         self.skip_grad_reduce = False
 
-        # reduction hook is only used if overlapping communication
-        # if it is stage 1 without overlapping, no hook will be attached
-        if self._overlap_sync_grad:
-            self._attach_reduction_hook()
+        self._attach_reduction_hook()
 
     @property
     def zero_local_rank(self):
@@ -321,12 +318,15 @@ def reduce_grad_hook_sp(*args):  # pylint: disable=W0613
 
                         # if sequence_parallel is True,
                         # the grad of norm should be all-reduce across the tp process group
-                        if gpc.config.parallel.sequence_parallel is True:
-                            if hasattr(param, IS_SEQUENCE_PARALLEL) and getattr(param, IS_SEQUENCE_PARALLEL) is True:
-                                accum_grad_obj_sp = get_grad_accumulate_object(param)
-                                accum_grad_obj_sp.register_hook(reduce_grad_hook_sp)
-
-                        accum_grad_obj.register_hook(reduce_grad_hook)
+                        if (
+                            gpc.config.parallel.sequence_parallel is True
+                            and hasattr(param, IS_SEQUENCE_PARALLEL)
+                            and getattr(param, IS_SEQUENCE_PARALLEL) is True
+                        ):
+                            accum_grad_obj.register_hook(reduce_grad_hook_sp)
+
+                        if self._overlap_sync_grad:
+                            accum_grad_obj.register_hook(reduce_grad_hook)
 
                     _define_and_attach(param, reduce_rank)