From 05f8751122af75d04573ebd304f191af89b78c1d Mon Sep 17 00:00:00 2001 From: Fei <33940270+YangFei1990@users.noreply.github.com> Date: Wed, 13 Dec 2023 18:14:04 -0800 Subject: [PATCH] Fix zero-1 bug for inferring local ranks (#5936) --- torch_xla/distributed/zero_redundancy_optimizer.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/torch_xla/distributed/zero_redundancy_optimizer.py b/torch_xla/distributed/zero_redundancy_optimizer.py index e18ae0b5003..f00929eeb86 100644 --- a/torch_xla/distributed/zero_redundancy_optimizer.py +++ b/torch_xla/distributed/zero_redundancy_optimizer.py @@ -83,7 +83,17 @@ def __init__( def init_zero(self): self.local_world_size = len(self.sharding_groups[0]) - self.local_rank = self.global_rank // len(self.sharding_groups) + # Infer the local rank from the group + self.local_rank = None + for group in self.sharding_groups: + if self.global_rank in group: + if not isinstance(group, list): + group = list(group) + self.local_rank = group.index(self.global_rank) + if self.local_rank is None: + raise ValueError( + f"Current rank {self.global_rank} is missing from the sharding_groups {self.sharding_groups}" + ) # Shard parameters for use in optimizer sharded_param_groups = self._shard_parameters() # Optimizer initialization