Skip to content

Commit

Permalink
Fix zero-1 bug for inferring local ranks (#5936)
Browse files Browse the repository at this point in the history
  • Loading branch information
YangFei1990 authored and bhavya01 committed Apr 22, 2024
1 parent 0a2725d commit 05f8751
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion torch_xla/distributed/zero_redundancy_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,17 @@ def __init__(

def init_zero(self):
self.local_world_size = len(self.sharding_groups[0])
self.local_rank = self.global_rank // len(self.sharding_groups)
# Infer the local rank from the group
self.local_rank = None
for group in self.sharding_groups:
if self.global_rank in group:
if not isinstance(group, list):
group = list(group)
self.local_rank = group.index(self.global_rank)
if self.local_rank is None:
raise ValueError(
f"Current rank {self.global_rank} is missing from the sharding_groups {self.sharding_groups}"
)
# Shard parameters for use in optimizer
sharded_param_groups = self._shard_parameters()
# Optimizer initialization
Expand Down

0 comments on commit 05f8751

Please sign in to comment.