From 702fbb5661089420b7e6b90e08bb0e391643a4db Mon Sep 17 00:00:00 2001 From: Jon Bolin Date: Thu, 12 Oct 2023 02:13:30 +0000 Subject: [PATCH] Use rank from tracked process group --- torch_xla/experimental/distributed_checkpoint/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_xla/experimental/distributed_checkpoint/manager.py b/torch_xla/experimental/distributed_checkpoint/manager.py index 2b87ab10bba..476bae5b882 100644 --- a/torch_xla/experimental/distributed_checkpoint/manager.py +++ b/torch_xla/experimental/distributed_checkpoint/manager.py @@ -193,7 +193,7 @@ def _release_oldest_checkpoints(self): Delete oldest checkpoints until the number of tracked checkpoints is below self.max_to_keep. This operation is only execution on the rank 0 process. """ - if dist.get_rank() == 0 and self.max_to_keep > 0: + if dist.get_rank(self.pg) == 0 and self.max_to_keep > 0: while len(self._tracked_chkpts) > self.max_to_keep: oldest_chkpt = self._tracked_chkpts.popleft() self._delete_chkpt_at_step(oldest_chkpt.step)