From d5f966d0d846bd32d24cf84140bbee94e1f042a4 Mon Sep 17 00:00:00 2001
From: jonb377 <jonbolin@google.com>
Date: Tue, 14 Nov 2023 15:21:58 +1100
Subject: [PATCH] Add GKE support and various usability improvements in
 CheckpointManager (#5770)

* Add GKE support and various usability improvements in CheckpointManager

* Bug fix for async checkpointing fully sharded state dicts
---
 test/spmd/test_xla_distributed_checkpoint.py  |  21 ++-
 torch_xla/_internal/tpu.py                    |   5 +-
 .../distributed_checkpoint/_helpers.py        |   7 +-
 .../distributed_checkpoint/manager.py         | 130 ++++++++++--------
 4 files changed, 97 insertions(+), 66 deletions(-)

diff --git a/test/spmd/test_xla_distributed_checkpoint.py b/test/spmd/test_xla_distributed_checkpoint.py
index 910b35f324b0..55fd7d9c1551 100644
--- a/test/spmd/test_xla_distributed_checkpoint.py
+++ b/test/spmd/test_xla_distributed_checkpoint.py
@@ -273,6 +273,20 @@ def test_save_state_dict_with_cpu_shards(self):
       self.assertTrue(
           isinstance(planner.sharded_state_dict['fc1.weight'], _CpuShards))
 
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required for sharded test")
+  def test_cpu_state_dict_flattening(self):
+    # In the case of a nested state_dict with fully sharded parameters,
+    # _CpuShards should be treated as terminal nodes.
+    t = torch.randn(128, 128).to(xm.xla_device())
+    mesh = self._get_mesh((self.n_devices, 1))
+    xs.mark_sharding(t, mesh, (0, 1))
+    state_dict = _sharded_cpu_state_dict({'model': {'weight': t}})
+    planner = SPMDSavePlanner()
+    planner.set_up_planner(state_dict, True)
+    # model.weight should be flattened and tracked in the sharded state dict.
+    self.assertCountEqual(planner.sharded_state_dict, ["model.weight"])
+
   def test_local_save_plan(self):
 
     def _write_item_assertions(plan, n_devices, parameter_count):
@@ -433,13 +447,14 @@ def test_manager_async(self, tmpdir):
 
     # Patch the manager's save method to block until this thread signals.
     cond = threading.Condition()
-    old_save = chkpt_mgr.save
+    old_save = chkpt_mgr._save
 
     def patched_save(*args, **kwargs):
-      cond.wait()
+      with cond:
+        cond.wait()
       old_save(*args, **kwargs)
 
-    with unittest.mock.patch.object(chkpt_mgr, 'save', patched_save):
+    with unittest.mock.patch.object(chkpt_mgr, '_save', patched_save):
       chkpt_mgr.save_async(10, state_dict)
 
     # No new steps should be tracked immediately after calling save_async
diff --git a/torch_xla/_internal/tpu.py b/torch_xla/_internal/tpu.py
index 108ca7945a32..385566b1d358 100644
--- a/torch_xla/_internal/tpu.py
+++ b/torch_xla/_internal/tpu.py
@@ -5,6 +5,7 @@
 import os
 import pathlib
 import re
+import socket
 from typing import NamedTuple, Optional, List
 from typing_extensions import TypedDict
 import requests
@@ -299,10 +300,12 @@ def discover_master_worker_ip(use_localhost: bool = True) -> str:
   return worker_ips[master_worker_id]
 
 
-def _spmd_find_master_ip(current_worker_ip: str) -> str:
+def _spmd_find_master_ip(current_worker_hostname: str) -> str:
   import torch_xla.runtime as xr
   import torch_xla.distributed.spmd as xs
   from_cpu_shards = torch_xla._XLAC._global_tensor_from_cpu_shards
+  # Translate the hostname to an IP address, e.g. for TPUs on GKE.
+  current_worker_ip = socket.gethostbyname(current_worker_hostname)
   ip_int = int(ip_address(current_worker_ip))
   n_dev = xr.global_runtime_device_count()
   local_ndev = len(torch_xla._XLAC._xla_get_runtime_devices())
diff --git a/torch_xla/experimental/distributed_checkpoint/_helpers.py b/torch_xla/experimental/distributed_checkpoint/_helpers.py
index 6ab2da163ac2..62c3c6f2ee0b 100644
--- a/torch_xla/experimental/distributed_checkpoint/_helpers.py
+++ b/torch_xla/experimental/distributed_checkpoint/_helpers.py
@@ -34,8 +34,13 @@
 CONTAINER_TYPE = MutableMapping[PATH_ITEM, STATE_DICT_ITEM]
 
 
+# TODO(jonbolin): Logic here is modified from the upstream to enable async
+# checkpointing. If the state_dict is comprised entirely of _CpuShards,
+# flatten_state_dict will not actually flatten the dict.
+# Once we can represent XLAShardedTensor on CPU, either directly or through
+# DistributedTensor, we can reuse the upstream logic.
 def _keep_visiting_tensors(value: STATE_DICT_ITEM) -> bool:
-  return isinstance(value, torch.Tensor)
+  return isinstance(value, torch.Tensor) or isinstance(value, _CpuShards)
 
 
 def _traverse_state_dict(
diff --git a/torch_xla/experimental/distributed_checkpoint/manager.py b/torch_xla/experimental/distributed_checkpoint/manager.py
index 0eaf184910ae..89bb20f50769 100644
--- a/torch_xla/experimental/distributed_checkpoint/manager.py
+++ b/torch_xla/experimental/distributed_checkpoint/manager.py
@@ -2,11 +2,11 @@
 import logging
 import os
 import pickle
-import queue
 import threading
 import torch.distributed as dist
 import torch.distributed.checkpoint as dist_cp
 import torch_xla
+import torch_xla.core.xla_model as xm
 import torch_xla.runtime as xr
 import torch_xla.experimental.distributed_checkpoint as xc
 import traceback
@@ -16,6 +16,7 @@
 from collections import deque
 from fsspec.core import url_to_fs
 from os.path import basename
+from concurrent.futures import ThreadPoolExecutor, wait
 from typing import Deque, List, Optional, Union
 from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
 from ._helpers import _sharded_cpu_state_dict
@@ -81,7 +82,7 @@ class CheckpointManager:
   step_period, as would be the case in auto checkpointing.
 
   This class is inspired by Orbax's CheckpointManager, which can be found here:
-  https://github.com/google/orbax/blob/efc079c4e5b437782a80138913d322cb3ed365c7/checkpoint/orbax/checkpoint/checkpoint_manager.py
+  https://github.com/google/orbax/blob/efc079c/checkpoint/orbax/checkpoint/checkpoint_manager.py
   """
 
   # The base path to write checkpoints to. Each checkpoint taken by the manager
@@ -102,7 +103,7 @@ def __init__(self,
                path: str,
                save_interval: int,
                max_to_keep: Optional[int] = 0,
-               async_queue_size: Optional[int] = 1,
+               max_pending_async: Optional[int] = 1,
                process_group: dist.ProcessGroup = None,
                chkpt_on_preemption: bool = True):
     """
@@ -116,11 +117,11 @@ def __init__(self,
             CheckpointManager. When a new checkpoint will be taken, the
             checkpoint for the lowest tracked step will be deleted.
             Default: 0, indicating no upper bound on the number of checkpoints.
-      async_queue_size: The size of the execution queue which processes async
-            checkpoints. This should be a small value to ensure training doesn't
+      max_pending_async: The maximum number of async checkpoints which can be
+            pending. This should be a small value to ensure training doesn't
             get too far ahead of the last finished checkpoint, but increasing
-            the value to 2 can unblock training when there are transient
-            network issues which slow down the active checkpoint.
+            the value can unblock training when there are transient issues which
+            slow down the active checkpoint.
             Default: 1, which only allows a single async checkpoint to be
             pending at a time.
       process_group: The process group to use when coordinating the checkpoint.
@@ -132,31 +133,33 @@ def __init__(self,
     """
     assert dist.is_initialized(), "A process group is required."
     assert save_interval > 0, "save_interval must be positive"
-    assert async_queue_size > 0, "async_queue_size must be positive"
+    assert max_pending_async > 0, "max_pending_async must be positive"
     assert max_to_keep >= 0, "max_to_keep must be non-negative"
 
-    self.base_path = path
+    self.base_path = os.path.join(path, '')  # Ensure the base path ends in '/'
     self.save_interval = save_interval
     self.max_to_keep = max_to_keep
     self.chkpt_on_preemption = chkpt_on_preemption
 
-    self._tracked_chkpts = self._load_tracked_chkpts()
-    self._async_queue = queue.Queue(maxsize=async_queue_size)
-    self._alive = threading.Event()
-    self._alive.set()
-    self._chkpt_thread = threading.Thread(
-        target=self._async_worker, daemon=True)
-    self._chkpt_thread.start()
-
     # Create a new group if none is provided
     # TODO(jonbolin): Verify subgroup on GPU backend
     self.pg = process_group or dist.new_group()
 
+    # Thread pool to run the async checkpoints. `_async_sem` is used to guard
+    # the number of pending checkpoints, and `_async_futures` tracks all
+    # futures returned by the pool.
+    self._async_worker_pool = ThreadPoolExecutor(max_workers=1)
+    self._async_sem = threading.Semaphore(max_pending_async)
+    self._async_futures = []
+    # Mutex to ensure only a single thread can write a checkpoint at a time.
+    self._save_mutex = threading.Lock()
+
+    self._tracked_chkpts = self._load_tracked_chkpts()
+
     if self.chkpt_on_preemption:
       # Initialize the distributed runtime for preemption detection
-      master_ip = xr.get_master_ip()
       torch_xla._XLAC._ensure_xla_coordinator_initialized(
-          xr.process_index(), xr.process_count(), master_ip)
+          xr.process_index(), xr.process_count(), xr.get_master_ip())
       torch_xla._XLAC._activate_preemption_sync_manager()
 
   def _load_tracked_chkpts(self) -> Deque[_CheckpointMetadata]:
@@ -166,36 +169,20 @@ def _load_tracked_chkpts(self) -> Deque[_CheckpointMetadata]:
     all_chkpts = []
     invalid_paths = []
     fs, raw_path = url_to_fs(self.base_path)
-    for path in fs.ls(raw_path, detail=False):
-      try:
-        with fsspec.open(os.path.join(path, _MANAGER_METADATA_FILE), 'rb') as f:
-          all_chkpts.append(pickle.load(f))
-      except:
-        invalid_paths.append(path)
+    if not fs.exists(raw_path):
+      fs.mkdir(raw_path)
+    else:
+      for path in fs.ls(raw_path, detail=False):
+        try:
+          with fs.open(os.path.join(path, _MANAGER_METADATA_FILE), 'rb') as f:
+            all_chkpts.append(pickle.load(f))
+        except:
+          invalid_paths.append(path)
 
     if invalid_paths:
       logging.warning(f'Ignoring invalid checkpoints: {invalid_paths}')
     return deque(sorted(all_chkpts, key=lambda m: m.ts))
 
-  def __del__(self):
-    self._alive.clear()
-    # Send a sentinel value to tell the worker to exit, and wait for pending
-    # checkpoints to complete.
-    self._async_queue.put(None)
-    self._chkpt_thread.join()
-
-  def _async_worker(self):
-    while self._alive.is_set():
-      try:
-        item = self._async_queue.get()
-        if item:
-          step, state_dict = item
-          self.save(step, state_dict, force=True)
-      except:
-        traceback.print_exc()
-      finally:
-        self._async_queue.task_done()
-
   def _get_path(self, step: int) -> str:
     return os.path.join(self.base_path, str(step))
 
@@ -215,6 +202,35 @@ def _release_oldest_checkpoints(self):
         oldest_chkpt = self._tracked_chkpts.popleft()
         self._delete_chkpt_at_step(oldest_chkpt.step)
 
+  def _wait_for_data(self):
+    xm.mark_step()
+    xm.wait_device_ops()
+
+  def _save(self, step, state_dict):
+    """
+    The actual checkpointing logic, which is shared between async and
+    synchronous checkpointing.
+
+    The caller must ensure that data is accessible within the state_dict before
+    calling, which can be achieved with `self._wait_for_data`.
+    """
+    with self._save_mutex:
+      path = self._get_path(step)
+      # Delete any existing checkpoint at the current step.
+      self._delete_chkpt_at_step(step)
+      dist_cp.save_state_dict(
+          state_dict=state_dict,
+          storage_writer=FsspecWriter(path),
+          planner=xc.SPMDSavePlanner(),
+          process_group=self.pg,
+      )
+      metadata = _CheckpointMetadata(step=step, ts=datetime.now())
+      self._tracked_chkpts.append(metadata)
+      if dist.get_rank(self.pg) == 0:
+        with fsspec.open(os.path.join(path, _MANAGER_METADATA_FILE), 'wb') as f:
+          pickle.dump(metadata, f)
+        self._release_oldest_checkpoints()
+
   def should_save(self, step: int) -> bool:
     """
     Returns true if a checkpoint should be saved for the current step. A
@@ -247,20 +263,8 @@ def save(self,
       True if a checkpoint was taken and False otherwise.
     """
     if self.should_save(step) or force:
-      path = self._get_path(step)
-      # Delete any existing checkpoint at the current step.
-      self._delete_chkpt_at_step(step)
-      dist_cp.save_state_dict(
-          state_dict=state_dict,
-          storage_writer=FsspecWriter(path),
-          planner=xc.SPMDSavePlanner(),
-          process_group=self.pg,
-      )
-      metadata = _CheckpointMetadata(step=step, ts=datetime.now())
-      with fsspec.open(os.path.join(path, _MANAGER_METADATA_FILE), 'wb') as f:
-        pickle.dump(metadata, f)
-      self._tracked_chkpts.append(metadata)
-      self._release_oldest_checkpoints()
+      self._wait_for_data()
+      self._save(step, state_dict)
       return True
     return False
 
@@ -288,9 +292,13 @@ def save_async(self,
       True if a checkpoint was taken and False otherwise.
     """
     if self.should_save(step) or force:
+      self._wait_for_data()
       # Move the state_dict to CPU
       cpu_state_dict = _sharded_cpu_state_dict(state_dict)
-      self._async_queue.put((step, cpu_state_dict))
+      self._async_sem.acquire()
+      future = self._async_worker_pool.submit(self._save, step, cpu_state_dict)
+      future.add_done_callback(lambda _: self._async_sem.release())
+      self._async_futures.append(future)
       return True
     return False
 
@@ -322,8 +330,8 @@ def all_steps(self) -> List[int]:
     return sorted(x.step for x in self._tracked_chkpts)
 
   def join(self):
-    """ Wait for all pending async checkpoints to complete. """
-    self._async_queue.join()
+    """ Wait for any pending async checkpoints to complete. """
+    wait(self._async_futures)
 
   def reached_preemption(self, step: int) -> bool:
     """ Returns True if a preemption has been detected at the given step. """