AI-Hypercomputer · copybara-service · Dec 12, 2024
diff --git a/pathwaysutils/persistence/helper.py b/pathwaysutils/persistence/helper.py
@@ -14,9 +14,10 @@
 """Helper functions for persistence."""
 
 import base64
+import concurrent.futures
 import datetime
 import json
-from typing import Sequence, Union
+from typing import Sequence, Tuple, Union
 
 import jax
 from jax import core
@@ -82,6 +83,7 @@ def get_write_request(
     name: str,
     jax_array: jax.Array,
     timeout: datetime.timedelta,
+    return_dict: bool = False,
 ) -> str:
   """Returns a string representation of the plugin program which writes the given jax_array to the given location."""
   sharding = jax_array.sharding
@@ -91,7 +93,7 @@ def get_write_request(
       timeout.total_seconds(), 1
   )
   timeout_nanoseconds = timeout_fractional_seconds * 1e9
-  return json.dumps({
+  d = {
       "persistenceWriteRequest": {
           "b64_location": string_to_base64(location_path),
           "b64_name": string_to_base64(name),
@@ -112,7 +114,29 @@ def get_write_request(
               "nanos": int(timeout_nanoseconds),
           },
       }
-  })
+  }
+
+  if return_dict:
+    return d
+  return json.dumps(d)
+
+
+def get_bulk_write_request(
+    location_path: str,
+    names: Sequence[str],
+    jax_arrays: Sequence[jax.Array],
+    timeout: datetime.timedelta,
+) -> str:
+  """Returns a string representation of a bulk write request, writes multiple arrays with one call."""
+  write_requests = [
+      get_write_request(location_path, name, jax_array, timeout, True)[
+          "persistenceWriteRequest"
+      ]
+      for name, jax_array in zip(names, jax_arrays)
+  ]
+  return json.dumps(
+      {"bulk_persistence_write_request": {"write_requests": write_requests}}
+  )
 
 
 def get_read_request(
@@ -123,6 +147,7 @@ def get_read_request(
     sharding: jax.sharding.Sharding,
     devices: Sequence[jax.Device],
     timeout: datetime.timedelta,
+    return_dict: bool = False,
 ) -> str:
   """Returns a string representation of the plugin program which reads the given array from the given location into the provided sharding."""
   if not isinstance(devices, np.ndarray):
@@ -132,7 +157,7 @@ def get_read_request(
       timeout.total_seconds(), 1
   )
   timeout_nanoseconds = timeout_fractional_seconds * 1e9
-  return json.dumps({
+  d = {
       "persistenceReadRequest": {
           "b64_location": string_to_base64(location_path),
           "b64_shape_proto_string": get_shape_string(dtype, shape),
@@ -148,7 +173,32 @@ def get_read_request(
               "nanos": int(timeout_nanoseconds),
           },
       }
-  })
+  }
+
+  if return_dict:
+    return d
+  return json.dumps(d)
+
+
+def get_bulk_read_request(
+    location_path: str,
+    names: Sequence[str],
+    dtypes: np.dtype,
+    shapes: Sequence[Sequence[int]],
+    shardings: Sequence[jax.sharding.Sharding],
+    devices: Sequence[jax.Device],
+    timeout: datetime.timedelta,
+) -> str:
+  """Returns a string representation of a bulk read request, reads multiple arrays with one call."""
+  read_requests = [
+      get_read_request(
+          location_path, name, dtype, shape, sharding, devices, timeout, True
+      )["persistenceReadRequest"]
+      for name, dtype, shape, sharding in zip(names, dtypes, shapes, shardings)
+  ]
+  return json.dumps(
+      {"bulk_persistence_read_request": {"read_requests": read_requests}}
+  )
 
 
 def write_one_array(
@@ -164,6 +214,19 @@ def write_one_array(
   return write_future
 
 
+def write_arrays(
+    location: str,
+    names: Sequence[str],
+    values: Sequence[jax.Array],
+    timeout: datetime.timedelta,
+):
+  """Creates the write array plugin program string, compiles it to an executable, calls it and returns an awaitable future."""
+  bulk_write_request = get_bulk_write_request(location, names, values, timeout)
+  bulk_write_executable = plugin_executable.PluginExecutable(bulk_write_request)
+  _, bulk_write_future = bulk_write_executable.call(values)
+  return bulk_write_future
+
+
 def read_one_array(
     location: str,
     name: str,
@@ -190,3 +253,27 @@ def read_one_array(
   )
   read_future.result()
   return read_array[0]
+
+
+def read_arrays(
+    location: str,
+    names: Sequence[str],
+    dtypes: Sequence[np.dtype],
+    shapes: Sequence[int],
+    shardings: Sequence[jax.sharding.Sharding],
+    devices: Union[Sequence[jax.Device], np.ndarray],
+    timeout: datetime.timedelta,
+) -> Tuple[Sequence[jax.Array], concurrent.futures.Future[None]]:
+  """Creates the read array plugin program string, compiles it to an executable, calls it and returns the result."""
+
+  bulk_read_request = get_bulk_read_request(
+      location, names, dtypes, shapes, shardings, devices, timeout
+  )
+  bulk_read_executable = plugin_executable.PluginExecutable(bulk_read_request)
+  out_avals = [
+      core.ShapedArray(shape, dtype) for shape, dtype in zip(shapes, dtypes)
+  ]
+  arrays, read_future = bulk_read_executable.call(
+      out_shardings=shardings, out_avals=out_avals
+  )
+  return (arrays, read_future)
diff --git a/pathwaysutils/persistence/pathways_orbax_handler.py b/pathwaysutils/persistence/pathways_orbax_handler.py
@@ -60,7 +60,7 @@ def __init__(
     self._read_timeout = read_timeout
 
     if use_ocdbt:
-      raise ValueError('OCDBT not supported for Pathways.')
+      raise ValueError("OCDBT not supported for Pathways.")
     super().__init__()
 
   async def serialize(
@@ -73,12 +73,10 @@ async def serialize(
     type_handlers.check_input_arguments(values, infos, args)
 
     if any([arg.dtype is not None for arg in args]):
-      raise ValueError('Casting during save not supported for Pathways.')
+      raise ValueError("Casting during save not supported for Pathways.")
 
     locations, names = extract_parent_dir_and_name(infos)
-    f = functools.partial(
-        helper.write_one_array, timeout=self._read_timeout
-    )
+    f = functools.partial(helper.write_one_array, timeout=self._read_timeout)
     return list(map(f, locations, names, values))
 
   async def deserialize(
@@ -88,7 +86,7 @@ async def deserialize(
   ) -> Sequence[jax.Array]:
     """Uses Pathways Persistence API to deserialize a jax array."""
     if args is None:
-      raise ValueError('Must provide ArrayRestoreArgs to restore as jax.Array.')
+      raise ValueError("Must provide ArrayRestoreArgs to restore as jax.Array.")
     type_handlers.check_input_arguments(infos, args)
 
     global_meshes = []
@@ -101,14 +99,14 @@ async def deserialize(
     for arg in args:
       if not isinstance(arg, ArrayRestoreArgs):
         raise ValueError(
-            'To restore jax.Array, provide ArrayRestoreArgs; found'
-            f' {type(arg).__name__}'
+            "To restore jax.Array, provide ArrayRestoreArgs; found"
+            f" {type(arg).__name__}"
         )
       arg = typing.cast(ArrayRestoreArgs, arg)
       if arg.sharding is None and (arg.mesh is None or arg.mesh_axes is None):
         raise ValueError(
-            'Sharding of jax.Array cannot be None. Provide `mesh`'
-            ' and `mesh_axes` OR `sharding`.'
+            "Sharding of jax.Array cannot be None. Provide `mesh`"
+            " and `mesh_axes` OR `sharding`."
         )
       if arg.sharding is None:
         global_meshes.append(arg.mesh)
@@ -118,15 +116,15 @@ async def deserialize(
         )
       else:
         if not isinstance(arg.sharding, jax.sharding.NamedSharding):
-          raise ValueError('Pathways only supports jax.sharding.NamedSharding.')
+          raise ValueError("Pathways only supports jax.sharding.NamedSharding.")
         sharding = typing.cast(jax.sharding.NamedSharding, arg.sharding)
         global_meshes.append(sharding.mesh)
         mesh_axes.append(sharding.spec)
         shardings.append(sharding)
       if arg.global_shape is None or arg.dtype is None:
         logger.warning(
-            'Shape or dtype not provided for restoration. Provide these'
-            ' properties for improved performance.'
+            "Shape or dtype not provided for restoration. Provide these"
+            " properties for improved performance."
         )
         should_open_metadata = True
       global_shapes.append(arg.global_shape)
@@ -153,27 +151,17 @@ async def deserialize(
       grouped_dtypes = [dtypes[idx] for idx in idxs]
       grouped_shardings = [shardings[idx] for idx in idxs]
       locations, names = extract_parent_dir_and_name(grouped_infos)
-      f = functools.partial(
-          helper.read_one_array,
-          devices=global_mesh.devices,
+      grouped_arrays, read_future = helper.read_arrays(
+          locations[0],
+          names,
+          grouped_dtypes,
+          grouped_global_shapes,
+          grouped_shardings,
+          global_mesh.devices,
           timeout=self._read_timeout,
       )
-      grouped_arrays = [
-          f(
-              location=location,
-              name=name,
-              dtype=dtype,
-              shape=shape,
-              shardings=sharding,
-          )
-          for location, name, dtype, shape, sharding in zip(
-              locations,
-              names,
-              grouped_dtypes,
-              grouped_global_shapes,
-              grouped_shardings,
-          )
-      ]
+      # each persistence call is awaited serially.
+      read_future.result()
       for idx, arr in zip(idxs, grouped_arrays):
         results[idx] = arr
     return results  # pytype: disable=bad-return-type
@@ -184,7 +172,7 @@ def register_pathways_handlers(
 ):
   """Function that must be called before saving or restoring with Pathways."""
   logger.debug(
-      'Registering CloudPathwaysArrayHandler (Pathways Persistence API).'
+      "Registering CloudPathwaysArrayHandler (Pathways Persistence API)."
   )
   type_handlers.register_type_handler(
       jax.Array,