pytorch · will-cromar · Jan 25, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 11, 2024
diff --git a/plugins/cuda/torch_xla_cuda_plugin/__init__.py b/plugins/cuda/torch_xla_cuda_plugin/__init__.py
@@ -1,11 +1,32 @@
 import os
 from torch_xla.experimental import plugins
-from torch_xla._internal import tpu
+import torch_xla.utils.utils as xu
 
 class GpuPlugin(plugins.DevicePlugin):
   def library_path(self) -> str:
     return os.path.join(os.path.dirname(__file__), 'pjrt_c_api_gpu_plugin.so')
 
   def physical_chip_count(self) -> int:
     # TODO: default to actual device count
-    return int(os.getenv('GPU_NUM_DEVICES', '1'))
+    return xu.getenv_as('GPU_NUM_DEVICES', int, 1)
+
+  def client_create_options(self) -> dict:
+    local_process_rank = xu.getenv_as("LOCAL_RANK", int, 0)
+    global_process_rank = xu.getenv_as("RANK", int, local_process_rank)
+    local_world_size = xu.getenv_as("LOCAL_WORLD_SIZE", int, 1)
+    global_world_size = xu.getenv_as("WORLD_SIZE", int, local_world_size)
+
+    # The available options are defined in OpenXLA: https://github.com/openxla/xla/blob/1bb2a74be91fabf5f9aa2702b2592b5b022c9052/xla/pjrt/c/pjrt_c_api_gpu_internal.cc#L58-L67
+    return {
+      "platform_name": "gpu",
+      # TODO(wcromar): make this configurable
+      "allocator": "cuda_async" if xu.getenv_as("PJRT_ALLOCATOR_CUDA_ASYNC", bool, False) else "default",
 auto allocator_config = xla::GpuAllocatorConfig{}; 
 if (sys_util::GetEnvString(env::kEnvPjrtAllocatorCudaAsync, "").empty() && 
     sys_util::GetEnvString(env::kEnvPjrtAllocatorPreallocate, "").empty() && 
     sys_util::GetEnvString(env::kEnvPjrtAllocatorFraction, "").empty()) { 
   return allocator_config; 
 } 
 auto allocator_config = xla::GpuAllocatorConfig{}; 
 if (sys_util::GetEnvString(env::kEnvPjrtAllocatorCudaAsync, "").empty() && 
     sys_util::GetEnvString(env::kEnvPjrtAllocatorPreallocate, "").empty() && 
     sys_util::GetEnvString(env::kEnvPjrtAllocatorFraction, "").empty()) { 
   return allocator_config; 
 } 
+      "memory_fraction": xu.getenv_as("PJRT_ALLOCATOR_FRACTION", float, .75),
+      "preallocate": xu.getenv_as("PJRT_ALLOCATOR_PREALLOCATE", bool, True),
+      "visible_devices": [local_process_rank],
+      "node_id": global_process_rank,
+      "num_nodes": global_world_size,
+    }
+
+  def requires_xla_coordinator(self) -> bool:
+    return True
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -2319,8 +2319,12 @@ void InitXlaModuleBindings(py::module m) {
         });
   // -------------Dynamo Integration API End-------------------------
   m.def("_register_pjrt_plugin",
-        [](std::string name, std::string library_path) {
-          runtime::RegisterPjRtPlugin(name, library_path);
+        [](std::string name, std::string library_path,
+           std::unordered_map<std::string, xla::PjRtValueType> create_options,
+           bool init_coordinator) {
+          runtime::RegisterPjRtPlugin(
+              name, library_path,
+              {create_options.begin(), create_options.end()}, init_coordinator);
         });
 }
 }  // namespace

diff --git a/torch_xla/csrc/runtime/pjrt_registry.cc b/torch_xla/csrc/runtime/pjrt_registry.cc
@@ -14,10 +14,16 @@
 namespace torch_xla {
 namespace runtime {
 
-std::unordered_map<std::string, std::string> pjrt_plugins_;
-
 namespace {
 
+struct PluginEntry {
+  std::string library_path;
+  absl::flat_hash_map<std::string, xla::PjRtValueType> create_options;
+  bool init_coordinator;
+};
+
+std::unordered_map<std::string, PluginEntry> pjrt_plugins_;
+
 xla::GpuAllocatorConfig GetGpuAllocatorConfig() {
   auto allocator_config = xla::GpuAllocatorConfig{};
   if (sys_util::GetEnvString(env::kEnvPjrtAllocatorCudaAsync, "").empty() &&
@@ -35,17 +41,21 @@ xla::GpuAllocatorConfig GetGpuAllocatorConfig() {
   return allocator_config;
 }
 
-std::optional<std::string> GetPjRtPluginPath(const std::string& device_type) {
+std::optional<PluginEntry> GetPjRtPlugin(const std::string& device_type) {
   auto plugin_path = pjrt_plugins_.find(device_type);
   return plugin_path != pjrt_plugins_.end() ? std::optional(plugin_path->second)
                                             : std::nullopt;
 }
 
 }  // namespace
 
-void RegisterPjRtPlugin(std::string name, std::string library_path) {
+void RegisterPjRtPlugin(
+    std::string name, std::string library_path,
+    absl::flat_hash_map<std::string, xla::PjRtValueType> create_options,
+    bool init_coordinator) {
   TF_VLOG(3) << "Registering PjRt plugin " << name << " at " << library_path;
-  pjrt_plugins_[name] = library_path;
+  pjrt_plugins_[name] = {std::move(library_path), std::move(create_options),
+                         init_coordinator};
 }
 
 std::tuple<std::unique_ptr<xla::PjRtClient>, std::unique_ptr<XlaCoordinator>>
@@ -54,13 +64,46 @@ InitializePjRt(const std::string& device_type) {
   std::unique_ptr<XlaCoordinator> coordinator;
 
   if (sys_util::GetEnvBool(env::kEnvPjrtDynamicPlugins, false)) {
-    std::optional<std::string> plugin_path = GetPjRtPluginPath(device_type);
-    if (plugin_path) {
+    std::optional<PluginEntry> plugin = GetPjRtPlugin(device_type);
+    if (plugin) {
       TF_VLOG(1) << "Initializing client for PjRt plugin " << device_type;
+
+      xla::PjRtClient::KeyValueGetCallback kv_get = nullptr;
+      xla::PjRtClient::KeyValuePutCallback kv_put = nullptr;
+      if (plugin->init_coordinator) {
+        int global_process_rank = sys_util::GetEnvInt("RANK", 0);
+        int global_world_size = sys_util::GetEnvInt("WORLD_SIZE", 1);
+        std::string master_addr =
+            runtime::sys_util::GetEnvString("MASTER_ADDR", "localhost");
+        std::string port = runtime::sys_util::GetEnvString(
+            "XLA_COORDINATOR_PORT", XlaCoordinator::kDefaultCoordinatorPort);
+
+        if (global_world_size > 1) {
+          // Use the XlaCoordinator as the distributed key-value store.
+          coordinator = std::make_unique<XlaCoordinator>(
+              global_process_rank, global_world_size, master_addr, port);
+          std::shared_ptr<xla::DistributedRuntimeClient> distributed_client =
+              coordinator->GetClient();
+          std::string key_prefix = "gpu:";
+          kv_get = [distributed_client, key_prefix](
+                       std::string_view k,
+                       absl::Duration timeout) -> xla::StatusOr<std::string> {
+            return distributed_client->BlockingKeyValueGet(
+                absl::StrCat(key_prefix, k), timeout);
+          };
+          kv_put = [distributed_client, key_prefix](
+                       std::string_view k, std::string_view v) -> xla::Status {
+            return distributed_client->KeyValueSet(absl::StrCat(key_prefix, k),
+                                                   v);
+          };
+        }
+      }
       const PJRT_Api* c_api = *pjrt::LoadPjrtPlugin(
-          absl::AsciiStrToLower(device_type), *plugin_path);
+          absl::AsciiStrToLower(device_type), plugin->library_path);
       XLA_CHECK_OK(pjrt::InitializePjrtPlugin(device_type));
-      client = xla::GetCApiClient(absl::AsciiStrToUpper(device_type)).value();
+      client = xla::GetCApiClient(absl::AsciiStrToUpper(device_type),
+                                  plugin->create_options, kv_get, kv_put)
+                   .value();
       profiler::RegisterProfilerForPlugin(c_api);
     }
   } else if (device_type == "CPU") {

diff --git a/torch_xla/csrc/runtime/pjrt_registry.h b/torch_xla/csrc/runtime/pjrt_registry.h
@@ -6,7 +6,10 @@
 namespace torch_xla {
 namespace runtime {
 
-void RegisterPjRtPlugin(std::string name, std::string library_path);
+void RegisterPjRtPlugin(
+    std::string name, std::string library_path,
+    absl::flat_hash_map<std::string, xla::PjRtValueType> create_options = {},
+    bool init_coordinator = true);
 
 std::tuple<std::unique_ptr<xla::PjRtClient>, std::unique_ptr<XlaCoordinator>>
 InitializePjRt(const std::string& device_type);

diff --git a/torch_xla/experimental/plugins.py b/torch_xla/experimental/plugins.py
@@ -41,6 +41,17 @@ def physical_chip_count(self):
     """
     return 1
 
+  def client_create_options(self) -> dict:
+    return {}
+
+  def requires_xla_coordinator(self) -> bool:
+    """Whether to initialize the XLA coordinator before plugin client.
+
+    Expects `torchrun` variables such as RANK, WORLD_SIZE, MASTER_ADDR to be
+    set.
+    """
+    return False
+
 
 _plugin_registry = {}
 
@@ -64,4 +75,6 @@ def default() -> DevicePlugin:
 
 def register_plugin(name: str, device_plugin: DevicePlugin):
   _plugin_registry[name.upper()] = device_plugin
-  torch_xla._XLAC._register_pjrt_plugin(name, device_plugin.library_path())
+  torch_xla._XLAC._register_pjrt_plugin(
+      name, device_plugin.library_path(), device_plugin.client_create_options(),
+      device_plugin.requires_xla_coordinator())