From 10d3cdb7c249103f837acb88d1f429831e286cd6 Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Mon, 30 Oct 2023 22:13:09 +0000 Subject: [PATCH 1/2] Destroy the ComputationClient when the program exits --- torch_xla/csrc/init_python_bindings.cpp | 1 - torch_xla/csrc/runtime/computation_client.h | 2 -- torch_xla/csrc/runtime/pjrt_computation_client.h | 2 -- torch_xla/csrc/runtime/runtime.cc | 12 ++++++------ 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp index 1884310c5fd..1e6bb020fe5 100644 --- a/torch_xla/csrc/init_python_bindings.cpp +++ b/torch_xla/csrc/init_python_bindings.cpp @@ -95,7 +95,6 @@ void PrepareToExit() { runtime::GetComputationClientIfInitialized(); if (client != nullptr) { XLAGraphExecutor::Get()->WaitDeviceOps({}); - client->PrepareToExit(); } } diff --git a/torch_xla/csrc/runtime/computation_client.h b/torch_xla/csrc/runtime/computation_client.h index 1e610be7959..db4bac21916 100644 --- a/torch_xla/csrc/runtime/computation_client.h +++ b/torch_xla/csrc/runtime/computation_client.h @@ -344,8 +344,6 @@ class ComputationClient { virtual MemoryInfo GetMemoryInfo(const std::string& device) = 0; - virtual void PrepareToExit() = 0; - // Block until pass in devices' async operation are finished. If empty, all // the local devices will be waited for. virtual void WaitDeviceOps(const std::vector& devices) = 0; diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.h b/torch_xla/csrc/runtime/pjrt_computation_client.h index d7a11611a03..f4fc73bb79e 100644 --- a/torch_xla/csrc/runtime/pjrt_computation_client.h +++ b/torch_xla/csrc/runtime/pjrt_computation_client.h @@ -85,8 +85,6 @@ class PjRtComputationClient : public ComputationClient { std::shared_ptr> GetReplicationDevices() override; - void PrepareToExit() override { return; }; - void WaitDeviceOps(const std::vector& devices) override; std::map GetMetrics() const override; diff --git a/torch_xla/csrc/runtime/runtime.cc b/torch_xla/csrc/runtime/runtime.cc index 8cfd0695184..2ae59aa7da4 100644 --- a/torch_xla/csrc/runtime/runtime.cc +++ b/torch_xla/csrc/runtime/runtime.cc @@ -10,10 +10,11 @@ namespace torch_xla { namespace runtime { namespace { -std::atomic g_computation_client(nullptr); -std::once_flag g_computation_client_once; +std::atomic g_computation_client_initialized(false); ComputationClient* CreateClient() { + bool was_initialized = g_computation_client_initialized.exchange(true); + XLA_CHECK(!was_initialized) << "ComputationClient already initialized"; if (sys_util::GetEnvBool("XLA_DUMP_FATAL_STACK", false)) { tsl::testing::InstallStacktraceHandler(); } @@ -34,13 +35,12 @@ ComputationClient* CreateClient() { } // namespace ComputationClient* GetComputationClient() { - std::call_once(g_computation_client_once, - [&]() { g_computation_client = std::move(CreateClient()); }); - return g_computation_client.load(); + static auto client = std::unique_ptr(CreateClient()); + return client.get(); } ComputationClient* GetComputationClientIfInitialized() { - return g_computation_client.load(); + return g_computation_client_initialized ? GetComputationClient() : nullptr; } } // namespace runtime From 022d8540900879c4c13fe39d0979b23b58f0aef2 Mon Sep 17 00:00:00 2001 From: Will Cromar Date: Tue, 31 Oct 2023 17:18:25 +0000 Subject: [PATCH 2/2] Fix extra error when PJRT_DEVICE is not set --- torch_xla/csrc/runtime/runtime.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/torch_xla/csrc/runtime/runtime.cc b/torch_xla/csrc/runtime/runtime.cc index 2ae59aa7da4..69e5bb74319 100644 --- a/torch_xla/csrc/runtime/runtime.cc +++ b/torch_xla/csrc/runtime/runtime.cc @@ -24,6 +24,7 @@ ComputationClient* CreateClient() { if (sys_util::GetEnvString(env::kEnvPjRtDevice, "") != "") { client = new PjRtComputationClient(); } else { + g_computation_client_initialized = false; XLA_ERROR() << "$PJRT_DEVICE is not set." << std::endl; }