diff --git a/torch_xla/csrc/device.cpp b/torch_xla/csrc/device.cpp index 443023009a2..16c77001904 100644 --- a/torch_xla/csrc/device.cpp +++ b/torch_xla/csrc/device.cpp @@ -91,6 +91,12 @@ bool UseVirtualDevice(bool force_spmd) { return use_virtual_device; } +bool IsVirtualDevice(const std::string& device) { + XlaDeviceType hw_type = + static_cast(ParseDeviceString(device).type()); + return hw_type == XlaDeviceType::SPMD; +} + bool GetLockSpmdConfig() { return spmd_config_is_locked; } bool CheckTpuDevice(XlaDeviceType hw_type) { diff --git a/torch_xla/csrc/device.h b/torch_xla/csrc/device.h index 7161b03eb6b..84fdf7f0b16 100644 --- a/torch_xla/csrc/device.h +++ b/torch_xla/csrc/device.h @@ -45,6 +45,9 @@ torch::lazy::BackendDevice GetVirtualDevice(); // Optionally, `force_spmd` to set `use_virtual_device` to true. bool UseVirtualDevice(bool force_spmd = false); +// Return true if `device` is of SPMD device type. +bool IsVirtualDevice(const std::string& device); + // Return true if SPMD config can be switches. That is, no device has been // initialized, yet. bool GetLockSpmdConfig(); diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp index 86a08238ec8..c66bfe915b0 100644 --- a/torch_xla/csrc/init_python_bindings.cpp +++ b/torch_xla/csrc/init_python_bindings.cpp @@ -1939,16 +1939,8 @@ void InitXlaModuleBindings(py::module m) { std::vector sharding_specs; sharding_specs.reserve(tensors.size()); for (const at::Tensor& tensor : tensors) { - XLATensorPtr xtensor = bridge::GetXlaTensor(tensor); - XLATensor::ShardingSpecPtr sharding_spec = - xtensor ? xtensor->sharding_spec() : nullptr; - if (sharding_spec != nullptr) { - sharding_specs.push_back( - xla::HloSharding::FromProto(sharding_spec->sharding) - ->ToString()); - } else { - sharding_specs.push_back(""); - } + sharding_specs.push_back( + GetXLAShardingSpec(bridge::GetXlaTensor(tensor))); } return sharding_specs; }); diff --git a/torch_xla/csrc/ops/device_data.h b/torch_xla/csrc/ops/device_data.h index dc70924be3d..217f2cef197 100644 --- a/torch_xla/csrc/ops/device_data.h +++ b/torch_xla/csrc/ops/device_data.h @@ -36,12 +36,10 @@ class DeviceData : public XlaNode { // backend data with a partitioned one in the node operands. Note that // this is permitted only if the node holds a placeholder. void Assign(std::shared_ptr data) { - // TODO(yeounoh) check if the existing data is a placeholder after we - // address the issue where some of the sync tensors spill with device node. XLA_CHECK(data->shape() == data_->shape()) << "Shape mismatch: expected (" << data_->shape().to_string() << "), actual (" << data->shape().to_string() << ")"; - data_ = data; + data_.reset(data.get()); } static DeviceData* Cast(const torch::lazy::Node* node); diff --git a/torch_xla/csrc/tensor_util.cpp b/torch_xla/csrc/tensor_util.cpp index 9c00e212ccc..db9d8390724 100644 --- a/torch_xla/csrc/tensor_util.cpp +++ b/torch_xla/csrc/tensor_util.cpp @@ -689,10 +689,13 @@ std::vector CreateTensorsData( return {}; } - // We assume that caller can't mix virtual device and real device. - if (devices[0] == "SPMD:0") { - // When running in SPMD mode, tensors here in the unsharded - // CreateTensorsData should be implicitly replicated to all devices. + // CreateTensorsData should be implicitly replicated to all devices. + if (IsVirtualDevice(devices[0])) { + XLA_CHECK( + std::all_of(devices.begin(), devices.end(), + [&](const std::string& s) { return s == devices[0]; })) + << "can't mix virtual device and real device."; + std::vector local_devices = runtime::GetComputationClient()->GetLocalDevices(); std::vector handles; diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp index 22e747b1362..dd5e3faff59 100644 --- a/torch_xla/csrc/xla_graph_executor.cpp +++ b/torch_xla/csrc/xla_graph_executor.cpp @@ -1279,9 +1279,9 @@ XLAGraphExecutor::BuildInputOutputAliases( } XLAGraphExecutor::CompilationResult XLAGraphExecutor::Compile( - const std::vector& tensors, - absl::Span devices, const SyncTensorCollection& coll, - PostOrderData* po_data, const std::vector& ir_values) { + std::vector& tensors, absl::Span devices, + const SyncTensorCollection& coll, PostOrderData* po_data, + const std::vector& ir_values) { tsl::profiler::TraceMe activity( [&] { return tsl::profiler::TraceMeEncode( diff --git a/torch_xla/csrc/xla_graph_executor.h b/torch_xla/csrc/xla_graph_executor.h index 4d7d29c1b14..5e3fed65d9e 100644 --- a/torch_xla/csrc/xla_graph_executor.h +++ b/torch_xla/csrc/xla_graph_executor.h @@ -346,7 +346,9 @@ class XLAGraphExecutor : public torch::lazy::LazyGraphExecutor { std::vector SetBufferDonors(LoweringContext* lowering_ctx); // We don't use upstream Compile to have BuildInputOutputAliases. - CompilationResult Compile(const std::vector& tensors, + // TODO(yeounoh) auto-sharding can change tensors shardings, which needs to be + // accounted for in Dynamo integration. + CompilationResult Compile(std::vector& tensors, absl::Span devices, const SyncTensorCollection& coll, PostOrderData* po_data,