From 55bb502f905c63ec5e7b7883838a2fe03f004eaa Mon Sep 17 00:00:00 2001
From: Oleg Milyutin <omilyutin@tenstorrent.com>
Date: Tue, 17 Dec 2024 10:43:36 -0500
Subject: [PATCH] #0: Remove some dead code (#16084)

1. `Tensor::deepcopy` that isn't actually doing a deep copy and isn't
used anywhere.
2. `cpu_sharded` / `host_sharded` that doesn't work due to deprecation
of slow dispatch, and that was previously used for debugging.

### Checklist
- [x] [Post commit CI
passes](https://github.com/tenstorrent/tt-metal/actions/runs/12365560530)
(failure unrelated)
---
 ttnn/cpp/pybind11/pytensor.cpp               |  7 ----
 ttnn/cpp/ttnn/tensor/tensor.cpp              | 16 +--------
 ttnn/cpp/ttnn/tensor/tensor.hpp              |  4 ---
 ttnn/cpp/ttnn/tensor/tensor_impl.cpp         | 37 --------------------
 ttnn/cpp/ttnn/tensor/tensor_impl.hpp         |  3 --
 ttnn/cpp/ttnn/tensor/tensor_impl_wrapper.hpp |  1 -
 ttnn/cpp/ttnn/tensor/tensor_ops.cpp          |  9 -----
 ttnn/cpp/ttnn/tensor/tensor_ops.hpp          |  2 --
 8 files changed, 1 insertion(+), 78 deletions(-)
diff --git a/ttnn/cpp/pybind11/pytensor.cpp b/ttnn/cpp/pybind11/pytensor.cpp
index 68a507a2c4a..cd381eabab5 100644
--- a/ttnn/cpp/pybind11/pytensor.cpp
+++ b/ttnn/cpp/pybind11/pytensor.cpp
@@ -1161,13 +1161,6 @@ void pytensor_module(py::module& m_tensor) {
 
                 tt_tensor = tt_tensor.cpu()
         )doc")
-        .def("cpu_sharded", &Tensor::cpu_sharded, R"doc(
-            Move TT Tensor from TT accelerator device to host device in sharded orientation.
-
-            .. code-block:: python
-
-                tt_tensor = tt_tensor.cpu_sharded()
-        )doc")
         .def(
             "to",
             py::overload_cast<Layout, Device*>(&Tensor::to, py::const_),
diff --git a/ttnn/cpp/ttnn/tensor/tensor.cpp b/ttnn/cpp/ttnn/tensor/tensor.cpp
index 689cc127d34..1057b183496 100644
--- a/ttnn/cpp/ttnn/tensor/tensor.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor.cpp
@@ -482,21 +482,9 @@ void Tensor::perform_cleanup_for_async_mode() {
     }
 }
 
-void Tensor::deepcopy(const Tensor& other) {
-    ZoneScoped;
-    // Wait until the tensor being copied is populated
-    other.wait_for_tensor_data_populated();
-    // Populate tensor metadata
-    this->set_storage(other.get_storage());
-    this->set_tensor_spec(other.get_tensor_spec());
-    // Set metadata populated flag for getters
-    this->tensor_attributes->num_workers_completed++;
-}
-
 void Tensor::populate_buffers_and_metadata(const Tensor& other) {
     ZoneScoped;
-    // Similar to deepcopy, but to be applied on a tensor that has an empty storage
-    // container initialized. Require tensor storage to be correctly initialized.
+    // Applied on a tensor that has an empty storage container initialized.
     this->set_tensor_spec(other.get_tensor_spec());
     // Populate storage container with buffers + shapes
     std::visit(
@@ -698,8 +686,6 @@ Tensor Tensor::cpu(bool blocking, uint8_t cq_id, const std::vector<SubDeviceId>&
     return tensor_ops::tensor_cpu(*this, blocking, cq_id, sub_device_ids);
 }
 
-Tensor Tensor::cpu_sharded() const { return tensor_ops::tensor_cpu_sharded(*this); }
-
 Tensor Tensor::extract_shard(const CoreCoord& core) const {
     ZoneScoped;
     const auto& buffer_page_mapping = *this->buffer()->get_buffer_page_mapping();
diff --git a/ttnn/cpp/ttnn/tensor/tensor.hpp b/ttnn/cpp/ttnn/tensor/tensor.hpp
index 30e18978b8e..6827c421320 100644
--- a/ttnn/cpp/ttnn/tensor/tensor.hpp
+++ b/ttnn/cpp/ttnn/tensor/tensor.hpp
@@ -131,8 +131,6 @@ struct Tensor {
 
     void perform_cleanup_for_async_mode();
 
-    void deepcopy(const Tensor& other);
-
     void populate_buffers_and_metadata(const Tensor& other);
 
     void deallocate(bool force = false);
@@ -209,8 +207,6 @@ struct Tensor {
         uint8_t cq_id = ttnn::DefaultQueueId,
         const std::vector<SubDeviceId>& sub_device_ids = {}) const;
 
-    Tensor cpu_sharded() const;
-
     Tensor unpad(const ttnn::SimpleShape& output_tensor_start, const ttnn::SimpleShape& output_tensor_end) const;
 
     Tensor pad_to_tile(float pad_value) const;
diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
index dc7545ac0e5..3f731c97c65 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_impl.cpp
@@ -633,43 +633,6 @@ Tensor to_host<bfloat8_b>(
     return to_host<uint32_t>(tensor, blocking, cq_id, sub_device_ids);
 }
 
-// ======================================================================================
-//                                  .to_host_sharded()
-// ======================================================================================
-
-template <typename T>
-Tensor to_host_sharded(const Tensor& tensor) {
-    TT_ASSERT(tensor.is_allocated(), "Buffer must be allocated on device!");
-    auto device_buffer = tensor.buffer();
-    auto device = tensor.device();
-    TT_ASSERT(device != nullptr && "Need device to be set copy data from device to host!");
-    std::vector<T> data_vec;
-    const char* TT_METAL_SLOW_DISPATCH_MODE = std::getenv("TT_METAL_SLOW_DISPATCH_MODE");
-    if (TT_METAL_SLOW_DISPATCH_MODE == nullptr) {
-        TT_THROW("FAST_DISPATCH is not supported for to_host_sharded!");
-    }
-    ::detail::ReadFromBuffer(*device_buffer, data_vec, true);
-    auto output_buffer = owned_buffer::create<T>(std::move(data_vec));
-    return Tensor(OwnedStorage{output_buffer}, tensor.get_tensor_spec());
-}
-
-template Tensor to_host_sharded<bfloat16>(const Tensor& tensor);
-template Tensor to_host_sharded<float>(const Tensor& tensor);
-template Tensor to_host_sharded<int32_t>(const Tensor& tensor);
-template Tensor to_host_sharded<uint32_t>(const Tensor& tensor);
-template Tensor to_host_sharded<uint16_t>(const Tensor& tensor);
-template Tensor to_host_sharded<uint8_t>(const Tensor& tensor);
-
-template <>
-Tensor to_host_sharded<bfloat4_b>(const Tensor& tensor) {
-    return to_host_sharded<uint32_t>(tensor);
-}
-
-template <>
-Tensor to_host_sharded<bfloat8_b>(const Tensor& tensor) {
-    return to_host_sharded<uint32_t>(tensor);
-}
-
 // ======================================================================================
 //                               .to_device() details
 // ======================================================================================
diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp
index 87c34bdb199..0ceb2b9c1d1 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_impl.hpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_impl.hpp
@@ -191,9 +191,6 @@ Tensor to_host(
     uint8_t cq_id = ttnn::DefaultQueueId,
     tt::stl::Span<const SubDeviceId> sub_device_ids = {});
 
-template <typename T>
-Tensor to_host_sharded(const Tensor& tensor);
-
 template <typename T>
 Tensor to_device(
     const Tensor& tensor,
diff --git a/ttnn/cpp/ttnn/tensor/tensor_impl_wrapper.hpp b/ttnn/cpp/ttnn/tensor/tensor_impl_wrapper.hpp
index 6ab8a8dae75..9cf4c810591 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_impl_wrapper.hpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_impl_wrapper.hpp
@@ -39,7 +39,6 @@ inline size_t packed_buffer_size_bytes_wrapper(DataType dtype, size_t volume_unp
 
 WRAP_FUNCTION(to_host)
 WRAP_FUNCTION(extract_shard)
-WRAP_FUNCTION(to_host_sharded)
 WRAP_FUNCTION(to_device)
 WRAP_FUNCTION(to_layout)
 WRAP_FUNCTION(pad)
diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
index 96b53b87901..c2df9f3e430 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_ops.cpp
@@ -160,15 +160,6 @@ Tensor tensor_cpu(
     return host_tensor;
 }
 
-Tensor tensor_cpu_sharded(const Tensor& input_tensor) {
-    ZoneScoped;
-    GraphTracker::instance().track_function_start("Tensor::cpu_sharded", input_tensor);
-    auto output = tensor_impl::to_host_sharded_wrapper(input_tensor);
-    output = tt::tt_metal::set_tensor_id(output);
-    GraphTracker::instance().track_function_end(output);
-    return output;
-}
-
 Tensor tensor_to(const Tensor& input_tensor, Layout target_layout, Device* worker) {
     ZoneScoped;
     GraphTracker::instance().track_function_start("Tensor::to", input_tensor, target_layout, worker);
diff --git a/ttnn/cpp/ttnn/tensor/tensor_ops.hpp b/ttnn/cpp/ttnn/tensor/tensor_ops.hpp
index b8edff425f8..b65af33cb42 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_ops.hpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_ops.hpp
@@ -41,8 +41,6 @@ Tensor tensor_to(const Tensor& input_tensor, Layout target_layout, distributed::
 Tensor tensor_cpu(
     const Tensor& input_tensor, bool blocking, uint8_t cq_id, const std::vector<SubDeviceId>& sub_device_ids);
 
-Tensor tensor_cpu_sharded(const Tensor& input_tensor);
-
 void tensor_print(const Tensor& input_tensor);
 
 Tensor tensor_pad(