From d65bc5e609bb664b0f1f2d960d16c8c306a6c710 Mon Sep 17 00:00:00 2001
From: Borys Bradel <164946524+bbradelTT@users.noreply.github.com>
Date: Wed, 11 Dec 2024 14:51:22 -0500
Subject: [PATCH 01/13] #15106: add missing tile_c_dim to packer config call
 (#15920)

### Ticket
Link to Github Issue #15106

### Problem description
- a parameter was not passed on when setting the config

### What's changed
- pass on the parameter
- remove skip for now passing test

### Checklist
- [ ] Post commit CI passes N/A BH only changes
- [x] Blackhole Post commit (if applicable)
https://github.com/tenstorrent/tt-metal/actions/runs/12282755287
- [ ] Model regression CI testing passes (if applicable) N/A
- [ ] Device performance regression CI testing passes (if applicable)
N/A
- [ ] **(For models and ops writers)** Full [new
models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml)
tests passes N/A
- [x] New/Existing tests provide coverage for changes
---
 .../python_api_testing/unit_testing/misc/test_sharded.py        | 1 -
 tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_pack_api.h     | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py
index 6eb3991d34da..88bafe821b3f 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py
@@ -1845,7 +1845,6 @@ def test_sharded_tilize_with_val_padding(input_shape, sharding_config, output_dt
     assert passing
 
 
-@skip_for_blackhole("GH #15234")
 @pytest.mark.parametrize("N", [8, 16])
 @pytest.mark.parametrize("in_sharded", [True], ids=["in0_sharded"])
 @pytest.mark.parametrize("out_sharded", [True], ids=["out_sharded"])
diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_pack_api.h
index 99dd5e81e54f..62d85c771b02 100644
--- a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_pack_api.h
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_pack_api.h
@@ -343,6 +343,7 @@ inline void llk_pack_reduce_config_v2(uint32_t icb_out) {
     if constexpr (at_kernel_start) {
         const std::uint32_t output_id = get_output_id(icb_out);
         const std::uint32_t face_r_dim = get_output_face_r_dim(output_id);
+        const std::uint32_t tile_c_dim = get_output_tile_c_dim(output_id);
         const std::uint32_t num_faces = get_output_num_faces(output_id);
         const bool partial_face = get_output_partial_face(output_id);
         const bool narrow_tile = get_output_narrow_tile(output_id);
@@ -358,6 +359,7 @@ inline void llk_pack_reduce_config_v2(uint32_t icb_out) {
             pack_dst_format[output_id],
             tile_size,
             face_r_dim,
+            tile_c_dim,
             num_faces,
             partial_face,
             narrow_tile,

From 529d6f8c13ae5a518cf0270157099f4ad522396a Mon Sep 17 00:00:00 2001
From: Sean Nijjar <sean.nijjar@gmail.com>
Date: Wed, 11 Dec 2024 15:52:24 -0500
Subject: [PATCH 02/13] disambiguate semaphore lookup by including core type
 arg (#15133)

When looking up semaphores, we now also specify which core type we are
specifically looking at.

There are multiple core types, each with there own logical core
mappings. Logical core locations can alias between core types, even
though they map to unique noc locations.
---
 tt_metal/impl/program/program.cpp | 14 +++++++-------
 tt_metal/impl/program/program.hpp |  4 ++--
 tt_metal/tt_metal.cpp             |  6 +++---
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index 9ef41bce94f1..2ea7d2cd21a7 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -117,7 +117,7 @@ class Program_ {
 
     std::vector<CoreRange> circular_buffers_unique_coreranges() const;
 
-    std::vector<std::reference_wrapper<const Semaphore>> semaphores_on_core(const CoreCoord &core) const;
+    std::vector<std::reference_wrapper<const Semaphore>> semaphores_on_core(const CoreCoord &core, CoreType core_type) const;
 
     size_t num_semaphores () const;
     void init_semaphores ( const Device & device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const;
@@ -830,18 +830,18 @@ void detail::Program_::validate_circular_buffer_region(const Device *device) {
     }
 }
 
-size_t Program::num_semaphores(const CoreCoord &core) const { return semaphores_on_core(core).size(); }
+size_t Program::num_semaphores(const CoreCoord &core, CoreType core_type) const { return semaphores_on_core(core, core_type).size(); }
 
 size_t detail::Program_::num_semaphores() const { return semaphores_.size(); }
 
 size_t Program::num_semaphores() const { return pimpl_->num_semaphores(); }
 
 void detail::Program_::init_semaphores(const Device &device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const {
-    auto semaphores_on_core = this->semaphores_on_core(logical_core);
 
     uint64_t kernel_config_base = hal.get_dev_addr(programmable_core_type_index, HalL1MemAddrType::KERNEL_CONFIG);
     uint64_t addr = kernel_config_base + this->program_configs_[programmable_core_type_index].sem_offset;
     CoreType core_type = hal.get_core_type(programmable_core_type_index);
+    auto semaphores_on_core = this->semaphores_on_core(logical_core, core_type);
     for (auto semaphore : semaphores_on_core) {
         llrt::write_hex_vec_to_core(
             device.id(),
@@ -1772,18 +1772,18 @@ void detail::Program_::release_buffers() { owned_buffer_pool = {}; }
 
 void Program::release_buffers() { pimpl_->release_buffers(); }
 
-std::vector<std::reference_wrapper<const Semaphore>> detail::Program_::semaphores_on_core(const CoreCoord &core) const {
+std::vector<std::reference_wrapper<const Semaphore>> detail::Program_::semaphores_on_core(const CoreCoord &core, CoreType core_type) const {
     std::vector<std::reference_wrapper<const Semaphore>> semaphores;
     for (const Semaphore &s : this->semaphores_) {
-        if (s.initialized_on_logical_core(core)) {
+        if (s.initialized_on_logical_core(core) && s.core_type() == core_type) {
             semaphores.emplace_back(std::cref(s));
         }
     }
     return semaphores;
 }
 
-std::vector<std::reference_wrapper<const Semaphore>> Program::semaphores_on_core(const CoreCoord &core) const {
-    return pimpl_->semaphores_on_core(core);
+std::vector<std::reference_wrapper<const Semaphore>> Program::semaphores_on_core(const CoreCoord &core, CoreType core_type) const {
+    return pimpl_->semaphores_on_core(core, core_type);
 }
 
 bool detail::Program_::is_finalized() const { return this->finalized_; }
diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp
index bd1a1c027805..2305723485b4 100644
--- a/tt_metal/impl/program/program.hpp
+++ b/tt_metal/impl/program/program.hpp
@@ -135,9 +135,9 @@ class Program {
 
     std::vector<CoreRange> circular_buffers_unique_coreranges() const;
 
-    std::vector<std::reference_wrapper<const Semaphore>> semaphores_on_core(const CoreCoord &core) const;
+    std::vector<std::reference_wrapper<const Semaphore>> semaphores_on_core(const CoreCoord &core, CoreType core_type) const;
 
-    size_t num_semaphores ( const CoreCoord & core ) const;
+    size_t num_semaphores ( const CoreCoord & core, CoreType core_type ) const;
     size_t num_semaphores () const;
     void init_semaphores ( const Device & device, const CoreCoord &logical_core, uint32_t programmable_core_type_index) const;
     // XXXXX TODO: this should return a const reference
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index b5402a3ac05a..b8fdf165c52c 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -162,13 +162,13 @@ void ConfigureKernelGroup(
     }
 }
 
-std::optional<uint32_t> get_semaphore_id(const Program& program, const CoreRange& core_range) {
+std::optional<uint32_t> get_semaphore_id(const Program &program, const CoreRange& core_range, CoreType core_type) {
     std::optional<uint32_t> semaphore_id = std::nullopt;
     std::vector<uint32_t> semaphore_histogram(NUM_SEMAPHORES, 0);
     for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) {
         for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) {
             CoreCoord logical_core(x, y);
-            auto semaphores = program.semaphores_on_core(logical_core);
+            auto semaphores = program.semaphores_on_core(logical_core, core_type);
             if (semaphores.size() == NUM_SEMAPHORES) {
                 TT_THROW(
                     "Cannot add semaphore on core {}. Max number of semaphores ({}) reached!",
@@ -1158,7 +1158,7 @@ uint32_t CreateSemaphore(
             for (const auto& core_range : crs.ranges()) {
                 CoreCoord start_core = core_range.start_coord;
                 CoreCoord end_core = core_range.end_coord;
-                std::optional<uint32_t> semaphore_id_candidate = get_semaphore_id(program, core_range);
+                std::optional<uint32_t> semaphore_id_candidate = get_semaphore_id(program, core_range, core_type);
                 if (!semaphore_id.has_value()) {
                     semaphore_id = semaphore_id_candidate;
                 } else {

From aa18d6b09bed62f2db6de9b8b2352e35da528a1a Mon Sep 17 00:00:00 2001
From: Andrew Fuller <afuller@tenstorrent.com>
Date: Wed, 11 Dec 2024 15:57:59 -0500
Subject: [PATCH 03/13] Add less and emacs by request (#15870)

### Ticket
#15869

### Problem description
Users of IRD want some extra tools

### What's changed
Added less
Added emacs
Sorted the list
---
 scripts/docker/requirements_dev.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/docker/requirements_dev.txt b/scripts/docker/requirements_dev.txt
index e32fe8958d24..e7029ab3bc7f 100644
--- a/scripts/docker/requirements_dev.txt
+++ b/scripts/docker/requirements_dev.txt
@@ -1,7 +1,9 @@
-sudo
-nano
 acl
+emacs
 jq
+less
+libmpfr-dev
+nano
 openssh-server
+sudo
 vim
-libmpfr-dev

From beba7fe2e1f500f3e202b9dd254d0632b0510e80 Mon Sep 17 00:00:00 2001
From: Raymond Kim <rkim@tenstorrent.com>
Date: Wed, 11 Dec 2024 16:52:21 -0500
Subject: [PATCH 04/13] #0: [skip ci] Bump required TTKMD and tt-firmware
 versions for WH

---
 INSTALLING.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/INSTALLING.md b/INSTALLING.md
index 76f8cecd6277..09092083db98 100644
--- a/INSTALLING.md
+++ b/INSTALLING.md
@@ -20,9 +20,9 @@ Note the current compatability matrix:
 
 | Device              | OS              | Python   | Driver (TT-KMD)    | Firmware (TT-Flash)                        | TT-SMI                | TT-Topology                    |
 |---------------------|-----------------|----------|--------------------|--------------------------------------------|-----------------------|--------------------------------|
-| Grayskull           | Ubuntu 20.04    | 3.8.10   | v1.27.1            | fw_pack-80.9.0.0 (v80.9.0.0)               | v2.2.0 or above       | N/A                            |
-| Wormhole            | Ubuntu 20.04    | 3.8.10   | v1.27.1            | fw_pack-80.10.0.0 (v80.10.0.0)             | v2.2.0 or above       | N/A                            |
-| T3000 (Wormhole)    | Ubuntu 20.04    | 3.8.10   | v1.27.1            | fw_pack-80.10.0.0 (v80.10.0.0)             | v2.2.0 or above       | v1.1.3 or above, `mesh` config |
+| Grayskull           | Ubuntu 20.04    | 3.8.10   | v1.29              | fw_pack-80.9.0.0 (v80.9.0.0)               | v2.2.0 or above       | N/A                            |
+| Wormhole            | Ubuntu 20.04    | 3.8.10   | v1.29              | fw_pack-80.13.0.0 (v80.13.0.0)             | v2.2.0 or above       | N/A                            |
+| T3000 (Wormhole)    | Ubuntu 20.04    | 3.8.10   | v1.29              | fw_pack-80.13.0.0 (v80.13.0.0)             | v2.2.0 or above       | v1.1.3 or above, `mesh` config |
 
 ---
 

From f37ad77bd449689523df437c91fb447eb3f3db6b Mon Sep 17 00:00:00 2001
From: Oleg Milyutin <omilyutin@tenstorrent.com>
Date: Wed, 11 Dec 2024 16:58:40 -0500
Subject: [PATCH 05/13] #0: Add gmock to tt_metal test infra (#15890)

Added sample uses in
`tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp`
and `tt-train/tests/core/distributed_test.cpp` as a demo and confirm all
is working.

### Checklist
[X] [Post commit CI
passes](https://github.com/tenstorrent/tt-metal/actions/runs/12283753251)
---
 tests/CMakeLists.txt                             |  3 +--
 tests/ttnn/CMakeLists.txt                        |  2 +-
 .../tensor/test_create_tensor_multi_device.cpp   | 14 ++++++++------
 tt-train/tests/CMakeLists.txt                    |  2 +-
 tt-train/tests/core/distributed_test.cpp         | 16 +++++++++++-----
 5 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 544d624088c8..6a15d0c6db40 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -5,8 +5,7 @@ target_link_libraries(
     test_common_libs
     INTERFACE
         pthread
-        gtest
-        gtest_main
+        gmock_main
         magic_enum
         fmt::fmt-header-only
         span
diff --git a/tests/ttnn/CMakeLists.txt b/tests/ttnn/CMakeLists.txt
index c14a587dd727..3117e6b89205 100644
--- a/tests/ttnn/CMakeLists.txt
+++ b/tests/ttnn/CMakeLists.txt
@@ -6,7 +6,7 @@ function(setup_ttnn_test_target target_name)
             test_common_libs
             ttnn
             Metalium::Metal
-            GTest::gtest_main
+            GTest::gmock_main
     )
     target_include_directories(
         ${target_name}
diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp
index f4279cc87535..7ef367335f6e 100644
--- a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp
+++ b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_multi_device.cpp
@@ -8,6 +8,7 @@
 
 #include "buffers/buffer_constants.hpp"
 #include "gtest/gtest.h"
+#include "gmock/gmock.h"
 #include "ttnn/cpp/ttnn/operations/creation.hpp"
 #include "ttnn/cpp/ttnn/tensor/types.hpp"
 #include "ttnn/distributed/api.hpp"
@@ -17,6 +18,7 @@
 namespace ttnn::distributed::test {
 namespace {
 
+using ::testing::SizeIs;
 using ::tt::tt_metal::BufferType;
 using ::tt::tt_metal::Layout;
 using ::tt::tt_metal::MemoryConfig;
@@ -57,7 +59,7 @@ TEST_P(MultiDeviceTensorCreationTest, EmptyLike) {
         MemoryConfig{TensorMemoryLayout::INTERLEAVED, BufferType::DRAM, std::nullopt});
 
     EXPECT_EQ(tensor.storage_type(), StorageType::DEVICE);
-    EXPECT_EQ(tensor.get_workers().size(), 1);
+    EXPECT_THAT(tensor.get_workers(), SizeIs(1));
 
     const Tensor mesh_replicated_tensor = ttnn::empty_like(
         tensor,
@@ -67,7 +69,7 @@ TEST_P(MultiDeviceTensorCreationTest, EmptyLike) {
         MemoryConfig{TensorMemoryLayout::INTERLEAVED, BufferType::DRAM, std::nullopt});
 
     EXPECT_EQ(mesh_replicated_tensor.storage_type(), StorageType::MULTI_DEVICE);
-    EXPECT_EQ(mesh_replicated_tensor.get_workers().size(), mesh_device->num_devices());
+    EXPECT_THAT(mesh_replicated_tensor.get_workers(), SizeIs(mesh_device->num_devices()));
 
     const auto distributed_tensor_config = get_distributed_tensor_config_from_tensor(mesh_replicated_tensor);
     EXPECT_TRUE(std::holds_alternative<ReplicateTensor>(distributed_tensor_config));
@@ -86,7 +88,7 @@ TEST_P(MultiDeviceTensorCreationTest, Full) {
         MemoryConfig{TensorMemoryLayout::INTERLEAVED, BufferType::DRAM, std::nullopt});
 
     EXPECT_EQ(mesh_replicated_tensor.storage_type(), StorageType::MULTI_DEVICE);
-    EXPECT_EQ(mesh_replicated_tensor.get_workers().size(), mesh_device->num_devices());
+    EXPECT_THAT(mesh_replicated_tensor.get_workers(), SizeIs(mesh_device->num_devices()));
     EXPECT_EQ(mesh_replicated_tensor.shape(), ttnn::SimpleShape({32, 32}));
     EXPECT_EQ(mesh_replicated_tensor.dtype(), DataType::BFLOAT16);
     EXPECT_EQ(mesh_replicated_tensor.layout(), Layout::ROW_MAJOR);
@@ -109,7 +111,7 @@ TEST_P(MultiDeviceTensorCreationTest, FullLike) {
         MemoryConfig{TensorMemoryLayout::INTERLEAVED, BufferType::DRAM, std::nullopt});
 
     EXPECT_EQ(tensor.storage_type(), StorageType::DEVICE);
-    EXPECT_EQ(tensor.get_workers().size(), 1);
+    EXPECT_THAT(tensor.get_workers(), SizeIs(1));
 
     Tensor mesh_replicated_tensor = ttnn::full_like(
         tensor,
@@ -119,7 +121,7 @@ TEST_P(MultiDeviceTensorCreationTest, FullLike) {
         std::ref(*mesh_device));
 
     EXPECT_EQ(mesh_replicated_tensor.storage_type(), StorageType::MULTI_DEVICE);
-    EXPECT_EQ(mesh_replicated_tensor.get_workers().size(), mesh_device->num_devices());
+    EXPECT_THAT(mesh_replicated_tensor.get_workers(), SizeIs(mesh_device->num_devices()));
     EXPECT_EQ(mesh_replicated_tensor.shape(), tensor.shape());
     EXPECT_EQ(mesh_replicated_tensor.dtype(), tensor.dtype());
     EXPECT_EQ(mesh_replicated_tensor.layout(), tensor.layout());
@@ -161,7 +163,7 @@ TEST_P(MultiDeviceTensorCreationTest, FullLikeWithOptTensor) {
         opt_output);
 
     EXPECT_EQ(mesh_replicated_tensor.storage_type(), StorageType::MULTI_DEVICE);
-    EXPECT_EQ(mesh_replicated_tensor.get_workers().size(), mesh_device->num_devices());
+    EXPECT_THAT(mesh_replicated_tensor.get_workers(), SizeIs(mesh_device->num_devices()));
     EXPECT_EQ(mesh_replicated_tensor.shape(), tensor.shape());
     EXPECT_EQ(mesh_replicated_tensor.dtype(), tensor.dtype());
     EXPECT_EQ(mesh_replicated_tensor.layout(), tensor.layout());
diff --git a/tt-train/tests/CMakeLists.txt b/tt-train/tests/CMakeLists.txt
index 0faac2d3ee35..5eee6b8e77b8 100644
--- a/tt-train/tests/CMakeLists.txt
+++ b/tt-train/tests/CMakeLists.txt
@@ -13,7 +13,7 @@ file(
 add_executable(ttml_tests ${SOURCES})
 target_link_libraries(
     ttml_tests
-    GTest::gtest_main
+    GTest::gmock_main
     ttml
 )
 add_definitions(-DTEST_DATA_DIR="${CMAKE_SOURCE_DIR}/data")
diff --git a/tt-train/tests/core/distributed_test.cpp b/tt-train/tests/core/distributed_test.cpp
index 0f304788ca38..4d9bc0e8ae6a 100644
--- a/tt-train/tests/core/distributed_test.cpp
+++ b/tt-train/tests/core/distributed_test.cpp
@@ -2,12 +2,17 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include <core/xtensor_all_includes.hpp>
 
 #include "core/distributed_mapping.hpp"
 
+namespace {
+
+using ::testing::SizeIs;
+
 template <typename T>
 class MeshOpsTest : public ::testing::Test {
 protected:
@@ -25,7 +30,7 @@ TYPED_TEST(MeshOpsTest, ChunkBasicNonDivisible3) {
     // Chunk into 3 parts along dimension 0
     auto chunks = ttml::core::chunk(tensor, 3, 0);
 
-    ASSERT_EQ(chunks.size(), 3u);
+    ASSERT_THAT(chunks, SizeIs(3));
     EXPECT_EQ(chunks[0].shape()[0], 4u);  // first chunk size 4
     EXPECT_EQ(chunks[1].shape()[0], 4u);  // next chunk size 4
     EXPECT_EQ(chunks[2].shape()[0], 2u);  // last chunk size 2
@@ -38,7 +43,7 @@ TYPED_TEST(MeshOpsTest, ChunkBasicLessChunksThanProvided) {
     // Chunk into 6 parts along dimension 0
     auto chunks = ttml::core::chunk(tensor, 6, 0);
 
-    ASSERT_EQ(chunks.size(), 5u);
+    ASSERT_THAT(chunks, SizeIs(5));
     EXPECT_EQ(chunks[0].shape()[0], 3u);  // first chunk size 3
     EXPECT_EQ(chunks[1].shape()[0], 3u);  // next chunk size 3
     EXPECT_EQ(chunks[2].shape()[0], 3u);  // next chunk size 3
@@ -56,7 +61,7 @@ TYPED_TEST(MeshOpsTest, ShardXTensorToMeshBasicShard) {
     auto shards = sharder.map(tensor);
 
     // With 4 shards, each shard should have size 2
-    ASSERT_EQ(shards.size(), 4u);
+    ASSERT_THAT(shards, SizeIs(4));
     for (auto& s : shards) {
         EXPECT_EQ(s.size(), 2u);
     }
@@ -73,7 +78,7 @@ TYPED_TEST(MeshOpsTest, ShardTensor2dMeshTwoDimSharding) {
     ttml::core::ShardTensor2dMesh<TypeParam> sharder(mesh_shape, {0, 1});
     auto shards = sharder.map(tensor);
 
-    ASSERT_EQ(shards.size(), 4u);
+    ASSERT_THAT(shards, SizeIs(4));
     // Check shapes of shards
     for (auto& shard : shards) {
         EXPECT_EQ(shard.shape()[0], 2u);
@@ -90,7 +95,7 @@ TYPED_TEST(MeshOpsTest, ReplicateXTensorToMeshReplication) {
     ttml::core::ReplicateXTensorToMesh<TypeParam> replicator(mesh_shape);
     auto replicas = replicator.map(tensor);
 
-    ASSERT_EQ(static_cast<int>(replicas.size()), num_devices);
+    ASSERT_THAT(replicas, SizeIs(num_devices));
     for (const auto& t : replicas) {
         EXPECT_TRUE(xt::allclose(t, tensor));
     }
@@ -243,3 +248,4 @@ TYPED_TEST(MeshOpsTest, ConcatenateSameParametersAsCompose) {
         TypeParam(0), TypeParam(1), TypeParam(2), TypeParam(3), TypeParam(4), TypeParam(5)};
     EXPECT_TRUE(xt::allclose(composed, expected));
 }
+}  // namespace

From 38fddcd45b737a9f9163221b8ee7dc63ed86b0b2 Mon Sep 17 00:00:00 2001
From: Ligang Long <llong@tenstorrent.com>
Date: Wed, 11 Dec 2024 14:15:39 -0800
Subject: [PATCH 06/13] #15602: ttnn-padding padding size enhancement. (#15758)

### Ticket
#15602
[Link to Github
Issue](https://github.com/tenstorrent/tt-metal/issues/15602)

### Problem description
When use padding size that's not multiples of 16, the result is
erroneous

### What's changed
Re-design the kernel to allow arbitrary front pad/back pad length
Still need to address the alignment issue in L1 since now the total size
of a stick is no longer 16B aligned.

### Checklist
- [x] Post commit CI passes
https://github.com/tenstorrent/tt-metal/actions/runs/12281633434
- [ ] Blackhole Post commit (if applicable)
- [ ] Model regression CI testing passes (if applicable)
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] New/Existing tests provide coverage for changes
---
 .../reader_pad_dims_rm_interleaved_v2.cpp     | 71 +++++++------------
 .../writer_pad_dims_rm_interleaved_v2.cpp     |  3 +-
 .../pad/device/pad_program_factory.cpp        | 32 ++++++---
 3 files changed, 47 insertions(+), 59 deletions(-)

diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp
index 9348316a2499..696aad3dfe4b 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_interleaved_v2.cpp
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <stdint.h>
+#include <cstring>
 #include "dataflow_api.h"
 
 inline __attribute__((always_inline)) void fill_pad_cb_with_val(
@@ -37,8 +38,10 @@ void kernel_main() {
     constexpr uint32_t stick_size_padded_end = get_compile_time_arg_val(10);
     constexpr uint32_t num_zero_pad_sticks_read = get_compile_time_arg_val(11);
     constexpr uint32_t last_zero_stick_size = get_compile_time_arg_val(12);
+    constexpr uint32_t stick_size_padded_aligned = get_compile_time_arg_val(21);
 
 #define not_pad_by_zero get_compile_time_arg_val(13) == 1
+#define front_padding get_compile_time_arg_val(9)
 #if (not_pad_by_zero)
     constexpr uint32_t packed_pad_value = get_compile_time_arg_val(14);
     constexpr uint32_t row_major_min_bytes = get_compile_time_arg_val(15);
@@ -47,8 +50,9 @@ void kernel_main() {
     constexpr uint32_t num_sticks_padded_read = get_compile_time_arg_val(18);
 #endif
 
-    constexpr auto cb_in0 = tt::CBIndex::c_0;
-    constexpr auto cb_pad = tt::CBIndex::c_1;
+    constexpr uint32_t cb_in0 = tt::CBIndex::c_0;
+    constexpr uint32_t cb_pad = tt::CBIndex::c_1;
+    constexpr uint32_t cb_pad_align = tt::CBIndex::c_2;
 
 #define stick_size_is_pow2 get_compile_time_arg_val(19) == 1
 #if (stick_size_is_pow2)
@@ -68,8 +72,14 @@ void kernel_main() {
     uint64_t pad_val_addr = get_read_ptr(cb_pad);
     uint64_t pad_val_noc_addr = get_noc_addr(pad_val_addr);
 
+    uint64_t pad_align_addr = get_read_ptr(cb_pad_align);
+    uint64_t pad_align_write_addr = get_write_ptr(cb_pad_align);
+    uint64_t pad_align_noc_addr = get_noc_addr(pad_align_addr);
+
 #if (not_pad_by_zero)
-    fill_pad_cb_with_val(cb_pad, row_major_min_bytes, packed_pad_value);
+    fill_pad_cb_with_val(cb_pad, stick_size_padded, packed_pad_value);
+#else
+    fill_pad_cb_with_val(cb_pad, stick_size_padded, 0);
 #endif
 
     uint32_t i_stick = start_id;
@@ -82,55 +92,23 @@ void kernel_main() {
             bool read_stick = (curr_h >= front_pad_h and curr_h < H) and (curr_c >= front_pad_c and curr_c < C) and
                               (curr_n >= front_pad_n and curr_n < N);
             uint64_t read_noc_addr = get_noc_addr(i_stick, s);
+            noc_async_read(pad_val_noc_addr, l1_write_addr, stick_size_padded);
 
             if (read_stick) {
-#if (not_pad_by_zero)
-                if constexpr (stick_size_padded_front != 0) {
-                    for (uint32_t j = 0; j < num_front_pad_sticks_read; ++j) {
-                        noc_async_read(pad_val_noc_addr, l1_write_addr, row_major_min_bytes);
-                        l1_write_addr += row_major_min_bytes;
-                    }
-                }
+#if (front_padding)
+                // Read noc into cb_pad_align l1
+                noc_async_read(read_noc_addr, get_write_ptr(cb_pad_align), stick_size_bytes);
+                noc_async_read_barrier();
+                memmove(
+                    (void*)(l1_write_addr + stick_size_padded_front),
+                    (void*)(get_read_ptr(cb_pad_align)),
+                    (size_t)(stick_size_bytes));
 #else
-                if constexpr (stick_size_padded_front != 0) {
-                    noc_async_read(zeros_noc_addr, l1_write_addr, stick_size_padded_front);
-                    l1_write_addr += stick_size_padded_front;
-                }
-#endif
-
                 noc_async_read(read_noc_addr, l1_write_addr, stick_size_bytes);
-                l1_write_addr += stick_size_bytes;
-                i_stick++;
-
-#if (not_pad_by_zero)
-                if constexpr (stick_size_padded_end != 0) {
-                    for (uint32_t j = 0; j < num_end_pad_sticks_read; ++j) {
-                        noc_async_read(pad_val_noc_addr, l1_write_addr, row_major_min_bytes);
-                        l1_write_addr += row_major_min_bytes;
-                    }
-                }
-#else
-                if constexpr (stick_size_padded_end != 0) {
-                    noc_async_read(zeros_noc_addr, l1_write_addr, stick_size_padded_end);
-                    l1_write_addr += stick_size_padded_end;
-                }
-#endif
-
-            } else {
-#if (not_pad_by_zero)
-                for (uint32_t j = 0; j < num_sticks_padded_read; ++j) {
-                    noc_async_read(pad_val_noc_addr, l1_write_addr, row_major_min_bytes);
-                    l1_write_addr += row_major_min_bytes;
-                }
-#else
-                for (uint32_t j = 0; j < num_zero_pad_sticks_read; ++j) {
-                    auto read_bytes = j == num_zero_pad_sticks_read - 1 ? last_zero_stick_size : 512;
-                    noc_async_read(zeros_noc_addr, l1_write_addr, read_bytes);
-                    l1_write_addr += read_bytes;
-                }
 #endif
+                i_stick++;
             }
-
+            l1_write_addr += stick_size_padded_aligned;
             curr_h++;
             if (curr_h == H_padded) {
                 curr_c++;
@@ -142,7 +120,6 @@ void kernel_main() {
             }
         }
         noc_async_read_barrier();
-
         cb_push_back(cb_in0, num_read_per_barrier);
     }
 }
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved_v2.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved_v2.cpp
index 658a24bb6684..80ff4013d627 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved_v2.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/writer_pad_dims_rm_interleaved_v2.cpp
@@ -13,6 +13,7 @@ void kernel_main() {
     constexpr uint32_t cb_out0 = get_compile_time_arg_val(0);
     constexpr bool dst_is_dram = get_compile_time_arg_val(1) == 1;
     constexpr uint32_t W_size_bytes = get_compile_time_arg_val(2);
+    constexpr uint32_t stick_size_padded_aligned = get_compile_time_arg_val(5);
 
     const uint32_t stick_size_bytes = W_size_bytes;
 
@@ -38,7 +39,7 @@ void kernel_main() {
         for (uint32_t i = 0; i < num_read_per_barrier; ++i) {
             uint64_t write_noc_addr = get_noc_addr(i_stick, s);
             noc_async_write(l1_read_addr, write_noc_addr, stick_size_bytes);
-            l1_read_addr += stick_size_bytes;
+            l1_read_addr += stick_size_padded_aligned;
             i_stick += 1;
         }
         noc_async_write_barrier();
diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
index e0b5e4859d01..af7ea10d2259 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp
@@ -1029,6 +1029,7 @@ operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2(
     auto stick_size_padded = W_padded * a.element_size();
     auto stick_size_padded_front = front_pad[-1] * a.element_size();
     auto stick_size_padded_end = stick_size_padded - stick_size - stick_size_padded_front;
+    uint32_t stick_size_padded_aligned = align(stick_size_padded, hal.get_alignment(HalMemType::L1));
     uint32_t row_major_min_bytes = 16;
 
     tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype());
@@ -1050,24 +1051,31 @@ operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2(
          num_sticks_padded_per_core_group_2] =
             tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, NCH_padded);
 
-    uint32_t src0_cb_index = 0;
+    uint32_t src0_cb_index = tt::CBIndex::c_0;
     auto num_sticks = num_sticks_padded_per_core_group_1 > num_sticks_padded_per_core_group_2
                           ? num_sticks_padded_per_core_group_1
                           : num_sticks_padded_per_core_group_2;
 
     tt::tt_metal::CircularBufferConfig cb_src0_config =
-        tt::tt_metal::CircularBufferConfig(num_sticks * stick_size_padded, {{src0_cb_index, cb_data_format}})
-            .set_page_size(src0_cb_index, stick_size_padded);
+        tt::tt_metal::CircularBufferConfig(num_sticks * stick_size_padded_aligned, {{src0_cb_index, cb_data_format}})
+            .set_page_size(src0_cb_index, stick_size_padded_aligned);
     auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config);
 
     // construct const buffer with the pad_value
     bool not_pad_by_zero = pad_value != 0;
-    if (not_pad_by_zero) {
-        uint32_t src1_cb_index = 1;
-        tt::tt_metal::CircularBufferConfig cb_src1_config =
-            tt::tt_metal::CircularBufferConfig(row_major_min_bytes, {{src1_cb_index, cb_data_format}})
-                .set_page_size(src1_cb_index, row_major_min_bytes);
-        auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src1_config);
+
+    uint32_t src1_cb_index = tt::CBIndex::c_1;
+    tt::tt_metal::CircularBufferConfig cb_src1_config =
+        tt::tt_metal::CircularBufferConfig(stick_size_padded_aligned, {{src1_cb_index, cb_data_format}})
+            .set_page_size(src1_cb_index, stick_size_padded_aligned);
+    auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src1_config);
+
+    if (stick_size_padded_front != 0) {
+        uint32_t src2_cb_index = tt::CBIndex::c_2;
+        tt::tt_metal::CircularBufferConfig cb_src2_config =
+            tt::tt_metal::CircularBufferConfig(stick_size_padded_aligned, {{src2_cb_index, cb_data_format}})
+                .set_page_size(src2_cb_index, stick_size_padded_aligned);
+        auto cb_src2 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src2_config);
     }
 
     Buffer* src0_buffer = a.buffer();
@@ -1104,13 +1112,15 @@ operation::ProgramWithCallbacks pad_rm_reader_writer_multi_core_v2(
         (std::uint32_t)(stick_size_padded_end / row_major_min_bytes),
         (std::uint32_t)(stick_size_padded / row_major_min_bytes),
         (std::uint32_t)src_stick_size_is_power_of_two,
-        (std::uint32_t)src_stick_size_is_power_of_two ? src_log2_stick_size : stick_size};
+        (std::uint32_t)src_stick_size_is_power_of_two ? src_log2_stick_size : stick_size,
+        (std::uint32_t)stick_size_padded_aligned};
     std::vector<uint32_t> writer_ct_args = {
         (std::uint32_t)src0_cb_index,
         (std::uint32_t)dst_is_dram,
         (std::uint32_t)stick_size_padded,
         (std::uint32_t)dst_stick_size_is_power_of_two,
-        (std::uint32_t)dst_stick_size_is_power_of_two ? dst_log2_stick_size : stick_size_padded};
+        (std::uint32_t)dst_stick_size_is_power_of_two ? dst_log2_stick_size : stick_size_padded,
+        (std::uint32_t)stick_size_padded_aligned};
 
     KernelHandle reader_kernel_id = CreateKernel(
         program,

From 49d39e9b2ead9e2b4e560ebe12f25fbf9571b39a Mon Sep 17 00:00:00 2001
From: Raymond Kim <109366641+tt-rkim@users.noreply.github.com>
Date: Wed, 11 Dec 2024 18:58:57 -0500
Subject: [PATCH 07/13] #0: [skip ci] Disable and file issues for flaky nightly
 fast dispatch tests and rename pipeline (#15947)

### Ticket

These tests have been dogging us and we need to be green.

### Problem description
Provide context for the problem.

### What's changed
Describe the approach used to solve the problem.
Summarize the changes made and its impact.

### Checklist
- [ ] Post commit CI passes
- [ ] Blackhole Post commit (if applicable)
- [ ] Model regression CI testing passes (if applicable)
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] **(For models and ops writers)** Full [new
models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml)
tests passes
- [ ] New/Existing tests provide coverage for changes
---
 .github/workflows/_produce-data.yaml          |  1 +
 ...atch-full-regressions-and-models-impl.yaml | 28 ++++++++-----------
 ...-dispatch-full-regressions-and-models.yaml |  2 +-
 .../tests/test_unet_2d_condition_model.py     |  1 +
 .../stable_diffusion/tests/test_upblock_2d.py |  1 +
 5 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/_produce-data.yaml b/.github/workflows/_produce-data.yaml
index c66c5bb57023..1f13fd8274e5 100644
--- a/.github/workflows/_produce-data.yaml
+++ b/.github/workflows/_produce-data.yaml
@@ -24,6 +24,7 @@ on:
       - "(Single-card) Demo tests"
       - "(Single-card) Tests for new models"
       - "Nightly fast dispatch tests"
+      - "(Single-card) Nightly model and ttnn tests"
       - "(Single-card) Tests for new models"
       - "(T3K) T3000 demo tests"
       - "(T3K) T3000 model perf tests"
diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
index 0af646345b18..8b71190eb2b3 100644
--- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
+++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
@@ -40,13 +40,6 @@ jobs:
               cmd: tests/scripts/single_card/nightly/run_ttnn.sh,
               timeout: 70
             },
-            {
-              name: "WH N300 pgm dispatch nightly",
-              arch: wormhole_b0,
-              runs-on: ["cloud-virtual-machine", "N300", "in-service"],
-              cmd: ./tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.sh,
-              timeout: 10
-            },
             {
               name: "GS-only models",
               arch: grayskull,
@@ -151,18 +144,19 @@ jobs:
         test-config:
           - model: "stable_diffusion"
             cmd: pytest --timeout 900 -n auto tests/nightly/single_card/stable_diffusion
-          - model: "mamba 1"
-            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1
-          - model: "mamba 2"
-            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 2
-          - model: "mamba 3"
-            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 3
-          - model: "mamba 4"
-            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 4
+              # Skipping due to issue #15932
+              # - model: "mamba 1"
+              # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 1
+              # - model: "mamba 2"
+              # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 2
+              # - model: "mamba 3"
+              # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 3
+              # - model: "mamba 4"
+              # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 4
           - model: "mamba 5"
             cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 5
-          - model: "mamba 6"
-            cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 6
+              # - model: "mamba 6"
+              # cmd: pytest --timeout 900 -n auto tests/nightly/single_card/mamba --splits 6 --group 6
         card: [N150, N300]
     name: "[Unstable] Nightly ${{ matrix.card }} ${{ matrix.test-config.model }}"
     env:
diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
index 409833993f46..65bacbceada8 100644
--- a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
+++ b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
@@ -1,4 +1,4 @@
-name: Nightly fast dispatch tests
+name: "(Single-card) Nightly model and ttnn tests"
 
 on:
   workflow_dispatch:
diff --git a/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py
index 72efdb4e178e..24b9e22dea2d 100644
--- a/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py
+++ b/models/demos/wormhole/stable_diffusion/tests/test_unet_2d_condition_model.py
@@ -72,6 +72,7 @@ def unsqueeze_all_params_to_4d(params):
         (2, 4, 64, 64),
     ],
 )
+@pytest.mark.skip(reason="#15931: Failing, skip for now")
 def test_unet_2d_condition_model_512x512(device, batch_size, in_channels, input_height, input_width):
     device.enable_program_cache()
 
diff --git a/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py b/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py
index 0445d58164be..cb9fd165f3f7 100644
--- a/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py
+++ b/models/demos/wormhole/stable_diffusion/tests/test_upblock_2d.py
@@ -29,6 +29,7 @@
 @pytest.mark.parametrize("res_hidden_states_tuple", [([2, 1280, 8, 8], [2, 1280, 8, 8], [2, 1280, 8, 8])])
 @pytest.mark.parametrize("hidden_states", [[2, 1280, 8, 8]])
 @pytest.mark.parametrize("temb", [[1, 1, 2, 1280]])
+@pytest.mark.skip(reason="#15931: Fails, need to investigate")
 def test_upblock_512x512(reset_seeds, device, res_hidden_states_tuple, hidden_states, temb):
     # TODO
     # setup pytorch model

From 7e1ec659e01e0550b567c896e80b522eb405e9f3 Mon Sep 17 00:00:00 2001
From: Raymond Kim <rkim@tenstorrent.com>
Date: Thu, 12 Dec 2024 00:01:46 +0000
Subject: [PATCH 08/13] #0: [skip ci] Bump device and e2e thresholds for yolov4
 and compile time for vgg for recent changes from
 https://github.com/tenstorrent/tt-metal/commit/c1a246ef0563d409dd0f6bf4e42d6243dd1a280d

---
 models/demos/vgg/tests/test_perf_vgg.py     | 2 +-
 models/demos/yolov4/tests/test_perf_yolo.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/models/demos/vgg/tests/test_perf_vgg.py b/models/demos/vgg/tests/test_perf_vgg.py
index 9cc0397bb073..8faff8a5b733 100644
--- a/models/demos/vgg/tests/test_perf_vgg.py
+++ b/models/demos/vgg/tests/test_perf_vgg.py
@@ -22,7 +22,7 @@
 
 
 def get_expected_times(vgg):
-    return (16, 10.5)
+    return (17, 10.5)
 
 
 @pytest.mark.models_performance_bare_metal
diff --git a/models/demos/yolov4/tests/test_perf_yolo.py b/models/demos/yolov4/tests/test_perf_yolo.py
index f04566ebbd1e..a9e0009e6f5b 100644
--- a/models/demos/yolov4/tests/test_perf_yolo.py
+++ b/models/demos/yolov4/tests/test_perf_yolo.py
@@ -23,7 +23,7 @@
 
 
 def get_expected_times():
-    return (40, 16)
+    return (40, 16.2)
 
 
 @pytest.mark.models_performance_bare_metal
@@ -96,7 +96,7 @@ def test_perf_device_bare_metal_yolov4(batch_size, model_name):
     num_iterations = 1
     margin = 0.03
 
-    expected_perf = 197.89
+    expected_perf = 199.89
     command = f"pytest tests/ttnn/integration_tests/yolov4/test_ttnn_yolov4.py"
 
     cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]

From bc004389d2720a30b8bdf1a60aec075506f30727 Mon Sep 17 00:00:00 2001
From: Roman Furko <rfurko@tenstorrent.com>
Date: Wed, 11 Dec 2024 16:33:08 -0800
Subject: [PATCH 09/13] [tt-train] GPT2-S Matmul tests (#15937)

### Problem description
Some matmuls fail during GPT2S training.

### What's changed
Add tests for all matmul combinations from GPT2-S.

### Checklist
- [x] Post commit CI passes
https://github.com/tenstorrent/tt-metal/actions/runs/12286281817
- [x] New/Existing tests provide coverage for changes
---
 tt-train/tests/model/gpt2s_test.cpp | 95 +++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 tt-train/tests/model/gpt2s_test.cpp

diff --git a/tt-train/tests/model/gpt2s_test.cpp b/tt-train/tests/model/gpt2s_test.cpp
new file mode 100644
index 000000000000..bfad28597d8d
--- /dev/null
+++ b/tt-train/tests/model/gpt2s_test.cpp
@@ -0,0 +1,95 @@
+// SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gtest/gtest.h>
+
+#include <core/ttnn_all_includes.hpp>
+
+#include "autograd/auto_context.hpp"
+#include "core/compute_kernel_config.hpp"
+#include "core/tt_tensor_utils.hpp"
+
+enum class ExpectedResult { OK, ERROR };
+
+struct MatmulInput {
+    ttnn::Shape shape_a;
+    ttnn::Shape shape_b;
+    bool transpose_a{false};
+    bool transpose_b{false};
+};
+
+struct MatmulTest {
+    MatmulInput input;
+    ExpectedResult expected_result;
+};
+
+// Matmul tests are based on GPT2-S model with batch size 64
+TEST(GPT2SBatch64Test, Matmul) {
+    std::vector<MatmulTest> tests = {
+        {{{64, 12, 64, 1024}, {64, 12, 1024, 64}, false, false}, ExpectedResult::OK},
+        {{{64, 12, 1024, 64}, {64, 12, 1024, 64}, false, true}, ExpectedResult::OK},
+        {{{64, 12, 1024, 64}, {64, 12, 1024, 64}, true, false}, ExpectedResult::OK},
+        {{{64, 12, 1024, 64}, {64, 12, 64, 1024}, false, false}, ExpectedResult::OK},
+        {{{64, 12, 1024, 1024}, {64, 12, 1024, 64}, false, false}, ExpectedResult::OK},
+        {{{768, 65536}, {65536, 96}, false, false}, ExpectedResult::OK},
+        {{{65536, 768}, {65536, 96}, true, false}, ExpectedResult::OK},
+        {{{65536, 96}, {1, 1, 96, 768}, false, false}, ExpectedResult::ERROR},
+        {{{65536, 96}, {1, 1, 768, 96}, false, true}, ExpectedResult::ERROR},
+        {{{3072, 65536}, {65536, 768}, false, false}, ExpectedResult::OK},
+        {{{65536, 3072}, {65536, 768}, true, false}, ExpectedResult::OK},
+        {{{65536, 768}, {1, 1, 768, 3072}, false, false}, ExpectedResult::ERROR},
+        {{{65536, 768}, {1, 1, 3072, 768}, false, true}, ExpectedResult::ERROR},
+        {{{768, 65536}, {65536, 3072}, false, false}, ExpectedResult::OK},
+        {{{65536, 768}, {65536, 3072}, true, false}, ExpectedResult::OK},
+        {{{65536, 3072}, {1, 1, 3072, 768}, false, false}, ExpectedResult::ERROR},
+        {{{65536, 3072}, {1, 1, 768, 3072}, false, true}, ExpectedResult::ERROR},
+        {{{65536, 3072}, {3072, 768}, false, false}, ExpectedResult::ERROR},
+        {{{65536, 3072}, {768, 3072}, false, true}, ExpectedResult::ERROR},
+        {{{768, 65536}, {65536, 768}, false, false}, ExpectedResult::OK},
+        {{{65536, 768}, {65536, 768}, true, false}, ExpectedResult::OK},
+        {{{65536, 768}, {1, 1, 768, 768}, false, false}, ExpectedResult::ERROR},
+        {{{768, 65536}, {1, 1, 768, 768}, true, false}, ExpectedResult::ERROR},
+        {{{768, 65536}, {65536, 2304}, false, false}, ExpectedResult::OK},
+        {{{65536, 768}, {65536, 2304}, true, false}, ExpectedResult::OK},
+        {{{65536, 768}, {768, 50257}, false, false}, ExpectedResult::ERROR},
+        {{{65536, 768}, {50257, 768}, false, true}, ExpectedResult::ERROR},
+        {{{65536, 50257}, {50257, 768}, false, false}, ExpectedResult::ERROR},
+    };
+
+    auto run_matmul = [](auto& a, auto& b, bool transpose_a, bool transpose_b) {
+        fmt::println(
+            "Running matmul with shapes {} and {}, tranpose_a {} transpose_b {}",
+            a.get_shape(),
+            b.get_shape(),
+            transpose_a,
+            transpose_b);
+        [[maybe_unused]] auto c = ttnn::matmul(
+            a,
+            b,
+            transpose_a,
+            transpose_b,
+            /* memory_config */ std::nullopt,
+            /* dtype */ std::nullopt,
+            /* program_config */ std::nullopt,
+            /* activation */ std::nullopt,
+            /* compute_kernel_config */
+            ttml::core::ComputeKernelConfig::matmul(),
+            /* core_grid */ ttnn::CoreGrid{7, 8},
+            /* output_tile */ std::nullopt);
+    };
+
+    for (const auto& [input, expected_result] : tests) {
+        auto [shape_a, shape_b, transpose_a, transpose_b] = input;
+
+        auto* device = &ttml::autograd::ctx().get_device();
+        auto a = ttml::core::empty(shape_a, device, {});
+        auto b = ttml::core::empty(shape_b, device, {});
+
+        if (expected_result == ExpectedResult::OK) {
+            EXPECT_NO_THROW(run_matmul(a, b, transpose_a, transpose_b));
+        } else {
+            EXPECT_ANY_THROW(run_matmul(a, b, transpose_a, transpose_b));
+        }
+    }
+}

From 5b04331a431e60048ce625b4449ba63c3869ccf3 Mon Sep 17 00:00:00 2001
From: Brian Liu <bliu@tenstorrent.com>
Date: Tue, 3 Dec 2024 16:16:42 +0000
Subject: [PATCH 10/13] #13127: Use tensor spec in conversion between python
 and tt tensors - Significant changes in ttnn/cpp/pybind11/pytensor.cpp:   *
 Use tensor spec in create_owned_tensor   * Add conversion between ROW_MAJOR
 and TILE layouts for ttnn.Tensor(...)/tensor.to(...) APIs     ** For
 ttnn.Tensor(python_tensor, ...), handling is now internal and not through
 .to(layout)     ** For ttnn.Tensor(float_vector, ...), use .to(layout) to
 convert to TILE if needed     ** Make tilize, tilize_to_list, and untilize
 python utility functions no-ops and mark as deprecated   * Add analogous
 create_row_major_owned_buffer from tensor buffer     ** Commonize handling of
 BFLOAT8_B/BFLOAT4_B as float tensors/buffers     ** Always use OwnedBuffer if
 conversion to/from TILE layout is required   * Automatically deduce python
 dtype from owned buffers instead of mapping based on tt dtype   * Set
 defaults for pybound init so it is more usable   * Invert meaning of
 enable_borrow (now called override_enable_borrow)     ** Make enable_borrow
 internal to create_tt_tensor_from_py_data - Update tensor init documentation
 and sample code for tile arg and creating tensors on device - Add
 memory_config() to TensorSpec - Commonize tt_dtype_to_torch_dtype and
 tt_dtype_to_np_dtype dicts across ttnn unit tests - Add test for host side
 tensor conversion in tests/ttnn/unit_tests/tensor/test_tensor_conversion.py -
 Add new tests/ttnn/unit_tests/tensor/test_tensor_creation.py tests   *
 Coverage for directly creating device tensors with ttnn.Tensor(...)   *
 Coverage for API parity between ttnn.from_device/ttnn.to_device and
 ttnn.Tensor(...)/tensor.to(...)

---
 models/utility_functions.py                   | 100 +----
 .../unit_testing/misc/test_indexed_fill.py    |  10 +-
 .../unit_testing/misc/test_non_zero.py        |  10 +-
 .../unit_testing/misc/test_sharded_tensor.py  |   7 +-
 .../tensor/test_tensor_conversion.py          |  40 +-
 .../unit_tests/tensor/test_tensor_creation.py | 122 ++++++
 .../tensor/test_tensor_serialization.py       |  10 +-
 tests/ttnn/unit_tests/test_print_tensor.py    |  10 +-
 tests/ttnn/utils_for_testing.py               |  27 ++
 ttnn/cpp/pybind11/pytensor.cpp                | 371 ++++++++++++------
 ttnn/cpp/ttnn/tensor/tensor_spec.hpp          |   1 +
 ttnn/tt_lib/fused_ops/softmax.py              |   2 +-
 ttnn/tt_lib/utils.py                          | 100 +----
 13 files changed, 444 insertions(+), 366 deletions(-)
 create mode 100644 tests/ttnn/unit_tests/tensor/test_tensor_creation.py

diff --git a/models/utility_functions.py b/models/utility_functions.py
index 2b652f815424..f13fd48d8ca0 100644
--- a/models/utility_functions.py
+++ b/models/utility_functions.py
@@ -15,6 +15,8 @@
 
 from ttnn.device import Arch
 
+from typing_extensions import deprecated
+
 
 ### Math operations ###
 def _nearest_32(x):
@@ -430,108 +432,22 @@ def convert_act_2d_matrix(activation, kernel_y, kernel_x, stride_y, stride_x, pa
 
 
 ### Tilizing / Untilizing ###
+@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
 def tilize(x):
-    """
-    This function tilizes a tensor. The last two tensor dims must be divisible by 32, after which this function
-    produces row major tiles and creates faces. The output of this function is a flattened list that
-    we can send to the device.
-
-    :param x: Input PyTorch Tensor
-    :type x: class:`torch.Tensor`
-
-    WARNING: This function should eventually be retired in favour of fully tilizing on device.
-    """
-    nearest_32 = _nearest_32
-
-    assert isinstance(
-        x, (torch.Tensor, np.ndarray)
-    ), "Input to this function must be an instance of torch.Tensor or np.array"
-    assert len(x.shape) == 4, "Only 4D tensors suppported"
-    assert (x.shape[-2] % 32) == 0 and (
-        x.shape[-1] % 32
-    ) == 0, "The last two dimensions of the tensor must be divisible by 32"
-
-    if isinstance(x, torch.Tensor):
-        ret = torch.zeros(np.prod(x.shape))
-    else:
-        ret = np.zeros(np.prod(x.shape))
-
-    idx = 0
-    for B in range(x.shape[0]):
-        for C in range(x.shape[1]):
-            for H in range(0, x.shape[2], 32):
-                for W in range(0, x.shape[3], 32):
-                    unfaced_tile = x[B, C, H : H + 32, W : W + 32]
-
-                    face0 = unfaced_tile[:16, :16]
-                    face1 = unfaced_tile[:16, 16:]
-                    face2 = unfaced_tile[16:, :16]
-                    face3 = unfaced_tile[16:, 16:]
-
-                    for face in (face0, face1, face2, face3):
-                        ret[idx : idx + 256] = face.reshape(-1)
-                        idx += 256
-
-    return ret.reshape(x.shape)
+    return x
 
 
+@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
 def tilize_to_list(x):
     """
-    Tilize a PyTorch and then return the values as a flat list. The last two
-    tensor dims must be divisible by 32, after which this function produces row
-    major tiles and creates faces.
-
-    :param x: Input PyTorch Tensor
-    :type x: class:`torch.Tensor`
-
-    WARNING: This function should eventually be retired in favour of fully tilizing on device.
+    Returns a flattened list of the tensor
     """
-
     return tilize(x).reshape(-1).tolist()
 
 
+@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
 def untilize(x):
-    """
-    This function untilizes a tensor to row major format.
-
-    :param x: Input PyTorch Tensor
-    :type x: class:`torch.Tensor`
-
-    WARNING: This function should eventually be retired in favour of fully tilizing on device.
-    """
-    nearest_32 = _nearest_32
-
-    assert isinstance(x, (torch.Tensor, np.ndarray)), "Input to this function must be an instance of torch.Tensor"
-    assert len(x.shape) == 4, "Only 4D tensors suppported"
-    assert (x.shape[-2] % 32) == 0 and (
-        x.shape[-1] % 32
-    ) == 0, "The last two dimensions of the tensor must be divisible by 32"
-
-    if isinstance(x, torch.Tensor):
-        ret = torch.zeros(x.shape, dtype=x.dtype)
-    else:
-        ret = np.zeros(x.shape, dtype=x.dtype)
-
-    for B in range(x.shape[0]):
-        for C in range(x.shape[1]):
-            x_hw = x[B, C, :].reshape(-1)
-            hw = 0
-            for h in range(0, x.shape[2], 32):
-                for w in range(0, x.shape[3], 32):
-                    f_tile = x_hw[hw : hw + 256].reshape(16, 16)
-                    ret[B, C, h : h + 16, w : w + 16] = f_tile
-
-                    f_tile = x_hw[hw + 256 : hw + 512].reshape(16, 16)
-                    ret[B, C, h : h + 16, w + 16 : w + 32] = f_tile
-
-                    f_tile = x_hw[hw + 512 : hw + 768].reshape(16, 16)
-                    ret[B, C, h + 16 : h + 32, w : w + 16] = f_tile
-
-                    f_tile = x_hw[hw + 768 : hw + 1024].reshape(16, 16)
-                    ret[B, C, h + 16 : h + 32, w + 16 : w + 32] = f_tile
-                    hw += 1024  # traverse tiles in RM-order
-
-    return ret
+    return x
 
 
 ### Measuring accuracy and other metrics ###
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_indexed_fill.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_indexed_fill.py
index 4245a35c3c2d..3044f6bbb892 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_indexed_fill.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_indexed_fill.py
@@ -9,15 +9,7 @@
 import ttnn
 import torch
 import numpy as np
-
-
-tt_dtype_to_torch_dtype = {
-    ttnn.uint16: torch.int16,
-    ttnn.uint32: torch.int32,
-    ttnn.float32: torch.float,
-    ttnn.bfloat16: torch.bfloat16,
-    ttnn.bfloat8_b: torch.float,
-}
+from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype
 
 
 @pytest.mark.parametrize(
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_non_zero.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_non_zero.py
index e672856c3e20..b280d8e0b66c 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_non_zero.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_non_zero.py
@@ -10,15 +10,7 @@
 import torch
 import numpy as np
 import ttnn
-
-
-tt_dtype_to_torch_dtype = {
-    ttnn.uint16: torch.int16,
-    ttnn.uint32: torch.int32,
-    ttnn.float32: torch.float,
-    ttnn.bfloat16: torch.bfloat16,
-    ttnn.bfloat8_b: torch.float,
-}
+from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype
 
 
 @pytest.mark.parametrize(
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded_tensor.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded_tensor.py
index 050099d62d60..1c19b8137e68 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded_tensor.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded_tensor.py
@@ -11,14 +11,9 @@
 import ttnn
 
 from models.utility_functions import get_debug_tensor
+from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype
 from enum import Enum
 
-tt_dtype_to_torch_dtype = {
-    ttnn.uint32: torch.int32,
-    ttnn.uint16: torch.int16,
-    ttnn.bfloat16: torch.bfloat16,
-    ttnn.bfloat8_b: torch.float,
-}
 TILE_WIDTH = 32
 TILE_HEIGHT = 32
 
diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py b/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py
index 634423088311..2fff322de44d 100644
--- a/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py
+++ b/tests/ttnn/unit_tests/tensor/test_tensor_conversion.py
@@ -11,29 +11,10 @@
 import numpy as np
 
 import ttnn
-
-tt_dtype_to_torch_dtype = {
-    ttnn.uint8: torch.uint8,
-    ttnn.uint16: torch.int16,
-    ttnn.uint32: torch.int32,
-    ttnn.int32: torch.int32,
-    ttnn.float32: torch.float,
-    ttnn.bfloat16: torch.bfloat16,
-    ttnn.bfloat8_b: torch.float,
-    ttnn.bfloat4_b: torch.float,
-}
-
-tt_dtype_to_np_dtype = {
-    ttnn.uint8: np.ubyte,
-    ttnn.uint16: np.int16,
-    ttnn.uint32: np.int32,
-    ttnn.int32: np.int32,
-    ttnn.float32: np.float32,
-    ttnn.bfloat8_b: np.float32,
-    ttnn.bfloat4_b: np.float32,
-}
+from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype, tt_dtype_to_np_dtype
 
 
+@pytest.mark.parametrize("convert_to_device", [True, False])
 @pytest.mark.parametrize(
     "tt_dtype",
     [
@@ -49,7 +30,7 @@
 )
 @pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
 @pytest.mark.parametrize("python_lib", [torch, np])
-def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device):
+def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, convert_to_device, device):
     torch.manual_seed(0)
 
     if python_lib == torch:
@@ -64,7 +45,7 @@ def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device):
 
     elif python_lib == np:
         if tt_dtype == ttnn.bfloat16:
-            pytest.skip("ttnn.bloat16 dtype is not supported yet for numpy tensors!")
+            pytest.skip("ttnn.bfloat16 dtype is not supported yet for numpy tensors!")
         dtype = tt_dtype_to_np_dtype[tt_dtype]
 
         if dtype in {np.ubyte, np.int16, np.int32}:
@@ -82,8 +63,9 @@ def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device):
         assert tt_tensor.storage_type() == ttnn.StorageType.BORROWED
         assert tt_tensor.layout == ttnn.ROW_MAJOR_LAYOUT
 
-    tt_tensor = tt_tensor.to(device)
-    tt_tensor = tt_tensor.cpu()
+    if convert_to_device:
+        tt_tensor = tt_tensor.to(device)
+        tt_tensor = tt_tensor.cpu()
 
     if python_lib == torch:
         py_tensor_after_round_trip = tt_tensor.to_torch()
@@ -123,6 +105,7 @@ def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device):
 }
 
 
+@pytest.mark.parametrize("convert_to_device", [True, False])
 @pytest.mark.parametrize(
     "python_dtype_str",
     [
@@ -137,7 +120,7 @@ def test_tensor_conversion_with_tt_dtype(python_lib, shape, tt_dtype, device):
 )
 @pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
 @pytest.mark.parametrize("python_lib", [torch, np])
-def test_tensor_conversion_with_python_dtype(python_lib, shape, python_dtype_str, device):
+def test_tensor_conversion_with_python_dtype(python_lib, shape, python_dtype_str, convert_to_device, device):
     torch.manual_seed(0)
 
     if python_lib == torch:
@@ -165,8 +148,9 @@ def test_tensor_conversion_with_python_dtype(python_lib, shape, python_dtype_str
     tt_tensor = ttnn.Tensor(py_tensor)
     assert tt_tensor.storage_type() == ttnn.StorageType.BORROWED
 
-    tt_tensor = tt_tensor.to(device)
-    tt_tensor = tt_tensor.cpu()
+    if convert_to_device:
+        tt_tensor = tt_tensor.to(device)
+        tt_tensor = tt_tensor.cpu()
 
     if python_lib == torch:
         py_tensor_after_round_trip = tt_tensor.to_torch()
diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_creation.py b/tests/ttnn/unit_tests/tensor/test_tensor_creation.py
new file mode 100644
index 000000000000..f0615abba973
--- /dev/null
+++ b/tests/ttnn/unit_tests/tensor/test_tensor_creation.py
@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+import os
+import pathlib
+
+import torch
+import numpy as np
+
+import ttnn
+from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype
+
+
+@pytest.mark.parametrize(
+    "layout",
+    [
+        ttnn.ROW_MAJOR_LAYOUT,
+        ttnn.TILE_LAYOUT,
+    ],
+)
+@pytest.mark.parametrize(
+    "tt_dtype",
+    [
+        ttnn.uint8,
+        ttnn.uint16,
+        ttnn.uint32,
+        ttnn.int32,
+        ttnn.float32,
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+        ttnn.bfloat4_b,
+    ],
+)
+@pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
+def test_tensor_creation(shape, tt_dtype, layout, device):
+    torch.manual_seed(0)
+
+    dtype = tt_dtype_to_torch_dtype[tt_dtype]
+
+    if dtype in {torch.uint8, torch.int16, torch.int32}:
+        py_tensor = torch.randint(torch.iinfo(dtype).min, torch.iinfo(dtype).max, shape, dtype=dtype)
+    else:
+        py_tensor = torch.rand(shape, dtype=dtype)
+
+    tt_tensor = ttnn.Tensor(py_tensor, tt_dtype, device, layout)
+
+    tt_tensor = tt_tensor.cpu()
+
+    py_tensor_after_round_trip = tt_tensor.to_torch()
+
+    assert py_tensor.dtype == py_tensor_after_round_trip.dtype
+    assert py_tensor.shape == py_tensor_after_round_trip.shape
+
+    allclose_kwargs = {}
+    if tt_dtype == ttnn.bfloat8_b:
+        allclose_kwargs = dict(atol=1e-2)
+    elif tt_dtype == ttnn.bfloat4_b:
+        allclose_kwargs = dict(atol=0.2)
+
+    passing = torch.allclose(py_tensor, py_tensor_after_round_trip, **allclose_kwargs)
+    assert passing
+
+
+@pytest.mark.parametrize(
+    "layout",
+    [
+        ttnn.ROW_MAJOR_LAYOUT,
+        ttnn.TILE_LAYOUT,
+    ],
+)
+@pytest.mark.parametrize(
+    "tt_dtype",
+    [
+        ttnn.uint8,
+        ttnn.uint16,
+        ttnn.uint32,
+        ttnn.int32,
+        ttnn.float32,
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+        ttnn.bfloat4_b,
+    ],
+)
+@pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
+def test_tensor_creation_api_parity(shape, tt_dtype, layout, device):
+    torch.manual_seed(0)
+
+    if tt_dtype in (ttnn.bfloat8_b, ttnn.bfloat4_b) and layout == ttnn.ROW_MAJOR_LAYOUT:
+        pytest.skip("{} is only valid for ttnn.TILE_LAYOUT!".format(tt_dtype))
+
+    dtype = tt_dtype_to_torch_dtype[tt_dtype]
+
+    if dtype in {torch.uint8, torch.int16, torch.int32}:
+        py_tensor = torch.randint(torch.iinfo(dtype).min, torch.iinfo(dtype).max, shape, dtype=dtype)
+    else:
+        py_tensor = torch.rand(shape, dtype=dtype)
+
+    tt_tensor_1 = ttnn.Tensor(py_tensor, tt_dtype, device, layout)
+    tt_tensor_2 = ttnn.from_torch(py_tensor, tt_dtype, device=device, layout=layout)
+
+    tt_tensor_1 = tt_tensor_1.cpu()
+    tt_tensor_2 = tt_tensor_2.cpu()
+
+    py_tensor_after_round_trip_1 = tt_tensor_1.to_torch()
+    py_tensor_after_round_trip_2 = tt_tensor_2.to_torch()
+    py_tensor_after_round_trip_3 = ttnn.to_torch(tt_tensor_1)
+    py_tensor_after_round_trip_4 = ttnn.to_torch(tt_tensor_2)
+
+    allclose_kwargs = {}
+    if tt_dtype == ttnn.bfloat8_b:
+        allclose_kwargs = dict(atol=1e-2)
+    elif tt_dtype == ttnn.bfloat4_b:
+        allclose_kwargs = dict(atol=0.2)
+
+    passing = torch.allclose(py_tensor, py_tensor_after_round_trip_1, **allclose_kwargs)
+    passing = torch.allclose(py_tensor, py_tensor_after_round_trip_2, **allclose_kwargs)
+    passing = torch.allclose(py_tensor, py_tensor_after_round_trip_3, **allclose_kwargs)
+    passing = torch.allclose(py_tensor, py_tensor_after_round_trip_4, **allclose_kwargs)
+    assert passing
diff --git a/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py b/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py
index 1db497c08439..a56dde83d199 100644
--- a/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py
+++ b/tests/ttnn/unit_tests/tensor/test_tensor_serialization.py
@@ -11,15 +11,7 @@
 import numpy as np
 
 import ttnn
-
-tt_dtype_to_torch_dtype = {
-    ttnn.uint16: torch.int16,
-    ttnn.uint32: torch.int32,
-    ttnn.float32: torch.float,
-    ttnn.bfloat16: torch.bfloat16,
-    ttnn.bfloat8_b: torch.float,
-    ttnn.bfloat4_b: torch.float,
-}
+from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype
 
 
 @pytest.mark.parametrize("shape", [(2, 3, 64, 96)])
diff --git a/tests/ttnn/unit_tests/test_print_tensor.py b/tests/ttnn/unit_tests/test_print_tensor.py
index 66254f7d3634..90f1ecd51579 100644
--- a/tests/ttnn/unit_tests/test_print_tensor.py
+++ b/tests/ttnn/unit_tests/test_print_tensor.py
@@ -7,14 +7,8 @@
 import torch
 
 import ttnn
+from tests.ttnn.utils_for_testing import tt_dtype_to_torch_dtype
 
-ttnn_dtype_to_torch_dtype = {
-    ttnn.uint16: torch.int16,
-    ttnn.uint32: torch.int32,
-    ttnn.float32: torch.float,
-    ttnn.bfloat16: torch.bfloat16,
-    ttnn.bfloat8_b: torch.float,
-}
 
 GOLDEN_TENSOR_STRINGS = {
     (
@@ -77,7 +71,7 @@ def test_print(device, dtype, layout, profile, deallocate):
 
     ttnn.set_printoptions(profile=profile)
 
-    torch_dtype = ttnn_dtype_to_torch_dtype[dtype]
+    torch_dtype = tt_dtype_to_torch_dtype[dtype]
     shape = (2, 16, 64, 32)
 
     if torch_dtype in {torch.int16, torch.int32}:
diff --git a/tests/ttnn/utils_for_testing.py b/tests/ttnn/utils_for_testing.py
index fb083a681fff..92849b32e57d 100644
--- a/tests/ttnn/utils_for_testing.py
+++ b/tests/ttnn/utils_for_testing.py
@@ -10,6 +10,33 @@
 from models.utility_functions import comp_pcc, comp_equal, divup, roundup
 from typing import Tuple
 
+import ttnn
+import torch
+import numpy as np
+
+
+# Dictionaries for converting dtypes
+tt_dtype_to_torch_dtype = {
+    ttnn.uint8: torch.uint8,
+    ttnn.uint16: torch.int16,
+    ttnn.uint32: torch.int32,
+    ttnn.int32: torch.int32,
+    ttnn.float32: torch.float,
+    ttnn.bfloat16: torch.bfloat16,
+    ttnn.bfloat8_b: torch.float,
+    ttnn.bfloat4_b: torch.float,
+}
+
+tt_dtype_to_np_dtype = {
+    ttnn.uint8: np.ubyte,
+    ttnn.uint16: np.int16,
+    ttnn.uint32: np.int32,
+    ttnn.int32: np.int32,
+    ttnn.float32: np.float32,
+    ttnn.bfloat8_b: np.float32,
+    ttnn.bfloat4_b: np.float32,
+}
+
 
 def construct_pcc_assert_message(message, expected_pytorch_result, actual_pytorch_result):
     messages = []
diff --git a/ttnn/cpp/pybind11/pytensor.cpp b/ttnn/cpp/pybind11/pytensor.cpp
index 48a360fb3cb2..17de2f3493e0 100644
--- a/ttnn/cpp/pybind11/pytensor.cpp
+++ b/ttnn/cpp/pybind11/pytensor.cpp
@@ -66,17 +66,17 @@ void log_external_operation(
 #endif
 
 template <typename T>
-Tensor create_owned_tensor(
-    T* data_ptr,
-    size_t num_elements,
-    tt::stl::Span<const uint32_t> shape,
-    DataType data_type,
-    Layout layout,
-    const std::optional<Tile>& optional_tile = std::nullopt) {
-    auto data = std::vector(data_ptr, data_ptr + num_elements);
+Tensor create_owned_tensor(T* data_ptr, const ttnn::TensorSpec& tensor_spec) {
+    std::size_t num_elements = tensor_spec.logical_shape().volume();
+    auto data = std::vector<T>(data_ptr, data_ptr + num_elements);
     auto buffer = owned_buffer::create(std::move(data));
+
+    if (tensor_spec.layout() == Layout::TILE) {
+        data = tensor_impl::convert_layout_row_major_to_tile(tensor_spec.physical_shape(), tensor_spec.tile(), buffer);
+        buffer = owned_buffer::create(std::move(data));
+    }
     auto storage = OwnedStorage{std::move(buffer)};
-    return Tensor(std::move(storage), shape, data_type, layout, optional_tile);
+    return Tensor(std::move(storage), tensor_spec);
 }
 
 OwnedBuffer create_owned_buffer_from_vector_of_floats(std::vector<float>&& data, DataType data_type) {
@@ -138,7 +138,7 @@ Tensor convert_float_vector_to_tt_tensor(
         return tensor;
     }
     auto owned_buffer = create_owned_buffer_from_vector_of_floats(std::move(data), data_type);
-    auto tensor = Tensor(OwnedStorage{owned_buffer}, shape, data_type, layout, tile);
+    auto tensor = Tensor(OwnedStorage{owned_buffer}, shape, data_type, Layout::ROW_MAJOR, tile).to(layout);
     if (device) {
         return tensor.to(device, memory_config.value_or(MemoryConfig{}));
     }
@@ -146,23 +146,30 @@ Tensor convert_float_vector_to_tt_tensor(
 }
 
 Tensor create_tt_tensor_from_py_data(
-    std::size_t num_elements,
     std::size_t py_data_ptr,
-    const ttnn::SmallVector<uint32_t>& shape,
-    const DataType data_type,
-    const std::optional<Tile>& optional_tile,
-    bool enable_borrow,
-    const std::function<void()>& on_creation_callback = [] {},
-    const std::function<void()>& on_destruction_callback = [] {}) {
+    const TensorSpec& tensor_spec,
+    Device* device,
+    bool override_enable_borrow,
+    const std::function<void()>& on_creation_callback,
+    const std::function<void()>& on_destruction_callback) {
+    auto layout = tensor_spec.layout();
+
+    bool enable_borrow = true;
+    if (layout != Layout::ROW_MAJOR or override_enable_borrow) {
+        enable_borrow = false;
+    }
+
+    auto data_type = tensor_spec.data_type();
+    std::size_t num_elements = tensor_spec.logical_shape().volume();
     switch (data_type) {
         case DataType::UINT8: {
             auto data_ptr = reinterpret_cast<uint8_t*>(py_data_ptr);
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-                return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return Tensor(std::move(storage), tensor_spec);
             } else {
-                return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return create_owned_tensor(data_ptr, tensor_spec);
             }
         }
         case DataType::UINT16: {
@@ -170,9 +177,9 @@ Tensor create_tt_tensor_from_py_data(
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-                return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return Tensor(std::move(storage), tensor_spec);
             } else {
-                return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return create_owned_tensor(data_ptr, tensor_spec);
             }
         }
         case DataType::INT32: {
@@ -180,9 +187,9 @@ Tensor create_tt_tensor_from_py_data(
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-                return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return Tensor(std::move(storage), tensor_spec);
             } else {
-                return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return create_owned_tensor(data_ptr, tensor_spec);
             }
         }
         case DataType::UINT32: {
@@ -190,9 +197,9 @@ Tensor create_tt_tensor_from_py_data(
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-                return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return Tensor(std::move(storage), tensor_spec);
             } else {
-                return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return create_owned_tensor(data_ptr, tensor_spec);
             }
         }
         case DataType::FLOAT32: {
@@ -200,9 +207,9 @@ Tensor create_tt_tensor_from_py_data(
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-                return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return Tensor(std::move(storage), tensor_spec);
             } else {
-                return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return create_owned_tensor(data_ptr, tensor_spec);
             }
         }
         // TODO: This is not supported for numpy
@@ -211,27 +218,28 @@ Tensor create_tt_tensor_from_py_data(
             if (enable_borrow) {
                 auto storage = BorrowedStorage(
                     borrowed_buffer::Buffer(data_ptr, num_elements), on_creation_callback, on_destruction_callback);
-                return Tensor(std::move(storage), shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return Tensor(std::move(storage), tensor_spec);
             } else {
-                return create_owned_tensor(data_ptr, num_elements, shape, data_type, Layout::ROW_MAJOR, optional_tile);
+                return create_owned_tensor(data_ptr, tensor_spec);
             }
         }
         case DataType::BFLOAT8_B:
         case DataType::BFLOAT4_B: {
             auto data_ptr = reinterpret_cast<float*>(py_data_ptr);
-            auto data = std::vector<float>(data_ptr, data_ptr + num_elements);
-            auto buffer = owned_buffer::create<float>(std::move(data));
-            auto tile = optional_tile.value_or(Tile());
-            auto tensor = Tensor(OwnedStorage{buffer}, shape, DataType::FLOAT32, Layout::ROW_MAJOR, optional_tile)
-                              .to(Layout::TILE);
-            auto output_float_data = owned_buffer::get_as<float>(tensor).get();
+            auto float_tensor_spec = TensorSpec(
+                tensor_spec.logical_shape(),
+                TensorLayout(DataType::FLOAT32, tensor_spec.page_config(), tensor_spec.memory_config()));
+            auto float_tensor = create_owned_tensor(data_ptr, float_tensor_spec);
+
+            auto tile = tensor_spec.tensor_layout().get_page_config().get_tile();
+            auto output_float_data = owned_buffer::get_as<float>(float_tensor).get();
             auto output_packed_data = data_type == DataType::BFLOAT8_B
                                           ? pack_fp32_vec_as_bfp8_tiles(
                                                 output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false, tile)
                                           : pack_fp32_vec_as_bfp4_tiles(
                                                 output_float_data, /*row_major_input=*/false, /*is_exp_a=*/false, tile);
             auto output_buffer = owned_buffer::create<uint32_t>(std::move(output_packed_data));
-            return Tensor(std::move(OwnedStorage{std::move(output_buffer)}), shape, data_type, Layout::TILE, tile);
+            return Tensor(std::move(OwnedStorage{std::move(output_buffer)}), tensor_spec);
         }
         default: {
             TT_THROW("Unsupported DataType: {}", data_type);
@@ -242,16 +250,26 @@ Tensor create_tt_tensor_from_py_data(
 
 Tensor convert_python_tensor_to_tt_tensor(
     const py::handle& py_tensor,
-    std::optional<DataType> optional_data_type = std::nullopt,
-    const std::optional<Tile>& optional_tile = std::nullopt,
-    bool enable_borrow = true) {
+    std::optional<DataType> optional_data_type,
+    std::optional<Layout> optional_layout,
+    const std::optional<Tile>& optional_tile,
+    const MemoryConfig& memory_config,
+    Device* device,
+    bool override_enable_borrow = false) {
     GraphTracker::instance().track_function_start(
-        "tt::tt_metal::detail::convert_python_tensor_to_tt_tensor", py_tensor, optional_data_type, enable_borrow);
+        "tt::tt_metal::detail::convert_python_tensor_to_tt_tensor",
+        py_tensor,
+        optional_data_type,
+        optional_layout,
+        optional_tile,
+        memory_config,
+        device,
+        override_enable_borrow);
     py::object torch = py::module_::import("torch");
     py::object np = py::module_::import("numpy");
 
     auto py_dtype = py_tensor.attr("dtype");
-    auto shape = py::cast<ttnn::SmallVector<uint32_t>>(py_tensor.attr("shape"));
+    auto shape = ttnn::SimpleShape(py::cast<ttnn::SmallVector<uint32_t>>(py_tensor.attr("shape")));
 
     DataType data_type;
 
@@ -323,7 +341,7 @@ Tensor convert_python_tensor_to_tt_tensor(
         num_elements = py::cast<std::size_t>(contiguous_py_tensor.attr("numel")());
         py_data_ptr = py::cast<std::size_t>(contiguous_py_tensor.attr("data_ptr")());
     } else if (py::isinstance(py_tensor, np.attr("ndarray"))) {
-        TT_FATAL(enable_borrow, "Owned storage for numpy tensors is untested!");
+        TT_FATAL(!override_enable_borrow, "Disabling borrowed buffers for numpy tensors is untested!");
 
         contiguous_py_tensor = np.attr("ascontiguousarray")(py_tensor);
 
@@ -386,17 +404,35 @@ Tensor convert_python_tensor_to_tt_tensor(
         TT_THROW("The argument must be of type torch.Tensor or numpy.ndarray!");
     }
 
+    // TODO: Remove check of num_elements from python against volume of ttnn::SimpleShape
+    TT_FATAL(
+        num_elements == shape.volume(),
+        "Number of elements from python tensor {} must match volume of shape {}!",
+        num_elements,
+        shape.volume());
+
+    Layout layout = optional_layout.value_or(Layout::ROW_MAJOR);
+    if (data_type == DataType::BFLOAT8_B or data_type == DataType::BFLOAT4_B) {
+        if (optional_layout.has_value() and optional_layout.value() != Layout::TILE) {
+            log_warning(
+                tt::LogAlways,
+                "Tensor layout must be Layout::TILE for bfloat8_b or bfloat4_b! Tensor layout will be {} instead of "
+                "the requested {}!",
+                Layout::TILE,
+                optional_layout.value());
+        }
+        layout = Layout::TILE;
+    }
+
+    auto tensor_spec = TensorSpec(shape, TensorLayout(data_type, PageConfig(layout, optional_tile), memory_config));
     auto on_creation_callback = [tensor = contiguous_py_tensor] { tensor.inc_ref(); };
     auto on_destruction_callback = [tensor = contiguous_py_tensor] { tensor.dec_ref(); };
     auto output = create_tt_tensor_from_py_data(
-        num_elements,
-        py_data_ptr,
-        shape,
-        data_type,
-        optional_tile,
-        enable_borrow,
-        on_creation_callback,
-        on_destruction_callback);
+        py_data_ptr, tensor_spec, device, override_enable_borrow, on_creation_callback, on_destruction_callback);
+
+    if (device) {
+        output = output.to(device, memory_config);
+    }
     output = tt::tt_metal::set_tensor_id(output);
     GraphTracker::instance().track_function_end(output);
     return output;
@@ -411,7 +447,8 @@ Tensor convert_python_tensors_to_tt_tensors(
         "tt::tt_metal::detail::convert_python_tensors_to_tt_tensors", tensor_shards, data_type, strategy);
     std::vector<Tensor> tt_shards;
     for (const auto& shard : tensor_shards) {
-        tt_shards.push_back(detail::convert_python_tensor_to_tt_tensor(shard, data_type, tile, false));
+        tt_shards.push_back(detail::convert_python_tensor_to_tt_tensor(
+            shard, data_type, Layout::ROW_MAJOR, tile, MemoryConfig{}, nullptr, true));
     }
     std::vector<OwnedBuffer> host_owned_buffers;
     std::vector<ttnn::Shape> host_owned_shapes;
@@ -432,15 +469,68 @@ Tensor convert_python_tensors_to_tt_tensors(
     return output;
 }
 
-std::pair<std::variant<OwnedBuffer, BorrowedBuffer>, DataType> get_buffer_and_dtype_from_tensor(
-    const Tensor& tt_tensor) {
+template <typename T>
+owned_buffer::Buffer<T> create_row_major_owned_buffer(
+    owned_buffer::Buffer<T> owned_buffer, const ttnn::TensorSpec& tensor_spec) {
+    if (tensor_spec.layout() == Layout::TILE) {
+        auto data = tensor_impl::convert_layout_tile_to_row_major(
+            tensor_spec.physical_shape(), tensor_spec.tile(), owned_buffer);
+        return owned_buffer::create(std::move(data));
+    }
+    return owned_buffer;
+}
+
+std::variant<OwnedBuffer, BorrowedBuffer> get_host_buffer_from_tensor(const Tensor& tt_tensor) {
     TT_ASSERT(tt_tensor.storage_type() == StorageType::OWNED or tt_tensor.storage_type() == StorageType::BORROWED);
 
-    auto buffer = std::visit(
-        [](auto&& storage) -> std::variant<OwnedBuffer, BorrowedBuffer> {
+    const auto& tensor_spec = tt_tensor.get_tensor_spec();
+    return std::visit(
+        [&tensor_spec, &tt_tensor](auto&& storage) -> std::variant<OwnedBuffer, BorrowedBuffer> {
             using T = std::decay_t<decltype(storage)>;
             if constexpr (std::is_same_v<T, OwnedStorage>) {
-                return storage.buffer;
+                auto tt_dtype = tensor_spec.data_type();
+                switch (tt_dtype) {
+                    case DataType::UINT8: {
+                        return create_row_major_owned_buffer(
+                            owned_buffer::get_as<uint8_t>(storage.buffer), tensor_spec);
+                    }
+                    case DataType::UINT16: {
+                        return create_row_major_owned_buffer(
+                            owned_buffer::get_as<uint16_t>(storage.buffer), tensor_spec);
+                    }
+                    case DataType::INT32: {
+                        return create_row_major_owned_buffer(
+                            owned_buffer::get_as<int32_t>(storage.buffer), tensor_spec);
+                    }
+                    case DataType::UINT32: {
+                        return create_row_major_owned_buffer(
+                            owned_buffer::get_as<uint32_t>(storage.buffer), tensor_spec);
+                    }
+                    case DataType::FLOAT32: {
+                        return create_row_major_owned_buffer(owned_buffer::get_as<float>(storage.buffer), tensor_spec);
+                    }
+                    case DataType::BFLOAT16: {
+                        return create_row_major_owned_buffer(
+                            owned_buffer::get_as<::bfloat16>(storage.buffer), tensor_spec);
+                    }
+                    case DataType::BFLOAT8_B:
+                    case DataType::BFLOAT4_B: {
+                        const auto& tile = tensor_spec.tile();
+                        auto uint32_data = owned_buffer::get_as<std::uint32_t>(storage.buffer).get();
+                        auto float_unpacked_data =
+                            tt_dtype == DataType::BFLOAT8_B
+                                ? unpack_bfp8_tiles_into_float_vec(
+                                      uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile)
+                                : unpack_bfp4_tiles_into_float_vec(
+                                      uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile);
+                        auto input_float_buffer = owned_buffer::create<float>(std::move(float_unpacked_data));
+                        return create_row_major_owned_buffer(input_float_buffer, tensor_spec);
+                    }
+                    default: {
+                        TT_THROW("Unsupported DataType: {}", tt_dtype);
+                        break;
+                    }
+                }
             } else if constexpr (std::is_same_v<T, DeviceStorage>) {
                 TT_THROW("Device tensor cannot be converted to torch");
             } else if constexpr (std::is_same_v<T, BorrowedStorage>) {
@@ -456,52 +546,64 @@ std::pair<std::variant<OwnedBuffer, BorrowedBuffer>, DataType> get_buffer_and_dt
             }
         },
         tt_tensor.get_storage());
-
-    const auto tile = tt_tensor.get_tensor_spec().tile();
-    auto tt_dtype = tt_tensor.get_dtype();
-    if (tt_dtype == DataType::BFLOAT8_B || tt_dtype == DataType::BFLOAT4_B) {
-        TT_ASSERT(
-            std::holds_alternative<OwnedBuffer>(buffer),
-            "Unexpected type {}",
-            tt::stl::get_active_type_name_in_variant(buffer));
-        auto uint32_data = std::get<owned_buffer::Buffer<std::uint32_t>>(std::get<OwnedBuffer>(buffer)).get();
-        auto float_unpacked_data =
-            tt_dtype == DataType::BFLOAT8_B
-                ? unpack_bfp8_tiles_into_float_vec(uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile)
-                : unpack_bfp4_tiles_into_float_vec(uint32_data, /*row_major_output=*/false, /*is_exp_a=*/false, tile);
-        auto input_float_buffer = owned_buffer::create<float>(std::move(float_unpacked_data));
-        auto float_tensor = Tensor(
-                                OwnedStorage{input_float_buffer},
-                                tt_tensor.get_shape(),
-                                DataType::FLOAT32,
-                                tt_tensor.get_layout(),
-                                tile)
-                                .to(Layout::ROW_MAJOR);
-        auto output_float_data = owned_buffer::get_as<float>(float_tensor).get();
-        buffer = owned_buffer::create<float>(std::move(output_float_data));
-        tt_dtype = DataType::FLOAT32;
-    }
-
-    return {buffer, tt_dtype};
 }
 
 py::object convert_tt_tensor_to_torch_tensor(const Tensor& tt_tensor) {
     GraphTracker::instance().track_function_start("tt::tt_metal::detail::convert_tt_tensor_to_torch_tensor", tt_tensor);
 
-    auto [buffer, buffer_dtype] = get_buffer_and_dtype_from_tensor(tt_tensor);
+    auto buffer = get_host_buffer_from_tensor(tt_tensor);
 
     py::object torch = py::module_::import("torch");
     auto frombuffer = torch.attr("frombuffer");
 
-    const auto tt_dtype_to_torch_dtype = std::map<DataType, py::object>{
-        {DataType::UINT8, torch.attr("uint8")},
-        {DataType::UINT16, torch.attr("int16")},  // TODO(arakhmati): add DataType::INT16
-        {DataType::INT32, torch.attr("int32")},
-        {DataType::UINT32, torch.attr("int32")},  // TODO(arakhmati): add DataType::INT32
-        {DataType::FLOAT32, torch.attr("float32")},
-        {DataType::BFLOAT16, torch.attr("bfloat16")},
-    };
-    auto torch_dtype = tt_dtype_to_torch_dtype.at(buffer_dtype);
+    auto torch_dtype = [&]() {
+        if (std::holds_alternative<OwnedBuffer>(buffer)) {
+            return std::visit(
+                [&torch](auto& owned_buffer) -> py::object {
+                    using T = std::decay_t<decltype(owned_buffer)>;
+                    if constexpr (std::is_same_v<T, owned_buffer::Buffer<uint8_t>>) {
+                        return torch.attr("uint8");
+                    } else if constexpr (std::is_same_v<T, owned_buffer::Buffer<uint16_t>>) {
+                        return torch.attr("int16");
+                    } else if constexpr (std::is_same_v<T, owned_buffer::Buffer<int32_t>>) {
+                        return torch.attr("int32");
+                    } else if constexpr (std::is_same_v<T, owned_buffer::Buffer<uint32_t>>) {
+                        return torch.attr("int32");
+                    } else if constexpr (std::is_same_v<T, owned_buffer::Buffer<float>>) {
+                        return torch.attr("float32");
+                    } else if constexpr (std::is_same_v<T, owned_buffer::Buffer<::bfloat16>>) {
+                        return torch.attr("bfloat16");
+                    } else {
+                        static_assert(tt::stl::concepts::always_false_v<T>, "Unsupported buffer!");
+                    }
+                },
+                std::get<OwnedBuffer>(buffer));
+
+        } else if (std::holds_alternative<BorrowedBuffer>(buffer)) {
+            return std::visit(
+                [&torch](auto& borrowed_buffer) -> py::object {
+                    using T = std::decay_t<decltype(borrowed_buffer)>;
+                    if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<uint8_t>>) {
+                        return torch.attr("uint8");
+                    } else if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<uint16_t>>) {
+                        return torch.attr("int16");
+                    } else if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<int32_t>>) {
+                        return torch.attr("int32");
+                    } else if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<uint32_t>>) {
+                        return torch.attr("int32");
+                    } else if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<float>>) {
+                        return torch.attr("float32");
+                    } else if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<::bfloat16>>) {
+                        return torch.attr("bfloat16");
+                    } else {
+                        static_assert(tt::stl::concepts::always_false_v<T>, "Unsupported buffer!");
+                    }
+                },
+                std::get<BorrowedBuffer>(buffer));
+        } else {
+            TT_THROW("Only OwnedBuffer or BorrowedBuffer is supported for converting to python buffers!");
+        }
+    }();
 
     auto shape = tt_tensor.get_legacy_shape();
     auto torch_shape = std::vector<std::uint32_t>(std::begin(shape), std::end(shape));
@@ -527,19 +629,59 @@ py::object convert_tt_tensor_to_torch_tensor(const Tensor& tt_tensor) {
 py::object convert_tt_tensor_to_numpy_tensor(const Tensor& tt_tensor) {
     GraphTracker::instance().track_function_start("tt::tt_metal::detail::convert_tt_tensor_to_numpy_tensor", tt_tensor);
 
-    auto [buffer, buffer_dtype] = get_buffer_and_dtype_from_tensor(tt_tensor);
+    auto buffer = get_host_buffer_from_tensor(tt_tensor);
 
     py::object np = py::module_::import("numpy");
     auto frombuffer = np.attr("frombuffer");
 
-    const auto tt_dtype_to_np_dtype = std::map<DataType, py::object>{
-        {DataType::UINT8, np.attr("ubyte")},
-        {DataType::UINT16, np.attr("int16")},  // TODO(arakhmati): add DataType::INT16
-        {DataType::INT32, np.attr("int32")},
-        {DataType::UINT32, np.attr("int32")},  // TODO(arakhmati): add DataType::INT32
-        {DataType::FLOAT32, np.attr("float32")},
-    };
-    auto np_dtype = tt_dtype_to_np_dtype.at(buffer_dtype);
+    auto np_dtype = [&]() {
+        if (std::holds_alternative<OwnedBuffer>(buffer)) {
+            return std::visit(
+                [&np](auto& owned_buffer) -> py::object {
+                    using T = std::decay_t<decltype(owned_buffer)>;
+                    if constexpr (std::is_same_v<T, owned_buffer::Buffer<uint8_t>>) {
+                        return np.attr("ubyte");
+                    } else if constexpr (std::is_same_v<T, owned_buffer::Buffer<uint16_t>>) {
+                        return np.attr("int16");
+                    } else if constexpr (std::is_same_v<T, owned_buffer::Buffer<int32_t>>) {
+                        return np.attr("int32");
+                    } else if constexpr (std::is_same_v<T, owned_buffer::Buffer<uint32_t>>) {
+                        return np.attr("int32");
+                    } else if constexpr (std::is_same_v<T, owned_buffer::Buffer<float>>) {
+                        return np.attr("float32");
+                    } else if constexpr (std::is_same_v<T, owned_buffer::Buffer<::bfloat16>>) {
+                        TT_THROW("Bfloat16 is not supported for numpy!");
+                    } else {
+                        static_assert(tt::stl::concepts::always_false_v<T>, "Unsupported buffer!");
+                    }
+                },
+                std::get<OwnedBuffer>(buffer));
+
+        } else if (std::holds_alternative<BorrowedBuffer>(buffer)) {
+            return std::visit(
+                [&np](auto& borrowed_buffer) -> py::object {
+                    using T = std::decay_t<decltype(borrowed_buffer)>;
+                    if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<uint8_t>>) {
+                        return np.attr("ubyte");
+                    } else if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<uint16_t>>) {
+                        return np.attr("int16");
+                    } else if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<int32_t>>) {
+                        return np.attr("int32");
+                    } else if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<uint32_t>>) {
+                        return np.attr("int32");
+                    } else if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<float>>) {
+                        return np.attr("float32");
+                    } else if constexpr (std::is_same_v<T, borrowed_buffer::Buffer<::bfloat16>>) {
+                        TT_THROW("Bfloat16 is not supported for numpy!");
+                    } else {
+                        static_assert(tt::stl::concepts::always_false_v<T>, "Unsupported buffer!");
+                    }
+                },
+                std::get<BorrowedBuffer>(buffer));
+        } else {
+            TT_THROW("Only OwnedBuffer or BorrowedBuffer is supported for converting to python buffers!");
+        }
+    }();
 
     auto shape = tt_tensor.get_legacy_shape();
     auto np_shape = std::vector<std::uint32_t>(std::begin(shape), std::end(shape));
@@ -842,7 +984,8 @@ void pytensor_module(py::module& m_tensor) {
                 if (py::isinstance<py::list>(tensor)) {
                     return detail::convert_python_tensors_to_tt_tensors(tensor, data_type, tile, strategy);
                 }
-                return detail::convert_python_tensor_to_tt_tensor(tensor, data_type, tile);
+                return detail::convert_python_tensor_to_tt_tensor(
+                    tensor, data_type, std::nullopt, tile, MemoryConfig{}, nullptr);
             }),
             py::arg("tensor"),
             py::arg("data_type") = std::nullopt,
@@ -857,6 +1000,8 @@ void pytensor_module(py::module& m_tensor) {
                 +--------------+------------------------+
                 | data_type    | TT Tensor data type    |
                 +--------------+------------------------+
+                | tile         | TT Tile Spec           |
+                +--------------+------------------------+
 
                 Example of creating a TT Tensor that uses torch.Tensor's storage as its own storage:
 
@@ -872,16 +1017,15 @@ void pytensor_module(py::module& m_tensor) {
                           Layout layout,
                           const MemoryConfig& mem_config,
                           const std::optional<Tile>& tile) {
-                auto tensor = detail::convert_python_tensor_to_tt_tensor(python_tensor, data_type, tile);
-                auto layout_tensor = tensor.to(layout);
-                return layout_tensor.to(device, mem_config);
+                return detail::convert_python_tensor_to_tt_tensor(
+                    python_tensor, data_type, layout, tile, mem_config, device);
             }),
             py::arg("tensor"),
             py::arg("data_type") = std::nullopt,
-            py::arg("device").noconvert(),
-            py::arg("layout").noconvert(),
-            py::arg("mem_config").noconvert(),
-            py::arg("tile") = std::nullopt,
+            py::arg("device") = nullptr,
+            py::arg("layout").noconvert() = Layout::ROW_MAJOR,
+            py::arg("mem_config").noconvert() = MemoryConfig{},
+            py::arg("tile").noconvert() = std::nullopt,
             py::return_value_policy::move,
             R"doc(
                 +--------------+------------------------+
@@ -897,14 +1041,17 @@ void pytensor_module(py::module& m_tensor) {
                 +--------------+------------------------+
                 | mem_config   | TT memory_config       |
                 +--------------+------------------------+
+                | tile         | TT Tile Spec           |
+                +--------------+------------------------+
 
 
-                Example of creating a TT Tensor that uses torch.Tensor's storage as its own storage:
+                Example of creating a TT Tensor from numpy tensor:
 
                 .. code-block:: python
 
+                    device = ttnn.open_device(device_id=0)
                     py_tensor = np.zeros((1, 1, 32, 32))
-                    ttnn.Tensor(py_tensor)
+                    ttnn.Tensor(py_tensor, ttnn.bfloat16, device, ttnn.TILE_LAYOUT)
             )doc")
         .def_property_readonly("shape", [](const Tensor& self) { return self.get_shape(); })
         .def_property_readonly("dtype", [](const Tensor& self) { return self.get_dtype(); })
diff --git a/ttnn/cpp/ttnn/tensor/tensor_spec.hpp b/ttnn/cpp/ttnn/tensor/tensor_spec.hpp
index 125b3bb719f4..172e0d881f5f 100644
--- a/ttnn/cpp/ttnn/tensor/tensor_spec.hpp
+++ b/ttnn/cpp/ttnn/tensor/tensor_spec.hpp
@@ -28,6 +28,7 @@ class TensorSpec final {
     DataType data_type() const { return tensor_layout_.get_data_type(); }
     Layout layout() const { return tensor_layout_.get_layout(); }
     PageConfig page_config() const { return tensor_layout_.get_page_config(); }
+    const MemoryConfig& memory_config() const { return tensor_layout_.get_memory_config(); }
     const ttnn::SimpleShape& padded_shape() const { return cached_padded_shape_; }
     const Size& physical_shape() const { return cached_physical_shape_; }
     ttnn::Shape shape() const { return ttnn::Shape(logical_shape_.view(), cached_padded_shape_.view()); }
diff --git a/ttnn/tt_lib/fused_ops/softmax.py b/ttnn/tt_lib/fused_ops/softmax.py
index f5b2f5fceb4d..904b4cea008e 100644
--- a/ttnn/tt_lib/fused_ops/softmax.py
+++ b/ttnn/tt_lib/fused_ops/softmax.py
@@ -42,7 +42,7 @@ def ref_stable_softmax(x):
 
 
 if __name__ == "__main__":
-    device = ttnn.open_device(0)
+    device = ttnn.open_device(device_id=0)
 
     H, W = 64, 96
     torch.manual_seed(123)
diff --git a/ttnn/tt_lib/utils.py b/ttnn/tt_lib/utils.py
index 9883666b81f0..a61f97594641 100644
--- a/ttnn/tt_lib/utils.py
+++ b/ttnn/tt_lib/utils.py
@@ -8,6 +8,8 @@
 import torch
 import numpy as np
 
+from typing_extensions import deprecated
+
 
 def _nearest_32(x):
     return math.ceil(x / 32) * 32
@@ -134,108 +136,22 @@ def convert_act_2d_matrix(activation, kernel_y, kernel_x, stride_y, stride_x, pa
     return ret.reshape(ret_shape)
 
 
+@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
 def tilize(x):
-    """
-    This function tilizes a tensor. The last two tensor dims must be divisible by 32, after which this function
-    produces row major tiles and creates faces. The output of this function is a flattened list that
-    we can send to the device.
-
-    :param x: Input PyTorch Tensor
-    :type x: class:`torch.Tensor`
-
-    WARNING: This function should eventually be retired in favour of fully tilizing on device.
-    """
-    nearest_32 = _nearest_32
-
-    assert isinstance(
-        x, (torch.Tensor, np.ndarray)
-    ), "Input to this function must be an instance of torch.Tensor or np.array"
-    assert len(x.shape) == 4, "Only 4D tensors suppported"
-    assert (x.shape[-2] % 32) == 0 and (
-        x.shape[-1] % 32
-    ) == 0, "The last two dimensions of the tensor must be divisible by 32"
-
-    if isinstance(x, torch.Tensor):
-        ret = torch.zeros(np.prod(x.shape))
-    else:
-        ret = np.zeros(np.prod(x.shape))
-
-    idx = 0
-    for B in range(x.shape[0]):
-        for C in range(x.shape[1]):
-            for H in range(0, x.shape[2], 32):
-                for W in range(0, x.shape[3], 32):
-                    unfaced_tile = x[B, C, H : H + 32, W : W + 32]
-
-                    face0 = unfaced_tile[:16, :16]
-                    face1 = unfaced_tile[:16, 16:]
-                    face2 = unfaced_tile[16:, :16]
-                    face3 = unfaced_tile[16:, 16:]
-
-                    for face in (face0, face1, face2, face3):
-                        ret[idx : idx + 256] = face.reshape(-1)
-                        idx += 256
-
-    return ret.reshape(x.shape)
+    return x
 
 
+@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
 def tilize_to_list(x):
     """
-    Tilize a PyTorch and then return the values as a flat list. The last two
-    tensor dims must be divisible by 32, after which this function produces row
-    major tiles and creates faces.
-
-    :param x: Input PyTorch Tensor
-    :type x: class:`torch.Tensor`
-
-    WARNING: This function should eventually be retired in favour of fully tilizing on device.
+    Returns a flattened list of the tensor
     """
-
     return tilize(x).reshape(-1).tolist()
 
 
+@deprecated("PyTorch data is handled automatically in tensor infra. This function does nothing now:")
 def untilize(x):
-    """
-    This function untilizes a tensor to row major format.
-
-    :param x: Input PyTorch Tensor
-    :type x: class:`torch.Tensor`
-
-    WARNING: This function should eventually be retired in favour of fully tilizing on device.
-    """
-    nearest_32 = _nearest_32
-
-    assert isinstance(x, (torch.Tensor, np.ndarray)), "Input to this function must be an instance of torch.Tensor"
-    assert len(x.shape) == 4, "Only 4D tensors suppported"
-    assert (x.shape[-2] % 32) == 0 and (
-        x.shape[-1] % 32
-    ) == 0, "The last two dimensions of the tensor must be divisible by 32"
-
-    if isinstance(x, torch.Tensor):
-        ret = torch.zeros(x.shape)
-    else:
-        ret = np.zeros(x.shape)
-
-    for B in range(x.shape[0]):
-        for C in range(x.shape[1]):
-            x_hw = x[B, C, :].reshape(-1)
-            hw = 0
-            for h in range(0, x.shape[2], 32):
-                for w in range(0, x.shape[3], 32):
-                    f_tile = x_hw[hw : hw + 256].reshape(16, 16)
-                    ret[B, C, h : h + 16, w : w + 16] = f_tile
-
-                    f_tile = x_hw[hw + 256 : hw + 512].reshape(16, 16)
-                    ret[B, C, h : h + 16, w + 16 : w + 32] = f_tile
-
-                    f_tile = x_hw[hw + 512 : hw + 768].reshape(16, 16)
-                    ret[B, C, h + 16 : h + 32, w : w + 16] = f_tile
-
-                    f_tile = x_hw[hw + 768 : hw + 1024].reshape(16, 16)
-                    ret[B, C, h + 16 : h + 32, w + 16 : w + 32] = f_tile
-                    hw += 1024  # traverse tiles in RM-order
-
-    return ret
+    return x
 
 
 def print_diff_argmax(a, b, annotation=""):

From dd874ddd970e40a593cd4ac545962d69fbeadd41 Mon Sep 17 00:00:00 2001
From: Andrew Fuller <afuller@tenstorrent.com>
Date: Wed, 11 Dec 2024 22:36:33 -0500
Subject: [PATCH 11/13] Enforce 'const' left of the type, even via clang-tidy's
 FIX-ITs (#15948)

### Ticket
None

### Problem description
Clang-tidy was trying to insert `const` right of the type.

### What's changed
* Told clang-format we want it on the left.
* Told clang-tidy to do what clang-format says.
---
 .clang-format | 1 +
 .clang-tidy   | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.clang-format b/.clang-format
index d29b0dbdcef5..301ae10318a3 100644
--- a/.clang-format
+++ b/.clang-format
@@ -95,6 +95,7 @@ PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
+QualifierAlignment: Left
 RawStringFormats:
   - Language:        Cpp
     Delimiters:
diff --git a/.clang-tidy b/.clang-tidy
index 273f9d885e80..9e775a897962 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -195,3 +195,5 @@ Checks: >
 CheckOptions:
   - key: readability-function-cognitive-complexity.IgnoreMacros
     value: true
+
+FormatStyle: 'file'

From 74311e95eea60f9e91c56efb1c7aed469f297b21 Mon Sep 17 00:00:00 2001
From: Nilaykumar Patel <nkpatel@tenstorrent.com>
Date: Thu, 12 Dec 2024 09:36:34 +0530
Subject: [PATCH 12/13] #15894: Add conv1d tests in convolution short sweep.

Signed-off-by: Nilaykumar Patel <nkpatel@tenstorrent.com>
---
 .../sweep_utils/conv2d_common.py              | 78 ++++++++++++++++++-
 .../sweeps/conv2d/full/conv2d_misc.py         |  4 +-
 .../sweeps/conv2d/full/conv2d_sharding.py     |  4 +-
 .../conv2d/full/conv2d_sliding_window.py      |  4 +-
 .../sweeps/conv2d/short/conv2d_short_sweep.py | 35 ++++++---
 5 files changed, 108 insertions(+), 17 deletions(-)

diff --git a/tests/sweep_framework/sweep_utils/conv2d_common.py b/tests/sweep_framework/sweep_utils/conv2d_common.py
index c7509247213e..a7354f8363a3 100644
--- a/tests/sweep_framework/sweep_utils/conv2d_common.py
+++ b/tests/sweep_framework/sweep_utils/conv2d_common.py
@@ -48,7 +48,7 @@ def mesh_device_fixture():
     ttnn.close_device(device)
 
 
-def run_full(
+def run_conv2d_full_sweep(
     input_specs,
     input_channels,
     output_channels,
@@ -174,7 +174,7 @@ def run_full(
     return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf]
 
 
-def run_short(
+def run_conv2d_short_sweep(
     input_specs,
     device,
 ) -> list:
@@ -256,3 +256,77 @@ def run_short(
     torch_output_tensor = torch.permute(torch_output_tensor, (0, 3, 1, 2))
 
     return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf]
+
+
+def run_conv1d_short_sweep(
+    input_specs,
+    device,
+) -> list:
+    [
+        batch_size,
+        output_channels,
+        input_channels,
+        input_length,
+        kernel_size,
+        stride,
+        padding,
+        groups,
+        has_bias,
+        dilation,
+    ] = input_specs
+    print(input_specs)
+
+    # has_bias = False
+    torch.manual_seed(0)
+    conv_input_shape = [batch_size, input_channels, input_length]
+    conv_weight_shape = [output_channels, input_channels // groups, kernel_size]
+    conv_bias_shape = [1, 1, 1, output_channels]
+    torch_input_tensor_ncl = torch.randn(conv_input_shape, dtype=torch.bfloat16).float()
+    torch_input_tensor = torch.permute(torch_input_tensor_ncl, (0, 2, 1))
+    torch_weight_tensor = torch.randn(conv_weight_shape, dtype=torch.bfloat16).float()
+    torch_bias_tensor = torch.randn(conv_bias_shape, dtype=torch.bfloat16).float() if has_bias else None
+    torch_out_golden_tensor = torch.nn.functional.conv1d(
+        torch_input_tensor_ncl,
+        torch_weight_tensor,
+        bias=torch_bias_tensor.reshape(-1) if has_bias else None,
+        stride=stride,
+        padding=padding,
+        groups=groups,
+    )
+
+    tt_weight_tensor = ttnn.from_torch(torch_weight_tensor, ttnn.bfloat16)
+    tt_bias_tensor = None
+    if has_bias:
+        tt_bias_tensor = ttnn.from_torch(torch_bias_tensor, ttnn.bfloat16)
+
+    tt_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16)
+
+    start_time = start_measuring_time()
+    [tt_output_tensor_on_device, out_length, [weights_device, bias_device]] = ttnn.Conv1d(
+        input_tensor=tt_input_tensor,
+        weight_tensor=tt_weight_tensor,
+        in_channels=input_channels,
+        out_channels=output_channels,
+        device=device,
+        bias_tensor=tt_bias_tensor,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        batch_size=batch_size,
+        input_length=input_length,
+        groups=groups,
+        return_output_dim=True,
+        return_weights_and_bias=True,
+    )
+
+    tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device)
+    torch_output_tensor = ttnn.to_torch(tt_output_tensor)
+    e2e_perf = stop_measuring_time(start_time)
+
+    # torch_output_tensor is in row major layout and NLC shape
+    # NLC to NCL
+    torch_output_tensor = torch_output_tensor.reshape(batch_size, out_length, output_channels)
+
+    torch_output_tensor = torch.permute(torch_output_tensor, (0, 2, 1))
+
+    return [check_with_pcc(torch_output_tensor, torch_out_golden_tensor, pcc=0.998), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py b/tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py
index 144640d642d6..0bb58c4dac09 100644
--- a/tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py
+++ b/tests/sweep_framework/sweeps/conv2d/full/conv2d_misc.py
@@ -12,7 +12,7 @@
 
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
 from models.utility_functions import torch_random
-from tests.sweep_framework.sweep_utils.conv2d_common import run_full, get_input_specs, mesh_device_fixture
+from tests.sweep_framework.sweep_utils.conv2d_common import run_conv2d_full_sweep, get_input_specs, mesh_device_fixture
 
 # Override the default timeout in seconds for hang detection.
 TIMEOUT = 30
@@ -242,7 +242,7 @@ def run(
     *,
     device,
 ) -> list:
-    return run_full(
+    return run_conv2d_full_sweep(
         input_specs,
         input_channels,
         output_channels,
diff --git a/tests/sweep_framework/sweeps/conv2d/full/conv2d_sharding.py b/tests/sweep_framework/sweeps/conv2d/full/conv2d_sharding.py
index 4b6a3a29259a..bfcc0d058009 100644
--- a/tests/sweep_framework/sweeps/conv2d/full/conv2d_sharding.py
+++ b/tests/sweep_framework/sweeps/conv2d/full/conv2d_sharding.py
@@ -12,7 +12,7 @@
 
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
 from models.utility_functions import torch_random
-from tests.sweep_framework.sweep_utils.conv2d_common import run_full, get_input_specs, mesh_device_fixture
+from tests.sweep_framework.sweep_utils.conv2d_common import run_conv2d_full_sweep, get_input_specs, mesh_device_fixture
 
 # Override the default timeout in seconds for hang detection.
 TIMEOUT = 30
@@ -111,7 +111,7 @@ def run(
     *,
     device,
 ) -> list:
-    return run_full(
+    return run_conv2d_full_sweep(
         input_specs,
         input_channels,
         output_channels,
diff --git a/tests/sweep_framework/sweeps/conv2d/full/conv2d_sliding_window.py b/tests/sweep_framework/sweeps/conv2d/full/conv2d_sliding_window.py
index 3b3db7da7ed2..83c2f233fd5e 100644
--- a/tests/sweep_framework/sweeps/conv2d/full/conv2d_sliding_window.py
+++ b/tests/sweep_framework/sweeps/conv2d/full/conv2d_sliding_window.py
@@ -11,7 +11,7 @@
 
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
 from models.utility_functions import torch_random
-from tests.sweep_framework.sweep_utils.conv2d_common import run_full, get_input_specs, mesh_device_fixture
+from tests.sweep_framework.sweep_utils.conv2d_common import run_conv2d_full_sweep, get_input_specs, mesh_device_fixture
 
 # Override the default timeout in seconds for hang detection.
 TIMEOUT = 30
@@ -109,7 +109,7 @@ def run(
     *,
     device,
 ) -> list:
-    return run_full(
+    return run_conv2d_full_sweep(
         input_specs,
         input_channels,
         output_channels,
diff --git a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py
index c41f6be90928..743d5ac652df 100644
--- a/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py
+++ b/tests/sweep_framework/sweeps/conv2d/short/conv2d_short_sweep.py
@@ -12,10 +12,14 @@
 
 from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
 from models.utility_functions import torch_random
-from tests.sweep_framework.sweep_utils.conv2d_common import run_short, mesh_device_fixture
+from tests.sweep_framework.sweep_utils.conv2d_common import (
+    run_conv2d_short_sweep,
+    run_conv1d_short_sweep,
+    mesh_device_fixture,
+)
 
 parameters = {
-    "short_sweep_suite": {
+    "short_sweep_suite_conv2d": {
         "input_specs": [
             # Contains following params
             # [batch_size, output_channels, input_channels, input_height, input_width, kernel_height, kernel_width, stride_x, stride_y, pad_x, pad_y, groups, bias, dilation]
@@ -1566,6 +1570,18 @@
             [1, 320, 960, 64, 64, 1, 1, 1, 1, 0, 0, 1, True, 1],
             [1, 320, 960, 64, 64, 3, 3, 1, 1, 1, 1, 1, True, 1],
         ],
+        "is_conv1d": [False],
+    },
+    "short_sweep_suite_conv1d": {
+        "input_specs": [
+            # Contains following params
+            # [batch_size, output_channels, input_channels, input_length, kernel_size, stride, pad, groups, bias, dilation]
+            [1, 256, 1024, 512, 1, 1, 0, 1, True, 1],
+            [1, 1024, 256, 512, 1, 1, 0, 1, True, 1],
+            [1, 768, 768, 3000, 3, 2, 1, 1, True, 1],
+            [1, 768, 80, 3000, 3, 1, 1, 1, True, 1],
+        ],
+        "is_conv1d": [True],
     },
 }
 
@@ -1576,22 +1592,23 @@ def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
 
 def run(
     input_specs,
+    is_conv1d=False,
     *,
     device,
 ) -> list:
-    return run_short(
-        input_specs,
-        device,
-    )
+    if is_conv1d:
+        return run_conv1d_short_sweep(input_specs, device)
+    else:
+        return run_conv2d_short_sweep(input_specs, device)
 
 
 import pytest
 
 
-@pytest.mark.parametrize("input_spec", parameters["short_sweep_suite"]["input_specs"])
+@pytest.mark.parametrize("input_spec", parameters["short_sweep_suite_conv2d"]["input_specs"])
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 def test_conv2d_localrun(device, input_spec):
-    run_short(
+    run_conv2d_short_sweep(
         input_spec,
         device,
     )
@@ -1658,7 +1675,7 @@ def test_conv2d_localrun(device, input_spec):
 @pytest.mark.parametrize("input_spec", failing_parameters)
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 def test_conv2d_localrun_fail_only(device, input_spec):
-    run_short(
+    run_conv2d_short_sweep(
         input_spec,
         device,
     )

From 6e983a7c924c166307361968e7ea98518859dab8 Mon Sep 17 00:00:00 2001
From: Radomir Djogo <159184120+rdjogoTT@users.noreply.github.com>
Date: Thu, 12 Dec 2024 00:13:56 -0500
Subject: [PATCH 13/13] Add LLK and API for binary shift left/right (#15926)

### Ticket
[Link to Github
Issue](https://github.com/tenstorrent/tt-metal/issues/10034)

### What's changed
Implemented binary left/right shift as binary SFPU OPs.

### Checklist
- [x] Post commit CI passes:
https://github.com/tenstorrent/tt-metal/actions/runs/12287866881
- [x] Blackhole Post commit:
https://github.com/tenstorrent/tt-metal/actions/runs/12287867702
- [x] New/Existing tests provide coverage for changes - will be added in
future PR
---
 .../llk_api/llk_sfpu/ckernel_sfpu_shift.h     | 27 ++++++++
 .../llk_math_eltwise_binary_sfpu_shift.h      | 34 ++++++++++
 .../llk_api/llk_sfpu/ckernel_sfpu_shift.h     | 27 ++++++++
 .../llk_math_eltwise_binary_sfpu_shift.h      | 34 ++++++++++
 .../compute_kernel_api/add_int32_sfpu.h       |  2 +
 .../compute_kernel_api/binary_bitwise_sfpu.h  |  2 +
 .../include/compute_kernel_api/binary_shift.h | 68 +++++++++++++++++++
 .../compute_kernel_api/eltwise_binary_sfpu.h  |  2 +
 tt_metal/third_party/tt_llk_blackhole         |  2 +-
 tt_metal/third_party/tt_llk_wormhole_b0       |  2 +-
 10 files changed, 198 insertions(+), 2 deletions(-)
 create mode 100644 tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h
 create mode 100644 tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h
 create mode 100644 tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h
 create mode 100644 tt_metal/include/compute_kernel_api/binary_shift.h

diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h
new file mode 100644
index 000000000000..ccd4b2e6df29
--- /dev/null
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel {
+namespace sfpu {
+
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_binary_left_shift(const uint dst_offset) {
+    _calculate_binary_left_shift_<APPROXIMATION_MODE, ITERATIONS>(dst_offset);
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_binary_right_shift(const uint dst_offset) {
+    _calculate_binary_right_shift_<APPROXIMATION_MODE, ITERATIONS>(dst_offset);
+}
+
+}  // namespace sfpu
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h
new file mode 100644
index 000000000000..337fdd9df5c7
--- /dev/null
+++ b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_math_eltwise_binary_sfpu_init.h"
+#include "llk_math_eltwise_binary_sfpu_params.h"
+#include "ckernel_sfpu_shift.h"
+
+namespace ckernel {
+
+// New LLK SFPU APIs
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_shift_init() {
+    llk_math_eltwise_binary_sfpu_init<SfpuType::unused, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_left_shift(
+    uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) {
+    llk_math_eltwise_binary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_binary_left_shift<APPROXIMATE>, dst_index0, dst_index1, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_right_shift(
+    uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) {
+    llk_math_eltwise_binary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_binary_right_shift<APPROXIMATE>, dst_index0, dst_index1, vector_mode);
+}
+
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h
new file mode 100644
index 000000000000..ccd4b2e6df29
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_shift.h
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel {
+namespace sfpu {
+
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_binary_left_shift(const uint dst_offset) {
+    _calculate_binary_left_shift_<APPROXIMATION_MODE, ITERATIONS>(dst_offset);
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
+inline void calculate_binary_right_shift(const uint dst_offset) {
+    _calculate_binary_right_shift_<APPROXIMATION_MODE, ITERATIONS>(dst_offset);
+}
+
+}  // namespace sfpu
+}  // namespace ckernel
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h
new file mode 100644
index 000000000000..337fdd9df5c7
--- /dev/null
+++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_shift.h
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_math_eltwise_binary_sfpu_init.h"
+#include "llk_math_eltwise_binary_sfpu_params.h"
+#include "ckernel_sfpu_shift.h"
+
+namespace ckernel {
+
+// New LLK SFPU APIs
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_shift_init() {
+    llk_math_eltwise_binary_sfpu_init<SfpuType::unused, APPROXIMATE>();
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_left_shift(
+    uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) {
+    llk_math_eltwise_binary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_binary_left_shift<APPROXIMATE>, dst_index0, dst_index1, vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_right_shift(
+    uint dst_index0, uint32_t dst_index1, int vector_mode = VectorMode::RC) {
+    llk_math_eltwise_binary_sfpu_params<APPROXIMATE>(
+        ckernel::sfpu::calculate_binary_right_shift<APPROXIMATE>, dst_index0, dst_index1, vector_mode);
+}
+
+}  // namespace ckernel
diff --git a/tt_metal/include/compute_kernel_api/add_int32_sfpu.h b/tt_metal/include/compute_kernel_api/add_int32_sfpu.h
index 4de5ee5b55af..f566c7e34da1 100644
--- a/tt_metal/include/compute_kernel_api/add_int32_sfpu.h
+++ b/tt_metal/include/compute_kernel_api/add_int32_sfpu.h
@@ -19,6 +19,8 @@ namespace ckernel {
  * Performs an elementwise add operation with the two integer inputs: y = add(x0,x1)
  * Output overwrites first operand in DST.
  *
+ * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available
+ * on the compute engine.
  * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles,
  * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats.
  *
diff --git a/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h b/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h
index cf2a20d00901..1ec6d40cecab 100644
--- a/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h
+++ b/tt_metal/include/compute_kernel_api/binary_bitwise_sfpu.h
@@ -19,6 +19,8 @@ namespace ckernel {
  * Performs an elementwise binary bitwise operation with the two inputs: y = bitwise(x0,x1)
  * Output overwrites first operand in DST.
  *
+ * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available
+ * on the compute engine.
  * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles,
  * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats.
  *
diff --git a/tt_metal/include/compute_kernel_api/binary_shift.h b/tt_metal/include/compute_kernel_api/binary_shift.h
new file mode 100644
index 000000000000..3bd2ddb9a59d
--- /dev/null
+++ b/tt_metal/include/compute_kernel_api/binary_shift.h
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "compute_kernel_api/common_globals.h"
+#ifdef TRISC_MATH
+#include "llk_math_eltwise_binary_sfpu_shift.h"
+#define MAIN math_main()
+#define MATH(x) x
+#else
+#define MATH(x)
+#endif
+
+namespace ckernel {
+
+/**
+ * Performs an elementwise shift operation to the left on the input at idst0, by input at idst1: y = x0 << x1
+ * Both inputs must be of Int32 data type only. Output overwrites first operand in DST.
+ *
+ * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available
+ * on the compute engine.
+ * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles,
+ * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats.
+ *
+ * Return value: None
+ *
+ * | Argument       | Description                                                           | Type     | Valid Range |
+ * Required |
+ * |----------------|-----------------------------------------------------------------------|----------|-------------------------------------------------------|----------|
+ * | idst0          | The index of the tile in DST register buffer to use as first operand  | uint32_t | Must be less
+ * than the size of the DST register buffer | True     | | idst1          | The index of the tile in DST register buffer
+ * to use as second operand | uint32_t | Must be less than the size of the DST register buffer | True     |
+ */
+ALWI void binary_left_shift_tile(uint32_t idst0, uint32_t idst1) {
+    MATH((llk_math_eltwise_binary_sfpu_left_shift<APPROX>(idst0, idst1)));
+}
+
+/**
+ * Performs an elementwise shift operation to the right on the input at idst0, by input at idst1: y = x0 >> x1
+ * Both inputs must be of Int32 data type only. Output overwrites first operand in DST.
+ *
+ * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available
+ * on the compute engine.
+ * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles,
+ * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats.
+ *
+ * Return value: None
+ *
+ * | Argument       | Description                                                           | Type     | Valid Range |
+ * Required |
+ * |----------------|-----------------------------------------------------------------------|----------|-------------------------------------------------------|----------|
+ * | idst0          | The index of the tile in DST register buffer to use as first operand  | uint32_t | Must be less
+ * than the size of the DST register buffer | True     | | idst1          | The index of the tile in DST register buffer
+ * to use as second operand | uint32_t | Must be less than the size of the DST register buffer | True     |
+ */
+
+ALWI void binary_right_shift_tile(uint32_t idst0, uint32_t idst1) {
+    MATH((llk_math_eltwise_binary_sfpu_right_shift<APPROX>(idst0, idst1)));
+}
+
+/**
+ * Please refer to documentation for any_init.
+ */
+ALWI void binary_shift_tile_init() { MATH((llk_math_eltwise_binary_sfpu_shift_init<APPROX>())); }
+
+}  // namespace ckernel
diff --git a/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h b/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h
index 22fc4c13fcf0..239958919401 100644
--- a/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h
+++ b/tt_metal/include/compute_kernel_api/eltwise_binary_sfpu.h
@@ -19,6 +19,8 @@ namespace ckernel {
  * Performs an elementwise binop operation with the two floating point inputs: y = binop(x0,x1)
  * Output overwrites first operand in DST.
  *
+ * The DST register buffer must be in acquired state via *acquire_dst* call. This call is blocking and is only available
+ * on the compute engine.
  * A maximum of 4 tiles from each operand can be loaded into DST at once, for a total of 8 tiles,
  * when using 16 bit formats. This gets reduced to 2 tiles from each operand for 32 bit formats.
  *
diff --git a/tt_metal/third_party/tt_llk_blackhole b/tt_metal/third_party/tt_llk_blackhole
index 7536fbacd75a..973288fb014a 160000
--- a/tt_metal/third_party/tt_llk_blackhole
+++ b/tt_metal/third_party/tt_llk_blackhole
@@ -1 +1 @@
-Subproject commit 7536fbacd75a4ad62047c63c9c54176fae079e06
+Subproject commit 973288fb014a22ce72cdba1c38a9f41f48532d6d
diff --git a/tt_metal/third_party/tt_llk_wormhole_b0 b/tt_metal/third_party/tt_llk_wormhole_b0
index 0f57d4e9dec6..33a7f6a02671 160000
--- a/tt_metal/third_party/tt_llk_wormhole_b0
+++ b/tt_metal/third_party/tt_llk_wormhole_b0
@@ -1 +1 @@
-Subproject commit 0f57d4e9dec602b68671be8891e7af876285f275
+Subproject commit 33a7f6a026719af509a119d8a4e8e36c7c31854c