Merge branch 'main' into llama32-vision

tenstorrent · Oct 24, 2024 · 4e79091 · 4e79091
2 parents 2c5ff7f + af83135
commit 4e79091
Show file tree

Hide file tree

Showing 55 changed files with 607 additions and 792 deletions.
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -105,7 +105,7 @@ ttnn/cpp/ttnn/operations/ccl/ @SeanNijjar @cfjchu
 ttnn/cpp/ttnn/operations/pool/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic
 ttnn/cpp/ttnn/operations/conv/ @mywoodstock @shwetankTT @sankarmanoj-tt @pavlejosipovic @bbradelTT
 ttnn/cpp/ttnn/operations/sliding_window/ @mywoodstock @sankarmanoj-tt @pavlejosipovic
-ttnn/cpp/ttnn/operations/data_movement/ @ntarafdar @sjameelTT @yan-zaretskiy @jaykru-tt
+ttnn/cpp/ttnn/operations/data_movement/ @ntarafdar @sjameelTT @jaykru-tt @yugi957
 ttnn/cpp/ttnn/operations/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
 ttnn/cpp/ttnn/operations/experimental/matmul/ @TT-BrianLiu @bbradelTT @yugaoTT
 ttnn/cpp/ttnn/operations/eltwise/ @patrickroberts @yan-zaretskiy @eyonland

diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
@@ -25,6 +25,18 @@ using namespace tt;
 using namespace tt_metal;
 using namespace constants;
 
+namespace {
+uint32_t get_device_buffer_address(const Tensor& tensor) {
+    TT_FATAL(std::holds_alternative<DeviceStorage>(tensor.get_storage()), "Tensor storage is not DeviceStorage");
+    auto buffer = std::get<DeviceStorage>(tensor.get_storage()).buffer;
+    uint32_t result = 0;
+    buffer->device()->push_work([&]() {
+        result = buffer->address();
+    }, true);
+    return result;
+}
+}
+
 TEST_F(CommonFixture, TestTensorOwnershipSanity) {
     // Sanity test tensor read, write and update paths with synchronous
     // Ensure that tensor data is copied and owned as expected
@@ -129,17 +141,17 @@ TEST_F(CommonFixture, TestAsyncEltwiseBinary) {
         Tensor output_tensor_host = output_tensor_device_2.cpu();
         // Test tensor deallocation in async mode: deallocate tensors after using them
         if (i == 0) {
-            input_a_addr = std::get<DeviceStorage>(input_tensor_a.get_storage()).buffer->address();
-            input_b_addr = std::get<DeviceStorage>(input_tensor_b.get_storage()).buffer->address();
-            input_c_addr = std::get<DeviceStorage>(input_tensor_c.get_storage()).buffer->address();
-            output_1_addr = std::get<DeviceStorage>(output_tensor_device.get_storage()).buffer->address();
-            output_2_addr = std::get<DeviceStorage>(output_tensor_device_2.get_storage()).buffer->address();
+            input_a_addr = get_device_buffer_address(input_tensor_a);
+            input_b_addr = get_device_buffer_address(input_tensor_b);
+            input_c_addr = get_device_buffer_address(input_tensor_c);
+            output_1_addr = get_device_buffer_address(output_tensor_device);
+            output_2_addr = get_device_buffer_address(output_tensor_device_2);
         } else {
-            EXPECT_EQ(std::get<DeviceStorage>(input_tensor_a.get_storage()).buffer->address(), input_a_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(input_tensor_b.get_storage()).buffer->address(), input_b_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(input_tensor_c.get_storage()).buffer->address(), input_c_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(output_tensor_device.get_storage()).buffer->address(), output_1_addr);
-            EXPECT_EQ(std::get<DeviceStorage>(output_tensor_device_2.get_storage()).buffer->address(), output_2_addr);
+            EXPECT_EQ(get_device_buffer_address(input_tensor_a), input_a_addr);
+            EXPECT_EQ(get_device_buffer_address(input_tensor_b), input_b_addr);
+            EXPECT_EQ(get_device_buffer_address(input_tensor_c), input_c_addr);
+            EXPECT_EQ(get_device_buffer_address(output_tensor_device), output_1_addr);
+            EXPECT_EQ(get_device_buffer_address(output_tensor_device_2), output_2_addr);
         }
         input_tensor_a.deallocate();
         input_tensor_b.deallocate();
@@ -171,7 +183,7 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
         Tensor tensor2 =
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
-        uint32_t tensor2_device_buf_addr = tensor2.device_buffer()->address();
+        uint32_t tensor2_device_buf_addr = get_device_buffer_address(tensor2);
         // Assign tensor1 to tensor2 and ensure that ref counts are appropriately updated with the buffer for tensor2
         // deallocated
         tensor2 = tensor1;
@@ -181,19 +193,19 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
         // prev addr for tensor2
         Tensor tensor3 =
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
-        EXPECT_EQ(tensor3.device_buffer()->address(), tensor2_device_buf_addr);
-        EXPECT_EQ(tensor1.device_buffer()->address(), tensor2.device_buffer()->address());
+        EXPECT_EQ(get_device_buffer_address(tensor3), tensor2_device_buf_addr);
+        EXPECT_EQ(get_device_buffer_address(tensor1), get_device_buffer_address(tensor2));
     }
     log_info(LogTest, "Testing Device tensor self-assignment through function");
     for (int i = 0; i < 5; i++) {
         Tensor device_tensor =
             ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(i), DataType::BFLOAT16).to(device);
-        uint32_t device_tensor_address = device_tensor.device_buffer()->address();
+        uint32_t device_tensor_address = get_device_buffer_address(device_tensor);
         // This step will copy the tensor to a temp rval and std::move it back to the caller's instance of device_tensor
         // Ensure ref count and address remain unchanged
         device_tensor = tensor_identity_copy_function(device_tensor);
         EXPECT_EQ(device_tensor.tensor_attributes->main_thread_ref_count, 1);
-        EXPECT_EQ(device_tensor.device_buffer()->address(), device_tensor_address);
+        EXPECT_EQ(get_device_buffer_address(device_tensor), device_tensor_address);
     }
 
     log_info(LogTest, "Testing Device tensor move assignment");
@@ -208,11 +220,11 @@ TEST_F(CommonFixture, TestAsyncRefCountManager) {
     log_info(LogTest, "Testing Device tensor self-assignment");
     Tensor tensor_to_self_assign =
         ttnn::numpy::full<float>(tt::tt_metal::LegacyShape({1, 1, 1024, 1024}), static_cast<float>(0), DataType::BFLOAT16).to(device);
-    uint32_t tensor_to_self_assign_address = tensor_to_self_assign.device_buffer()->address();
+    uint32_t tensor_to_self_assign_address = get_device_buffer_address(tensor_to_self_assign);
     tensor_to_self_assign = tensor_to_self_assign;
     EXPECT_EQ(tensor_to_self_assign.tensor_attributes->main_thread_ref_count, 1);
     tensor_to_self_assign = std::move(tensor_to_self_assign);
-    EXPECT_EQ(tensor_to_self_assign.device_buffer()->address(), tensor_to_self_assign_address);
+    EXPECT_EQ(get_device_buffer_address(tensor_to_self_assign), tensor_to_self_assign_address);
     auto barrier_tensor = tensor_to_self_assign.cpu();
     device->enable_async(false);
 }

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp
@@ -126,7 +126,7 @@ int main(int argc, char** argv) {
 
         // limit size of the L1 buffer to do not exceed global L1 size
         uint32_t l1_buffer_size = num_cores_r * num_cores_c * (num_tiles > 256 ? 256 : num_tiles) * page_size;
-        auto l1_buffer = tt_metal::Buffer(device, l1_buffer_size, page_size, tt_metal::BufferType::L1);
+        auto l1_buffer = tt_metal::Buffer::create(device, l1_buffer_size, page_size, tt_metal::BufferType::L1);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Application Setup
@@ -166,7 +166,7 @@ int main(int argc, char** argv) {
             for (int j = 0; j < num_cores_c; j++) {
                 CoreCoord core = {(std::size_t)j, (std::size_t)i};
                 uint32_t core_index = i * num_cores_c + j;
-                uint32_t l1_buffer_addr = l1_buffer.address();
+                uint32_t l1_buffer_addr = l1_buffer->address();
 
                 const std::array noc_runtime_args = {core_index, l1_buffer_addr, num_tiles, num_cores_r * num_cores_c};
                 SetRuntimeArgs(program, noc_kernel, core, noc_runtime_args);

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp
@@ -77,14 +77,14 @@ int main(int argc, char** argv) {
             log_error(tt::LogTest, "Command line arguments found exception", e.what());
         }
 
-        TT_ASSERT(transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size);
+        TT_ASSERT(page_size == 0 ? transfer_size == 0 : transfer_size % page_size == 0, "Transfer size {}B should be divisible by page size {}B", transfer_size, page_size);
 
         // Device setup
         int device_id = 0;
         tt_metal::Device* device = tt_metal::CreateDevice(device_id);
 
         // Application setup
-        auto buffer = tt_metal::Buffer(
+        auto buffer = tt_metal::Buffer::create(
             device, transfer_size, page_size, buffer_type == 0 ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1);
 
         std::vector<uint32_t> src_vec = create_random_vector_of_bfloat16(
@@ -104,7 +104,7 @@ int main(int argc, char** argv) {
             // Execute application
             if (!skip_write) {
                 auto t_begin = std::chrono::steady_clock::now();
-                EnqueueWriteBuffer(device->command_queue(), buffer, src_vec, false);
+                EnqueueWriteBuffer(device->command_queue(), *buffer, src_vec, false);
                 Finish(device->command_queue());
                 auto t_end = std::chrono::steady_clock::now();
                 auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
@@ -119,7 +119,7 @@ int main(int argc, char** argv) {
 
             if (!skip_read) {
                 auto t_begin = std::chrono::steady_clock::now();
-                EnqueueReadBuffer(device->command_queue(), buffer, result_vec, true);
+                EnqueueReadBuffer(device->command_queue(), *buffer, result_vec, true);
                 auto t_end = std::chrono::steady_clock::now();
                 auto elapsed_us = duration_cast<microseconds>(t_end - t_begin).count();
                 d2h_bandwidth.push_back((transfer_size / 1024.0 / 1024.0 / 1024.0) / (elapsed_us / 1000.0 / 1000.0));

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp
@@ -192,7 +192,7 @@ int main(int argc, char **argv) {
         ////////////////////////////////////////////////////////////////////////////
         std::vector<uint32_t> input_vec = create_random_vector_of_bfloat16(
             input_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
-        tt_metal::Buffer input_buffer(
+        auto input_buffer = Buffer::create(
             device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);
 
         ////////////////////////////////////////////////////////////////////////////
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
             num_tiles_per_core_group_1,
             num_tiles_per_core_group_2,
             kernel,
-            input_buffer.address(),
+            input_buffer->address(),
             num_reqs_at_a_time,
             single_tile_size,
             tile_format);
@@ -221,7 +221,7 @@ int main(int argc, char **argv) {
         //                      Copy Input To DRAM or L1
         ////////////////////////////////////////////////////////////////////////////
         if (access_type == 0) {
-            tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+            tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
         } else {
             for (uint32_t i = 0, input_offset = 0; i < num_cores; ++i) {
                 CoreCoord core = {i / num_cores_y, i % num_cores_y};
@@ -276,7 +276,7 @@ int main(int argc, char **argv) {
         ////////////////////////////////////////////////////////////////////////////
         pass = validation(
             device,
-            input_buffer,
+            *input_buffer,
             input_vec,
             num_cores,
             num_cores_y,

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp
@@ -673,18 +673,18 @@ int main(int argc, char **argv) {
                 input_size, 100, 1234);
         }
 
-        tt_metal::Buffer input_buffer(
+        auto input_buffer = tt_metal::Buffer::create(
             device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Application Setup
         ////////////////////////////////////////////////////////////////////////////
-        auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer.address());
+        auto [program, kernel, cb_addr] = create_program(device, all_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_cores_list, bank_start_id, input_buffer->address());
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Copy Input To DRAM or L1
         ////////////////////////////////////////////////////////////////////////////
-        tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+        tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Execution Application
@@ -713,7 +713,7 @@ int main(int argc, char **argv) {
 
         pass = validation(
             device,
-            input_buffer,
+            *input_buffer,
             input_vec,
             num_cores,
             all_cores_list,

diff --git a/...etal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/...etal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp
@@ -869,18 +869,18 @@ int main(int argc, char **argv) {
                 input_size, 100, 1234);
         }
 
-        tt_metal::Buffer input_buffer(
+        auto input_buffer = tt_metal::Buffer::create(
             device, input_vec.size() * sizeof(uint32_t), single_tile_size, tt_metal::BufferType::DRAM);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Application Setup
         ////////////////////////////////////////////////////////////////////////////
-        auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer.address());
+        auto [program, kernel, output_cb_addr] = create_program(device, all_dram_reader_cores, all_l1_receiver_cores, single_tile_size, tile_format, num_tiles_cb, num_tiles_per_core, k, n, num_blocks, num_banks, all_dram_reader_cores_ordered, all_l1_writer_cores_ordered, bank_start_id, input_buffer->address());
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Copy Input To DRAM or L1
         ////////////////////////////////////////////////////////////////////////////
-        tt_metal::detail::WriteToBuffer(input_buffer, input_vec);
+        tt_metal::detail::WriteToBuffer(*input_buffer, input_vec);
 
         ////////////////////////////////////////////////////////////////////////////
         //                      Execution Application
@@ -909,7 +909,7 @@ int main(int argc, char **argv) {
 
         pass = validation(
             device,
-            input_buffer,
+            *input_buffer,
             input_vec,
             num_cores,
             all_l1_writer_cores_ordered,

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/pgm_dispatch_perf.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/pgm_dispatch_perf.cpp
@@ -1,15 +1,18 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2023, 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
-// NULL kernel is not 0, subtract off overhead
-#if KERNEL_BYTES > 16
-constexpr uint32_t empty_kernel_bytes = 16;
-uint8_t data1[KERNEL_BYTES - empty_kernel_bytes] __attribute__ ((section ("l1_data_test_only"))) __attribute__((used));
+// An empty kernel is 16 bytes, so only pad above that to fake a
+// bigger kernel.
+#define EMPTY_KERNEL_BYTES 16
+#if KERNEL_BYTES > EMPTY_KERNEL_BYTES
+[[gnu::section(".text"), gnu::used]]
+static uint8_t lorem_ipsum[KERNEL_BYTES - EMPTY_KERNEL_BYTES];
 #endif
 
 #ifdef KERNEL_GLOBAL
-volatile uint32_t global = 4;
+[[gnu::section(".data"), gnu::used]]
+static uint32_t global;
 #endif
 
 #ifdef COMPILE_FOR_TRISC

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp
@@ -189,7 +189,7 @@ int main(int argc, char **argv) {
             activations_addr,
             activations_addr / 1024,
             Nt);
-        std::vector<tt_metal::Buffer> l1_buffers;
+        std::vector<std::shared_ptr<tt_metal::Buffer>> l1_buffers;
 
         int l1_buffers_size = 1;
         if (!(single_read || one_buffer_share)) {
@@ -199,8 +199,8 @@ int main(int argc, char **argv) {
         l1_buffers.reserve(l1_buffers_size);
         for (int r = 0; r < num_cores_r; ++r) {
             for (int c = 0; c < num_cores_c; ++c) {
-                l1_buffers.emplace_back(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1);
-                tt_metal::detail::WriteToBuffer(l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]);
+                l1_buffers.push_back(tt_metal::Buffer::create(device, total_tiles_size_bytes, single_tile_size, tt_metal::BufferType::L1));
+                tt_metal::detail::WriteToBuffer(*l1_buffers[r * num_cores_c + c], packed_tensors[r * num_cores_c + c]);
 
                 if (single_read || one_buffer_share)
                     break;
@@ -213,7 +213,7 @@ int main(int argc, char **argv) {
         for (int r = 0; r < num_cores_r; ++r) {
             for (int c = 0; c < num_cores_c; ++c) {
                 std::vector<uint32_t> result_vec;
-                tt_metal::detail::ReadFromBuffer(l1_buffers[r * num_cores_c + c], result_vec);
+                tt_metal::detail::ReadFromBuffer(*l1_buffers[r * num_cores_c + c], result_vec);
                 auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result_vec);
 
                 if (print_tensor) {
@@ -260,7 +260,7 @@ int main(int argc, char **argv) {
                 CoreCoord core = {(size_t)c, (size_t)r};
 
                 int l1_buffers_idx = (single_read || one_buffer_share) ? (0) : (r * num_cores_c + c);
-                auto l1_buffer_addr = l1_buffers[l1_buffers_idx].address();
+                auto l1_buffer_addr = l1_buffers[l1_buffers_idx]->address();
 
                 uint32_t l1_buffer_offset = (one_buffer_share) ? ((r * num_cores_c + c) * Nt) : (0);
 

diff --git a/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/unit_tests/allocator/test_l1_banking_allocator.cpp
@@ -35,7 +35,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) {
 
     uint64_t alloc_limit = unit_tests::test_l1_banking_allocator::get_alloc_limit(device);
 
-    std::vector<std::unique_ptr<Buffer>> buffers;
+    std::vector<std::shared_ptr<Buffer>> buffers;
     int alloc_size_idx = 0;
     uint32_t total_buffer_size = 0;
     while (total_size_bytes < alloc_limit) {
@@ -44,7 +44,7 @@ TEST_F(BasicFixture, TestL1BuffersAllocatedTopDown) {
         if (total_buffer_size + buffer_size >= alloc_limit) {
             break;
         }
-        std::unique_ptr<tt::tt_metal::Buffer> buffer = std::make_unique<tt::tt_metal::Buffer>(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1);
+        auto buffer = tt::tt_metal::Buffer::create(device, buffer_size, buffer_size, tt::tt_metal::BufferType::L1);
         buffers.emplace_back(std::move(buffer));
         total_buffer_size += buffer_size;
         EXPECT_EQ(buffers.back()->address(), device->l1_size_per_core() - total_buffer_size);