From 06713f06f9907d8aa56b1c6e814a0f1a277c84f9 Mon Sep 17 00:00:00 2001
From: Ilkoo Lee <ilkoo.lee@moreh.io>
Date: Wed, 18 Dec 2024 14:25:59 +0000
Subject: [PATCH] #16134: add test case for pre-allocated CreateBuffer

---
 tests/tt_metal/tt_metal/api/test_dram.cpp     | 66 +++++++++++++++++++
 .../unit_tests/gtests/test_async_runtime.cpp  |  2 +
 tt_metal/third_party/pybind11                 |  1 +
 3 files changed, 69 insertions(+)
 create mode 160000 tt_metal/third_party/pybind11

diff --git a/tests/tt_metal/tt_metal/api/test_dram.cpp b/tests/tt_metal/tt_metal/api/test_dram.cpp
index 293a10a5cafd..bd0e6e475bb9 100644
--- a/tests/tt_metal/tt_metal/api/test_dram.cpp
+++ b/tests/tt_metal/tt_metal/api/test_dram.cpp
@@ -122,6 +122,54 @@ bool dram_single_core(
     fixture->ReadBuffer(device, output_dram_buffer, result_vec);
     return result_vec == src_vec;
 }
+
+bool dram_single_core_pre_allocated(
+    DispatchFixture* fixture, tt_metal::Device* device, const DRAMConfig& cfg, std::vector<uint32_t> src_vec) {
+    // Create a program
+    tt_metal::Program program = CreateProgram();
+
+    tt_metal::InterleavedBufferConfig dram_config{
+        .device = device,
+        .size = cfg.dram_buffer_size,
+        .page_size = cfg.dram_buffer_size,
+        .buffer_type = tt_metal::BufferType::DRAM};
+
+    auto input_dram_buffer = tt_metal::CreateBuffer(dram_config);
+    uint32_t input_dram_buffer_addr = input_dram_buffer->address();
+    auto input_dram_pre_allocated_buffer = tt_metal::CreateBuffer(dram_config, input_dram_buffer_addr);
+    uint32_t input_dram_pre_allocated_buffer_addr = input_dram_pre_allocated_buffer->address();
+
+    TT_FATAL(input_dram_buffer_addr == input_dram_pre_allocated_buffer_addr, "Error");
+
+    auto output_dram_buffer = tt_metal::CreateBuffer(dram_config);
+    uint32_t output_dram_buffer_addr = output_dram_buffer->address();
+    auto output_dram_pre_allocated_buffer = tt_metal::CreateBuffer(dram_config, output_dram_buffer_addr);
+    uint32_t output_dram_pre_allocated_buffer_addr = output_dram_pre_allocated_buffer->address();
+
+    TT_FATAL(output_dram_buffer_addr == output_dram_pre_allocated_buffer_addr, "Error");
+
+    // Create the kernel
+    auto dram_kernel = tt_metal::CreateKernel(program, cfg.kernel_file, cfg.core_range, cfg.data_movement_cfg);
+    fixture->WriteBuffer(device, input_dram_pre_allocated_buffer, src_vec);
+
+    tt_metal::SetRuntimeArgs(
+        program,
+        dram_kernel,
+        cfg.core_range,
+        {cfg.l1_buffer_addr,
+         input_dram_pre_allocated_buffer_addr,
+         (std::uint32_t)0,
+         output_dram_pre_allocated_buffer_addr,
+         (std::uint32_t)0,
+         cfg.dram_buffer_size});
+
+    fixture->RunProgram(device, program);
+
+    std::vector<uint32_t> result_vec;
+    fixture->ReadBuffer(device, output_dram_pre_allocated_buffer, result_vec);
+
+    return result_vec == src_vec;
+}
 }  // namespace unit_tests_common::dram::test_dram
 
 TEST_F(DispatchFixture, TensixDRAMLoopbackSingleCore) {
@@ -142,6 +190,24 @@ TEST_F(DispatchFixture, TensixDRAMLoopbackSingleCore) {
     }
 }
 
+TEST_F(DispatchFixture, TensixDRAMLoopbackSingleCorePreAllocated) {
+    uint32_t buffer_size = 2 * 1024 * 25;
+    std::vector<uint32_t> src_vec =
+        create_random_vector_of_bfloat16(buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count());
+    unit_tests_common::dram::test_dram::DRAMConfig dram_test_config = {
+        .core_range = {{0, 0}, {0, 0}},
+        .kernel_file = "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp",
+        .dram_buffer_size = buffer_size,
+        .l1_buffer_addr = 400 * 1024,
+        .data_movement_cfg =
+            {.processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default},
+    };
+    for (unsigned int id = 0; id < devices_.size(); id++) {
+        ASSERT_TRUE(unit_tests_common::dram::test_dram::dram_single_core_pre_allocated(
+            this, devices_.at(id), dram_test_config, src_vec));
+    }
+}
+
 TEST_F(DispatchFixture, TensixDRAMLoopbackSingleCoreDB) {
     if (!this->IsSlowDispatch()) {
         tt::log_info(tt::LogTest, "This test is only supported in slow dispatch mode");
diff --git a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
index b5495a324dbb..7474f324ccab 100644
--- a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
+++ b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
@@ -74,11 +74,13 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) {
     ttnn::record_event(device_->command_queue(io_cq), write_event);
     // Host stalls until write is completed, before sending workload
     ttnn::event_synchronize(write_event);
+    EXPECT_EQ(ttnn::event_query(write_event), true);
     // Dispatch workload. Preallocated output_tensor is populated by op/
     ttnn::moreh_sum(input_tensor, /*dim*/ 3, false, output_tensor, std::nullopt, std::nullopt);
     // Record completion of workload
     ttnn::record_event(device_->command_queue(workload_dispatch_cq), workload_event);
     ttnn::event_synchronize(workload_event);
+    EXPECT_EQ(ttnn::event_query(workload_event), true);
     // Read output back, once workload is complete
     ttnn::read_buffer(io_cq, output_tensor, {readback_data});
     // Ensure that reference count book keeping is done correctly
diff --git a/tt_metal/third_party/pybind11 b/tt_metal/third_party/pybind11
new file mode 160000
index 000000000000..b8f28551cc3a
--- /dev/null
+++ b/tt_metal/third_party/pybind11
@@ -0,0 +1 @@
+Subproject commit b8f28551cc3a98ea9fbfc15c05b513c8f2d23e84