From a9a992d3b278857be55d4679b4aa12d4c21af065 Mon Sep 17 00:00:00 2001 From: Eyon Land <41128502+eyonland@users.noreply.github.com> Date: Mon, 25 Nov 2024 14:01:06 -0600 Subject: [PATCH] =?UTF-8?q?#7493:=20Accidently=20added=20two=20tests=20tha?= =?UTF-8?q?t=20should=20have=20been=20deleted=20durin=E2=80=A6=20(#15431)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Ticket #7493 ### Problem description Accidently added two tests that should have been deleted during a rebase. ### What's changed Deleted tests ### Checklist - [ ] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .../command_queue/test_HostAsyncCQ.cpp | 348 ------------------ .../command_queue/test_EnqueueTrace.cpp | 241 ------------ 2 files changed, 589 deletions(-) delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp delete mode 100644 tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp deleted file mode 100644 index 6ebe8f64da2..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_HostAsyncCQ.cpp +++ /dev/null @@ -1,348 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "command_queue_fixture.hpp" -#include "command_queue_test_utils.hpp" -#include "gtest/gtest.h" -#include "impl/buffers/buffer.hpp" -#include "tt_metal/common/bfloat16.hpp" -#include "tt_metal/common/scoped_timer.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/detail/util.hpp" -#include "tt_metal/detail/tt_metal.hpp" -#include "tt_metal/impl/device/device.hpp" -#include "tt_metal/impl/dispatch/command_queue.hpp" -#include "tt_metal/impl/buffers/circular_buffer.hpp" - -using std::vector; -using namespace tt::tt_metal; - -namespace host_cq_test_utils { -// Utility functions for Async Queue Flatten stress test -// Add more utils here for testing other ops/workloads -uint32_t prod(vector &shape) { - uint32_t shape_prod = 1; - - for (uint32_t shape_i: shape) { - shape_prod *= shape_i; - } - - return shape_prod; -} - -inline std::vector gold_standard_flatten(std::vector src_vec, vector shape) { - - int numel_in_tensor = prod(shape) / 2; - int idx = 0; - std::vector expected_dst_vec; - - uint32_t num_tile_rows = shape.at(shape.size() - 2) / 32; - uint32_t num_tile_cols = shape.at(shape.size() - 1) / 32; - - uint32_t start_dram_addr_offset_for_tensor_row = 0; - - for (int i = 0; i < num_tile_rows; i++) { - for (uint32_t j = 0; j < 32; j++) { - uint32_t src_addr_ = start_dram_addr_offset_for_tensor_row; - for (uint32_t k = 0; k < num_tile_cols; k++) { - - // Copy a row - for (uint32_t l = 0; l < 16; l++) { - uint32_t src_addr = src_addr_ + l; - expected_dst_vec.push_back(src_vec.at(src_addr_ + l)); - } - - // Zero padding - for (uint32_t l = 0; l < 31 * 16; l++) { - expected_dst_vec.push_back(0); - } - src_addr_ += 32 * 16; - } - start_dram_addr_offset_for_tensor_row += 16; - } - start_dram_addr_offset_for_tensor_row += num_tile_cols * 16; - } - - TT_FATAL(expected_dst_vec.size() == (num_tile_rows * 32) * (num_tile_cols * 16) * 32, "Error"); - return expected_dst_vec; -} - -bool flatten(Device *device, uint32_t num_tiles_r = 5, uint32_t num_tiles_c = 5) { - // Test Simulating Program Caching with Async Command Queues - bool pass = true; - // Create a program used across all loops - Program program = CreateProgram(); - - CoreCoord core = {0, 0}; - - uint32_t single_tile_size = 2 * 1024; - - uint32_t num_tiles = num_tiles_r * num_tiles_c; - uint32_t num_bytes_per_tensor_row = num_tiles_c * 64; - uint32_t num_bytes_per_tile = num_tiles * single_tile_size; - - uint32_t dram_buffer_size = single_tile_size * num_tiles * 32; - - - InterleavedBufferConfig dram_config{ - .device=device, - .size = dram_buffer_size, - .page_size = dram_buffer_size, - .buffer_type = BufferType::DRAM - }; - uint32_t src0_cb_index = tt::CBIndex::c_0; - uint32_t num_input_tiles = 8; - CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(src0_cb_index, single_tile_size); - auto cb_src0 = CreateCircularBuffer(program, core, cb_src0_config); - - uint32_t ouput_cb_index = tt::CBIndex::c_16; - uint32_t num_output_tiles = 1; - CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(ouput_cb_index, single_tile_size); - auto cb_output = CreateCircularBuffer(program, core, cb_output_config); - - auto flatten_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp", - core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - - auto unary_writer_kernel = CreateKernel( - program, - "tt_metal/kernels/dataflow/writer_unary.cpp", - core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - vector compute_kernel_args = { - num_tiles * 32 - }; - - auto eltwise_unary_kernel = CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy.cpp", - core, - ComputeConfig{.compile_args = compute_kernel_args} - ); - - // Inside the loop, run async runtime functions - for (int i = 0; i < 1000; i++) { - // Create Device Buffers Asynchronously - auto src_dram_buffer = CreateBuffer(dram_config); - auto dst_dram_buffer = CreateBuffer(dram_config); - - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // Create the source vector - std::shared_ptr> src_vec = std::make_shared>(create_random_vector_of_bfloat16( - dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count())); - - std::vector golden = gold_standard_flatten(*src_vec, {num_tiles_r * 32, num_tiles_c * 32}); - // Set the runtime args asynchronously - std::shared_ptr writer_runtime_args = std::make_shared(); - std::shared_ptr compute_runtime_args = std::make_shared(); - *compute_runtime_args = { - src_dram_buffer.get(), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - num_tiles_r, - num_tiles_c, - num_bytes_per_tensor_row - }; - *writer_runtime_args = { - dst_dram_buffer.get(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tiles * 32 - }; - - SetRuntimeArgs( - device, - detail::GetKernel(program, flatten_kernel), - core, - compute_runtime_args); - - SetRuntimeArgs( - device, - detail::GetKernel(program, unary_writer_kernel), - core, - writer_runtime_args); - // Async write input - EnqueueWriteBuffer(device->command_queue(), src_dram_buffer, src_vec, false); - // Share ownership of buffer with program - AssignGlobalBufferToProgram(src_dram_buffer, program); - // Main thread gives up ownership of buffer and src data (this is what python does) - src_dram_buffer.reset(); - src_vec.reset(); - // Queue up program - EnqueueProgram(device->command_queue(), program, false); - // Blocking read - std::vector result_vec; - EnqueueReadBuffer(device->command_queue(), dst_dram_buffer, result_vec, true); - - // Validation of data - TT_FATAL(golden.size() == result_vec.size(), "Error"); - pass &= (golden == result_vec); - - if (not pass) { - std::cout << "GOLDEN" << std::endl; - print_vec_of_uint32_as_packed_bfloat16(golden, num_tiles * 32); - - std::cout << "RESULT" << std::endl; - print_vec_of_uint32_as_packed_bfloat16(result_vec, num_tiles * 32); - } - } - return pass; -} -} - -namespace host_command_queue_tests { - -TEST_F(CommandQueueFixture, TestAsyncCommandQueueSanityAndProfile) { - auto& command_queue = this->device_->command_queue(); - auto current_mode = CommandQueue::default_mode(); - command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - Program program; - - CoreRange cr({0, 0}, {0, 0}); - CoreRangeSet cr_set({cr}); - // Add an NCRISC blank manually, but in compile program, the BRISC blank will be - // added separately - auto dummy_reader_kernel = CreateKernel( - program, "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/arbiter_hang.cpp", cr_set, DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - // Use scoper timer to benchmark time for pushing 2 commands - { - tt::ScopedTimer timer("AsyncCommandQueue"); - EnqueueProgram(command_queue, program, false); - Finish(command_queue); - } - command_queue.set_mode(current_mode); -} - -TEST_F(CommandQueueFixture, DISABLED_TestAsyncBufferRW) { - // Test Async Enqueue Read and Write + Get Addr + Buffer Allocation and Deallocation - auto& command_queue = this->device_->command_queue(); - auto current_mode = CommandQueue::default_mode(); - command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - Program program; - for (int j = 0; j < 10; j++) { - // Asynchronously initialize a buffer on device - uint32_t first_buf_value = j + 1; - uint32_t second_buf_value = j + 2; - uint32_t first_buf_size = 4096; - uint32_t second_buf_size = 2048; - // Asynchronously allocate buffer on device - std::shared_ptr buffer = Buffer::create(this->device_, first_buf_size, first_buf_size, BufferType::DRAM); - std::shared_ptr allocated_buffer_address = std::make_shared(); - EnqueueGetBufferAddr(this->device_->command_queue(), allocated_buffer_address.get(), buffer.get(), true); - // Ensure returned addr is correct - EXPECT_EQ((*allocated_buffer_address), buffer->address()); - - std::shared_ptr> vec = std::make_shared>(first_buf_size / 4, first_buf_value); - std::vector readback_vec = {}; - // Write first vector to existing on device buffer. - EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false); - // Reallocate the vector in the main thread after asynchronously pushing it (ensure that worker still has access to this data) - vec = std::make_shared>(second_buf_size / 4, second_buf_value); - // Simulate what tt-eager does: Share buffer ownership with program - AssignGlobalBufferToProgram(buffer, program); - // Reallocate buffer (this is safe, since the program also owns the existing buffer, which will not be deallocated) - buffer = Buffer::create(this->device_, second_buf_size, second_buf_size, BufferType::DRAM); - // Write second vector to second buffer - EnqueueWriteBuffer(this->device_->command_queue(), buffer, vec, false); - // Have main thread give up ownership immediately after writing - vec.reset(); - // Read both buffer and ensure data is correct - EnqueueReadBuffer(this->device_->command_queue(), buffer, readback_vec, true); - for (int i = 0; i < readback_vec.size(); i++) { - EXPECT_EQ(readback_vec[i], second_buf_value); - } - } - command_queue.set_mode(current_mode); -} - -TEST_F(CommandQueueFixture, DISABLED_TestAsyncCBAllocation) { - // Test asynchronous allocation of buffers and their assignment to CBs - auto& command_queue = this->device_->command_queue(); - auto current_mode = CommandQueue::default_mode(); - command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - Program program; - - const uint32_t num_pages = 1; - const uint32_t page_size = detail::TileSize(tt::DataFormat::Float16_b); - const tt::DataFormat data_format = tt::DataFormat::Float16_b; - - auto buffer_size = page_size; - tt::tt_metal::InterleavedBufferConfig buff_config{ - .device=this->device_, - .size = buffer_size, - .page_size = buffer_size, - .buffer_type = tt::tt_metal::BufferType::L1 - }; - // Asynchronously allocate an L1 Buffer - auto l1_buffer = CreateBuffer(buff_config); - CoreRange cr({0, 0}, {0, 2}); - CoreRangeSet cr_set({cr}); - std::vector buffer_indices = {16, 24}; - - CircularBufferConfig config1 = CircularBufferConfig(page_size, {{buffer_indices[0], data_format}, {buffer_indices[1], data_format}}, *l1_buffer) - .set_page_size(buffer_indices[0], page_size) - .set_page_size(buffer_indices[1], page_size); - // Asynchronously assign the L1 Buffer to the CB - auto multi_core_cb = CreateCircularBuffer(program, cr_set, config1); - auto cb_ptr = detail::GetCircularBuffer(program, multi_core_cb); - Finish(this->device_->command_queue()); - // Addresses should match - EXPECT_EQ(cb_ptr->address(), l1_buffer->address()); - // Asynchronously allocate a new L1 buffer - auto l1_buffer_2 = CreateBuffer(buff_config); - // Asynchronously update CB address to match new L1 buffer - UpdateDynamicCircularBufferAddress(program, multi_core_cb, *l1_buffer_2); - Finish(this->device_->command_queue()); - // Addresses should match - EXPECT_EQ(cb_ptr->address(), l1_buffer_2->address()); - command_queue.set_mode(current_mode); -} - -TEST_F(CommandQueueFixture, DISABLED_TestAsyncAssertForDeprecatedAPI) { - auto& command_queue = this->device_->command_queue(); - auto current_mode = CommandQueue::default_mode(); - command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - Program program; - CoreCoord core = {0, 0}; - uint32_t buf_size = 4096; - uint32_t page_size = 4096; - auto dummy_kernel = CreateKernel( - program, - "tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp", - core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - auto src0 = Buffer::create(this->device_, buf_size, page_size, BufferType::DRAM); - std::vector runtime_args = {src0->address()}; - try { - SetRuntimeArgs(program, dummy_kernel, core, runtime_args); - } - catch (std::runtime_error &e) { - std::string expected = "This variant of SetRuntimeArgs can only be called when Asynchronous SW Command Queues are disabled for Fast Dispatch."; - const string error = string(e.what()); - EXPECT_TRUE(error.find(expected) != std::string::npos); - } - command_queue.set_mode(current_mode); -} - -TEST_F(CommandQueueFixture, DISABLED_TestAsyncFlattenStress){ - auto& command_queue = this->device_->command_queue(); - auto current_mode = CommandQueue::default_mode(); - command_queue.set_mode(CommandQueue::CommandQueueMode::ASYNC); - uint32_t num_tiles_r = 2; - uint32_t num_tiles_c = 2; - if (!getenv("TT_METAL_SLOW_DISPATCH_MODE")){ - num_tiles_r = 1; - num_tiles_c = 1; - } - ASSERT_TRUE(host_cq_test_utils::flatten(this->device_, num_tiles_r, num_tiles_c)); - command_queue.set_mode(current_mode); -} -} diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp deleted file mode 100644 index eb2894cdf75..00000000000 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueTrace.cpp +++ /dev/null @@ -1,241 +0,0 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "command_queue_fixture.hpp" -#include "gtest/gtest.h" -#include "tt_metal/common/scoped_timer.hpp" -#include "tt_metal/host_api.hpp" -#include "tt_metal/impl/device/device.hpp" - -using std::vector; -using namespace tt::tt_metal; - -Program create_simple_unary_program(const Buffer& input, const Buffer& output) { - Program program = CreateProgram(); - - CoreCoord worker = {0, 0}; - auto reader_kernel = CreateKernel( - program, - "tt_metal/kernels/dataflow/reader_unary.cpp", - worker, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - - auto writer_kernel = CreateKernel( - program, - "tt_metal/kernels/dataflow/writer_unary.cpp", - worker, - DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - auto sfpu_kernel = CreateKernel( - program, - "tt_metal/kernels/compute/eltwise_sfpu.cpp", - worker, - ComputeConfig{ - .math_approx_mode = true, - .compile_args = {1, 1}, - .defines = {{"SFPU_OP_EXP_INCLUDE", "1"}, {"SFPU_OP_CHAIN_0", "exp_tile_init(); exp_tile(0);"}}}); - - CircularBufferConfig input_cb_config = CircularBufferConfig(2048, {{tt::CBIndex::c_0, tt::DataFormat::Float16_b}}) - .set_page_size(tt::CBIndex::c_0, 2048); - - CoreRange core_range({0, 0}); - CreateCircularBuffer(program, core_range, input_cb_config); - vector writer_rt_args = { - output.address(), - (uint32_t)output.noc_coordinates().x, - (uint32_t)output.noc_coordinates().y, - output.num_pages() - }; - SetRuntimeArgs(program, writer_kernel, worker, writer_rt_args); - - CircularBufferConfig output_cb_config = CircularBufferConfig(2048, {{tt::CBIndex::c_16, tt::DataFormat::Float16_b}}) - .set_page_size(tt::CBIndex::c_16, 2048); - - CreateCircularBuffer(program, core_range, output_cb_config); - vector reader_rt_args = { - input.address(), - (uint32_t)input.noc_coordinates().x, - (uint32_t)input.noc_coordinates().y, - input.num_pages() - }; - SetRuntimeArgs(program, reader_kernel, worker, reader_rt_args); - - return program; -} - -// All basic trace tests just assert that the replayed result exactly matches -// the eager mode results -namespace basic_tests { - -TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTrace) { - Setup(2048, 2); - auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - - CommandQueue& command_queue = this->device_->command_queue(0); - CommandQueue& data_movement_queue = this->device_->command_queue(1); - - Program simple_program = create_simple_unary_program(*input, *output); - vector input_data(input->size() / sizeof(uint32_t), 0); - for (uint32_t i = 0; i < input_data.size(); i++) { - input_data[i] = i; - } - - // Eager mode - vector eager_output_data; - eager_output_data.resize(input_data.size()); - - EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); - EnqueueProgram(command_queue, simple_program, true); - EnqueueReadBuffer(data_movement_queue, output, eager_output_data.data(), true); - - // Trace mode - vector trace_output_data; - trace_output_data.resize(input_data.size()); - - EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); - - uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); - EnqueueProgram(command_queue, simple_program, false); - EndTraceCapture(this->device_, command_queue.id(), tid); - - EnqueueTrace(command_queue, tid, true); - EnqueueReadBuffer(data_movement_queue, *output, trace_output_data.data(), true); - EXPECT_TRUE(eager_output_data == trace_output_data); - - // Done - Finish(command_queue); - ReleaseTrace(this->device_, tid); -} - -TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceLoops) { - Setup(4096, 2); - auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - - CommandQueue& command_queue = this->device_->command_queue(0); - CommandQueue& data_movement_queue = this->device_->command_queue(1); - - Program simple_program = create_simple_unary_program(*input, *output); - vector input_data(input->size() / sizeof(uint32_t), 0); - for (uint32_t i = 0; i < input_data.size(); i++) { - input_data[i] = i; - } - - // Trace mode output - uint32_t num_loops = 10; - vector> trace_outputs; - - for (auto i = 0; i < num_loops; i++) { - trace_outputs.push_back({}); - trace_outputs[i].resize(input_data.size()); - } - - // Compile - EnqueueProgram(command_queue, simple_program, true); - - // Trace mode execution - uint32_t trace_id = 0; - bool trace_captured = false; - for (auto i = 0; i < num_loops; i++) { - EnqueueWriteBuffer(data_movement_queue, *input, input_data.data(), true); - - if (not trace_captured) { - trace_id = BeginTraceCapture(this->device_, command_queue.id()); - EnqueueProgram(command_queue, simple_program, false); - EndTraceCapture(this->device_, command_queue.id(), trace_id); - trace_captured = true; - } - - EnqueueTrace(command_queue, trace_id, false); - EnqueueReadBuffer(data_movement_queue, *output, trace_outputs[i].data(), true); - - // Expect same output across all loops - EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]); - } - - // Done - Finish(command_queue); - ReleaseTrace(this->device_, trace_id); -} - -TEST_F(SingleDeviceTraceFixture, EnqueueOneProgramTraceBenchmark) { - Setup(6144, 2); - auto input = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - auto output = Buffer::create(this->device_, 2048, 2048, BufferType::DRAM); - - constexpr bool kBlocking = true; - constexpr bool kNonBlocking = false; - vector blocking_flags = {kBlocking, kNonBlocking}; - - // Single Q for data and commands - // Keep this queue in passthrough mode for now - CommandQueue& command_queue = this->device_->command_queue(0); - - auto simple_program = create_simple_unary_program(*input, *output); - vector input_data(input->size() / sizeof(uint32_t), 0); - for (uint32_t i = 0; i < input_data.size(); i++) { - input_data[i] = i; - } - - // Trace mode output - uint32_t num_loops = 10; - vector> trace_outputs; - - for (auto i = 0; i < num_loops; i++) { - trace_outputs.push_back({}); - trace_outputs[i].resize(input_data.size()); - } - - // Eager mode - vector expected_output_data; - vector eager_output_data; - expected_output_data.resize(input_data.size()); - eager_output_data.resize(input_data.size()); - - // Warm up and use the eager blocking run as the expected output - EnqueueWriteBuffer(command_queue, *input, input_data.data(), kBlocking); - EnqueueProgram(command_queue, simple_program, kBlocking); - EnqueueReadBuffer(command_queue, *output, expected_output_data.data(), kBlocking); - Finish(command_queue); - - for (bool blocking : blocking_flags) { - std::string mode = blocking ? "Eager-B" : "Eager-NB"; - for (auto i = 0; i < num_loops; i++) { - tt::ScopedTimer timer(mode + " loop " + std::to_string(i)); - EnqueueWriteBuffer(command_queue, *input, input_data.data(), blocking); - EnqueueProgram(command_queue, simple_program, blocking); - EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), blocking); - } - if (not blocking) { - // (Optional) wait for the last non-blocking command to finish - Finish(command_queue); - } - EXPECT_TRUE(eager_output_data == expected_output_data); - } - - // Capture trace on a trace queue - uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); - EnqueueProgram(command_queue, simple_program, false); - EndTraceCapture(this->device_, command_queue.id(), tid); - - // Trace mode execution - for (auto i = 0; i < num_loops; i++) { - tt::ScopedTimer timer("Trace loop " + std::to_string(i)); - EnqueueWriteBuffer(command_queue, *input, input_data.data(), kNonBlocking); - EnqueueTrace(command_queue, tid, kNonBlocking); - EnqueueReadBuffer(command_queue, *output, trace_outputs[i].data(), kNonBlocking); - } - Finish(command_queue); - - // Expect same output across all loops - for (auto i = 0; i < num_loops; i++) { - EXPECT_TRUE(trace_outputs[i] == trace_outputs[0]); - } - ReleaseTrace(this->device_, tid); -} - -} // end namespace basic_tests