diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp index fa4d83593e8..4c23e804277 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp @@ -22,54 +22,50 @@ TEST_F(CommandQueueFixture, TestDataMovementEventsWrittenToCompletionQueueInOrde uint32_t page_size = 2048; vector page(page_size / sizeof(uint32_t)); uint32_t expected_event_id = 0; - - auto current_mode = CommandQueue::default_mode(); uint32_t last_read_address = 0; - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH, CommandQueue::CommandQueueMode::ASYNC}) { - for (const DataMovementMode data_movement_mode: {DataMovementMode::READ, DataMovementMode::WRITE}) { - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - this->device_->command_queue().set_mode(mode); - auto start = std::chrono::system_clock::now(); - - uint32_t completion_queue_base = this->device_->sysmem_manager().get_completion_queue_read_ptr(0); - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device_->id()); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id()); - constexpr uint32_t completion_queue_event_alignment = 32; - - vector> buffers; - for (size_t i = 0; i < num_buffers; i++) { - buffers.push_back(std::make_shared(this->device_, page_size, page_size, BufferType::DRAM)); - if (data_movement_mode == DataMovementMode::WRITE) { - EnqueueWriteBuffer(this->device_->command_queue(), buffers.back(), page, false); - } else if (data_movement_mode == DataMovementMode::READ) { - EnqueueReadBuffer(this->device_->command_queue(), buffers.back(), page, false); - } - } - Finish(this->device_->command_queue()); + for (const DataMovementMode data_movement_mode: {DataMovementMode::READ, DataMovementMode::WRITE}) { + + auto start = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000); + uint32_t completion_queue_base = this->device_->sysmem_manager().get_completion_queue_read_ptr(0); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device_->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id()); + constexpr uint32_t completion_queue_event_alignment = 32; + + vector> buffers; + for (size_t i = 0; i < num_buffers; i++) { + buffers.push_back(std::make_shared(this->device_, page_size, page_size, BufferType::DRAM)); - // Read completion queue and ensure we see events 0-99 inclusive in order - uint32_t event; if (data_movement_mode == DataMovementMode::WRITE) { - for (size_t i = 0; i < num_buffers; i++) { - uint32_t host_addr = last_read_address + i * completion_queue_event_alignment; - tt::Cluster::instance().read_sysmem(&event, 4, host_addr, mmio_device_id, channel); - EXPECT_EQ(event, expected_event_id++); - } + EnqueueWriteBuffer(this->device_->command_queue(), buffers.back(), page, false); } else if (data_movement_mode == DataMovementMode::READ) { - for (size_t i = 0; i < num_buffers; i++) { - uint32_t host_addr = completion_queue_base + (i * (completion_queue_event_alignment + page_size)); - tt::Cluster::instance().read_sysmem(&event, 4, host_addr, mmio_device_id, channel); - EXPECT_EQ(event, expected_event_id++); - last_read_address = host_addr + completion_queue_event_alignment + page_size; - } + EnqueueReadBuffer(this->device_->command_queue(), buffers.back(), page, false); + } + } + Finish(this->device_->command_queue()); + + std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); + tt::log_info(tt::LogTest, "Test Finished in {:.2f} us", elapsed_seconds.count() * 1000 * 1000); + + // Read completion queue and ensure we see events 0-99 inclusive in order + uint32_t event; + if (data_movement_mode == DataMovementMode::WRITE) { + for (size_t i = 0; i < num_buffers; i++) { + uint32_t host_addr = last_read_address + i * completion_queue_event_alignment; + tt::Cluster::instance().read_sysmem(&event, 4, host_addr, mmio_device_id, channel); + EXPECT_EQ(event, expected_event_id++); + } + } else if (data_movement_mode == DataMovementMode::READ) { + for (size_t i = 0; i < num_buffers; i++) { + uint32_t host_addr = completion_queue_base + (i * (completion_queue_event_alignment + page_size)); + tt::Cluster::instance().read_sysmem(&event, 4, host_addr, mmio_device_id, channel); + EXPECT_EQ(event, expected_event_id++); + last_read_address = host_addr + completion_queue_event_alignment + page_size; } } } - this->device_->command_queue().set_mode(current_mode); + } // Basic test, record events, check that Event struct was updated. Enough commands to trigger issue queue wrap. @@ -78,29 +74,19 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventIssueQueueWrap) { const size_t num_events = 100000; // Enough to wrap issue queue. 768MB and cmds are 22KB each, so 35k cmds. uint32_t cmds_issued_per_cq = 0; - auto current_mode = CommandQueue::default_mode(); - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH, CommandQueue::CommandQueueMode::ASYNC}) { - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - this->device_->command_queue().set_mode(mode); - auto start = std::chrono::system_clock::now(); - - for (size_t i = 0; i < num_events; i++) { - auto event = std::make_shared(); // type is std::shared_ptr - EnqueueRecordEvent(this->device_->command_queue(), event); - - if (mode == CommandQueue::CommandQueueMode::ASYNC) { - event->wait_until_ready(); // To check Event fields from host, must block until async cq populated event. - } - EXPECT_EQ(event->event_id, cmds_issued_per_cq); - EXPECT_EQ(event->cq_id, this->device_->command_queue().id()); - cmds_issued_per_cq++; - } - Finish(this->device_->command_queue()); + auto start = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000); + for (size_t i = 0; i < num_events; i++) { + auto event = std::make_shared(); // type is std::shared_ptr + EnqueueRecordEvent(this->device_->command_queue(), event); + EXPECT_EQ(event->event_id, cmds_issued_per_cq); + EXPECT_EQ(event->cq_id, this->device_->command_queue().id()); + cmds_issued_per_cq++; } - this->device_->command_queue().set_mode(current_mode); + Finish(this->device_->command_queue()); + + std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); + tt::log_info(tt::LogTest, "Test Finished in {:.2f} us", elapsed_seconds.count() * 1000 * 1000); } // Test where Host synchronously waits for event to be completed. @@ -108,37 +94,32 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronize) { const size_t num_events = 100; const size_t num_events_between_sync = 10; - auto current_mode = CommandQueue::default_mode(); - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH, CommandQueue::CommandQueueMode::ASYNC}) { - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - auto start = std::chrono::system_clock::now(); - this->device_->command_queue().set_mode(mode); + auto start = std::chrono::system_clock::now(); - std::vector> sync_events; + std::vector> sync_events; - // A bunch of events recorded, occasionally will sync from host. - for (size_t i = 0; i < num_events; i++) { - auto event = sync_events.emplace_back(std::make_shared()); - EnqueueRecordEvent(this->device_->command_queue(), event); + // A bunch of events recorded, occasionally will sync from host. + for (size_t i = 0; i < num_events; i++) { + auto event = sync_events.emplace_back(std::make_shared()); + EnqueueRecordEvent(this->device_->command_queue(), event); - // Host synchronize every N number of events. - if (i > 0 && ((i % num_events_between_sync) == 0)) { - EventSynchronize(event); - } + // Host synchronize every N number of events. + if (i > 0 && ((i % num_events_between_sync) == 0)) { + EventSynchronize(event); } + } - // A bunch of bonus syncs where event_id is mod on earlier ID's. - EventSynchronize(sync_events.at(2)); - EventSynchronize(sync_events.at(sync_events.size() - 2)); - EventSynchronize(sync_events.at(5)); + // A bunch of bonus syncs where event_id is mod on earlier ID's. + EventSynchronize(sync_events.at(2)); + EventSynchronize(sync_events.at(sync_events.size() - 2)); + EventSynchronize(sync_events.at(5)); - Finish(this->device_->command_queue()); + Finish(this->device_->command_queue()); - std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000); - } - this->device_->command_queue().set_mode(current_mode); + std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); + tt::log_info(tt::LogTest, "Test Finished in {:.2f} us", elapsed_seconds.count() * 1000 * 1000); } + // Negative test. Host syncing on a future event that isn't actually issued. // Ensure that expected hang is seen, which indicates event sync feature is working properly. TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) { @@ -199,35 +180,29 @@ TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventBasic) { const size_t num_events = 50; const size_t num_events_between_sync = 5; - auto current_mode = CommandQueue::default_mode(); - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH, CommandQueue::CommandQueueMode::ASYNC}) { - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - auto start = std::chrono::system_clock::now(); - this->device_->command_queue().set_mode(mode); - std::vector> sync_events; - - // A bunch of events recorded, occasionally will sync from device. - for (size_t i = 0; i < num_events; i++) { - auto event = sync_events.emplace_back(std::make_shared()); - EnqueueRecordEvent(this->device_->command_queue(), event); - - // Device synchronize every N number of events. - if (i > 0 && ((i % num_events_between_sync) == 0)) { - log_debug(tt::LogTest, "Going to WaitForEvent for i: {}", i); - EnqueueWaitForEvent(this->device_->command_queue(), event); - } - } + auto start = std::chrono::system_clock::now(); + std::vector> sync_events; - // A bunch of bonus syncs where event_id is mod on earlier ID's. - EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(0)); - EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(sync_events.size() - 5)); - EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(4)); - Finish(this->device_->command_queue()); + // A bunch of events recorded, occasionally will sync from device. + for (size_t i = 0; i < num_events; i++) { + auto event = sync_events.emplace_back(std::make_shared()); + EnqueueRecordEvent(this->device_->command_queue(), event); - std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000); + // Device synchronize every N number of events. + if (i > 0 && ((i % num_events_between_sync) == 0)) { + log_debug(tt::LogTest, "Going to WaitForEvent for i: {}", i); + EnqueueWaitForEvent(this->device_->command_queue(), event); + } } - this->device_->command_queue().set_mode(current_mode); + + // A bunch of bonus syncs where event_id is mod on earlier ID's. + EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(0)); + EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(sync_events.size() - 5)); + EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(4)); + Finish(this->device_->command_queue()); + + std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); + tt::log_info(tt::LogTest, "Test Finished in {:.2f} us", elapsed_seconds.count() * 1000 * 1000); } // Mix of WritesBuffers, RecordEvent, WaitForEvent, EventSynchronize with some checking. @@ -239,54 +214,39 @@ TEST_F(CommandQueueFixture, TestEventsMixedWriteBufferRecordWaitSynchronize) { const uint32_t num_cmds_per_cq = 3; // Record, Write, Wait uint32_t expected_event_id = 0; - auto current_mode = CommandQueue::default_mode(); - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH, CommandQueue::CommandQueueMode::ASYNC}) { - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - auto start = std::chrono::system_clock::now(); - this->device_->command_queue().set_mode(mode); - - uint32_t completion_queue_base = this->device_->sysmem_manager().get_completion_queue_read_ptr(0); - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device_->id()); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id()); - constexpr uint32_t completion_queue_event_alignment = 32; - for (size_t i = 0; i < num_buffers; i++) { - - log_debug(tt::LogTest, "Mode: {} i: {} - Going to record event, write, wait, synchronize.", mode, i); - auto event = std::make_shared(); // type is std::shared_ptr - EnqueueRecordEvent(this->device_->command_queue(), event); + auto start = std::chrono::system_clock::now(); - // Cannot count on event being populated with async cq, so only check with passthrough. - if (mode == CommandQueue::CommandQueueMode::PASSTHROUGH) { - EXPECT_EQ(event->cq_id, this->device_->command_queue().id()); - EXPECT_EQ(event->event_id, cmds_issued_per_cq); - } + uint32_t completion_queue_base = this->device_->sysmem_manager().get_completion_queue_read_ptr(0); + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device_->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id()); + constexpr uint32_t completion_queue_event_alignment = 32; + for (size_t i = 0; i < num_buffers; i++) { - std::shared_ptr buf = std::make_shared(this->device_, page_size, page_size, BufferType::DRAM); - EnqueueWriteBuffer(this->device_->command_queue(), buf, page, false); - EnqueueWaitForEvent(this->device_->command_queue(), event); + log_debug(tt::LogTest, "i: {} - Going to record event, write, wait, synchronize.", i); + auto event = std::make_shared(); // type is std::shared_ptr + EnqueueRecordEvent(this->device_->command_queue(), event); + EXPECT_EQ(event->cq_id, this->device_->command_queue().id()); + EXPECT_EQ(event->event_id, cmds_issued_per_cq); - if (i % 10 == 0) { - EventSynchronize(event); - // For async, can verify event fields here since previous function already called wait-until-ready. - if (mode == CommandQueue::CommandQueueMode::ASYNC) { - EXPECT_EQ(event->cq_id, this->device_->command_queue().id()); - EXPECT_EQ(event->event_id, cmds_issued_per_cq); - } - } - cmds_issued_per_cq += num_cmds_per_cq; - } - Finish(this->device_->command_queue()); + std::shared_ptr buf = std::make_shared(this->device_, page_size, page_size, BufferType::DRAM); + EnqueueWriteBuffer(this->device_->command_queue(), buf, page, false); + EnqueueWaitForEvent(this->device_->command_queue(), event); - // Read completion queue and ensure we see expected event IDs - uint32_t event_id; - for (size_t i = 0; i < num_buffers * num_cmds_per_cq; i++) { - uint32_t host_addr = completion_queue_base + i * completion_queue_event_alignment; - tt::Cluster::instance().read_sysmem(&event_id, 4, host_addr, mmio_device_id, channel); - EXPECT_EQ(event_id, expected_event_id++); + if (i % 10 == 0) { + EventSynchronize(event); } + cmds_issued_per_cq += num_cmds_per_cq; + } + Finish(this->device_->command_queue()); - std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000); + // Read completion queue and ensure we see expected event IDs + uint32_t event_id; + for (size_t i = 0; i < num_buffers * num_cmds_per_cq; i++) { + uint32_t host_addr = completion_queue_base + i * completion_queue_event_alignment; + tt::Cluster::instance().read_sysmem(&event_id, 4, host_addr, mmio_device_id, channel); + EXPECT_EQ(event_id, expected_event_id++); } - this->device_->command_queue().set_mode(current_mode); + + std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); + tt::log_info(tt::LogTest, "Test Finished in {:.2f} us", elapsed_seconds.count() * 1000 * 1000); } diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp index e7e9403377e..49f1aade887 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue/command_queue/test_EnqueueWaitForEvent.cpp @@ -22,11 +22,6 @@ void FinishAllCqs(vector>& cqs) { } } -void SetAllCqsMode(vector>& cqs, CommandQueue::CommandQueueMode mode) { - for (uint i = 0; i < cqs.size(); i++) { - cqs[i].get().set_mode(mode); - } -} } @@ -40,45 +35,37 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEventSynchronizeSanity) { TT_ASSERT(cqs.size() == 2); const int num_cmds_per_cq = 1; - auto current_mode = CommandQueue::default_mode(); - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH, CommandQueue::CommandQueueMode::ASYNC}) { - local_test_functions::SetAllCqsMode(cqs, mode); - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - auto start = std::chrono::system_clock::now(); - - std::unordered_map>> sync_events; - const size_t num_events = 10; + auto start = std::chrono::system_clock::now(); + std::unordered_map>> sync_events; + const size_t num_events = 10; - for (size_t j = 0; j < num_events; j++) { - for (uint i = 0; i < cqs.size(); i++) { - log_debug(tt::LogTest, "Mode: {} j : {} Recording and Host Syncing on event for CQ ID: {}", mode, j, cqs[i].get().id()); - auto event = sync_events[i].emplace_back(std::make_shared()); - EnqueueRecordEvent(cqs[i], event); - EventSynchronize(event); - // Can check events fields after prev sync w/ async CQ. - EXPECT_EQ(event->cq_id, cqs[i].get().id()); - EXPECT_EQ(event->event_id, cmds_issued_per_cq[i]); - cmds_issued_per_cq[i] += num_cmds_per_cq; - } + for (size_t j = 0; j < num_events; j++) { + for (uint i = 0; i < cqs.size(); i++) { + log_debug(tt::LogTest, "j : {} Recording and Host Syncing on event for CQ ID: {}", j, cqs[i].get().id()); + auto event = sync_events[i].emplace_back(std::make_shared()); + EnqueueRecordEvent(cqs[i], event); + EventSynchronize(event); + // Can check events fields after prev sync w/ async CQ. + EXPECT_EQ(event->cq_id, cqs[i].get().id()); + EXPECT_EQ(event->event_id, cmds_issued_per_cq[i]); + cmds_issued_per_cq[i] += num_cmds_per_cq; } + } - // Sync on earlier events again per CQ just to show it works. - for (uint i = 0; i < cqs.size(); i++) { - for (size_t j = 0; j < num_events; j++) { - EventSynchronize(sync_events.at(i)[j]); - } + // Sync on earlier events again per CQ just to show it works. + for (uint i = 0; i < cqs.size(); i++) { + for (size_t j = 0; j < num_events; j++) { + EventSynchronize(sync_events.at(i)[j]); } + } - local_test_functions::FinishAllCqs(cqs); - std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000); + local_test_functions::FinishAllCqs(cqs); + std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); + tt::log_info(tt::LogTest, "Test Finished in {:.2f} us", elapsed_seconds.count() * 1000 * 1000); - } - local_test_functions::SetAllCqsMode(cqs, current_mode); } -// Simplest test to record and wait-for-events on same CQ. Only check event struct members in passthrough mode to not add any extra -// sync/delay via wait_until_ready(). +// Simplest test to record and wait-for-events on same CQ. TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventSanity) { vector> cqs = {this->device_->command_queue(0), this->device_->command_queue(1)}; vector cmds_issued_per_cq = {0, 0}; @@ -87,34 +74,22 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventSanity TT_ASSERT(cqs.size() == 2); const int num_cmds_per_cq = 2; - auto current_mode = CommandQueue::default_mode(); - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH, CommandQueue::CommandQueueMode::ASYNC}) { - local_test_functions::SetAllCqsMode(cqs, mode); - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - auto start = std::chrono::system_clock::now(); - - for (size_t j = 0; j < num_events; j++) { - for (uint i = 0; i < cqs.size(); i++) { - log_debug(tt::LogTest, "Mode: {} j : {} Recording and Device Syncing on event for CQ ID: {}", mode, j, cqs[i].get().id()); - auto event = std::make_shared(); - EnqueueRecordEvent(cqs[i], event); - - // Only in passthrough mode is Event populated right away. - if (mode == CommandQueue::CommandQueueMode::PASSTHROUGH) { - EXPECT_EQ(event->cq_id, cqs[i].get().id()); - EXPECT_EQ(event->event_id, cmds_issued_per_cq[i]); - } + auto start = std::chrono::system_clock::now(); - EnqueueWaitForEvent(cqs[i], event); - cmds_issued_per_cq[i] += num_cmds_per_cq; - } + for (size_t j = 0; j < num_events; j++) { + for (uint i = 0; i < cqs.size(); i++) { + log_debug(tt::LogTest, "j : {} Recording and Device Syncing on event for CQ ID: {}", j, cqs[i].get().id()); + auto event = std::make_shared(); + EnqueueRecordEvent(cqs[i], event); + EXPECT_EQ(event->cq_id, cqs[i].get().id()); + EXPECT_EQ(event->event_id, cmds_issued_per_cq[i]); + EnqueueWaitForEvent(cqs[i], event); + cmds_issued_per_cq[i] += num_cmds_per_cq; } - local_test_functions::FinishAllCqs(cqs); - std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000); - } - local_test_functions::SetAllCqsMode(cqs, current_mode); + local_test_functions::FinishAllCqs(cqs); + std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); + tt::log_info(tt::LogTest, "Test Finished in {:.2f} us", elapsed_seconds.count() * 1000 * 1000); } // Record event on one CQ, wait-for-that-event on another CQ. Then do the flip. Occasionally insert @@ -129,66 +104,56 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsEnqueueWaitForEventCrossC const int num_cmds_per_cq = 1; vector expected_event_id = {0, 0}; - auto current_mode = CommandQueue::default_mode(); - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH, CommandQueue::CommandQueueMode::ASYNC}) { - local_test_functions::SetAllCqsMode(cqs, mode); - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - auto start = std::chrono::system_clock::now(); - - // Store completion queue base address from initial rdptr, for later readback. - vector completion_queue_base; - for (uint i = 0; i < cqs.size(); i++) { - completion_queue_base.push_back(this->device_->sysmem_manager().get_completion_queue_read_ptr(i)); - } - - // Issue a number of Event Record/Waits per CQ, with Record/Wait on alternate CQs - for (size_t j = 0; j < num_events_per_cq; j++) { - for (uint i = 0; i < cqs.size(); i++) { - auto cq_idx_record = i; - auto cq_idx_wait = (i + 1) % cqs.size(); - auto event = std::make_shared(); - log_debug(tt::LogTest, "Mode: {} j : {} Recording event on CQ ID: {} and Device Syncing on CQ ID: {}", mode, j, cqs[cq_idx_record].get().id(), cqs[cq_idx_wait].get().id()); - EnqueueRecordEvent(cqs[cq_idx_record], event); + auto start = std::chrono::system_clock::now(); - if (mode == CommandQueue::CommandQueueMode::ASYNC) { - event->wait_until_ready(); - } + // Store completion queue base address from initial rdptr, for later readback. + vector completion_queue_base; + for (uint i = 0; i < cqs.size(); i++) { + completion_queue_base.push_back(this->device_->sysmem_manager().get_completion_queue_read_ptr(i)); + } - EXPECT_EQ(event->cq_id, cqs[cq_idx_record].get().id()); - EXPECT_EQ(event->event_id, cmds_issued_per_cq[i]); - EnqueueWaitForEvent(cqs[cq_idx_wait], event); + // Issue a number of Event Record/Waits per CQ, with Record/Wait on alternate CQs + for (size_t j = 0; j < num_events_per_cq; j++) { + for (uint i = 0; i < cqs.size(); i++) { - // Occasionally do host wait for extra coverage from both CQs. - if (j > 0 && ((j % 3) == 0)) { - EventSynchronize(event); - } - cmds_issued_per_cq[cq_idx_record] += num_cmds_per_cq; - cmds_issued_per_cq[cq_idx_wait] += num_cmds_per_cq; + auto cq_idx_record = i; + auto cq_idx_wait = (i + 1) % cqs.size(); + auto event = std::make_shared(); + log_debug(tt::LogTest, "j : {} Recording event on CQ ID: {} and Device Syncing on CQ ID: {}", j, cqs[cq_idx_record].get().id(), cqs[cq_idx_wait].get().id()); + EnqueueRecordEvent(cqs[cq_idx_record], event); + EXPECT_EQ(event->cq_id, cqs[cq_idx_record].get().id()); + EXPECT_EQ(event->event_id, cmds_issued_per_cq[i]); + EnqueueWaitForEvent(cqs[cq_idx_wait], event); + + // Occasionally do host wait for extra coverage from both CQs. + if (j > 0 && ((j % 3) == 0)) { + EventSynchronize(event); } + cmds_issued_per_cq[cq_idx_record] += num_cmds_per_cq; + cmds_issued_per_cq[cq_idx_wait] += num_cmds_per_cq; } + } - local_test_functions::FinishAllCqs(cqs); + local_test_functions::FinishAllCqs(cqs); - // Check that completion queue per device is correct. Ensure expected event_ids seen in order. - chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device_->id()); - uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id()); - constexpr uint32_t completion_queue_event_alignment = 32; - uint32_t event; + // Check that completion queue per device is correct. Ensure expected event_ids seen in order. + chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device_->id()); + uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device_->id()); + constexpr uint32_t completion_queue_event_alignment = 32; + uint32_t event; - for (uint cq_id = 0; cq_id < cqs.size(); cq_id++) { - for (size_t i = 0; i < num_cmds_per_cq * cqs.size() * num_events_per_cq; i++) { - uint32_t host_addr = completion_queue_base[cq_id] + i * completion_queue_event_alignment; - tt::Cluster::instance().read_sysmem(&event, 4, host_addr, mmio_device_id, channel); - log_debug(tt::LogTest, "Checking completion queue. mode: {} cq_id: {} i: {} host_addr: {}. Got event_id: {}", mode, cq_id, i, host_addr, event); - EXPECT_EQ(event, expected_event_id[cq_id]++); - } + for (uint cq_id = 0; cq_id < cqs.size(); cq_id++) { + for (size_t i = 0; i < num_cmds_per_cq * cqs.size() * num_events_per_cq; i++) { + uint32_t host_addr = completion_queue_base[cq_id] + i * completion_queue_event_alignment; + tt::Cluster::instance().read_sysmem(&event, 4, host_addr, mmio_device_id, channel); + log_debug(tt::LogTest, "Checking completion queue. cq_id: {} i: {} host_addr: {}. Got event_id: {}", cq_id, i, host_addr, event); + EXPECT_EQ(event, expected_event_id[cq_id]++); } - - std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000); } - local_test_functions::SetAllCqsMode(cqs, current_mode); + + std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); + tt::log_info(tt::LogTest, "Test Finished in {:.2f} us", elapsed_seconds.count() * 1000 * 1000); } // Simple 2CQ test to mix reads, writes, record-event, wait-for-event in a basic way. It's simple because @@ -203,43 +168,37 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent size_t num_buffers_per_cq = 10; bool pass = true; - auto current_mode = CommandQueue::default_mode(); - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH}) { - local_test_functions::SetAllCqsMode(cqs, mode); - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - auto start = std::chrono::system_clock::now(); - - std::unordered_map>> sync_events; + auto start = std::chrono::system_clock::now(); - for (uint buf_idx = 0; buf_idx < num_buffers_per_cq; buf_idx++) { - vector> buffers; - vector> srcs; - for (uint i = 0; i < cqs.size(); i++) { - uint32_t wr_data_base = (buf_idx * 1000) + (i * 100); - buffers.push_back(std::make_shared(this->device_, buf_size, config.page_size, config.buftype)); - srcs.push_back(generate_arange_vector(buffers[i]->size(), wr_data_base)); - log_debug(tt::LogTest, "Mode: {} buf_idx: {} Doing Write to cq_id: {} of data: {}", mode, buf_idx, i, srcs[i]); - EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i], false); - auto event = sync_events[i].emplace_back(std::make_shared()); - EnqueueRecordEvent(cqs[i], event); - } + std::unordered_map>> sync_events; - for (uint i = 0; i < cqs.size(); i++) { - auto event = sync_events[i][buf_idx]; - EnqueueWaitForEvent(cqs[i], event); - vector result; - EnqueueReadBuffer(cqs[i], *buffers[i], result, true); // Blocking. - bool local_pass = (srcs[i] == result); - log_debug(tt::LogTest, "Mode: {} Checking buf_idx: {} cq_idx: {} local_pass: {} write_data: {} read_results: {}", mode, buf_idx, i, local_pass, srcs[i], result); - pass &= local_pass; - } + for (uint buf_idx = 0; buf_idx < num_buffers_per_cq; buf_idx++) { + vector> buffers; + vector> srcs; + for (uint i = 0; i < cqs.size(); i++) { + uint32_t wr_data_base = (buf_idx * 1000) + (i * 100); + buffers.push_back(std::make_shared(this->device_, buf_size, config.page_size, config.buftype)); + srcs.push_back(generate_arange_vector(buffers[i]->size(), wr_data_base)); + log_debug(tt::LogTest, "buf_idx: {} Doing Write to cq_id: {} of data: {}", buf_idx, i, srcs[i]); + EnqueueWriteBuffer(cqs[i], *buffers[i], srcs[i], false); + auto event = sync_events[i].emplace_back(std::make_shared()); + EnqueueRecordEvent(cqs[i], event); } - local_test_functions::FinishAllCqs(cqs); - std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000); + for (uint i = 0; i < cqs.size(); i++) { + auto event = sync_events[i][buf_idx]; + EnqueueWaitForEvent(cqs[i], event); + vector result; + EnqueueReadBuffer(cqs[i], *buffers[i], result, true); // Blocking. + bool local_pass = (srcs[i] == result); + log_debug(tt::LogTest, "Checking buf_idx: {} cq_idx: {} local_pass: {} write_data: {} read_results: {}", buf_idx, i, local_pass, srcs[i], result); + pass &= local_pass; + } } - local_test_functions::SetAllCqsMode(cqs, current_mode); + + local_test_functions::FinishAllCqs(cqs); + std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); + tt::log_info(tt::LogTest, "Test Finished in {:.2f} us", elapsed_seconds.count() * 1000 * 1000); EXPECT_TRUE(pass); } @@ -258,54 +217,48 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent size_t num_buffers_per_cq = 50; bool pass = true; - auto current_mode = CommandQueue::default_mode(); - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH}) { - local_test_functions::SetAllCqsMode(cqs, mode); - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - auto start = std::chrono::system_clock::now(); + auto start = std::chrono::system_clock::now(); - for (uint buf_idx = 0; buf_idx < num_buffers_per_cq; buf_idx++) { + for (uint buf_idx = 0; buf_idx < num_buffers_per_cq; buf_idx++) { - // Increase number of pages and page size every 10 buffers, to change async timing betwen CQs. - if (buf_idx > 0 && ((buf_idx % 10) == 0)) { - config.page_size *= 2; - config.num_pages *= 2; - } - - vector> buffers; - vector> srcs; - size_t buf_size = config.num_pages * config.page_size; - - for (uint i = 0; i < cqs.size(); i++) { - - uint32_t wr_data_base = (buf_idx * 1000) + (i * 100); - auto &cq_write = cqs[i]; - auto &cq_read = cqs[(i + 1) % cqs.size()]; - auto event = std::make_shared(); - vector result; - - buffers.push_back(std::make_shared(this->device_, buf_size, config.page_size, config.buftype)); - srcs.push_back(generate_arange_vector(buffers[i]->size(), wr_data_base)); - - // Blocking Read after Non-Blocking Write on alternate CQs, events ensure ordering. - log_debug(tt::LogTest, "Mode: {} buf_idx: {} Doing Write (page_size: {} num_pages: {}) to cq_id: {}", mode, buf_idx, config.page_size, config.num_pages, cq_write.get().id()); - EnqueueWriteBuffer(cq_write, *buffers[i], srcs[i], false); - EnqueueRecordEvent(cq_write, event); - EnqueueWaitForEvent(cq_read, event); - EnqueueReadBuffer(cq_read, *buffers[i], result, true); - bool local_pass = (srcs[i] == result); - log_debug(tt::LogTest, "Mode: {} Checking buf_idx: {} cq_idx: {} local_pass: {} write_data: {} read_results: {}", mode, buf_idx, i, local_pass, srcs[i], result); - pass &= local_pass; - } + // Increase number of pages and page size every 10 buffers, to change async timing betwen CQs. + if (buf_idx > 0 && ((buf_idx % 10) == 0)) { + config.page_size *= 2; + config.num_pages *= 2; } - local_test_functions::FinishAllCqs(cqs); + vector> buffers; + vector> srcs; + size_t buf_size = config.num_pages * config.page_size; + + for (uint i = 0; i < cqs.size(); i++) { - auto end = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = (end-start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {}us", mode, elapsed_seconds.count() * 1000 * 1000); + uint32_t wr_data_base = (buf_idx * 1000) + (i * 100); + auto &cq_write = cqs[i]; + auto &cq_read = cqs[(i + 1) % cqs.size()]; + auto event = std::make_shared(); + vector result; + + buffers.push_back(std::make_shared(this->device_, buf_size, config.page_size, config.buftype)); + srcs.push_back(generate_arange_vector(buffers[i]->size(), wr_data_base)); + + // Blocking Read after Non-Blocking Write on alternate CQs, events ensure ordering. + log_debug(tt::LogTest, "buf_idx: {} Doing Write (page_size: {} num_pages: {}) to cq_id: {}", buf_idx, config.page_size, config.num_pages, cq_write.get().id()); + EnqueueWriteBuffer(cq_write, *buffers[i], srcs[i], false); + EnqueueRecordEvent(cq_write, event); + EnqueueWaitForEvent(cq_read, event); + EnqueueReadBuffer(cq_read, *buffers[i], result, true); + bool local_pass = (srcs[i] == result); + log_debug(tt::LogTest, "Checking buf_idx: {} cq_idx: {} local_pass: {} write_data: {} read_results: {}", buf_idx, i, local_pass, srcs[i], result); + pass &= local_pass; + } } - local_test_functions::SetAllCqsMode(cqs, current_mode); + + local_test_functions::FinishAllCqs(cqs); + + auto end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = (end-start); + tt::log_info(tt::LogTest, "Test Finished in {}us", elapsed_seconds.count() * 1000 * 1000); EXPECT_TRUE(pass); } @@ -330,90 +283,85 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestEventsReadWriteWithWaitForEvent TT_ASSERT(cqs.size() == 2); - auto current_mode = CommandQueue::default_mode(); - for (const CommandQueue::CommandQueueMode mode : {CommandQueue::CommandQueueMode::PASSTHROUGH}) { - local_test_functions::SetAllCqsMode(cqs, mode); - tt::log_info(tt::LogTest, "Using CQ Mode: {}", mode); - auto start = std::chrono::system_clock::now(); + auto start = std::chrono::system_clock::now(); - // Repeat test starting with different CQ ID. Could have placed this loop lower down. - for (uint cq_idx = 0; cq_idx < cqs.size(); cq_idx++) { + // Repeat test starting with different CQ ID. Could have placed this loop lower down. + for (uint cq_idx = 0; cq_idx < cqs.size(); cq_idx++) { - auto &cq_write = cqs[cq_idx]; - auto &cq_read = cqs[(cq_idx + 1) % cqs.size()]; + auto &cq_write = cqs[cq_idx]; + auto &cq_read = cqs[(cq_idx + 1) % cqs.size()]; - // Another loop for increased testing. Repeat test multiple times for different buffers. - for (int i = 0; i < num_buffers; i++) { + // Another loop for increased testing. Repeat test multiple times for different buffers. + for (int i = 0; i < num_buffers; i++) { - vector> write_data; - vector> read_results; - vector> buffers; + vector> write_data; + vector> read_results; + vector> buffers; - buffers.push_back(std::make_shared(this->device_, buf_size, config.page_size, config.buftype)); + buffers.push_back(std::make_shared(this->device_, buf_size, config.page_size, config.buftype)); - // Number of write-read combos per buffer. Fewer make RAW race without events easier to hit. - for (uint j = 0; j < num_wr_rd_per_buf; j++) { + // Number of write-read combos per buffer. Fewer make RAW race without events easier to hit. + for (uint j = 0; j < num_wr_rd_per_buf; j++) { - // 2 Events to synchronize delaying the read after write, and delaying the next write after read. - auto event_sync_read_after_write = std::make_shared(); - auto event_sync_write_after_read = std::make_shared(); + // 2 Events to synchronize delaying the read after write, and delaying the next write after read. + auto event_sync_read_after_write = std::make_shared(); + auto event_sync_write_after_read = std::make_shared(); - // Add entry in resutls vector, and construct write data, unique per loop - read_results.emplace_back(); - write_data.push_back(generate_arange_vector(buffers.back()->size(), j * 100)); + // Add entry in resutls vector, and construct write data, unique per loop + read_results.emplace_back(); + write_data.push_back(generate_arange_vector(buffers.back()->size(), j * 100)); - // Issue non-blocking write via first CQ and record event to synchronize with read on other CQ. - log_debug(tt::LogTest, "Mode: {} cq_idx: {} Doing Write j: {} (page_size: {} num_pages: {}) to cq_id: {} write_data: {}", mode, cq_idx, j, config.page_size, config.num_pages, cq_write.get().id(), write_data.back()); - EnqueueWriteBuffer(cq_write, *buffers.back(), write_data.back(), false); - if (use_events) EnqueueRecordEvent(cq_write, event_sync_read_after_write); + // Issue non-blocking write via first CQ and record event to synchronize with read on other CQ. + log_debug(tt::LogTest, "cq_idx: {} Doing Write j: {} (page_size: {} num_pages: {}) to cq_id: {} write_data: {}", cq_idx, j, config.page_size, config.num_pages, cq_write.get().id(), write_data.back()); + EnqueueWriteBuffer(cq_write, *buffers.back(), write_data.back(), false); + if (use_events) EnqueueRecordEvent(cq_write, event_sync_read_after_write); - // Issue wait for write to complete, and non-blocking read from the second CQ. - if (use_events) EnqueueWaitForEvent(cq_read, event_sync_read_after_write); - EnqueueReadBuffer(cq_read, *buffers.back(), read_results.back(), false); - log_debug(tt::LogTest, "Mode: {} cq_idx: {} Issued Read for j: {} to cq_id: {} got data: {}", mode, cq_idx, j, cq_read.get().id(), read_results.back()); // Data not ready since non-blocking. + // Issue wait for write to complete, and non-blocking read from the second CQ. + if (use_events) EnqueueWaitForEvent(cq_read, event_sync_read_after_write); + EnqueueReadBuffer(cq_read, *buffers.back(), read_results.back(), false); + log_debug(tt::LogTest, "cq_idx: {} Issued Read for j: {} to cq_id: {} got data: {}", cq_idx, j, cq_read.get().id(), read_results.back()); // Data not ready since non-blocking. - // If more loops, Record Event on second CQ and wait for it to complete on first CQ before next loop's write. - if (use_events && j < num_wr_rd_per_buf-1) { - EnqueueRecordEvent(cq_read, event_sync_write_after_read); - EnqueueWaitForEvent(cq_write, event_sync_write_after_read); - } + // If more loops, Record Event on second CQ and wait for it to complete on first CQ before next loop's write. + if (use_events && j < num_wr_rd_per_buf-1) { + EnqueueRecordEvent(cq_read, event_sync_write_after_read); + EnqueueWaitForEvent(cq_write, event_sync_write_after_read); } + } - // Basically like Finish, but use host sync on event to ensure all read cmds are finished. - if (use_events) { - auto event_done_reads = std::make_shared(); - EnqueueRecordEvent(cq_read, event_done_reads); - EventSynchronize(event_done_reads); - } + // Basically like Finish, but use host sync on event to ensure all read cmds are finished. + if (use_events) { + auto event_done_reads = std::make_shared(); + EnqueueRecordEvent(cq_read, event_done_reads); + EventSynchronize(event_done_reads); + } - TT_ASSERT(write_data.size() == read_results.size()); - TT_ASSERT(write_data.size() == num_wr_rd_per_buf); - - for (uint j = 0; j < num_wr_rd_per_buf; j++) { - // Make copy of read results, helpful for comparison without events, since vector may be updated between comparison and debug log. - auto read_results_snapshot = read_results[j]; - bool local_pass = write_data[j] == read_results_snapshot; - if (!local_pass) { - log_warning(tt::LogTest, "Mode: {} cq_idx: {} Checking j: {} local_pass: {} write_data: {} read_results: {}", mode, cq_idx, j, local_pass, write_data[j], read_results_snapshot); - } - pass &= local_pass; + TT_ASSERT(write_data.size() == read_results.size()); + TT_ASSERT(write_data.size() == num_wr_rd_per_buf); + + for (uint j = 0; j < num_wr_rd_per_buf; j++) { + // Make copy of read results, helpful for comparison without events, since vector may be updated between comparison and debug log. + auto read_results_snapshot = read_results[j]; + bool local_pass = write_data[j] == read_results_snapshot; + if (!local_pass) { + log_warning(tt::LogTest, "cq_idx: {} Checking j: {} local_pass: {} write_data: {} read_results: {}", cq_idx, j, local_pass, write_data[j], read_results_snapshot); } + pass &= local_pass; + } - // Before starting test with another buffer, drain CQs. Without this, see segfaults after - // adding num_buffers loop. - local_test_functions::FinishAllCqs(cqs); + // Before starting test with another buffer, drain CQs. Without this, see segfaults after + // adding num_buffers loop. + local_test_functions::FinishAllCqs(cqs); - } // num_buffers + } // num_buffers - } // cqs + } // cqs - local_test_functions::FinishAllCqs(cqs); + local_test_functions::FinishAllCqs(cqs); + + auto end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = (end-start); + tt::log_info(tt::LogTest, "Test Finished in {}us", elapsed_seconds.count() * 1000 * 1000); - auto end = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = (end-start); - tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {}us", mode, elapsed_seconds.count() * 1000 * 1000); - } - local_test_functions::SetAllCqsMode(cqs, current_mode); EXPECT_TRUE(pass); }