diff --git a/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp b/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp index e232e350447..71ceb3f5c61 100644 --- a/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp +++ b/tests/tt_metal/tt_metal/unit_tests_common/common/watcher_fixture.hpp @@ -59,7 +59,7 @@ class WatcherFixture: public CommonFixture { tt::llrt::OptionsG.set_watcher_dump_all(watcher_previous_dump_all); tt::llrt::OptionsG.set_watcher_append(watcher_previous_append); tt::llrt::OptionsG.set_test_mode_enabled(test_mode_previous); - tt::watcher_server_clear_error_flag(); + tt::watcher_server_set_error_flag(false); } void RunTestOnDevice( diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp index 079a88fcd26..6c99e82b654 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_events.cpp @@ -8,6 +8,7 @@ #include "tt_metal/common/bfloat16.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/detail/tt_metal.hpp" +#include "impl/debug/watcher_server.hpp" using namespace tt::tt_metal; @@ -110,12 +111,6 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronize) { EventSynchronize(sync_events.at(sync_events.size() - 2)); EventSynchronize(sync_events.at(5)); - // Uncomment this to confirm future events not yet seen would hang. - // log_debug(tt::LogTest, "The next event is not yet seen, would hang"); - // auto future_event = sync_events.at(0); - // future_event->event_id = num_events + 2; - // EventSynchronize(future_event); - Finish(this->device_->command_queue()); std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); @@ -123,6 +118,59 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronize) { } this->device_->command_queue().set_mode(current_mode); } +// Negative test. Host syncing on a future event that isn't actually issued. +// Ensure that expected hang is seen, which indicates event sync feature is working properly. +TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) { + tt::llrt::OptionsG.set_test_mode_enabled(true); // Required for finish hang breakout. + + auto future_event = std::make_shared(); + EnqueueRecordEvent(this->device_->command_queue(), future_event); + future_event->wait_until_ready(); // in case async used, must block until async cq populated event. + future_event->event_id = 0xFFFF; // Modify event_id to be a future event that isn't issued yet. + + // Launch Host Sync in an async thread, expected to hang, with timeout and kill signal. + auto future = std::async(std::launch::async, [this, future_event]() { + return EventSynchronize(future_event); + }); + + bool seen_expected_hang = future.wait_for(std::chrono::seconds(1)) == std::future_status::timeout; + tt::watcher_server_set_error_flag(seen_expected_hang); // Signal to terminate thread. Don't care about it's exception. + + // Briefly wait before clearing error flag, and wrapping up via finish. + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + tt::watcher_server_set_error_flag(false); + Finish(this->device_->command_queue()); + + log_info(tt::LogTest, "Note: Test expects to see a hang if events feature is working. seen_expected_hang: {}", seen_expected_hang); + EXPECT_TRUE(seen_expected_hang); +} + +// Negative test. Device sync. Single CQ here syncing on a future event that isn't actually issued. +// Ensure that expected hang is seen, which indicates event sync feature is working properly. +TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventHang) { + tt::llrt::OptionsG.set_test_mode_enabled(true); // Required for finish hang breakout. + + auto future_event = std::make_shared(); + EnqueueRecordEvent(this->device_->command_queue(), future_event); + future_event->wait_until_ready(); // in case async used, must block until async cq populated event. + future_event->event_id = 0xFFFF; // Modify event_id to be a future event that isn't issued yet. + EnqueueWaitForEvent(this->device_->command_queue(), future_event); + + // Launch Finish in an async thread, expected to hang, with timeout and kill signal. + auto future = std::async(std::launch::async, [this]() { + return Finish(this->device_->command_queue()); + }); + + bool seen_expected_hang = future.wait_for(std::chrono::seconds(1)) == std::future_status::timeout; + tt::watcher_server_set_error_flag(seen_expected_hang); // Signal to terminate thread. Don't care about it's exception. + + // Clear error flag before exiting to restore state for next test. + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + tt::watcher_server_set_error_flag(false); + + log_info(tt::LogTest, "Note: Test expects to see a hang if events feature is working. seen_expected_hang: {}", seen_expected_hang); + EXPECT_TRUE(seen_expected_hang); +} // Device sync. Single CQ here, less interesting than 2CQ but still useful. Ensure no hangs. TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventBasic) { @@ -153,12 +201,7 @@ TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventBasic) { EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(0)); EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(sync_events.size() - 5)); EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(4)); - - // Uncomment this to confirm future events not yet seen would hang. - // auto future_event = sync_events.at(0); - // future_event->event_id = (num_events * 2) + 2; - // log_debug(tt::LogTest, "The next event (event_id: {}) is not yet seen, would hang", future_event->event_id); - // EnqueueWaitForEvent(this->device_->command_queue(), future_event); + Finish(this->device_->command_queue()); std::chrono::duration elapsed_seconds = (std::chrono::system_clock::now() - start); tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000); diff --git a/tt_metal/impl/debug/watcher_server.cpp b/tt_metal/impl/debug/watcher_server.cpp index 094bce3ea3f..13943805d9e 100644 --- a/tt_metal/impl/debug/watcher_server.cpp +++ b/tt_metal/impl/debug/watcher_server.cpp @@ -648,8 +648,8 @@ bool watcher_server_killed_due_to_error() { return watcher::watcher_killed_due_to_error; } -void watcher_server_clear_error_flag() { - watcher::watcher_killed_due_to_error = false; +void watcher_server_set_error_flag(bool val) { + watcher::watcher_killed_due_to_error = val; } void watcher_clear_log() { diff --git a/tt_metal/impl/debug/watcher_server.hpp b/tt_metal/impl/debug/watcher_server.hpp index 3a8d935b3d7..03c5e82d4c7 100644 --- a/tt_metal/impl/debug/watcher_server.hpp +++ b/tt_metal/impl/debug/watcher_server.hpp @@ -19,10 +19,10 @@ int watcher_register_kernel(const string& name); // Check whether the watcher server has been killed due to an error detected. bool watcher_server_killed_due_to_error(); -// Function to clear this flag, so that non-watcher runs can continue as normal. +// Function to set this flag to true/false, so that non-watcher runs can continue as normal when set to false. // TODO(dma): this doesn't currently clear the actual error codes on the device. Once watcher is // moved out of llrt we can change this to watcher_clear_errors(). -void watcher_server_clear_error_flag(); +void watcher_server_set_error_flag(bool val); // Helper function to clear the watcher log file void watcher_clear_log(); diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 012666ae3a1..e9612692868 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -1191,7 +1191,7 @@ void HWCommandQueue::finish() { this->dprint_server_hang = true; return; } else if (tt::watcher_server_killed_due_to_error()) { - // Illegal NOC txn killed wathcer. Mark state and early exit. Assert in main thread. + // Illegal NOC txn killed watcher. Mark state and early exit. Assert in main thread. this->exit_condition = true; this->illegal_noc_txn_hang = true; return; @@ -1517,7 +1517,11 @@ void EventSynchronize(std::shared_ptr event) { log_trace(tt::LogMetal, "Issuing host sync on Event(device_id: {} cq_id: {} event_id: {})", event->device->id(), event->cq_id, event->event_id); while (event->device->sysmem_manager().get_last_completed_event(event->cq_id) < event->event_id) { - std::this_thread::sleep_for(std::chrono::microseconds(10)); + if (tt::llrt::OptionsG.get_test_mode_enabled() && tt::watcher_server_killed_due_to_error()) { + TT_ASSERT(false, "Command Queue could not complete EventSynchronize. See {} for details.", tt::watcher_get_log_file_name()); + return; + } + std::this_thread::sleep_for(std::chrono::microseconds(5)); } }