Skip to content

Commit

Permalink
#6093: Add pair of negative Host/Device Event sync tests on future ev…
Browse files Browse the repository at this point in the history
…ents (expected hangs)

 - Helps verify Events feature is working (spoiler: it wasn't lately)
   by syncing on a future event that is never issued, and ensure a hang
   is seen from Host and from Device (seperate tests)

 - watcher_server_clear_error_flag() is renamed to be
   watcher_server_set_error_flag(bool) for use here.

 - Add watcher_server_killed_due_to_error() support in EventSynchronize
   for host sync negative test to work, and to receive exit signal from
   main thread similar to Finish()
  • Loading branch information
kmabeeTT committed Mar 8, 2024
1 parent 2439641 commit a5f35a8
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class WatcherFixture: public CommonFixture {
tt::llrt::OptionsG.set_watcher_dump_all(watcher_previous_dump_all);
tt::llrt::OptionsG.set_watcher_append(watcher_previous_append);
tt::llrt::OptionsG.set_test_mode_enabled(test_mode_previous);
tt::watcher_server_clear_error_flag();
tt::watcher_server_set_error_flag(false);
}

void RunTestOnDevice(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "tt_metal/common/bfloat16.hpp"
#include "tt_metal/host_api.hpp"
#include "tt_metal/detail/tt_metal.hpp"
#include "impl/debug/watcher_server.hpp"

using namespace tt::tt_metal;

Expand Down Expand Up @@ -110,19 +111,66 @@ TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronize) {
EventSynchronize(sync_events.at(sync_events.size() - 2));
EventSynchronize(sync_events.at(5));

// Uncomment this to confirm future events not yet seen would hang.
// log_debug(tt::LogTest, "The next event is not yet seen, would hang");
// auto future_event = sync_events.at(0);
// future_event->event_id = num_events + 2;
// EventSynchronize(future_event);

Finish(this->device_->command_queue());

std::chrono::duration<double> elapsed_seconds = (std::chrono::system_clock::now() - start);
tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000);
}
this->device_->command_queue().set_mode(current_mode);
}
// Negative test. Host syncing on a future event that isn't actually issued.
// Ensure that expected hang is seen, which indicates event sync feature is working properly.
TEST_F(CommandQueueFixture, TestEventsEnqueueRecordEventAndSynchronizeHang) {
tt::llrt::OptionsG.set_test_mode_enabled(true); // Required for finish hang breakout.

auto future_event = std::make_shared<Event>();
EnqueueRecordEvent(this->device_->command_queue(), future_event);
future_event->wait_until_ready(); // in case async used, must block until async cq populated event.
future_event->event_id = 0xFFFF; // Modify event_id to be a future event that isn't issued yet.

// Launch Host Sync in an async thread, expected to hang, with timeout and kill signal.
auto future = std::async(std::launch::async, [this, future_event]() {
return EventSynchronize(future_event);
});

bool seen_expected_hang = future.wait_for(std::chrono::seconds(1)) == std::future_status::timeout;
tt::watcher_server_set_error_flag(seen_expected_hang); // Signal to terminate thread. Don't care about it's exception.

// Briefly wait before clearing error flag, and wrapping up via finish.
std::this_thread::sleep_for(std::chrono::milliseconds(200));
tt::watcher_server_set_error_flag(false);
Finish(this->device_->command_queue());

log_info(tt::LogTest, "Note: Test expects to see a hang if events feature is working. seen_expected_hang: {}", seen_expected_hang);
EXPECT_TRUE(seen_expected_hang);
}

// Negative test. Device sync. Single CQ here syncing on a future event that isn't actually issued.
// Ensure that expected hang is seen, which indicates event sync feature is working properly.
TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventHang) {
tt::llrt::OptionsG.set_test_mode_enabled(true); // Required for finish hang breakout.

auto future_event = std::make_shared<Event>();
EnqueueRecordEvent(this->device_->command_queue(), future_event);
future_event->wait_until_ready(); // in case async used, must block until async cq populated event.
future_event->event_id = 0xFFFF; // Modify event_id to be a future event that isn't issued yet.
EnqueueWaitForEvent(this->device_->command_queue(), future_event);

// Launch Finish in an async thread, expected to hang, with timeout and kill signal.
auto future = std::async(std::launch::async, [this]() {
return Finish(this->device_->command_queue());
});

bool seen_expected_hang = future.wait_for(std::chrono::seconds(1)) == std::future_status::timeout;
tt::watcher_server_set_error_flag(seen_expected_hang); // Signal to terminate thread. Don't care about it's exception.

// Clear error flag before exiting to restore state for next test.
std::this_thread::sleep_for(std::chrono::milliseconds(200));
tt::watcher_server_set_error_flag(false);

log_info(tt::LogTest, "Note: Test expects to see a hang if events feature is working. seen_expected_hang: {}", seen_expected_hang);
EXPECT_TRUE(seen_expected_hang);
}

// Device sync. Single CQ here, less interesting than 2CQ but still useful. Ensure no hangs.
TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventBasic) {
Expand Down Expand Up @@ -153,12 +201,7 @@ TEST_F(CommandQueueFixture, TestEventsQueueWaitForEventBasic) {
EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(0));
EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(sync_events.size() - 5));
EnqueueWaitForEvent(this->device_->command_queue(), sync_events.at(4));

// Uncomment this to confirm future events not yet seen would hang.
// auto future_event = sync_events.at(0);
// future_event->event_id = (num_events * 2) + 2;
// log_debug(tt::LogTest, "The next event (event_id: {}) is not yet seen, would hang", future_event->event_id);
// EnqueueWaitForEvent(this->device_->command_queue(), future_event);
Finish(this->device_->command_queue());

std::chrono::duration<double> elapsed_seconds = (std::chrono::system_clock::now() - start);
tt::log_info(tt::LogTest, "Test with CQ Mode: {} Finished in {:.2f} us", mode, elapsed_seconds.count() * 1000 * 1000);
Expand Down
4 changes: 2 additions & 2 deletions tt_metal/impl/debug/watcher_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -648,8 +648,8 @@ bool watcher_server_killed_due_to_error() {
return watcher::watcher_killed_due_to_error;
}

void watcher_server_clear_error_flag() {
watcher::watcher_killed_due_to_error = false;
void watcher_server_set_error_flag(bool val) {
watcher::watcher_killed_due_to_error = val;
}

void watcher_clear_log() {
Expand Down
4 changes: 2 additions & 2 deletions tt_metal/impl/debug/watcher_server.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ int watcher_register_kernel(const string& name);

// Check whether the watcher server has been killed due to an error detected.
bool watcher_server_killed_due_to_error();
// Function to clear this flag, so that non-watcher runs can continue as normal.
// Function to set this flag to true/false, so that non-watcher runs can continue as normal when set to false.
// TODO(dma): this doesn't currently clear the actual error codes on the device. Once watcher is
// moved out of llrt we can change this to watcher_clear_errors().
void watcher_server_clear_error_flag();
void watcher_server_set_error_flag(bool val);

// Helper function to clear the watcher log file
void watcher_clear_log();
Expand Down
8 changes: 6 additions & 2 deletions tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1191,7 +1191,7 @@ void HWCommandQueue::finish() {
this->dprint_server_hang = true;
return;
} else if (tt::watcher_server_killed_due_to_error()) {
// Illegal NOC txn killed wathcer. Mark state and early exit. Assert in main thread.
// Illegal NOC txn killed watcher. Mark state and early exit. Assert in main thread.
this->exit_condition = true;
this->illegal_noc_txn_hang = true;
return;
Expand Down Expand Up @@ -1517,7 +1517,11 @@ void EventSynchronize(std::shared_ptr<Event> event) {
log_trace(tt::LogMetal, "Issuing host sync on Event(device_id: {} cq_id: {} event_id: {})", event->device->id(), event->cq_id, event->event_id);

while (event->device->sysmem_manager().get_last_completed_event(event->cq_id) < event->event_id) {
std::this_thread::sleep_for(std::chrono::microseconds(10));
if (tt::llrt::OptionsG.get_test_mode_enabled() && tt::watcher_server_killed_due_to_error()) {
TT_ASSERT(false, "Command Queue could not complete EventSynchronize. See {} for details.", tt::watcher_get_log_file_name());
return;
}
std::this_thread::sleep_for(std::chrono::microseconds(5));
}
}

Expand Down

0 comments on commit a5f35a8

Please sign in to comment.