Skip to content

Commit

Permalink
#0: Bugfix
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-dma committed Nov 27, 2024
1 parent 677c4f2 commit b94e5ca
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 56 deletions.
53 changes: 46 additions & 7 deletions tests/scripts/run_tools_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,63 @@ if [[ -z "$TT_METAL_HOME" ]]; then
fi

if [[ -z "$TT_METAL_SLOW_DISPATCH_MODE" ]] ; then
check_list="Semaphore Allocated Configure Sysmem"
# Temporary dispatch compile args testing
echo "FD Compile Args Test - 1CQ"
./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*WatcherRingBufferBrisc
find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_old.csv
TT_METAL_NEW=1 ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*WatcherRingBufferBrisc

TT_METAL_NEW=1 ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*WatcherRingBufferBrisc | tee log.new
for i in $check_list; do
grep $i log.new > $i.new; sort -n -o $i.new{,}
done
find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_new.csv
rm -rf built

./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*WatcherRingBufferBrisc | tee log.old
for i in $check_list; do
grep $i log.old > $i.old; sort -n -o $i.old{,}
done
find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_old.csv
rm -rf built

for i in $check_list; do
if diff $i.old $i.new; then
echo "$i matches."
else
echo "FD Compile Args Test - 1CQ FAIL $i mismatch"
exit 1
fi
done
if diff kernel_args_old.csv kernel_args_new.csv; then
echo "FD Compile Args Test - 1CQ PASS"
echo "Kernel Args match."
else
echo "FD Compile Args Test - 1CQ FAIL"
exit 1
fi
echo "FD Compile Args Test - 1CQ PASS"

if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
echo "FD Compile Args Test - 2CQ"
TT_METAL_GTEST_ETH_DISPATCH=1 TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*WatcherRingBufferBrisc
find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_old.csv
TT_METAL_GTEST_ETH_DISPATCH=1 TT_METAL_GTEST_NUM_HW_CQS=2 TT_METAL_NEW=1 ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*WatcherRingBufferBrisc

TT_METAL_GTEST_ETH_DISPATCH=1 TT_METAL_GTEST_NUM_HW_CQS=2 TT_METAL_NEW=1 ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*WatcherRingBufferBrisc | tee log.new
for i in $check_list; do
grep $i log.new > $i.new; sort -n -o $i.new{,}
done
find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_new.csv

TT_METAL_GTEST_ETH_DISPATCH=1 TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_debug_tools --gtest_filter=*WatcherRingBufferBrisc | tee log.old
for i in $check_list; do
grep $i log.old > $i.old; sort -n -o $i.old{,}
done
find . -name "kernel_args.csv" | xargs -I {} cp {} kernel_args_old.csv

for i in $check_list; do
if diff $i.old $i.new; then
echo "$i matches."
else
echo "FD Compile Args Test - 2CQ FAIL $i mismatch"
exit 1
fi
done
if diff kernel_args_old.csv kernel_args_new.csv; then
echo "FD Compile Args Test - 2CQ PASS"
else
Expand Down
66 changes: 48 additions & 18 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2740,8 +2740,6 @@ void Device::configure_command_queue_programs_new() {
chip_id_t device_id = this->id();
chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(device_id);
Device *mmio_device = tt::DevicePool::instance().get_active_device(mmio_device_id);
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device_id);
log_debug(tt::LogMetal, "Device {} - Channel {}", this->id_, channel);

std::vector<uint32_t> zero = {0x0}; // Reset state in case L1 Clear is disabled.
std::vector<uint32_t> pointers;
Expand All @@ -2751,30 +2749,43 @@ void Device::configure_command_queue_programs_new() {
Program& command_queue_program = *this->command_queue_programs[0];
uint8_t num_hw_cqs = this->num_hw_cqs();

// Reset host-side command queue pointers
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(mmio_device_id);
uint32_t host_issue_q_rd_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::ISSUE_Q_RD);
uint32_t host_issue_q_wr_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::ISSUE_Q_WR);
uint32_t host_completion_q_wr_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
uint32_t host_completion_q_rd_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_RD);
uint32_t cq_start = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);
pointers.resize(cq_start/sizeof(uint32_t));
for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) {
// Reset the host manager's pointer for this command queue
this->sysmem_manager_->reset(cq_id);
// Reset host-side command queue pointers for all channels controlled by this mmio device
if (this->is_mmio_capable()) {
for (chip_id_t serviced_device_id : tt::Cluster::instance().get_devices_controlled_by_mmio_device(device_id)) {
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(serviced_device_id);
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(mmio_device_id);
uint32_t host_issue_q_rd_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::ISSUE_Q_RD);
uint32_t host_issue_q_wr_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::ISSUE_Q_WR);
uint32_t host_completion_q_wr_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
uint32_t host_completion_q_rd_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_RD);
uint32_t cq_start = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);
pointers.resize(cq_start/sizeof(uint32_t));
for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) {
// Reset the host manager's pointer for this command queue
this->sysmem_manager_->reset(cq_id);

pointers[host_issue_q_rd_ptr / sizeof(uint32_t)] = (cq_start + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;
pointers[host_issue_q_wr_ptr / sizeof(uint32_t)] = (cq_start + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;
pointers[host_completion_q_wr_ptr / sizeof(uint32_t)] = (cq_start + this->sysmem_manager_->get_issue_queue_size(cq_id) + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;
pointers[host_completion_q_rd_ptr / sizeof(uint32_t)] = (cq_start + this->sysmem_manager_->get_issue_queue_size(cq_id) + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;
pointers[host_issue_q_rd_ptr / sizeof(uint32_t)] = (cq_start + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;
pointers[host_issue_q_wr_ptr / sizeof(uint32_t)] = (cq_start + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;
pointers[host_completion_q_wr_ptr / sizeof(uint32_t)] = (cq_start + this->sysmem_manager_->get_issue_queue_size(cq_id) + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;
pointers[host_completion_q_rd_ptr / sizeof(uint32_t)] = (cq_start + this->sysmem_manager_->get_issue_queue_size(cq_id) + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;

tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), get_absolute_cq_offset(channel, cq_id, cq_size), mmio_device_id, get_umd_channel(channel));
log_warning(
"Sysmem: Addr={:#08x} - {:#08x} {:#08x} {:#08x} {:#08x}",
get_absolute_cq_offset(channel, cq_id, cq_size),
pointers[host_issue_q_rd_ptr / sizeof(uint32_t)],
pointers[host_issue_q_wr_ptr / sizeof(uint32_t)],
pointers[host_completion_q_wr_ptr / sizeof(uint32_t)],
pointers[host_completion_q_rd_ptr / sizeof(uint32_t)]);
tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), get_absolute_cq_offset(channel, cq_id, cq_size), mmio_device_id, get_umd_channel(channel));
}
}
}

// Write device-side cq pointers
configure_dispatch_cores(this);

// Run the cq program
command_queue_program.finalize(this);
detail::ConfigureDeviceWithProgram(this, command_queue_program, true);
tt::Cluster::instance().l1_barrier(this->id());
}
Expand Down Expand Up @@ -2820,6 +2831,13 @@ void Device::configure_command_queue_programs() {
pointers[host_completion_q_wr_ptr / sizeof(uint32_t)] = (cq_start + this->sysmem_manager_->get_issue_queue_size(cq_id) + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;
pointers[host_completion_q_rd_ptr / sizeof(uint32_t)] = (cq_start + this->sysmem_manager_->get_issue_queue_size(cq_id) + get_absolute_cq_offset(channel, cq_id, cq_size)) >> 4;

log_warning(
"Sysmem: Addr={:#08x} - {:#08x} {:#08x} {:#08x} {:#08x}",
get_absolute_cq_offset(channel, cq_id, cq_size),
pointers[host_issue_q_rd_ptr / sizeof(uint32_t)],
pointers[host_issue_q_wr_ptr / sizeof(uint32_t)],
pointers[host_completion_q_wr_ptr / sizeof(uint32_t)],
pointers[host_completion_q_rd_ptr / sizeof(uint32_t)]);
tt::Cluster::instance().write_sysmem(pointers.data(), pointers.size() * sizeof(uint32_t), get_absolute_cq_offset(channel, cq_id, cq_size), mmio_device_id, get_umd_channel(channel));
}

Expand Down Expand Up @@ -2851,11 +2869,13 @@ void Device::configure_command_queue_programs() {
uint32_t completion_q0_last_event_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q0_LAST_EVENT);
uint32_t completion_q1_last_event_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q1_LAST_EVENT);
std::vector<uint32_t> prefetch_q_pcie_rd_ptr_addr_data = {get_absolute_cq_offset(channel, cq_id, cq_size) + cq_start};
log_warning("Configure Prefetch H (device {} core {})", mmio_device->id(), prefetch_location.str());
detail::WriteToDeviceL1(mmio_device, prefetch_location, prefetch_q_rd_ptr, prefetch_q_rd_ptr_addr_data, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, prefetch_location, prefetch_q_pcie_rd_ptr, prefetch_q_pcie_rd_ptr_addr_data, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, prefetch_location, prefetch_q_base, prefetch_q, dispatch_core_type);
if (not this->is_mmio_capable()) {
// Initialize event counters to 0 on dispatch_d on r-chip
log_warning("Configure Dispatch D Counters (device {} core {})", this->id(), remote_dispatcher_location.str());
detail::WriteToDeviceL1(this, remote_dispatcher_location, completion_q0_last_event_ptr, zero, dispatch_core_type);
detail::WriteToDeviceL1(this, remote_dispatcher_location, completion_q1_last_event_ptr, zero, dispatch_core_type);
}
Expand All @@ -2864,13 +2884,23 @@ void Device::configure_command_queue_programs() {
uint32_t completion_queue_start_addr = cq_start + issue_queue_size + get_absolute_cq_offset(channel, cq_id, cq_size);
uint32_t completion_queue_start_addr_16B = completion_queue_start_addr >> 4;
std::vector<uint32_t> completion_queue_wr_ptr = {completion_queue_start_addr_16B};
log_warning("Configure CQ Writer (device {} core {})", mmio_device->id(), completion_q_writer_location.str());
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q_rd_ptr, completion_queue_wr_ptr, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q_wr_ptr, completion_queue_wr_ptr, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q0_last_event_ptr, zero, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q1_last_event_ptr, zero, dispatch_core_type);

// Initialize address where workers signal completion to dispatch core(s).
// TODO: Should only initialize dispatch_s_sync_sem if this->dispatch_s_enabled()?
if (this->distributed_dispatcher()) {
tt_cxy_pair dispatch_s_location = dispatch_core_manager::instance().dispatcher_s_core(device_id, channel, cq_id);
log_warning("Configure Dispatch S (device {} core {})", this->id(), dispatch_s_location.str());
}
log_warning("Configure Dispatch (device {} core {})", mmio_device->id(), dispatch_location.str());
if (device_id != mmio_device_id) {
tt_cxy_pair dispatch_d_location = dispatch_core_manager::instance().dispatcher_d_core(device_id, channel, cq_id);
log_warning("Configure Dispatch (device {} core {})", this->id(), dispatch_d_location.str());
}
for (uint32_t i = 0; i < dispatch_message_entries; i++) {
uint32_t dispatch_s_sync_sem_addr = dispatch_s_sync_sem_base_addr + dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(i);
uint32_t dispatch_message_addr = dispatch_message_base_addr + dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(i);
Expand Down
42 changes: 25 additions & 17 deletions tt_metal/impl/dispatch/arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,27 +190,35 @@ void configure_dispatch_cores(Device *device) {
// Set up completion_queue_writer core. This doesn't actually have a kernel so keep it out of the struct and config
// it here. TODO: should this be in the struct?
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
auto &my_dispatch_constants = dispatch_constants::get(dispatch_core_type);
uint32_t cq_start = my_dispatch_constants.get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED);
uint32_t cq_size = device->sysmem_manager().get_cq_size();
std::vector<uint32_t> zero = {0x0};
for (uint8_t cq_id = 0; cq_id < device->num_hw_cqs(); cq_id++) {
tt_cxy_pair completion_q_writer_location = dispatch_core_manager::instance().completion_queue_writer_core(device->id(), channel, cq_id);
Device *mmio_device = tt::DevicePool::instance().get_active_device(completion_q_writer_location.chip);
uint32_t completion_q_wr_ptr = my_dispatch_constants.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
uint32_t completion_q_rd_ptr = my_dispatch_constants.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);
uint32_t completion_q0_last_event_ptr = my_dispatch_constants.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q0_LAST_EVENT);
uint32_t completion_q1_last_event_ptr = my_dispatch_constants.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q1_LAST_EVENT);
// Initialize completion queue write pointer and read pointer copy
uint32_t issue_queue_size = device->sysmem_manager().get_issue_queue_size(cq_id);
uint32_t completion_queue_start_addr = cq_start + issue_queue_size + get_absolute_cq_offset(channel, cq_id, cq_size);
uint32_t completion_queue_start_addr_16B = completion_queue_start_addr >> 4;
std::vector<uint32_t> completion_queue_wr_ptr = {completion_queue_start_addr_16B};
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q_rd_ptr, completion_queue_wr_ptr, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q_wr_ptr, completion_queue_wr_ptr, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q0_last_event_ptr, zero, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q1_last_event_ptr, zero, dispatch_core_type);

// Need to set up for all devices serviced by an mmio chip
if (device->is_mmio_capable()) {
for (chip_id_t serviced_device_id : tt::Cluster::instance().get_devices_controlled_by_mmio_device(device->id())) {
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(serviced_device_id);
for (uint8_t cq_id = 0; cq_id < device->num_hw_cqs(); cq_id++) {
tt_cxy_pair completion_q_writer_location = dispatch_core_manager::instance().completion_queue_writer_core(serviced_device_id, channel, cq_id);
Device *mmio_device = tt::DevicePool::instance().get_active_device(completion_q_writer_location.chip);
uint32_t completion_q_wr_ptr = my_dispatch_constants.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
uint32_t completion_q_rd_ptr = my_dispatch_constants.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);
uint32_t completion_q0_last_event_ptr = my_dispatch_constants.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q0_LAST_EVENT);
uint32_t completion_q1_last_event_ptr = my_dispatch_constants.get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q1_LAST_EVENT);
// Initialize completion queue write pointer and read pointer copy
uint32_t issue_queue_size = device->sysmem_manager().get_issue_queue_size(cq_id);
uint32_t completion_queue_start_addr = cq_start + issue_queue_size + get_absolute_cq_offset(channel, cq_id, cq_size);
uint32_t completion_queue_start_addr_16B = completion_queue_start_addr >> 4;
std::vector<uint32_t> completion_queue_wr_ptr = {completion_queue_start_addr_16B};
tt::log_warning(
"Configure CQ Writer (device {} core {})", mmio_device->id(), completion_q_writer_location.str());
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q_rd_ptr, completion_queue_wr_ptr, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q_wr_ptr, completion_queue_wr_ptr, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q0_last_event_ptr, zero, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, completion_q_writer_location, completion_q1_last_event_ptr, zero, dispatch_core_type);
}
}
}
std::vector<dispatch_kernel_node_t> nodes = get_nodes(device);
for (int idx = 0; idx < node_id_to_kernel.size(); idx++) {
Expand Down
Loading

0 comments on commit b94e5ca

Please sign in to comment.