Skip to content

Commit

Permalink
#0: Move remote chip event synchronization to dispatch core (#13256)
Browse files Browse the repository at this point in the history
- Previous R-Chip implementation would require event syncs
   to be processed on prefetch_h, since tunneler did not have
   VCs and a latent deadlock
- With VCs in place, we can perform event synchronization on
   dispatch_d -> more performant, since prefetcher can keep
   buffering commands while dispatch_d is waiting
  • Loading branch information
tt-asaigal authored Sep 30, 2024
1 parent d4f42f4 commit e1f2f08
Show file tree
Hide file tree
Showing 8 changed files with 16 additions and 143 deletions.
2 changes: 1 addition & 1 deletion tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ1) {
ttnn::record_event(device->command_queue(1), workload_event);
ttnn::wait_for_event(device->command_queue(0), workload_event);

ttnn::read_buffer(1, output_tensor, {readback_data, readback_data, readback_data, readback_data, readback_data, readback_data, readback_data, readback_data});
ttnn::read_buffer(0, output_tensor, {readback_data, readback_data, readback_data, readback_data, readback_data, readback_data, readback_data, readback_data});

for (int j = 0; j < 3 * 2048 * 2048; j++) {
ASSERT_EQ(readback_data[j].to_float(), -1 * (i + dev_idx) * 32 + 500);
Expand Down
13 changes: 9 additions & 4 deletions tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1913,8 +1913,11 @@ void Device::configure_command_queue_programs() {
tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id);
tt_cxy_pair completion_q_writer_location = dispatch_core_manager::instance().completion_queue_writer_core(device_id, channel, cq_id);
tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_core(device_id, channel, cq_id);
tt_cxy_pair remote_dispatcher_location;
if (not this->is_mmio_capable()) {
remote_dispatcher_location = dispatch_core_manager::instance().dispatcher_d_core(device_id, channel, cq_id);
}
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(mmio_device_id);

TT_ASSERT(prefetch_location.chip == mmio_device_id and completion_q_writer_location.chip == mmio_device_id,
"Issue queue interface is on device {} and completion queue interface is on device {} but they are expected to be on device {}", prefetch_location.chip, completion_q_writer_location.chip, mmio_device_id);

Expand All @@ -1927,9 +1930,11 @@ void Device::configure_command_queue_programs() {
detail::WriteToDeviceL1(mmio_device, prefetch_location, CQ_PREFETCH_Q_RD_PTR, prefetch_q_rd_ptr_addr_data, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, prefetch_location, CQ_PREFETCH_Q_PCIE_RD_PTR, prefetch_q_pcie_rd_ptr_addr_data, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, prefetch_location, dispatch_constants::PREFETCH_Q_BASE, prefetch_q, dispatch_core_type);
// Used for prefetch_h, since a wait_for_event on remote chips requires prefetch_h to spin until dispatch_d notfiies completion
detail::WriteToDeviceL1(mmio_device, prefetch_location, CQ0_COMPLETION_LAST_EVENT, zero, dispatch_core_type);
detail::WriteToDeviceL1(mmio_device, prefetch_location, CQ1_COMPLETION_LAST_EVENT, zero, dispatch_core_type);
if (not this->is_mmio_capable()) {
// Initialize event counters to 0 on dispatch_d on r-chip
detail::WriteToDeviceL1(this, remote_dispatcher_location, CQ0_COMPLETION_LAST_EVENT, zero, dispatch_core_type);
detail::WriteToDeviceL1(this, remote_dispatcher_location, CQ1_COMPLETION_LAST_EVENT, zero, dispatch_core_type);
}
// Initialize completion queue write pointer and read pointer copy
uint32_t issue_queue_size = this->sysmem_manager_->get_issue_queue_size(cq_id);
uint32_t completion_queue_start_addr = CQ_START + issue_queue_size + get_absolute_cq_offset(channel, cq_id, cq_size);
Expand Down
27 changes: 3 additions & 24 deletions tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1521,12 +1521,6 @@ void EnqueueRecordEventCommand::process() {
sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd) + dispatch_constants::EVENT_PADDED_SIZE,
pcie_alignment); // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WRITE_LINEAR_HOST + event ID

if (not device->is_mmio_capable()) {
cmd_sequence_sizeB +=
CQ_PREFETCH_CMD_BARE_MIN_SIZE *
num_hw_cqs; // CQ_DISPATCH_REMOTE_WRITE (number of writes = number of prefetch_h cores on this CQ)
}

void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id);

HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB);
Expand Down Expand Up @@ -1564,18 +1558,6 @@ void EnqueueRecordEventCommand::process() {
event_payloads,
packed_write_max_unicast_sub_cmds);

if (not device->is_mmio_capable()) {
for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) {
tt_cxy_pair prefetch_location =
dispatch_core_manager::instance().prefetcher_core(this->device->id(), channel, cq_id);
CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_location, core_type);
command_sequence.add_dispatch_write_remote(
this->event_id,
this->device->get_noc_unicast_encoding(this->noc_index, prefetch_physical_core),
address);
}
}

bool flush_prefetch = true;
command_sequence.add_dispatch_write_host<true>(
flush_prefetch, dispatch_constants::EVENT_PADDED_SIZE, true, event_payload.data());
Expand Down Expand Up @@ -1606,18 +1588,15 @@ EnqueueWaitForEventCommand::EnqueueWaitForEventCommand(

void EnqueueWaitForEventCommand::process() {
uint32_t cmd_sequence_sizeB = CQ_PREFETCH_CMD_BARE_MIN_SIZE; // CQ_PREFETCH_CMD_RELAY_INLINE + CQ_DISPATCH_CMD_WAIT
// or CQ_PREFETCH_CMD_WAIT_FOR_EVENT

void* cmd_region = this->manager.issue_queue_reserve(cmd_sequence_sizeB, this->command_queue_id);

HugepageDeviceCommand command_sequence(cmd_region, cmd_sequence_sizeB);
uint32_t last_completed_event_address =
sync_event.cq_id == 0 ? CQ0_COMPLETION_LAST_EVENT : CQ1_COMPLETION_LAST_EVENT;
if (this->device->is_mmio_capable()) {
command_sequence.add_dispatch_wait(false, last_completed_event_address, sync_event.event_id, this->clear_count);
} else {
command_sequence.add_prefetch_wait_for_event(sync_event.event_id, last_completed_event_address);
}

command_sequence.add_dispatch_wait(false, last_completed_event_address, sync_event.event_id, this->clear_count);

this->manager.issue_queue_push_back(cmd_sequence_sizeB, this->command_queue_id);

this->manager.fetch_queue_reserve_back(this->command_queue_id);
Expand Down
29 changes: 3 additions & 26 deletions tt_metal/impl/dispatch/cq_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ enum CQPrefetchCmdId : uint8_t {
CQ_PREFETCH_CMD_EXEC_BUF_END = 7, // finish executing commands from a buffer (return), payload like relay_inline
CQ_PREFETCH_CMD_STALL = 8, // drain pipe through dispatcher
CQ_PREFETCH_CMD_DEBUG = 9, // log waypoint data to watcher, checksum
CQ_PREFETCH_CMD_WAIT_FOR_EVENT = 10, // wait_for_event: stall until dispatcher signals event completion
CQ_PREFETCH_CMD_TERMINATE = 11, // quit
CQ_PREFETCH_CMD_TERMINATE = 10, // quit
CQ_PREFETCH_CMD_MAX_COUNT, // for checking legal IDs
};

Expand All @@ -47,9 +46,8 @@ enum CQDispatchCmdId : uint8_t {
CQ_DISPATCH_CMD_DEBUG = 10, // log waypoint data to watcher, checksum
CQ_DISPATCH_CMD_DELAY = 11, // insert delay (for testing)
CQ_DISPATCH_CMD_EXEC_BUF_END = 12, // dispatch_d notify prefetch_h that exec_buf has completed
CQ_DISPATCH_CMD_REMOTE_WRITE = 13, // dispatch_d issues write to address on L-Chip through dispatch_h
CQ_DISPATCH_CMD_SET_WRITE_OFFSET = 14, // set the offset to add to all non-host destination addresses (relocation)
CQ_DISPATCH_CMD_TERMINATE = 15, // quit
CQ_DISPATCH_CMD_SET_WRITE_OFFSET = 13, // set the offset to add to all non-host destination addresses (relocation)
CQ_DISPATCH_CMD_TERMINATE = 14, // quit
CQ_DISPATCH_CMD_MAX_COUNT, // for checking legal IDs
};

Expand Down Expand Up @@ -115,13 +113,6 @@ struct CQPrefetchRelayInlineCmd {
uint32_t stride; // explicit stride saves a few insns on device
} __attribute__((packed));

struct CQPrefetchWaitForEventCmd {
uint8_t pad1;
uint16_t pad2;
uint32_t sync_event;
uint32_t sync_event_addr;
} __attribute__((packed));

struct CQPrefetchExecBufCmd {
uint8_t pad1;
uint16_t pad2;
Expand All @@ -138,7 +129,6 @@ struct CQPrefetchCmd {
CQPrefetchRelayPagedPackedCmd relay_paged_packed;
CQPrefetchRelayInlineCmd relay_inline;
CQPrefetchExecBufCmd exec_buf;
CQPrefetchWaitForEventCmd event_wait;
CQGenericDebugCmd debug;
} __attribute__((packed));
};
Expand Down Expand Up @@ -241,18 +231,6 @@ struct CQDispatchDelayCmd {
uint32_t delay;
} __attribute__((packed));

// When dispatch_d gets this command, it will be
// forwarded to dispatch_h, which will write data
// to local noc_xy_addr at offset addr. The data
// is currently a uint32_t field, which is inlined in
// the command.
struct CQDispatchRemoteWriteCmd {
uint32_t data;
uint32_t noc_xy_addr;
uint32_t addr;
} __attribute__((packed));


struct CQDispatchSetWriteOffsetCmd {
uint8_t pad1;
uint16_t pad2;
Expand All @@ -270,7 +248,6 @@ struct CQDispatchCmd {
CQDispatchWritePagedCmd write_paged;
CQDispatchWritePackedCmd write_packed;
CQDispatchWritePackedLargeCmd write_packed_large;
CQDispatchRemoteWriteCmd write_from_remote;
CQDispatchWaitCmd wait;
CQGenericDebugCmd debug;
CQDispatchDelayCmd delay;
Expand Down
8 changes: 0 additions & 8 deletions tt_metal/impl/dispatch/debug_tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ uint32_t dump_dispatch_cmd(CQDispatchCmd *cmd, uint32_t cmd_addr, std::ofstream
case CQ_DISPATCH_CMD_GO: break;
case CQ_DISPATCH_CMD_SINK: break;
case CQ_DISPATCH_CMD_EXEC_BUF_END: break;
case CQ_DISPATCH_CMD_REMOTE_WRITE: break;
case CQ_DISPATCH_CMD_TERMINATE: break;
case CQ_DISPATCH_CMD_SET_WRITE_OFFSET: break;
default: TT_THROW("Unrecognized dispatch command: {}", cmd_id); break;
Expand Down Expand Up @@ -249,13 +248,6 @@ uint32_t dump_prefetch_cmd(CQPrefetchCmd *cmd, uint32_t cmd_addr, std::ofstream
val(cmd->debug.stride));
stride = cmd->debug.stride;
break;
case CQ_PREFETCH_CMD_WAIT_FOR_EVENT:
iq_file << fmt::format(
" (sync_event={:#08x}, sync_event_addr={:#08x})",
val(cmd->event_wait.sync_event),
val(cmd->event_wait.sync_event_addr));
stride = CQ_PREFETCH_CMD_BARE_MIN_SIZE + sizeof(CQPrefetchHToPrefetchDHeader);
break;
// These commands don't have any additional data to dump.
case CQ_PREFETCH_CMD_ILLEGAL: break;
case CQ_PREFETCH_CMD_STALL: break;
Expand Down
42 changes: 0 additions & 42 deletions tt_metal/impl/dispatch/device_command.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,48 +129,6 @@ class DeviceCommand {
}
}

void add_prefetch_wait_for_event(uint32_t event_id, uint32_t event_addr) {
uint32_t increment_sizeB = align(sizeof(CQPrefetchCmd), this->pcie_alignment);
auto initialize_wait_cmd = [&](CQPrefetchCmd *wait_cmd) {
*wait_cmd = {};
wait_cmd->base.cmd_id = CQ_PREFETCH_CMD_WAIT_FOR_EVENT;
wait_cmd->event_wait.sync_event = event_id;
wait_cmd->event_wait.sync_event_addr = event_addr;
};
CQPrefetchCmd *wait_cmd_dst = this->reserve_space<CQPrefetchCmd *>(increment_sizeB);
if constexpr (hugepage_write) {
alignas(MEMCPY_ALIGNMENT) CQPrefetchCmd wait_cmd;
initialize_wait_cmd(&wait_cmd);
this->memcpy(wait_cmd_dst, &wait_cmd, sizeof(CQPrefetchCmd));
} else {
initialize_wait_cmd(wait_cmd_dst);
}
}

void add_dispatch_write_remote(uint32_t data, uint32_t noc_xy_addr, uint32_t addr) {
auto initialize_cross_prefetch_write = [&](CQPrefetchCmd *relay_write, CQDispatchCmd *write_cmd) {
relay_write->base.cmd_id = CQ_PREFETCH_CMD_RELAY_INLINE;
relay_write->relay_inline.length = sizeof(CQDispatchCmd);
relay_write->relay_inline.stride = CQ_PREFETCH_CMD_BARE_MIN_SIZE;
write_cmd->base.cmd_id = CQ_DISPATCH_CMD_REMOTE_WRITE;
write_cmd->write_from_remote.data = data;
write_cmd->write_from_remote.noc_xy_addr = noc_xy_addr;
write_cmd->write_from_remote.addr = addr;
};
CQPrefetchCmd *relay_write_dst = this->reserve_space<CQPrefetchCmd *>(sizeof(CQPrefetchCmd));
CQDispatchCmd *write_cmd_dst = this->reserve_space<CQDispatchCmd *>(sizeof(CQDispatchCmd));

if constexpr (hugepage_write) {
alignas(MEMCPY_ALIGNMENT) CQPrefetchCmd relay_write;
alignas(MEMCPY_ALIGNMENT) CQDispatchCmd write_cmd;
initialize_cross_prefetch_write(&relay_write, &write_cmd);
this->memcpy(relay_write_dst, &relay_write, sizeof(CQPrefetchCmd));
this->memcpy(write_cmd_dst, &write_cmd, sizeof(CQDispatchCmd));
} else {
initialize_cross_prefetch_write(write_cmd_dst);
}
}

void add_prefetch_relay_linear(uint32_t noc_xy_addr, uint32_t lengthB, uint32_t addr) {
uint32_t increment_sizeB = align(sizeof(CQPrefetchCmd), this->pcie_alignment);
auto initialize_relay_linear_cmd = [&](CQPrefetchCmd *relay_linear_cmd) {
Expand Down
19 changes: 0 additions & 19 deletions tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,12 +218,6 @@ void process_write_host_h(uint32_t& block_noc_writes_to_clear, uint32_t block_ne
cmd_ptr = data_ptr;
}

inline void process_remote_write_h() {
volatile tt_l1_ptr CQDispatchCmd *cmd = (volatile tt_l1_ptr CQDispatchCmd *)cmd_ptr;
noc_inline_dw_write(get_noc_addr_helper(cmd->write_from_remote.noc_xy_addr, cmd->write_from_remote.addr), cmd->write_from_remote.data);
cmd_ptr += sizeof(CQDispatchCmd);
}

void process_exec_buf_end_h() {
if (split_prefetch) {
invalidate_l1_cache();
Expand Down Expand Up @@ -884,15 +878,6 @@ static inline bool process_cmd_d(uint32_t &cmd_ptr, uint32_t* l1_cache, uint32_t
}
break;

case CQ_DISPATCH_CMD_REMOTE_WRITE:
DPRINT << "cmd_remote_write\n";
if (is_d_variant && !is_h_variant) {
// Relay write to dispatch_h, which will issue it on local chip
relay_to_next_cb<split_dispatch_page_preamble_size>(cmd_ptr, sizeof(CQDispatchCmd), block_noc_writes_to_clear, block_next_start_addr);
}
cmd_ptr += sizeof(CQDispatchCmd);
break;

case CQ_DISPATCH_CMD_SET_WRITE_OFFSET:
DPRINT << "write offset: " <<
cmd->set_write_offset.offset0 << " " <<
Expand Down Expand Up @@ -948,10 +933,6 @@ static inline bool process_cmd_h(uint32_t &cmd_ptr, uint32_t& block_noc_writes_t
DPRINT << "dispatch_h exec_buf_end\n";
process_exec_buf_end_h();
break;
case CQ_DISPATCH_CMD_REMOTE_WRITE:
DPRINT << "cmd_remote_write\n";
process_remote_write_h();
break;
case CQ_DISPATCH_CMD_TERMINATE:
DPRINT << "dispatch_h terminate\n";
cmd_ptr += sizeof(CQDispatchCmd);
Expand Down
19 changes: 0 additions & 19 deletions tt_metal/impl/dispatch/kernels/cq_prefetch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -531,16 +531,6 @@ uint32_t process_relay_paged_cmd_large(uint32_t cmd_ptr,
return CQ_PREFETCH_CMD_BARE_MIN_SIZE;
}

inline uint32_t process_prefetch_h_wait_cmd(uint32_t cmd_ptr) {
volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)(cmd_ptr + sizeof(CQPrefetchHToPrefetchDHeader));
volatile tt_l1_ptr uint32_t* event_addr =
reinterpret_cast<volatile tt_l1_ptr uint32_t*>(cmd->event_wait.sync_event_addr);
do {
invalidate_l1_cache();
} while (*event_addr < cmd->event_wait.sync_event);
return CQ_PREFETCH_CMD_BARE_MIN_SIZE + sizeof(CQPrefetchHToPrefetchDHeader);
}

// This fn prefetches data from DRAM memory and writes data to the dispatch core.
// Reading from DRAM has the following characteristics:
// - latency is moderately high ~400 cycles on WH
Expand Down Expand Up @@ -1181,10 +1171,6 @@ bool process_cmd(uint32_t& cmd_ptr,
stride = process_debug_cmd(cmd_ptr);
break;

case CQ_PREFETCH_CMD_WAIT_FOR_EVENT:
stride = CQ_PREFETCH_CMD_BARE_MIN_SIZE;
break;

case CQ_PREFETCH_CMD_TERMINATE:
//DPRINT << "prefetch terminating_" << is_h_variant << is_d_variant << ENDL();
ASSERT(!exec_buf);
Expand Down Expand Up @@ -1319,11 +1305,6 @@ void kernel_main_h() {

volatile CQPrefetchCmd tt_l1_ptr *cmd = (volatile CQPrefetchCmd tt_l1_ptr *)(cmd_ptr + sizeof(CQPrefetchHToPrefetchDHeader));
uint32_t cmd_id = cmd->base.cmd_id;
if (cmd_id == CQ_PREFETCH_CMD_WAIT_FOR_EVENT) {
// prefetch_h will stop execution until it recieves an event update from the
// dispatch_d core assigned to the other CQ
uint32_t stride = process_prefetch_h_wait_cmd(cmd_ptr);
}
// Infer that an exec_buf command is to be executed based on the stall state.
bool is_exec_buf = (stall_state == STALLED);
cmd_ptr = process_relay_inline_all(cmd_ptr, fence, is_exec_buf);
Expand Down

0 comments on commit e1f2f08

Please sign in to comment.