Skip to content

Commit

Permalink
#0: Add write barrier before mcasts in dispatcher
Browse files Browse the repository at this point in the history
  • Loading branch information
jbaumanTT committed Dec 16, 2024
1 parent 923b2bb commit 61ce70d
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 4 deletions.
2 changes: 1 addition & 1 deletion tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3047,7 +3047,7 @@ void HWCommandQueue::reset_config_buffer_mgr(const uint32_t num_entries) {
// Subtract 1 from the number of entries, so the watcher can read information (e.g. fired asserts) from the
// previous launch message.
// TODO(jbauman): Give correct number once async bug is fixed.
this->config_buffer_mgr[i].init_add_buffer(0, 1);
this->config_buffer_mgr[i].init_add_buffer(0, 3);
}
}

Expand Down
47 changes: 44 additions & 3 deletions tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,13 @@ void process_write_packed(
// dst_addr << " " << ENDL();
uint32_t writes = 0;
uint32_t mcasts = 0;
auto wait_for_barrier = [&]() {
noc_nonposted_writes_num_issued[noc_index] += writes;
noc_nonposted_writes_acked[noc_index] += mcasts;
writes = 0;
mcasts = 0;
noc_async_write_barrier();
};
WritePackedSubCmd* sub_cmd_ptr = (WritePackedSubCmd*)l1_cache;
while (count != 0) {
uint32_t dst_noc = sub_cmd_ptr->noc_xy_addr;
Expand All @@ -587,6 +594,7 @@ void process_write_packed(
if (cb_fence == block_next_start_addr[rd_block_idx]) {
orphan_size = cb_fence - data_ptr;
if (orphan_size != 0) {
wait_for_barrier();
cq_noc_async_write_with_state<CQ_NOC_SNdL>(data_ptr, dst, orphan_size, num_dests);
writes++;
mcasts += num_dests;
Expand Down Expand Up @@ -620,6 +628,7 @@ void process_write_packed(
uint32_t remainder_xfer_size = xfer_size - orphan_size;
// Creating full NOC addr not needed as we are not programming the noc coords
uint32_t remainder_dst_addr = dst_addr + orphan_size;
wait_for_barrier();
cq_noc_async_write_with_state<CQ_NOC_SnDL>(
data_ptr, remainder_dst_addr, remainder_xfer_size, num_dests);
// Reset values expected below
Expand All @@ -634,6 +643,7 @@ void process_write_packed(
}
}

wait_for_barrier();
cq_noc_async_write_with_state<CQ_NOC_SNdl>(data_ptr, dst, xfer_size, num_dests);
writes++;
mcasts += num_dests;
Expand Down Expand Up @@ -690,6 +700,14 @@ void process_write_packed_large(
uint32_t num_dests = sub_cmd_ptr->num_mcast_dests;
uint32_t pad_size = align(length, alignment) - length;
uint32_t unlink = sub_cmd_ptr->flags & CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK;
auto wait_for_barrier = [&]() {
noc_nonposted_writes_num_issued[noc_index] += writes;

mcasts += num_dests * writes;
noc_nonposted_writes_acked[noc_index] = mcasts;
writes = 0;
noc_async_write_barrier();
};

// Only re-init state after we have unlinked the last transaction
// Otherwise we assume NOC coord hasn't changed
Expand Down Expand Up @@ -731,22 +749,45 @@ void process_write_packed_large(
uint32_t xfer_size;
if (length > available_data) {
xfer_size = available_data;
wait_for_barrier();
cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests);
writes++;
} else {
xfer_size = length;
if (unlink) {
uint32_t rem_xfer_size =
cq_noc_async_write_with_state_any_len<false>(data_ptr, dst_addr, xfer_size, num_dests);
uint32_t rem_xfer_size = xfer_size;
if (rem_xfer_size > NOC_MAX_BURST_SIZE) {
uint32_t src_addr = data_ptr;
uint32_t dst_addr2 = dst_addr;
wait_for_barrier();
cq_noc_async_write_with_state<CQ_NOC_SnDL>(src_addr, dst_addr2, NOC_MAX_BURST_SIZE, num_dests);
writes++;
src_addr += NOC_MAX_BURST_SIZE;
dst_addr2 += NOC_MAX_BURST_SIZE;
rem_xfer_size -= NOC_MAX_BURST_SIZE;
while (rem_xfer_size > NOC_MAX_BURST_SIZE) {
wait_for_barrier();
cq_noc_async_write_with_state<CQ_NOC_SnDl>(
src_addr, dst_addr2, NOC_MAX_BURST_SIZE, num_dests);
writes++;
src_addr += NOC_MAX_BURST_SIZE;
dst_addr2 += NOC_MAX_BURST_SIZE;
rem_xfer_size -= NOC_MAX_BURST_SIZE;
}
}
// Unset Link flag
cq_noc_async_write_init_state<CQ_NOC_sndl, true, false>(0, 0, 0);
uint32_t data_offset = xfer_size - rem_xfer_size;
wait_for_barrier();
cq_noc_async_write_with_state<CQ_NOC_SnDL, CQ_NOC_wait>(
data_ptr + data_offset, dst_addr + data_offset, rem_xfer_size, num_dests);
writes++;
} else {
wait_for_barrier();
cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests);
writes++;
}
}
writes += div_up(xfer_size, NOC_MAX_BURST_SIZE);
length -= xfer_size;
data_ptr += xfer_size;
dst_addr += xfer_size;
Expand Down

0 comments on commit 61ce70d

Please sign in to comment.