Skip to content

Commit

Permalink
#15018: Re-enable async dispatch and workaround NOC hang
Browse files Browse the repository at this point in the history
When async dispatch was previously enabled, we hit a hang in LLAMA that seems
to occur when doing the path reserve for an mcast and a previous mcast is still
active. To work around this, force write barriers before mcasts that aren't
linked to earlier mcasts. The dispatcher is the only core that sends mcasts
over VC 5, and even with multiple CQs only one can be mcasting programs at a
time.
  • Loading branch information
jbaumanTT committed Dec 18, 2024
1 parent d5817c6 commit 4bfb135
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 10 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
2.69, 2.69, 2.95, 3.10, 3.34, 3.98, 4.02, 4.90, 3.48, 5.54, 3.38, 4.37, 5.48, 11.21, 4.02, 4.02, 12.97, 12.29, 96.13, 107.32, 4.14, 15.38, 114.60,
2.70, 2.70, 2.96, 3.12, 3.37, 4.01, 4.05, 4.93, 3.54, 5.55, 3.39, 4.40, 5.55, 11.26, 4.04, 4.03, 13.02, 12.45, 96.91, 108.26, 4.14, 15.37, 114.52,
2.71, 2.71, 3.04, 3.24, 3.43, 4.09, 4.17, 5.03, 3.70, 5.58, 3.42, 4.48, 5.73, 11.44, 4.06, 4.12, 13.19, 13.00, 99.35, 111.19, 4.14, 15.39, 118.12,
2.77, 2.77, 3.23, 3.48, 3.69, 4.28, 4.25, 5.16, 3.99, 5.63, 3.46, 4.57, 5.94, 11.70, 4.11, 4.09, 13.40, 14.23, 102.77, 115.28, 4.14, 16.30, 122.22,
2.95, 2.95, 3.66, 4.06, 4.50, 4.91, 4.82, 5.75, 4.69, 5.84, 3.53, 5.01, 6.68, 12.44, 4.34, 4.30, 14.16, 17.92, 118.69, 133.56, 4.14, 20.50, 134.55,
3.26, 3.28, 4.58, 5.21, 6.08, 6.09, 6.46, 7.19, 6.21, 6.15, 3.80, 7.63, 8.22, 14.02, 4.63, 4.64, 15.70, 29.68, 152.22, 177.60,
3.56, 3.63, 5.47, 6.42, 7.71, 7.70,
2.70, 2.70, 2.92, 3.02, 3.15, 3.18, 3.50, 3.53, 3.46, 5.90, 4.07, 4.45, 5.69, 12.25, 4.05, 4.15, 11.55, 12.29, 74.37, 85.31, 4.13, 14.16, 109.50,
2.71, 2.71, 2.93, 3.05, 3.23, 3.23, 3.55, 3.56, 3.49, 5.91, 4.08, 4.46, 5.75, 12.28, 4.05, 4.16, 11.60, 12.38, 74.64, 85.66, 4.13, 14.23, 109.89,
2.71, 2.71, 3.02, 3.23, 3.39, 3.39, 3.72, 3.81, 3.67, 5.93, 4.10, 4.51, 6.04, 12.48, 4.06, 4.18, 11.78, 12.55, 75.12, 86.29, 4.13, 14.46, 110.95,
2.78, 2.78, 3.23, 3.40, 3.61, 3.63, 3.96, 3.94, 3.88, 6.00, 4.13, 4.64, 6.18, 12.73, 4.11, 4.21, 12.03, 13.06, 76.39, 87.99, 4.13, 15.40, 114.50,
3.00, 3.00, 3.66, 3.98, 4.32, 4.35, 4.66, 4.69, 4.64, 6.22, 4.21, 5.05, 6.78, 13.41, 4.32, 4.35, 12.73, 17.65, 82.46, 95.97, 4.13, 19.60, 122.75,
3.29, 3.29, 4.51, 5.12, 5.79, 5.79, 6.27, 7.16, 6.02, 6.49, 4.34, 7.78, 8.28, 14.76, 4.64, 4.59, 16.45, 29.33, 151.60, 176.73,
3.57, 3.57, 5.40, 6.27, 7.17, 7.21,
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ for arg in "$@"; do
esac
done

set -x

# brisc only
echo "###" brisc only
build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch -w 5000 -s 256 -n -t $trace_option $eth_dispatch_option
Expand Down
3 changes: 1 addition & 2 deletions tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3056,8 +3056,7 @@ void HWCommandQueue::reset_config_buffer_mgr(const uint32_t num_entries) {
}
// Subtract 1 from the number of entries, so the watcher can read information (e.g. fired asserts) from the
// previous launch message.
// TODO(jbauman): Give correct number once async bug is fixed.
this->config_buffer_mgr[i].init_add_buffer(0, 1);
this->config_buffer_mgr[i].init_add_buffer(0, launch_msg_buffer_num_entries - 1);
}
}

Expand Down
38 changes: 37 additions & 1 deletion tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,18 @@ void process_write_packed(
// dst_addr << " " << ENDL();
uint32_t writes = 0;
uint32_t mcasts = 0;
auto wait_for_barrier = [&]() {
if (!mcast) {
return;
}
noc_nonposted_writes_num_issued[noc_index] += writes;
noc_nonposted_writes_acked[noc_index] += mcasts;
writes = 0;
mcasts = 0;
// Workaround mcast path reservation hangs by always waiting for a write
// barrier before doing an mcast that isn't linked to a previous mcast.
noc_async_write_barrier();
};
WritePackedSubCmd* sub_cmd_ptr = (WritePackedSubCmd*)l1_cache;
while (count != 0) {
uint32_t dst_noc = sub_cmd_ptr->noc_xy_addr;
Expand All @@ -587,6 +599,7 @@ void process_write_packed(
if (cb_fence == block_next_start_addr[rd_block_idx]) {
orphan_size = cb_fence - data_ptr;
if (orphan_size != 0) {
wait_for_barrier();
cq_noc_async_write_with_state<CQ_NOC_SNdL>(data_ptr, dst, orphan_size, num_dests);
writes++;
mcasts += num_dests;
Expand Down Expand Up @@ -620,6 +633,7 @@ void process_write_packed(
uint32_t remainder_xfer_size = xfer_size - orphan_size;
// Creating full NOC addr not needed as we are not programming the noc coords
uint32_t remainder_dst_addr = dst_addr + orphan_size;
wait_for_barrier();
cq_noc_async_write_with_state<CQ_NOC_SnDL>(
data_ptr, remainder_dst_addr, remainder_xfer_size, num_dests);
// Reset values expected below
Expand All @@ -634,6 +648,7 @@ void process_write_packed(
}
}

wait_for_barrier();
cq_noc_async_write_with_state<CQ_NOC_SNdl>(data_ptr, dst, xfer_size, num_dests);
writes++;
mcasts += num_dests;
Expand Down Expand Up @@ -684,21 +699,35 @@ void process_write_packed_large(
CQDispatchWritePackedLargeSubCmd* sub_cmd_ptr = (CQDispatchWritePackedLargeSubCmd*)l1_cache;

bool init_state = true;
bool must_barrier = true;
while (count != 0) {
uint32_t dst_addr = sub_cmd_ptr->addr + local_write_offset;
uint32_t length = sub_cmd_ptr->length;
uint32_t num_dests = sub_cmd_ptr->num_mcast_dests;
uint32_t pad_size = align(length, alignment) - length;
uint32_t unlink = sub_cmd_ptr->flags & CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK;
auto wait_for_barrier = [&]() {
if (!must_barrier) {
return;
}
noc_nonposted_writes_num_issued[noc_index] += writes;

mcasts += num_dests * writes;
noc_nonposted_writes_acked[noc_index] = mcasts;
writes = 0;
// Workaround mcast path reservation hangs by always waiting for a write
// barrier before doing an mcast that isn't linked to a previous mcast.
noc_async_write_barrier();
};

// Only re-init state after we have unlinked the last transaction
// Otherwise we assume NOC coord hasn't changed
// TODO: If we are able to send 0 length txn to unset link, we don't need a flag and can compare dst_noc to prev
// to determine linking
if (init_state) {
uint32_t dst_noc = sub_cmd_ptr->noc_xy_addr;
// TODO: Linking should be set to true once atomic txn is handled properly
cq_noc_async_write_init_state<CQ_NOC_sNdl, true, true>(0, get_noc_addr_helper(dst_noc, dst_addr));
must_barrier = true;
}

sub_cmd_ptr++;
Expand Down Expand Up @@ -731,19 +760,26 @@ void process_write_packed_large(
uint32_t xfer_size;
if (length > available_data) {
xfer_size = available_data;
wait_for_barrier();
cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests);
must_barrier = false;
} else {
xfer_size = length;
if (unlink) {
wait_for_barrier();
uint32_t rem_xfer_size =
cq_noc_async_write_with_state_any_len<false>(data_ptr, dst_addr, xfer_size, num_dests);
// Unset Link flag
cq_noc_async_write_init_state<CQ_NOC_sndl, true, false>(0, 0, 0);
uint32_t data_offset = xfer_size - rem_xfer_size;
cq_noc_async_write_with_state<CQ_NOC_SnDL, CQ_NOC_wait>(
data_ptr + data_offset, dst_addr + data_offset, rem_xfer_size, num_dests);
// Later writes must barrier, but the `must_barrier = true` in the `if (init_state)` block above
// will see to that.
} else {
wait_for_barrier();
cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests);
must_barrier = false;
}
}
writes += div_up(xfer_size, NOC_MAX_BURST_SIZE);
Expand Down

0 comments on commit 4bfb135

Please sign in to comment.