Skip to content

Commit

Permalink
#13482: Resolve 2CQ Trace Hangs on TG (#13484)
Browse files Browse the repository at this point in the history
#13482: Resolve incorrect dispatch_h -> prefetch_h bookkeeping for 2 CQ Trace on TG

  - On TG, mux buffers are identically sized for 1 and 2 CQs (Tensix
    dispatch allows fo this due to larger L1)
  - Prefetch_h accounted for this, but Dispatch_h assumed that Mux
    buffers (and therefore the initial semaphore value for Prefetch_h)
    was half the size for 2 CQs vs 1 CQ
  - This caused 2 CQ trace to hang, which requires handshaking between
    Prefetch_h and Dispatch_h
  - Add 2CQs Trace Test to TG
  • Loading branch information
tt-asaigal authored Oct 4, 2024
1 parent a88a3da commit 84e73a4
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
8 changes: 4 additions & 4 deletions tests/ttnn/unit_tests/test_multi_device_trace_TG.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
)
@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
@pytest.mark.parametrize("enable_async", [True])
@pytest.mark.parametrize("enable_multi_cq", [False]) # To be toggled when Galaxy supports Multi-CQ
@pytest.mark.parametrize("device_params", [{"trace_region_size": 60000}], indirect=True)
@pytest.mark.parametrize("enable_multi_cq", [True, False])
@pytest.mark.parametrize("device_params", [{"trace_region_size": 60000, "num_command_queues": 2}], indirect=True)
def test_multi_device_single_trace(mesh_device, shape, enable_async, enable_multi_cq):
if mesh_device.get_num_devices() < 32:
pytest.skip("Test is only valid on Galaxy")
Expand Down Expand Up @@ -120,8 +120,8 @@ def event_sync(event, record_cq, wait_cq):
)
@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
@pytest.mark.parametrize("enable_async", [True])
@pytest.mark.parametrize("enable_multi_cq", [False]) # To be toggled when Galaxy supports Multi-CQ
@pytest.mark.parametrize("device_params", [{"trace_region_size": 200000}], indirect=True)
@pytest.mark.parametrize("enable_multi_cq", [True, False])
@pytest.mark.parametrize("device_params", [{"trace_region_size": 200000, "num_command_queues": 2}], indirect=True)
def test_multi_device_multi_trace(mesh_device, shape, enable_async, enable_multi_cq):
torch.manual_seed(0)
if mesh_device.get_num_devices() < 32:
Expand Down
3 changes: 2 additions & 1 deletion tt_metal/impl/device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1195,6 +1195,7 @@ void Device::update_workers_build_settings(std::vector<std::vector<std::tuple<tt
TT_ASSERT(device_worker_variants[DispatchWorkerType::DEMUX].size() == 3, "Insufficient Demux cores. Expected = 3. Found = {}", device_worker_variants[DispatchWorkerType::DEMUX].size());
uint32_t dispatch_idx = 0;
uint32_t demux_fanout = num_dispatchers / 2;
auto mux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::MUX][0]);
for (int i = 1; i < 3; i++) {
auto demux_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DEMUX][i]);
TT_ASSERT(demux_fanout == demux_settings.semaphores.size(), "Demux does not have required number of semaphores for Dispatchers. Exptected = {}. Found = {}", num_dispatchers / 2, demux_settings.semaphores.size());
Expand Down Expand Up @@ -1229,7 +1230,7 @@ void Device::update_workers_build_settings(std::vector<std::vector<std::tuple<tt
compile_args[15] = true, // split_prefetcher
compile_args[16] = NOC_XY_ENCODING(prefetch_physical_core.x, prefetch_physical_core.y),
compile_args[17] = prefetch_h_settings.producer_semaphore_id, // sem_id on prefetch_h that dispatch_d is meant to increment, to resume sending of cmds post exec_buf stall
compile_args[18] = dispatch_constants::get(dispatch_core_type).mux_buffer_pages(num_hw_cqs), // XXXX should this be mux pages?
compile_args[18] = mux_settings.cb_pages,
compile_args[19] = settings.num_compute_cores;
compile_args[20] = 0; // unused: dispatch_d only
compile_args[21] = 0; // unused: dispatch_d only
Expand Down

0 comments on commit 84e73a4

Please sign in to comment.