diff --git a/.github/workflows/all-static-checks.yaml b/.github/workflows/all-static-checks.yaml index a9f1fb939916..785fe2c6e573 100644 --- a/.github/workflows/all-static-checks.yaml +++ b/.github/workflows/all-static-checks.yaml @@ -58,7 +58,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Check kernel count in base metal is less than maximum - run: if (( $(find tt_metal/kernels/ -type f | wc -l) > 7 )); then exit 1; fi + run: if (( $(find tt_metal/kernels/ -type f | wc -l) > 8 )); then exit 1; fi check-doc: runs-on: ubuntu-latest steps: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 946c69f3720b..e6b3a690fc2f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -341,13 +341,13 @@ Breakpoint 1, tt::tt_metal::Device::Device (this=0x3c, device_id=21845, num_hw_c TT_METAL_WATCHER=10 ./your_program ... Always | WARNING | Watcher detected NOC error and stopped device: bad alignment in NOC transaction. - Always | WARNING | Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010] + Always | WARNING | Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010] Always | INFO | Last waypoint: NARW, W, W, W, W Always | INFO | While running kernels: Always | INFO | brisc : tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp Always | INFO | ncrisc: blank Always | INFO | triscs: blank - Test | INFO | Reported error: Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010] + Test | INFO | Reported error: Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): brisc using noc0 tried to access DRAM core w/ physical coords (x=0,y=11) DRAM[addr=0x00003820,len=102400], misaligned with local L1[addr=0x00064010] Always | FATAL | Watcher detected NOC error and stopped device: bad alignment in NOC transaction. ``` - If no such error is reported, but the program is hanging, check the watcher log generated in `generated/watcher/watcher.log`. There is a legend at the top of the log showing how to interpret it, and a sample portion of a log is shown below: @@ -371,22 +371,22 @@ Legend: k_ids:|| (ID map to file at end of section) ... Dump #7 at 8.992s -Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 1,y= 0) phys(x= 2,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 2,y= 0) phys(x= 3,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 3,y= 0) phys(x= 4,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 4,y= 0) phys(x= 6,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 5,y= 0) phys(x= 7,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 6,y= 0) phys(x= 8,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 7,y= 0) phys(x= 9,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 -Device 0 worker core(x= 0,y= 7) phys(x= 1,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|2|0 -Device 0 worker core(x= 1,y= 7) phys(x= 2,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|1|0 -Device 0 worker core(x= 2,y= 7) phys(x= 3,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|3|0 -Device 0 worker core(x= 3,y= 7) phys(x= 4,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|7|0 -Device 0 worker core(x= 4,y= 7) phys(x= 6,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:4|0|0 -Device 0 worker core(x= 5,y= 7) phys(x= 7,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:6|0|0 -Device 0 worker core(x= 6,y= 7) phys(x= 8,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0 -Device 0 worker core(x= 7,y= 7) phys(x= 9,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0 +Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 1,y= 0) virtual(x= 2,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 2,y= 0) virtual(x= 3,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 3,y= 0) virtual(x= 4,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 4,y= 0) virtual(x= 6,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 5,y= 0) virtual(x= 7,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 6,y= 0) virtual(x= 8,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 7,y= 0) virtual(x= 9,y= 1): GW, W, W, W, W rmsg:D0D|BNT smsg:DDDD k_ids:14|13|15 +Device 0 worker core(x= 0,y= 7) virtual(x= 1,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|2|0 +Device 0 worker core(x= 1,y= 7) virtual(x= 2,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|1|0 +Device 0 worker core(x= 2,y= 7) virtual(x= 3,y=10): NTW, HQW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|3|0 +Device 0 worker core(x= 3,y= 7) virtual(x= 4,y=10): NTW,UAPW, W, W, W rmsg:H1G|bNt smsg:GDDD k_ids:0|7|0 +Device 0 worker core(x= 4,y= 7) virtual(x= 6,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:4|0|0 +Device 0 worker core(x= 5,y= 7) virtual(x= 7,y=10): NABD, W, W, W, W rmsg:H0G|Bnt smsg:DDDD k_ids:6|0|0 +Device 0 worker core(x= 6,y= 7) virtual(x= 8,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0 +Device 0 worker core(x= 7,y= 7) virtual(x= 9,y=10): GW, W, W, W, W rmsg:H0D|bnt smsg:DDDD k_ids:0|0|0 k_id[0]: blank k_id[1]: tt_metal/impl/dispatch/kernels/cq_prefetch.cpp k_id[2]: tt_metal/impl/dispatch/kernels/cq_dispatch.cpp diff --git a/docs/source/tt-metalium/tools/watcher.rst b/docs/source/tt-metalium/tools/watcher.rst index 9962f20b1f9d..3dd71f3c0fdb 100644 --- a/docs/source/tt-metalium/tools/watcher.rst +++ b/docs/source/tt-metalium/tools/watcher.rst @@ -217,7 +217,7 @@ per RISC in the log. If a stack overflow is detected, the core will hang and an .. code-block:: - Device 0 worker core(x= 0,y= 0) phys(x= 1,y= 1): GW, W, W, W, W rmsg:D1D|BNt smsg:DDDD k_ids:11|10|0 + Device 0 worker core(x= 0,y= 0) virtual(x= 1,y= 1): GW, W, W, W, W rmsg:D1D|BNt smsg:DDDD k_ids:11|10|0 brisc stack usage: 228/768, kernel using most stack: ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/dataflow/reader_mcast_sender_unary_sharded_gn_v2.cpp ncrisc stack usage: 192/768, kernel using most stack: ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reader_unary_sharded_blocks_interleaved_start_id.cpp trisc0 stack usage: 252/320, kernel using most stack: ttnn/cpp/ttnn/operations/normalization/groupnorm/device/kernels/compute/groupnorm_sharded_v2.cpp diff --git a/docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst b/docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst index e1c606211033..52f288a48e49 100644 --- a/docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst +++ b/docs/source/tt-metalium/tt_metal/examples/dram_loopback.rst @@ -112,6 +112,9 @@ Let's make the input and output DRAM buffers. Buffer output_dram_buffer = CreateBuffer(dram_config); const uint32_t output_dram_buffer_addr = output_dram_buffer.address(); + const uint32_t input_bank_id = 0; + const uint32_t output_bank_id = 0; + Sending real data into DRAM --------------------------- @@ -134,11 +137,9 @@ Setting runtime arguments for the data movement kernel const std::vector runtime_args = { l1_buffer.address(), input_dram_buffer.address(), - static_cast(input_dram_buffer.noc_coordinates().x), - static_cast(input_dram_buffer.noc_coordinates().y), + input_bank_id, output_dram_buffer.address(), - static_cast(output_dram_buffer.noc_coordinates().x), - static_cast(output_dram_buffer.noc_coordinates().y), + output_bank_id, l1_buffer.size() }; diff --git a/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst b/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst index 8749e64a4420..4a4bba21b9c9 100644 --- a/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst +++ b/docs/source/tt-metalium/tt_metal/examples/eltwise_sfpu.rst @@ -100,8 +100,7 @@ Extra runtime arguments for reader/writer core, { dst_dram_buffer.address(), - static_cast(dst_dram_buffer.noc_coordinates().x), - static_cast(dst_dram_buffer.noc_coordinates().y), + dst_bank_id, num_tiles } ); diff --git a/tech_reports/prog_examples/add_2_integers_in_compute/Tutorial_Add_Two_Integers_in_a_Compute_Kernel.md b/tech_reports/prog_examples/add_2_integers_in_compute/Tutorial_Add_Two_Integers_in_a_Compute_Kernel.md index f580ed3f39dd..df489cf3107a 100644 --- a/tech_reports/prog_examples/add_2_integers_in_compute/Tutorial_Add_Two_Integers_in_a_Compute_Kernel.md +++ b/tech_reports/prog_examples/add_2_integers_in_compute/Tutorial_Add_Two_Integers_in_a_Compute_Kernel.md @@ -27,27 +27,18 @@ tt_metal::InterleavedBufferConfig dram_config{ .page_size = single_tile_size, .buffer_type = tt_metal::BufferType::DRAM }; +uint32_t src0_bank_id = 0; +uint32_t src1_bank_id = 0; +uint32_t dst_bank_id = 0; ``` -5. Define the tile size to fit BFloat16 values: +5. Allocate memory for each buffer: ```std::shared_ptr src0_dram_buffer = CreateBuffer(dram_config); std::shared_ptr src1_dram_buffer = CreateBuffer(dram_config); std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); ``` -6.Allocate memory for each buffer: -```auto src0_dram_noc_coord = src0_dram_buffer->noc_coordinates(); -auto src1_dram_noc_coord = src1_dram_buffer->noc_coordinates(); -auto dst_dram_noc_coord = dst_dram_buffer->noc_coordinates(); -uint32_t src0_dram_noc_x = src0_dram_noc_coord.x; -uint32_t src0_dram_noc_y = src0_dram_noc_coord.y; -uint32_t src1_dram_noc_x = src1_dram_noc_coord.x; -uint32_t src1_dram_noc_y = src1_dram_noc_coord.y; -uint32_t dst_dram_noc_x = dst_dram_noc_coord.x; -uint32_t dst_dram_noc_y = dst_dram_noc_coord.y; -``` - -7. Specify NoC Coordinates: +6. Create circular buffers and assign them to the program: ```constexpr uint32_t src0_cb_index = CB::c_in0; constexpr uint32_t num_input_tiles = 1; CircularBufferConfig cb_src0_config = CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}).set_page_size(src0_cb_index, single_tile_size); @@ -63,7 +54,7 @@ CircularBufferConfig cb_output_config = CircularBufferConfig(num_output_tiles * CBHandle cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); ``` -8. Create a data movement kernel: +7. Create a data movement kernel: ```KernelHandle binary_reader_kernel_id = CreateKernel( program, "tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp", @@ -77,7 +68,7 @@ KernelHandle unary_writer_kernel_id = CreateKernel( DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); ``` -9. Create a compute kernel: +8. Create a compute kernel: ```vector compute_kernel_args = {}; KernelHandle eltwise_binary_kernel_id = CreateKernel( program, @@ -92,7 +83,7 @@ KernelHandle eltwise_binary_kernel_id = CreateKernel( ); ``` -10. Create two source vectors: +9. Create two source vectors: ```std::vector src0_vec; std::vector src1_vec; src0_vec = create_constant_vector_of_bfloat16(single_tile_size, 14.0f); @@ -102,8 +93,8 @@ EnqueueWriteBuffer(cq, src0_dram_buffer, src0_vec, false); EnqueueWriteBuffer(cq, src1_dram_buffer, src1_vec, false); ``` -11. Setup corresponding runtime arguments: -```SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_dram_noc_x, src0_dram_noc_y, src1_dram_noc_x, src1_dram_noc_y}); +10. Setup corresponding runtime arguments: +```SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_bank_id, src1_bank_id, dst_bank_id}); SetRuntimeArgs(program, eltwise_binary_kernel_id, core, {}); SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_dram_noc_x, dst_dram_noc_y}); @@ -111,7 +102,7 @@ EnqueueProgram(cq, program, false); Finish(cq); ``` -12. Execute the Program: +11. Execute the Program: ```uint32_t ublock_size_bytes_0 = get_tile_size(cb_id_in0); uint32_t ublock_size_bytes_1 = get_tile_size(cb_id_in1); @@ -129,7 +120,7 @@ noc_async_read_barrier(); cb_push_back(cb_id_in1, 1); ``` -13. Unpack, compute, and pack the data: +12. Unpack, compute, and pack the data: ```binary_op_init_common(cb_in0, cb_in1, cb_out0); add_tiles_init(); @@ -153,8 +144,8 @@ cb_pop_front(cb_in1, 1); cb_push_back(cb_out0, 1); ``` -14. Write integer values to the DRAM: -```uint64_t dst_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_addr); +13. Write integer values to the DRAM: +```uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_dram); constexpr uint32_t cb_id_out0 = tt::CB::c_out0; uint32_t ublock_size_bytes = get_tile_size(cb_id_out0); @@ -166,6 +157,6 @@ noc_async_write_barrier(); cb_pop_front(cb_id_out0, 1); ``` -15. Close the device: +14. Close the device: ```CloseDevice(device); ``` diff --git a/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md b/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md index d92e7d5cd74e..0eb7ec005204 100644 --- a/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md +++ b/tech_reports/prog_examples/add_2_integers_in_compute/add_2_integers_in_compute.md @@ -47,18 +47,12 @@ std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config Next, we allocate memory for each buffer with the specified configuration for each of the input vectors and another buffer for the output vector. The source data will be sent to the corresponding DRAM buffers to be accessed by the cores, and the results of the computation will be sent to the DRAM to be read by the destination vector. ``` cpp -auto src0_dram_noc_coord = src0_dram_buffer->noc_coordinates(); -auto src1_dram_noc_coord = src1_dram_buffer->noc_coordinates(); -auto dst_dram_noc_coord = dst_dram_buffer->noc_coordinates(); -uint32_t src0_dram_noc_x = src0_dram_noc_coord.x; -uint32_t src0_dram_noc_y = src0_dram_noc_coord.y; -uint32_t src1_dram_noc_x = src1_dram_noc_coord.x; -uint32_t src1_dram_noc_y = src1_dram_noc_coord.y; -uint32_t dst_dram_noc_x = dst_dram_noc_coord.x; -uint32_t dst_dram_noc_y = dst_dram_noc_coord.y; +uint32_t src0_bank_id = 0; +uint32_t src1_bank_id = 0; +uint32_t dst_bank_id = 0; ``` -For this example, we will also specify the NoC coordinates to pass into the kernel functions as runtime arguments. We will use this to ensure that the kernels will access the data at the correct NoC addresses. +For this example, we will also specify the Buffer Bank IDs to pass into the kernel functions as runtime arguments. We will use this to ensure that the kernels will access the data from the correct DRAM Memory Banks corresponding to each buffer. ``` cpp constexpr uint32_t src0_cb_index = CBIndex::c_0; @@ -129,9 +123,9 @@ EnqueueWriteBuffer(cq, src1_dram_buffer, src1_vec, false); Next, we create two source vectors, each loaded with a constant value, before queueing the command to feed it to the corresponding DRAM buffers using `EnqueueWriteBuffer`. ``` cpp -SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_dram_noc_x, src0_dram_noc_y, src1_dram_noc_x, src1_dram_noc_y}); +SetRuntimeArgs(program, binary_reader_kernel_id, core, { src0_dram_buffer->address(), src1_dram_buffer->address(), src0_bank_id, src1_bank_id}); SetRuntimeArgs(program, eltwise_binary_kernel_id, core, {}); -SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_dram_noc_x, dst_dram_noc_y}); +SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id}); EnqueueProgram(cq, program, false); Finish(cq); @@ -192,7 +186,7 @@ In the compute kernel, a single tile is read from each of the circular buffers c ## Writer kernel function ``` cpp -uint64_t dst_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_addr); +uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_dram); constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16; uint32_t ublock_size_bytes = get_tile_size(cb_id_out0); diff --git a/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md b/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md index 52e5e556b1f1..bac6a4a9d161 100644 --- a/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md +++ b/tech_reports/prog_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.md @@ -88,7 +88,7 @@ In this example, we are using data movement processors for basic computation. As ## Configure and execute program ``` cpp -SetRuntimeArgs(program, binary_reader_kernel_id, core, {src0_dram_buffer->address(), src1_dram_buffer->address(), dst_dram_buffer->address(),}); +SetRuntimeArgs(program, binary_reader_kernel_id, core, {src0_dram_buffer->address(), src1_dram_buffer->address(), dst_dram_buffer->address(), src0_bank_id, src1_bank_id, dst_bank_id}); EnqueueProgram(cq, program, false); Finish(cq); @@ -100,9 +100,9 @@ In order to execute the program, we need to load the runtime arguments for the k ``` cpp // NoC coords (x,y) depending on DRAM location on-chip -uint64_t src0_dram_noc_addr = get_noc_addr(src0_dram_noc_x, src0_dram_noc_y, src0_dram); -uint64_t src1_dram_noc_addr = get_noc_addr(src1_dram_noc_x, src1_dram_noc_y, src1_dram); -uint64_t dst_dram_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_dram); +uint64_t src0_dram_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_dram); +uint64_t src1_dram_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_dram); +uint64_t dst_dram_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_dram); constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; // index=0 constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1; // index=1 diff --git a/tech_reports/prog_examples/dram_loopback/dram_loopback.md b/tech_reports/prog_examples/dram_loopback/dram_loopback.md index 3a0486826db1..d7f64d73207f 100644 --- a/tech_reports/prog_examples/dram_loopback/dram_loopback.md +++ b/tech_reports/prog_examples/dram_loopback/dram_loopback.md @@ -110,11 +110,9 @@ We use a non-blocking call so we can continue setting up our program. const std::vector runtime_args = { l1_buffer.address(), input_dram_buffer.address(), - static_cast(input_dram_buffer.noc_coordinates().x), - static_cast(input_dram_buffer.noc_coordinates().y), + input_bank_id, output_dram_buffer.address(), - static_cast(output_dram_buffer.noc_coordinates().x), - static_cast(output_dram_buffer.noc_coordinates().y), + output_bank_id, l1_buffer.size() }; @@ -131,9 +129,9 @@ particular kernel, we have to provide: - Where the L1 buffer starts (memory address) - Where the input DRAM buffer starts (memory address) -- The location of the input DRAM buffer\'s channel on the NOC +- The Bank ID of the input DRAM buffer - Where the output DRAM buffer starts (memory address) -- The location of the output DRAM buffer\'s channel on the NOC +- The Bank ID of the output DRAM buffer - The size of the buffers ## Running the program diff --git a/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md b/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md index c7964729a86d..b8954ead4c04 100644 --- a/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md +++ b/tech_reports/prog_examples/eltwise_sfpu/eltwise_sfpu.md @@ -77,8 +77,7 @@ SetRuntimeArgs( core, { dst_dram_buffer.address(), - static_cast(dst_dram_buffer.noc_coordinates().x), - static_cast(dst_dram_buffer.noc_coordinates().y), + dst_bank_id, num_tiles } ); diff --git a/tests/tt_eager/kernels/dataflow/reader_unary_8bank.cpp b/tests/tt_eager/kernels/dataflow/reader_unary_8bank.cpp index 872b02b215fe..673361837a71 100644 --- a/tests/tt_eager/kernels/dataflow/reader_unary_8bank.cpp +++ b/tests/tt_eager/kernels/dataflow/reader_unary_8bank.cpp @@ -33,7 +33,7 @@ void generate_bcast_scaler() { void kernel_main() { uint32_t src_addr = get_arg_val(0); uint32_t num_tiles = - get_arg_val(3); // same arg index as in reader_unary and in reader_unary_transpose_wh_8bank + get_arg_val(2); // same arg index as in reader_unary and in reader_unary_transpose_wh_8bank constexpr uint32_t cb_id_in0 = 0, cb_id_in1 = 1; diff --git a/tests/tt_eager/kernels/dataflow/reader_unary_push_4.cpp b/tests/tt_eager/kernels/dataflow/reader_unary_push_4.cpp index b34105a5a562..fb6169c91f17 100644 --- a/tests/tt_eager/kernels/dataflow/reader_unary_push_4.cpp +++ b/tests/tt_eager/kernels/dataflow/reader_unary_push_4.cpp @@ -6,10 +6,9 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); constexpr uint32_t cb_id_in0 = 0; @@ -18,8 +17,8 @@ void kernel_main() { uint32_t ublock_size_bytes = get_tile_size(cb_id_in0) * ublock_size_tiles; // read a ublock of tiles from src to CB, and then push the ublock to unpacker - for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + for (uint32_t i = 0; i(bank_id, src_addr); cb_reserve_back(cb_id_in0, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id_in0); diff --git a/tests/tt_eager/kernels/dataflow/writer_unary_8bank.cpp b/tests/tt_eager/kernels/dataflow/writer_unary_8bank.cpp index 9dcf5c207991..1bc283a72db9 100644 --- a/tests/tt_eager/kernels/dataflow/writer_unary_8bank.cpp +++ b/tests/tt_eager/kernels/dataflow/writer_unary_8bank.cpp @@ -5,8 +5,8 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t dst_addr = get_arg_val(0); - uint32_t num_tiles = get_arg_val(3); // Index 3 to match with regular writer_unary + uint32_t dst_addr = get_arg_val(0); + uint32_t num_tiles = get_arg_val(2); // Index 2 to match with regular writer_unary constexpr uint32_t cb_id_out0 = 16; constexpr uint32_t onetile = 1; diff --git a/tests/tt_eager/ops/test_sfpu.cpp b/tests/tt_eager/ops/test_sfpu.cpp index 46526fb313eb..1e9df15d4bee 100644 --- a/tests/tt_eager/ops/test_sfpu.cpp +++ b/tests/tt_eager/ops/test_sfpu.cpp @@ -94,9 +94,6 @@ bool run_sfpu_test(const string& sfpu_name, int tile_factor = 1, bool use_DRAM = auto dst_dram_buffer = CreateBuffer(buff_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math // kernel, input CB and reader @@ -167,8 +164,7 @@ bool run_sfpu_test(const string& sfpu_name, int tile_factor = 1, bool use_DRAM = core, { dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, + 0, num_tiles, 0, 0, @@ -181,7 +177,7 @@ bool run_sfpu_test(const string& sfpu_name, int tile_factor = 1, bool use_DRAM = program, unary_writer_kernel, core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + {dram_buffer_dst_addr, 0, num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp index 8c3321858408..05dbe4c2295c 100644 --- a/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp +++ b/tests/tt_metal/tt_metal/api/circular_buffer/test_CircularBuffer_allocation.cpp @@ -452,8 +452,7 @@ TEST_F(DeviceFixture, TensixTestDataCopyWithUpdatedCircularBufferConfig) { core, { (uint32_t)src_dram_buffer->address(), - (uint32_t)src_dram_buffer->noc_coordinates().x, - (uint32_t)src_dram_buffer->noc_coordinates().y, + 0, (uint32_t)num_tiles, }); SetRuntimeArgs( @@ -462,8 +461,7 @@ TEST_F(DeviceFixture, TensixTestDataCopyWithUpdatedCircularBufferConfig) { core, { (uint32_t)dst_dram_buffer->address(), - (uint32_t)dst_dram_buffer->noc_coordinates().x, - (uint32_t)dst_dram_buffer->noc_coordinates().y, + 0, (uint32_t)num_tiles, }); diff --git a/tests/tt_metal/tt_metal/api/test_banked.cpp b/tests/tt_metal/tt_metal/api/test_banked.cpp index 40de1459f03e..8f057e17dea7 100644 --- a/tests/tt_metal/tt_metal/api/test_banked.cpp +++ b/tests/tt_metal/tt_metal/api/test_banked.cpp @@ -82,17 +82,9 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked auto output_buffer = CreateBuffer(out_config); tt::log_debug( - tt::LogTest, - "Input buffer: [address: {} B, size: {} B] at noc coord {}", - input_buffer->address(), - input_buffer->size(), - input_buffer->noc_coordinates().str()); + tt::LogTest, "Input buffer: [address: {} B, size: {} B]", input_buffer->address(), input_buffer->size()); tt::log_debug( - tt::LogTest, - "Output buffer: [address: {} B, size: {} B] at noc coord {}", - output_buffer->address(), - output_buffer->size(), - output_buffer->noc_coordinates().str()); + tt::LogTest, "Output buffer: [address: {} B, size: {} B]", output_buffer->address(), output_buffer->size()); TT_FATAL(cfg.num_tiles * cfg.page_size_bytes == cfg.size_bytes, "Error"); constexpr uint32_t num_pages_cb = 1; @@ -103,6 +95,10 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked bool input_is_dram = cfg.input_buffer_type == BufferType::DRAM; bool output_is_dram = cfg.output_buffer_type == BufferType::DRAM; + std::map reader_defines = { + {"INTERFACE_WITH_L1", std::to_string((uint32_t)(not input_is_dram))}}; + std::map writer_defines = { + {"INTERFACE_WITH_L1", std::to_string((uint32_t)(not output_is_dram))}}; auto reader_kernel = CreateKernel( program, @@ -111,7 +107,8 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked DataMovementConfig{ .processor = DataMovementProcessor::RISCV_0, .noc = NOC::NOC_0, - .compile_args = {cb_id, uint32_t(input_buffer->page_size()), (uint32_t)input_is_dram}}); + .compile_args = {cb_id, uint32_t(input_buffer->page_size()), (uint32_t)input_is_dram}, + .defines = reader_defines}); auto writer_kernel = CreateKernel( program, writer_kernel_name, @@ -119,15 +116,15 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked DataMovementConfig{ .processor = DataMovementProcessor::RISCV_1, .noc = NOC::NOC_1, - .compile_args = {cb_id, uint32_t(output_buffer->page_size()), (uint32_t)output_is_dram}}); + .compile_args = {cb_id, uint32_t(output_buffer->page_size()), (uint32_t)output_is_dram}, + .defines = writer_defines}); if (banked_reader) { reader_runtime_args = {(uint32_t)input_buffer->address(), (uint32_t)cfg.num_tiles}; } else { reader_runtime_args = { (uint32_t)input_buffer->address(), - (uint32_t)input_buffer->noc_coordinates().x, - (uint32_t)input_buffer->noc_coordinates().y, + 0, (uint32_t)cfg.num_tiles, }; } @@ -136,8 +133,7 @@ bool reader_cb_writer(Device* device, const BankedConfig& cfg, const bool banked } else { writer_runtime_args = { (uint32_t)output_buffer->address(), - (uint32_t)output_buffer->noc_coordinates().x, - (uint32_t)output_buffer->noc_coordinates().y, + 0, (uint32_t)cfg.num_tiles, }; } diff --git a/tests/tt_metal/tt_metal/api/test_direct.cpp b/tests/tt_metal/tt_metal/api/test_direct.cpp index 9f7a3de8a866..91a299f5891f 100644 --- a/tests/tt_metal/tt_metal/api/test_direct.cpp +++ b/tests/tt_metal/tt_metal/api/test_direct.cpp @@ -39,7 +39,6 @@ bool reader_only( auto input_dram_buffer = CreateBuffer(dram_config); uint32_t dram_byte_address = input_dram_buffer->address(); - auto dram_noc_xy = input_dram_buffer->noc_coordinates(); // TODO (abhullar): Use L1 buffer after bug with L1 banking and writing to < 1 MB is fixed. // Try this after KM uplifts TLB setup // auto l1_buffer = @@ -65,8 +64,7 @@ bool reader_only( reader_core, { (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, + 0, (uint32_t)l1_byte_address, (uint32_t)byte_size, }); @@ -100,7 +98,6 @@ bool writer_only( auto output_dram_buffer = CreateBuffer(dram_config); uint32_t dram_byte_address = output_dram_buffer->address(); - auto dram_noc_xy = output_dram_buffer->noc_coordinates(); // TODO (abhullar): Use L1 buffer after bug with L1 banking and writing to < 1 MB is fixed. // Try this after KM uplifts TLB setup // auto l1_buffer = @@ -127,8 +124,7 @@ bool writer_only( writer_core, { (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, + 0, (uint32_t)l1_byte_address, (uint32_t)byte_size, }); @@ -169,10 +165,8 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf auto input_dram_buffer = CreateBuffer(dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); tt_metal::CircularBufferConfig l1_cb_config = tt_metal::CircularBufferConfig(byte_size, {{cb_index, test_config.l1_data_format}}) @@ -214,8 +208,7 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf test_config.core, { (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }); tt_metal::SetRuntimeArgs( @@ -224,8 +217,7 @@ bool reader_writer(tt_metal::Device* device, const ReaderWriterConfig& test_conf test_config.core, { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }); @@ -262,10 +254,8 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter .device = device, .size = byte_size, .page_size = byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM}; auto input_dram_buffer = tt_metal::CreateBuffer(dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); auto output_dram_buffer = tt_metal::CreateBuffer(dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); tt_metal::CircularBufferConfig l1_input_cb_config = tt_metal::CircularBufferConfig(byte_size, {{input0_cb_index, test_config.l1_input_data_format}}) @@ -321,8 +311,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter test_config.core, { (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }); tt_metal::SetRuntimeArgs( @@ -331,8 +320,7 @@ bool reader_datacopy_writer(tt_metal::Device* device, const ReaderDatacopyWriter test_config.core, { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }); diff --git a/tests/tt_metal/tt_metal/api/test_dram.cpp b/tests/tt_metal/tt_metal/api/test_dram.cpp index 99bba0b1b91d..293a10a5cafd 100644 --- a/tests/tt_metal/tt_metal/api/test_dram.cpp +++ b/tests/tt_metal/tt_metal/api/test_dram.cpp @@ -51,9 +51,6 @@ bool dram_single_core_db(DispatchFixture* fixture, tt_metal::Device* device) { auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_buffer_addr = output_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - auto dram_copy_kernel = tt_metal::CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp", @@ -70,16 +67,14 @@ bool dram_single_core_db(DispatchFixture* fixture, tt_metal::Device* device) { dram_copy_kernel, core, {input_dram_buffer_addr, - (std::uint32_t)input_dram_noc_xy.x, - (std::uint32_t)input_dram_noc_xy.y, - output_dram_buffer_addr, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - dram_buffer_size_bytes, - num_tiles, - l1_buffer_addr, - total_l1_buffer_size_tiles, - total_l1_buffer_size_bytes}); + (std::uint32_t)0, + output_dram_buffer_addr, + (std::uint32_t)0, + dram_buffer_size_bytes, + num_tiles, + l1_buffer_addr, + total_l1_buffer_size_tiles, + total_l1_buffer_size_bytes}); fixture->RunProgram(device, program); @@ -105,25 +100,21 @@ bool dram_single_core( auto output_dram_buffer = tt_metal::CreateBuffer(dram_config); uint32_t output_dram_buffer_addr = output_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); log_debug(tt::LogVerif, "Creating kernel"); // Create the kernel auto dram_kernel = tt_metal::CreateKernel(program, cfg.kernel_file, cfg.core_range, cfg.data_movement_cfg); fixture->WriteBuffer(device, input_dram_buffer, src_vec); tt_metal::SetRuntimeArgs( - program, - dram_kernel, - cfg.core_range, - {cfg.l1_buffer_addr, - input_dram_buffer_addr, - (std::uint32_t)input_dram_noc_xy.x, - (std::uint32_t)input_dram_noc_xy.y, - output_dram_buffer_addr, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - cfg.dram_buffer_size}); + program, + dram_kernel, + cfg.core_range, + {cfg.l1_buffer_addr, + input_dram_buffer_addr, + (std::uint32_t)0, + output_dram_buffer_addr, + (std::uint32_t)0, + cfg.dram_buffer_size}); fixture->RunProgram(device, program); diff --git a/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp index fbb2ca02bab4..efa2bdd81af5 100644 --- a/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp +++ b/tests/tt_metal/tt_metal/api/test_dram_to_l1_multicast.cpp @@ -48,7 +48,6 @@ bool dram_to_l1_multicast(DispatchFixture* fixture, tt_metal::Device* device, co auto dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_addr = dram_buffer->address(); - auto dram_noc_xy = dram_buffer->noc_coordinates(); CoreCoord core_start = {0, 0}; CoreCoord grid_size = device->logical_grid_size(); @@ -65,8 +64,7 @@ bool dram_to_l1_multicast(DispatchFixture* fixture, tt_metal::Device* device, co } std::vector mcast_reader_args = { (std::uint32_t)dram_buffer_addr, - (std::uint32_t)dram_noc_xy.x, - (std::uint32_t)dram_noc_xy.y, + 0, (std::uint32_t)dram_buffer_size, (std::uint32_t)local_buffer_addr, (std::uint32_t)dest_buffer_addr, diff --git a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp index 7417bdd13df1..13d38f4e2ef5 100644 --- a/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp +++ b/tests/tt_metal/tt_metal/api/test_global_semaphores.cpp @@ -67,7 +67,10 @@ TEST_F(DispatchFixture, CreateMultipleGlobalSemaphoresOnSameCore) { const auto& cores_vec = cores_vecs[i]; for (const auto& core : cores_vec) { auto sem_vals = tt::llrt::read_hex_vec_from_core( - device->id(), device->worker_core_from_logical_core(core), address, sizeof(uint32_t)); + device->id(), + device->worker_core_from_logical_core(core), + address, + sizeof(uint32_t)); EXPECT_EQ(sem_vals[0], initial_value); } } diff --git a/tests/tt_metal/tt_metal/api/test_noc.cpp b/tests/tt_metal/tt_metal/api/test_noc.cpp index 8699430e54d3..5820e2710e0c 100644 --- a/tests/tt_metal/tt_metal/api/test_noc.cpp +++ b/tests/tt_metal/tt_metal/api/test_noc.cpp @@ -85,8 +85,8 @@ TEST(NOC, TensixSingleDeviceHarvestingPrints) { tt::log_info("Number of Harvested Rows={}", unharvested_logical_grid_size.y - logical_grid_size.y); } - tt::log_info("Logical -- Noc Coordinates Mapping"); - tt::log_info("[Logical <-> NOC0] Coordinates"); + tt::log_info("Logical -- Virtual Mapping"); + tt::log_info("[Logical <-> Virtual] Coordinates"); for (int r = 0; r < logical_grid_size.y; r++) { string output_row = ""; for (int c = 0; c < logical_grid_size.x; c++) { @@ -94,7 +94,7 @@ TEST(NOC, TensixSingleDeviceHarvestingPrints) { const auto noc_coord = device->worker_core_from_logical_core(logical_coord); output_row += "{L[x" + std::to_string(c); output_row += "-y" + std::to_string(r); - output_row += "]:N[x" + std::to_string(noc_coord.x); + output_row += "]:V[x" + std::to_string(noc_coord.x); output_row += "-y" + std::to_string(noc_coord.y); output_row += "]}, "; } @@ -108,6 +108,12 @@ TEST(NOC, TensixVerifyNocNodeIDs) { tt::tt_metal::Device* device; const unsigned int device_id = 0; device = tt::tt_metal::CreateDevice(device_id); + +#if COORDINATE_VIRTUALIZATION_ENABLED != 0 + uint32_t MY_NOC_ENCODING_REG = NOC_CFG(NOC_ID_LOGICAL); +#else + uint32_t MY_NOC_ENCODING_REG = NOC_NODE_ID; +#endif // Ping all the Noc Nodes auto logical_grid_size = device->logical_grid_size(); for (size_t y = 0; y < logical_grid_size.y; y++) { @@ -115,7 +121,7 @@ TEST(NOC, TensixVerifyNocNodeIDs) { auto worker_core = device->worker_core_from_logical_core(CoreCoord(x, y)); // Read register from specific node uint32_t node_id_regval; - node_id_regval = unit_tests::basic::test_noc::read_reg(device, CoreCoord(x, y), NOC_NODE_ID); + node_id_regval = unit_tests::basic::test_noc::read_reg(device, CoreCoord(x, y), MY_NOC_ENCODING_REG); ASSERT_NE( node_id_regval, unit_tests::basic::test_noc::init_value); // Need to make sure we read in valid reg // Check it matches software translated xy diff --git a/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp index ce128a890012..881efb23d1e4 100644 --- a/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp +++ b/tests/tt_metal/tt_metal/api/test_simple_l1_buffer.cpp @@ -62,23 +62,26 @@ bool SimpleTiledL1WriteCBRead( tt_metal::CircularBufferConfig(byte_size, {{cb_index, tt::DataFormat::Float16_b}}) .set_page_size(cb_index, page_size); auto l1_cb = tt_metal::CreateCircularBuffer(program, core, l1_cb_config); - + std::map defines = {{"INTERFACE_WITH_L1", "1"}}; + uint32_t bank_id = device->bank_ids_from_logical_core(BufferType::L1, core)[0]; auto reader_kernel = tt_metal::CreateKernel( program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::NOC_0, - .compile_args = {cb_index}}); + .compile_args = {cb_index}, + .defines = defines}); auto writer_kernel = tt_metal::CreateKernel( program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp", + "tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::NOC_1, - .compile_args = {cb_index}}); + .compile_args = {cb_index}, + .defines = defines}); tt_metal::SetRuntimeArgs( program, @@ -86,8 +89,7 @@ bool SimpleTiledL1WriteCBRead( core, { (uint32_t)input_local_address, - (uint32_t)phys_core.x, - (uint32_t)phys_core.y, + bank_id, (uint32_t)num_tiles, }); tt_metal::SetRuntimeArgs( @@ -96,8 +98,7 @@ bool SimpleTiledL1WriteCBRead( core, { (uint32_t)output_local_address, - (uint32_t)phys_core.x, - (uint32_t)phys_core.y, + bank_id, (uint32_t)num_tiles, }); @@ -114,6 +115,7 @@ bool SimpleTiledL1WriteCBRead( } return pass; } + } // namespace tt::test::buffer::detail TEST_F(DeviceFixture, TestSimpleL1BufferReadOnlyLo) { diff --git a/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp index f24d3fee91f0..dcd236cd74a7 100644 --- a/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp +++ b/tests/tt_metal/tt_metal/api/test_soc_descriptor.cpp @@ -52,6 +52,7 @@ TEST(SOC, TensixValidateLogicalToPhysicalCoreCoordHostMapping) { for (int device_id = 0; device_id < num_devices; device_id++) { tt_metal::Device* device = tt_metal::CreateDevice(device_id); uint32_t harvested_rows_mask = tt::Cluster::instance().get_harvested_rows(device_id); + const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(device_id); log_info(LogTest, "Device {} harvesting mask {}", device_id, harvested_rows_mask); std::unordered_set harvested_rows = unit_tests::basic::soc_desc::get_harvested_rows(device_id); @@ -59,7 +60,7 @@ TEST(SOC, TensixValidateLogicalToPhysicalCoreCoordHostMapping) { for (int x = 0; x < logical_grid_size.x; x++) { for (int y = 0; y < logical_grid_size.y; y++) { CoreCoord logical_core_coord(x, y); - CoreCoord physical_core_coord = device->worker_core_from_logical_core(logical_core_coord); + CoreCoord physical_core_coord = soc_desc.get_physical_tensix_core_from_logical(logical_core_coord); ASSERT_TRUE(harvested_rows.find(physical_core_coord.y) == harvested_rows.end()); } } diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp index 7d8ec61dd309..ad0ef0534e93 100644 --- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tensix_dest.cpp @@ -44,8 +44,7 @@ using DramBuffer = std::shared_ptr; static std::vector get_dram_kernel_runtime_arguments(const DramBuffer& dram_buffer, size_t num_tiles) { return { static_cast(dram_buffer->address()), - static_cast(dram_buffer->noc_coordinates().x), - static_cast(dram_buffer->noc_coordinates().y), + static_cast(0), static_cast(num_tiles), }; } diff --git a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp index 33b858331570..e324a6f44bc3 100644 --- a/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/dprint/test_print_tiles.cpp @@ -120,7 +120,6 @@ static void RunTest(DPrintFixture* fixture, Device* device, tt::DataFormat data_ .device = device, .size = tile_size, .page_size = tile_size, .buffer_type = tt_metal::BufferType::DRAM}; auto src_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_addr = src_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); // Create kernels on device KernelHandle brisc_print_kernel_id = CreateKernel( @@ -143,11 +142,7 @@ static void RunTest(DPrintFixture* fixture, Device* device, tt::DataFormat data_ ); // BRISC kernel needs dram info via rtargs - tt_metal::SetRuntimeArgs( - program, - brisc_print_kernel_id, - core, - {dram_buffer_src_addr, (std::uint32_t)dram_src_noc_xy.x, (std::uint32_t)dram_src_noc_xy.y}); + tt_metal::SetRuntimeArgs(program, brisc_print_kernel_id, core, {dram_buffer_src_addr, (std::uint32_t)0}); // Create input tile std::vector u32_vec = GenerateInputTile(data_format); diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp index 25fd8be5c26d..ddd80f0a95dd 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_assert.cpp @@ -155,18 +155,19 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty // We should be able to find the expected watcher error in the log as well, // expected error message depends on the risc we're running on. string kernel = "tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp"; - int line_num = 56; + int line_num = 57; string expected = fmt::format( - "Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2}): {} tripped an assert on line {}. Current kernel: {}.", + "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2}): {} tripped an assert on line {}. Current kernel: {}.", device->id(), (riscv_type == DebugErisc) ? "ethnet" : "worker", - logical_core.x, logical_core.y, - phys_core.x, phys_core.y, + logical_core.x, + logical_core.y, + phys_core.x, + phys_core.y, risc, line_num, - kernel - ); + kernel); expected += " Note that file name reporting is not yet implemented, and the reported line number for the assert may be from a different file."; log_info(LogTest, "Expected error: {}", expected); @@ -179,7 +180,7 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty } } -TEST_F(WatcherFixture, TensixTestWatcherAssertBrisc) { +TEST_F(WatcherFixture, TestWatcherAssertBrisc) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -191,7 +192,7 @@ TEST_F(WatcherFixture, TensixTestWatcherAssertBrisc) { ); } -TEST_F(WatcherFixture, TensixTestWatcherAssertNCrisc) { +TEST_F(WatcherFixture, TestWatcherAssertNCrisc) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -201,7 +202,7 @@ TEST_F(WatcherFixture, TensixTestWatcherAssertNCrisc) { ); } -TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc0) { +TEST_F(WatcherFixture, TestWatcherAssertTrisc0) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -211,7 +212,7 @@ TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc0) { ); } -TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc1) { +TEST_F(WatcherFixture, TestWatcherAssertTrisc1) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -221,7 +222,7 @@ TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc1) { ); } -TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc2) { +TEST_F(WatcherFixture, TestWatcherAssertTrisc2) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -231,7 +232,7 @@ TEST_F(WatcherFixture, TensixTestWatcherAssertTrisc2) { ); } -TEST_F(WatcherFixture, ActiveEthTestWatcherAssertErisc) { +TEST_F(WatcherFixture, TestWatcherAssertErisc) { using namespace CMAKE_UNIQUE_NAMESPACE; if (this->slow_dispatch_) GTEST_SKIP(); @@ -241,7 +242,7 @@ TEST_F(WatcherFixture, ActiveEthTestWatcherAssertErisc) { ); } -TEST_F(WatcherFixture, IdleEthTestWatcherAssertIErisc) { +TEST_F(WatcherFixture, TestWatcherAssertIErisc) { using namespace CMAKE_UNIQUE_NAMESPACE; if (!this->IsSlowDispatch()) { log_info(tt::LogTest, "FD-on-idle-eth not supported."); diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp index 8f656da7fd62..0ac4f6ce2670 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize.cpp @@ -21,8 +21,8 @@ using namespace tt::tt_metal; typedef enum sanitization_features { SanitizeAddress, - SanitizeAlignmentL1, - SanitizeAlignmentDRAM + SanitizeAlignmentL1Write, + SanitizeAlignmentL1Read } watcher_features_t; void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, bool is_eth_core, watcher_features_t feature, bool use_ncrisc = false) { @@ -38,94 +38,83 @@ void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, boo // Set up dram buffers uint32_t single_tile_size = 2 * 1024; uint32_t num_tiles = 50; - uint32_t dram_buffer_size = single_tile_size * num_tiles; + uint32_t l1_buffer_size = single_tile_size * num_tiles; uint32_t l1_buffer_addr = 400 * 1024; + tt_metal::InterleavedBufferConfig l1_config{ + .device = device, .size = l1_buffer_size, .page_size = l1_buffer_size, .buffer_type = tt_metal::BufferType::L1}; + auto input_l1_buffer = CreateBuffer(l1_config); + uint32_t input_l1_buffer_addr = input_l1_buffer->address(); - tt_metal::InterleavedBufferConfig dram_config{ - .device=device, - .size = dram_buffer_size, - .page_size = dram_buffer_size, - .buffer_type = tt_metal::BufferType::DRAM - }; - auto input_dram_buffer = CreateBuffer(dram_config); - uint32_t input_dram_buffer_addr = input_dram_buffer->address(); + auto output_l1_buffer = CreateBuffer(l1_config); + uint32_t output_l1_buffer_addr = output_l1_buffer->address(); - auto output_dram_buffer = CreateBuffer(dram_config); - uint32_t output_dram_buffer_addr = output_dram_buffer->address(); - - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - log_info("Input DRAM: {}", input_dram_noc_xy); - log_info("Output DRAM: {}", output_dram_noc_xy); + auto input_buf_noc_xy = device->worker_core_from_logical_core(input_l1_buffer->logical_core_from_bank_id(0)); + auto output_buf_noc_xy = device->worker_core_from_logical_core(output_l1_buffer->logical_core_from_bank_id(0)); + log_info("Input DRAM: {}", input_buf_noc_xy); + log_info("Output DRAM: {}", output_buf_noc_xy); // A DRAM copy kernel, we'll feed it incorrect inputs to test sanitization. KernelHandle dram_copy_kernel; if (is_eth_core) { std::map dram_copy_kernel_defines = { - {"SIGNAL_COMPLETION_TO_DISPATCHER", "1"}, - }; - dram_copy_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", - core, - tt_metal::EthernetConfig{ - .noc = tt_metal::NOC::NOC_0, - .defines=dram_copy_kernel_defines - } - ); + {"SIGNAL_COMPLETION_TO_DISPATCHER", "1"}, + }; + dram_copy_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp", + core, + tt_metal::EthernetConfig{.noc = tt_metal::NOC::NOC_0, .defines = dram_copy_kernel_defines}); } else { - std::map dram_copy_kernel_defines = { - {"SIGNAL_COMPLETION_TO_DISPATCHER", "1"}, - }; - dram_copy_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", - core, - tt_metal::DataMovementConfig{ - .processor = (use_ncrisc) ? tt_metal::DataMovementProcessor::RISCV_1 : tt_metal::DataMovementProcessor::RISCV_0, - .noc = (use_ncrisc) ? tt_metal::NOC::RISCV_1_default : tt_metal::NOC::RISCV_0_default, - .defines=dram_copy_kernel_defines - } - ); + std::map dram_copy_kernel_defines = { + {"SIGNAL_COMPLETION_TO_DISPATCHER", "1"}, + }; + dram_copy_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp", + core, + tt_metal::DataMovementConfig{ + .processor = + (use_ncrisc) ? tt_metal::DataMovementProcessor::RISCV_1 : tt_metal::DataMovementProcessor::RISCV_0, + .noc = (use_ncrisc) ? tt_metal::NOC::RISCV_1_default : tt_metal::NOC::RISCV_0_default, + .defines = dram_copy_kernel_defines}); } // Write to the input DRAM buffer std::vector input_vec = create_random_vector_of_bfloat16( - dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); - tt_metal::detail::WriteToBuffer(input_dram_buffer, input_vec); + l1_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); + tt_metal::detail::WriteToBuffer(input_l1_buffer, input_vec); // Write runtime args - update to a core that doesn't exist or an improperly aligned address, // depending on the flags passed in. switch(feature) { case SanitizeAddress: - output_dram_noc_xy.x = 16; - output_dram_noc_xy.y = 16; - break; - case SanitizeAlignmentL1: - l1_buffer_addr += 16; // This is illegal because reading DRAM->L1 needs DRAM alignment - // requirements (32 byte aligned). + output_buf_noc_xy.x = 16; + output_buf_noc_xy.y = 16; break; - case SanitizeAlignmentDRAM: - input_dram_buffer_addr++; + case SanitizeAlignmentL1Write: + output_l1_buffer_addr++; // This is illegal because reading DRAM->L1 needs DRAM alignment + // requirements (32 byte aligned). break; + case SanitizeAlignmentL1Read: input_l1_buffer_addr++; break; default: log_warning(LogTest, "Unrecognized feature to test ({}), skipping...", feature); GTEST_SKIP(); break; } + tt_metal::SetRuntimeArgs( program, dram_copy_kernel, core, {l1_buffer_addr, - input_dram_buffer_addr, - (std::uint32_t)input_dram_noc_xy.x, - (std::uint32_t)input_dram_noc_xy.y, - output_dram_buffer_addr, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - dram_buffer_size}); + input_l1_buffer_addr, + input_buf_noc_xy.x, + input_buf_noc_xy.y, + output_l1_buffer_addr, + (std::uint32_t)output_buf_noc_xy.x, + (std::uint32_t)output_buf_noc_xy.y, + l1_buffer_size}); // Run the kernel, expect an exception here try { @@ -140,40 +129,64 @@ void RunTestOnCore(WatcherFixture* fixture, Device* device, CoreCoord &core, boo // We should be able to find the expected watcher error in the log as well. string expected; + int noc = (use_ncrisc) ? 1 : 0; + CoreCoord target_core = device->virtual_noc_coordinate(noc, input_buf_noc_xy); + string risc_name = (is_eth_core) ? "erisc" : "brisc"; + if (use_ncrisc) { + risc_name = "ncrisc"; + } switch(feature) { case SanitizeAddress: expected = fmt::format( - "Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2}): {} using noc0 tried to unicast write 102400 bytes from local L1[{:#08x}] to Unknown core w/ physical coords {} [addr=0x{:08x}] (NOC target address did not map to any known Tensix/Ethernet/DRAM/PCIE core).", + "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2}): {} using noc0 tried to unicast write 102400 " + "bytes from local L1[{:#08x}] to Unknown core w/ physical coords {} [addr=0x{:08x}] (NOC target " + "address did not map to any known Tensix/Ethernet/DRAM/PCIE core).", device->id(), (is_eth_core) ? "ethnet" : "worker", - core.x, core.y, phys_core.x, phys_core.y, - (is_eth_core) ? "erisc" : "brisc", l1_buffer_addr, output_dram_noc_xy.str(), - output_dram_buffer_addr - ); + core.x, + core.y, + phys_core.x, + phys_core.y, + (is_eth_core) ? "erisc" : "brisc", + l1_buffer_addr, + output_buf_noc_xy.str(), + output_l1_buffer_addr); break; - case SanitizeAlignmentL1: - case SanitizeAlignmentDRAM: - { - // NoC-1 has a different coordinate for the same DRAM - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - int noc = (use_ncrisc) ? 1 : 0; - CoreCoord target_phys_core = { - tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.x, input_dram_noc_xy.x), - tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.y, input_dram_noc_xy.y) - }; - string risc_name = (is_eth_core) ? "erisc" : "brisc"; - if (use_ncrisc) - risc_name = "ncrisc"; + case SanitizeAlignmentL1Write: { expected = fmt::format( - "Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2}): {} using noc{} tried to unicast read 102400 bytes to local L1[{:#08x}] from DRAM core w/ physical coords {} DRAM[addr=0x{:08x}] (invalid address alignment in NOC transaction).", + "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2}): {} using noc{} tried to unicast write 102400 " + "bytes from local L1[{:#08x}] to Tensix core w/ physical coords {} L1[addr=0x{:08x}] (invalid address " + "alignment in NOC transaction).", device->id(), (is_eth_core) ? "ethnet" : "worker", - core.x, core.y, phys_core.x, phys_core.y, - risc_name, noc, l1_buffer_addr, target_phys_core, - input_dram_buffer_addr - ); - } + core.x, + core.y, + phys_core.x, + phys_core.y, + risc_name, + noc, + l1_buffer_addr, + target_core, + output_l1_buffer_addr); break; + } + case SanitizeAlignmentL1Read: { + expected = fmt::format( + "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2}): {} using noc{} tried to unicast read 102400 " + "bytes to local L1[{:#08x}] from Tensix core w/ physical coords {} L1[addr=0x{:08x}] (invalid address " + "alignment in NOC transaction).", + device->id(), + (is_eth_core) ? "ethnet" : "worker", + core.x, + core.y, + phys_core.x, + phys_core.y, + risc_name, + noc, + l1_buffer_addr, + target_core, + input_l1_buffer_addr); + } break; default: log_warning(LogTest, "Unrecognized feature to test ({}), skipping...", feature); GTEST_SKIP(); @@ -244,37 +257,37 @@ TEST_F(WatcherFixture, TensixTestWatcherSanitize) { ); } -TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentL1) { +TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentL1Write) { if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){ CoreCoord core{0, 0}; - RunTestOnCore(fixture, device, core, false, SanitizeAlignmentL1); + RunTestOnCore(fixture, device, core, false, SanitizeAlignmentL1Write); }, this->devices_[0] ); } -TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentDRAM) { +TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentL1Read) { if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){ CoreCoord core{0, 0}; - RunTestOnCore(fixture, device, core, false, SanitizeAlignmentDRAM); + RunTestOnCore(fixture, device, core, false, SanitizeAlignmentL1Read); }, this->devices_[0] ); } -TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentDRAMNCrisc) { +TEST_F(WatcherFixture, TensixTestWatcherSanitizeAlignmentL1ReadNCrisc) { if (this->slow_dispatch_) GTEST_SKIP(); this->RunTestOnDevice( [](WatcherFixture *fixture, Device *device){ CoreCoord core{0, 0}; - RunTestOnCore(fixture, device, core, false, SanitizeAlignmentDRAM, true); + RunTestOnCore(fixture, device, core, false, SanitizeAlignmentL1Read, true); }, this->devices_[0] ); diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp index 3ddd22c58cc2..67054c8c604e 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_noc_sanitize_delays.cpp @@ -52,10 +52,6 @@ void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = tt::CBIndex::c_0; uint32_t num_input_tiles = 2; tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig(num_input_tiles * SINGLE_TILE_SIZE, {{src0_cb_index, tt::DataFormat::Float16_b}}) @@ -118,19 +114,9 @@ void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord EnqueueWriteBuffer(cq, std::ref(src1_dram_buffer), src1_vec, false); vector reader_args = { - dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - NUM_TILES, - dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, - NUM_TILES, - 0}; - - vector writer_args = { - dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, NUM_TILES - }; + dram_buffer_src0_addr, (std::uint32_t)0, NUM_TILES, dram_buffer_src1_addr, (std::uint32_t)0, NUM_TILES, 0}; + + vector writer_args = {dram_buffer_dst_addr, (std::uint32_t)0, NUM_TILES}; SetRuntimeArgs(program, unary_writer_kernel, core, writer_args); SetRuntimeArgs(program, binary_reader_kernel, core, reader_args); @@ -143,7 +129,7 @@ void RunDelayTestOnCore(WatcherDelayFixture* fixture, Device* device, CoreCoord std::vector read_vec; CoreCoord worker_core = fixture->delayed_cores[CoreType::WORKER][0]; // Just check that the first delayed core has the feedback set - CoreCoord phys_core = device->physical_core_from_logical_core({0,0}, CoreType::WORKER); + CoreCoord phys_core = device->virtual_core_from_logical_core({0, 0}, CoreType::WORKER); read_vec = tt::llrt::read_hex_vec_from_core ( device->id(), phys_core, diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp index fb70bc917003..46d0c1666388 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_pause.cpp @@ -65,7 +65,8 @@ static void RunTest(WatcherFixture* fixture, Device* device) { KernelHandle erisc_kid; std::set eth_core_ranges; for (const auto& core : device->get_active_ethernet_cores(true)) { - log_info(LogTest, "Running on eth core {}({})", core.str(), device->ethernet_core_from_logical_core(core).str()); + log_info( + LogTest, "Running on eth core {}({})", core.str(), device->ethernet_core_from_logical_core(core).str()); eth_core_ranges.insert(CoreRange(core, core)); } erisc_kid = CreateKernel( @@ -82,7 +83,11 @@ static void RunTest(WatcherFixture* fixture, Device* device) { KernelHandle ierisc_kid; std::set eth_core_ranges; for (const auto& core : device->get_inactive_ethernet_cores()) { - log_info(LogTest, "Running on inactive eth core {}({})", core.str(), device->ethernet_core_from_logical_core(core).str()); + log_info( + LogTest, + "Running on inactive eth core {}({})", + core.str(), + device->ethernet_core_from_logical_core(core).str()); eth_core_ranges.insert(CoreRange(core, core)); } ierisc_kid = CreateKernel( diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp index 97ed9adef752..c2e63a7e384a 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_ringbuf.cpp @@ -144,7 +144,7 @@ static void RunTest(WatcherFixture *fixture, Device *device, riscv_id_t riscv_ty } } -TEST_F(WatcherFixture, TensixTestWatcherRingBufferBrisc) { +TEST_F(WatcherFixture, TestWatcherRingBufferBrisc) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -153,7 +153,8 @@ TEST_F(WatcherFixture, TensixTestWatcherRingBufferBrisc) { ); } } -TEST_F(WatcherFixture, TensixTestWatcherRingBufferNCrisc) { + +TEST_F(WatcherFixture, TestWatcherRingBufferNCrisc) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -162,7 +163,8 @@ TEST_F(WatcherFixture, TensixTestWatcherRingBufferNCrisc) { ); } } -TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc0) { + +TEST_F(WatcherFixture, TestWatcherRingBufferTrisc0) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -171,7 +173,8 @@ TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc0) { ); } } -TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc1) { + +TEST_F(WatcherFixture, TestWatcherRingBufferTrisc1) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -180,7 +183,8 @@ TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc1) { ); } } -TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc2) { + +TEST_F(WatcherFixture, TestWatcherRingBufferTrisc2) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -189,7 +193,8 @@ TEST_F(WatcherFixture, TensixTestWatcherRingBufferTrisc2) { ); } } -TEST_F(WatcherFixture, ActiveEthTestWatcherRingBufferErisc) { + +TEST_F(WatcherFixture, TestWatcherRingBufferErisc) { using namespace CMAKE_UNIQUE_NAMESPACE; for (Device* device : this->devices_) { this->RunTestOnDevice( @@ -198,7 +203,8 @@ TEST_F(WatcherFixture, ActiveEthTestWatcherRingBufferErisc) { ); } } -TEST_F(WatcherFixture, IdleEthTestWatcherRingBufferIErisc) { + +TEST_F(WatcherFixture, TestWatcherRingBufferIErisc) { using namespace CMAKE_UNIQUE_NAMESPACE; if (!this->IsSlowDispatch()) { log_info(tt::LogTest, "FD-on-idle-eth not supported."); diff --git a/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp index 524d06c4f3e5..f411c0da6a36 100644 --- a/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp +++ b/tests/tt_metal/tt_metal/debug_tools/watcher/test_waypoint.cpp @@ -145,7 +145,8 @@ static void RunTest(WatcherFixture* fixture, Device* device) { k_id_s = ""; } expected = fmt::format( - "Device {} ethnet core(x={:2},y={:2}) phys(x={:2},y={:2}): {}, X, X, X, X rmsg:* h_id:0 " + "Device {} ethnet core(x={:2},y={:2}) virtual(x={:2},y={:2}): {}, X, X, X, X rmsg:* " + "h_id:0 " "k_id:{}", device->id(), logical_core.x, @@ -165,7 +166,7 @@ static void RunTest(WatcherFixture* fixture, Device* device) { k_id_s = ""; } expected = fmt::format( - "Device {} worker core(x={:2},y={:2}) phys(x={:2},y={:2}): {},{},{},{},{} rmsg:***|*** h_id:0 " + "Device {} worker core(x={:2},y={:2}) virtual(x={:2},y={:2}): {},{},{},{},{} rmsg:***|*** h_id:0 " "smsg:**** k_ids:{}", device->id(), logical_core.x, diff --git a/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp index 5968e1441386..0b9ba68dddca 100644 --- a/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp +++ b/tests/tt_metal/tt_metal/device/test_device_cluster_api.cpp @@ -45,7 +45,7 @@ TEST_F(N300DeviceFixture, EthValidateEthernetConnectivity) { } // Check conversion to noc coords - std::vector chip_0_eth_noc_coords_expected = {CoreCoord(9, 6), CoreCoord(1, 6)}; + std::vector chip_0_eth_noc_coords_expected = {CoreCoord(25, 17), CoreCoord(18, 17)}; std::vector chip_0_eth_logical_coords; std::copy( @@ -59,7 +59,7 @@ TEST_F(N300DeviceFixture, EthValidateEthernetConnectivity) { std::sort(chip_0_eth_noc_coords_returned.begin(), chip_0_eth_noc_coords_returned.end()); ASSERT_TRUE(chip_0_eth_noc_coords_returned == chip_0_eth_noc_coords_expected); - std::vector chip_1_eth_noc_coords_expected = {CoreCoord(9, 0), CoreCoord(1, 0)}; + std::vector chip_1_eth_noc_coords_expected = {CoreCoord(25, 16), CoreCoord(18, 16)}; std::vector chip_1_eth_logical_coords; std::copy( @@ -82,22 +82,22 @@ TEST_F(N300DeviceFixture, EthInvalidLogicalEthernetCore) { TEST_F(N300DeviceFixture, EthValidateAllEthernetCoreMapping) { static std::map expected_mapping_logical_to_physical = { - {CoreCoord(0, 0), CoreCoord(9, 0)}, - {CoreCoord(0, 1), CoreCoord(1, 0)}, - {CoreCoord(0, 2), CoreCoord(8, 0)}, - {CoreCoord(0, 3), CoreCoord(2, 0)}, - {CoreCoord(0, 4), CoreCoord(7, 0)}, - {CoreCoord(0, 5), CoreCoord(3, 0)}, - {CoreCoord(0, 6), CoreCoord(6, 0)}, - {CoreCoord(0, 7), CoreCoord(4, 0)}, - {CoreCoord(0, 8), CoreCoord(9, 6)}, - {CoreCoord(0, 9), CoreCoord(1, 6)}, - {CoreCoord(0, 10), CoreCoord(8, 6)}, - {CoreCoord(0, 11), CoreCoord(2, 6)}, - {CoreCoord(0, 12), CoreCoord(7, 6)}, - {CoreCoord(0, 13), CoreCoord(3, 6)}, - {CoreCoord(0, 14), CoreCoord(6, 6)}, - {CoreCoord(0, 15), CoreCoord(4, 6)}, + {CoreCoord(0, 0), CoreCoord(25, 16)}, + {CoreCoord(0, 1), CoreCoord(18, 16)}, + {CoreCoord(0, 2), CoreCoord(24, 16)}, + {CoreCoord(0, 3), CoreCoord(19, 16)}, + {CoreCoord(0, 4), CoreCoord(23, 16)}, + {CoreCoord(0, 5), CoreCoord(20, 16)}, + {CoreCoord(0, 6), CoreCoord(22, 16)}, + {CoreCoord(0, 7), CoreCoord(21, 16)}, + {CoreCoord(0, 8), CoreCoord(25, 17)}, + {CoreCoord(0, 9), CoreCoord(18, 17)}, + {CoreCoord(0, 10), CoreCoord(24, 17)}, + {CoreCoord(0, 11), CoreCoord(19, 17)}, + {CoreCoord(0, 12), CoreCoord(23, 17)}, + {CoreCoord(0, 13), CoreCoord(20, 17)}, + {CoreCoord(0, 14), CoreCoord(22, 17)}, + {CoreCoord(0, 15), CoreCoord(21, 17)}, }; const auto& device_0 = this->devices_.at(0); for (const auto& logical_core : device_0->ethernet_cores()) { @@ -109,31 +109,31 @@ TEST_F(N300DeviceFixture, EthValidateAllEthernetCoreMapping) { TEST_F(N300DeviceFixture, EthValidatePhysicalCoreConversion) { static std::map expected_mapping_logical_to_physical = { - {CoreCoord(0, 0), CoreCoord(9, 0)}, - {CoreCoord(0, 1), CoreCoord(1, 0)}, - {CoreCoord(0, 2), CoreCoord(8, 0)}, - {CoreCoord(0, 3), CoreCoord(2, 0)}, - {CoreCoord(0, 4), CoreCoord(7, 0)}, - {CoreCoord(0, 5), CoreCoord(3, 0)}, - {CoreCoord(0, 6), CoreCoord(6, 0)}, - {CoreCoord(0, 7), CoreCoord(4, 0)}, - {CoreCoord(0, 8), CoreCoord(9, 6)}, - {CoreCoord(0, 9), CoreCoord(1, 6)}, - {CoreCoord(0, 10), CoreCoord(8, 6)}, - {CoreCoord(0, 11), CoreCoord(2, 6)}, - {CoreCoord(0, 12), CoreCoord(7, 6)}, - {CoreCoord(0, 13), CoreCoord(3, 6)}, - {CoreCoord(0, 14), CoreCoord(6, 6)}, - {CoreCoord(0, 15), CoreCoord(4, 6)}, + {CoreCoord(0, 0), CoreCoord(25, 16)}, + {CoreCoord(0, 1), CoreCoord(18, 16)}, + {CoreCoord(0, 2), CoreCoord(24, 16)}, + {CoreCoord(0, 3), CoreCoord(19, 16)}, + {CoreCoord(0, 4), CoreCoord(23, 16)}, + {CoreCoord(0, 5), CoreCoord(20, 16)}, + {CoreCoord(0, 6), CoreCoord(22, 16)}, + {CoreCoord(0, 7), CoreCoord(21, 16)}, + {CoreCoord(0, 8), CoreCoord(25, 17)}, + {CoreCoord(0, 9), CoreCoord(18, 17)}, + {CoreCoord(0, 10), CoreCoord(24, 17)}, + {CoreCoord(0, 11), CoreCoord(19, 17)}, + {CoreCoord(0, 12), CoreCoord(23, 17)}, + {CoreCoord(0, 13), CoreCoord(20, 17)}, + {CoreCoord(0, 14), CoreCoord(22, 17)}, + {CoreCoord(0, 15), CoreCoord(21, 17)}, }; const auto& device_0 = this->devices_.at(0); for (const auto& logical_core : device_0->ethernet_cores()) { ASSERT_TRUE( - device_0->physical_core_from_logical_core(logical_core, CoreType::ETH) == + device_0->virtual_core_from_logical_core(logical_core, CoreType::ETH) == expected_mapping_logical_to_physical.at(logical_core)); } // Check an invalid core type - EXPECT_ANY_THROW(device_0->physical_core_from_logical_core(CoreCoord(0, 0), CoreType::PCIE)); + EXPECT_ANY_THROW(device_0->virtual_core_from_logical_core(CoreCoord(0, 0), CoreType::PCIE)); } TEST_F(N300DeviceFixture, ActiveEthValidateEthernetSockets) { diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp index 5dd7eea0042f..81b4a647c72b 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_program/test_EnqueueProgram.cpp @@ -598,19 +598,10 @@ bool verify_rt_args( tt::Cluster::instance().l1_barrier(device->id()); auto noc_xy = riscv == tt::RISCV::ERISC ? device->ethernet_core_from_logical_core(logical_core) : device->worker_core_from_logical_core(logical_core); - std::vector args_readback = - tt::llrt::read_hex_vec_from_core(device->id(), noc_xy, addr, expected_rt_args.size() * sizeof(uint32_t)); - log_debug( - tt::LogTest, - "Verifying {} {} RT args for {} (Logical: {}) at addr: 0x{:x} w/ incr_val: {}", - expected_rt_args.size(), - label, - noc_xy, - logical_core.str(), - addr, - incr_val); - - for (int i = 0; i < expected_rt_args.size(); i++) { + std::vector args_readback = tt::llrt::read_hex_vec_from_core(device->id(), noc_xy, addr, expected_rt_args.size() * sizeof(uint32_t)); + log_debug(tt::LogTest, "Verifying {} {} RT args for {} (Logical: {}) at addr: 0x{:x} w/ incr_val: {}", expected_rt_args.size(), label, noc_xy, logical_core.str(), addr, incr_val); + + for(int i=0; iphysical_core_from_logical_core(eth_core, CoreType::ETH); + CoreCoord phys_eth_core = device->virtual_core_from_logical_core(eth_core, CoreType::ETH); uint32_t eth_sem_id = CreateSemaphore(program, eth_core, eth_sem_init_val, CoreType::ETH); auto eth_kernel = CreateKernel( program, @@ -102,7 +102,7 @@ TEST_F(DispatchFixture, EthTestBlank) { if (eth_cores.size() > 0) { CoreCoord eth_core = *eth_cores.begin(); - CoreCoord phys_eth_core = device->physical_core_from_logical_core(eth_core, CoreType::ETH); + CoreCoord phys_eth_core = device->virtual_core_from_logical_core(eth_core, CoreType::ETH); CreateKernel( program, "tt_metal/kernels/dataflow/blank.cpp", @@ -156,7 +156,7 @@ TEST_F(DispatchFixture, EthTestInitLocalMemory) { if (eth_cores.size() > 0) { CoreCoord eth_core = *eth_cores.begin(); - CoreCoord phys_eth_core = device->physical_core_from_logical_core(eth_core, CoreType::ETH); + CoreCoord phys_eth_core = device->virtual_core_from_logical_core(eth_core, CoreType::ETH); CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/misc/local_mem.cpp", diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp index fcac08d6d488..a13fa760e2a9 100644 --- a/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp +++ b/tests/tt_metal/tt_metal/dispatch/dispatch_trace/test_EnqueueTrace.cpp @@ -57,10 +57,16 @@ Program create_simple_unary_program(Buffer& input, Buffer& output) { std::shared_ptr reader_runtime_args = std::make_shared(); *writer_runtime_args = { - &output, (uint32_t)output.noc_coordinates().x, (uint32_t)output.noc_coordinates().y, output.num_pages()}; + &output, + (uint32_t)0, + output.num_pages() + }; *reader_runtime_args = { - &input, (uint32_t)input.noc_coordinates().x, (uint32_t)input.noc_coordinates().y, input.num_pages()}; + &input, + (uint32_t)0, + input.num_pages() + }; SetRuntimeArgs(device, detail::GetKernel(program, writer_kernel), worker, writer_runtime_args); SetRuntimeArgs(device, detail::GetKernel(program, reader_kernel), worker, reader_runtime_args); diff --git a/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp index 48ac0685e61b..87abacc67598 100644 --- a/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp +++ b/tests/tt_metal/tt_metal/eth/test_basic_eth.cpp @@ -57,14 +57,12 @@ bool reader_kernel_no_send( auto input_dram_buffer = CreateBuffer(dram_config); uint32_t dram_byte_address = input_dram_buffer->address(); - auto dram_noc_xy = input_dram_buffer->noc_coordinates(); auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_reader_core); log_debug( tt::LogTest, - "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}", + "Device {}: reading {} bytes from dram bank 0 addr {} to ethernet core {} addr {}", device->id(), byte_size, - dram_noc_xy.str(), dram_byte_address, eth_reader_core.str(), eth_l1_byte_address); @@ -92,8 +90,7 @@ bool reader_kernel_no_send( eth_reader_core, { (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, + 0, (uint32_t)byte_size, (uint32_t)eth_l1_byte_address, }); @@ -126,16 +123,14 @@ bool writer_kernel_no_receive( auto output_dram_buffer = CreateBuffer(dram_config); uint32_t dram_byte_address = output_dram_buffer->address(); - auto dram_noc_xy = output_dram_buffer->noc_coordinates(); auto eth_noc_xy = device->ethernet_core_from_logical_core(eth_writer_core); log_debug( tt::LogTest, - "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}", + "Device {}: writing {} bytes from ethernet core {} addr {} to dram bank 0 addr {}", device->id(), byte_size, eth_writer_core.str(), eth_l1_byte_address, - dram_noc_xy.str(), dram_byte_address); auto eth_writer_kernel = tt_metal::CreateKernel( @@ -161,18 +156,18 @@ bool writer_kernel_no_receive( eth_writer_core, { (uint32_t)dram_byte_address, - (uint32_t)dram_noc_xy.x, - (uint32_t)dram_noc_xy.y, + 0, (uint32_t)byte_size, (uint32_t)eth_l1_byte_address, }); fixture->RunProgram(device, program); - auto readback_vec = llrt::read_hex_vec_from_core(device->id(), dram_noc_xy, dram_byte_address, byte_size); + std::vector readback_vec; + fixture->ReadBuffer(device, output_dram_buffer, readback_vec); pass &= (readback_vec == inputs); if (not pass) { - std::cout << "Mismatch at Core: " << dram_noc_xy.str() << std::endl; + std::cout << "Mismatch" << std::endl; } return pass; } @@ -195,26 +190,21 @@ bool noc_reader_and_writer_kernels( auto reader_dram_buffer = CreateBuffer(dram_config); auto writer_dram_buffer = CreateBuffer(dram_config); - auto reader_dram_noc_xy = reader_dram_buffer->noc_coordinates(); - auto writer_dram_noc_xy = writer_dram_buffer->noc_coordinates(); - log_debug( tt::LogTest, - "Device {}: reading {} bytes from dram {} addr {} to ethernet core {} addr {}", + "Device {}: reading {} bytes from dram bank 0 addr {} to ethernet core {} addr {}", device->id(), byte_size, - reader_dram_noc_xy.str(), reader_dram_buffer->address(), logical_eth_core.str(), eth_dst_l1_address); log_debug( tt::LogTest, - "Device {}: writing {} bytes from ethernet core {} addr {} to dram {} addr {}", + "Device {}: writing {} bytes from ethernet core {} addr {} to dram bank 0 addr {}", device->id(), byte_size, logical_eth_core.str(), eth_src_l1_address, - writer_dram_noc_xy.str(), writer_dram_buffer->address()); auto eth_noc_xy = device->ethernet_core_from_logical_core(logical_eth_core); @@ -231,8 +221,7 @@ bool noc_reader_and_writer_kernels( logical_eth_core, { (uint32_t)reader_dram_buffer->address(), - (uint32_t)reader_dram_noc_xy.x, - (uint32_t)reader_dram_noc_xy.y, + 0, (uint32_t)byte_size, (uint32_t)eth_dst_l1_address, }); @@ -249,8 +238,7 @@ bool noc_reader_and_writer_kernels( logical_eth_core, { (uint32_t)writer_dram_buffer->address(), - (uint32_t)writer_dram_noc_xy.x, - (uint32_t)writer_dram_noc_xy.y, + 0, (uint32_t)byte_size, (uint32_t)eth_src_l1_address, }); diff --git a/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp index cd1160969726..d2be295b74bd 100644 --- a/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp +++ b/tests/tt_metal/tt_metal/eth/test_buffer_movement_kernels.cpp @@ -62,22 +62,19 @@ bool chip_to_chip_dram_buffer_transfer( // Create source buffer on sender device auto input_dram_buffer = CreateBuffer(sender_dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); // Create dest buffer on receiver device auto output_dram_buffer = CreateBuffer(receiver_dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); log_info( tt::LogTest, - "Sending {} bytes from device {} dram {} addr {} to device {} dram {} addr {}, using eth core {} and {}", + "Sending {} bytes from device {} dram bank 0 addr {} to device {} dram bank 0 addr {}, using eth core {} and " + "{}", byte_size, sender_device->id(), - input_dram_noc_xy.str(), input_dram_byte_address, receiver_device->id(), - output_dram_noc_xy.str(), output_dram_byte_address, eth_sender_core.str(), eth_receiver_core.str()); @@ -113,8 +110,7 @@ bool chip_to_chip_dram_buffer_transfer( eth_sender_core, { (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, + 0, (uint32_t)remaining_bytes, (uint32_t)num_loops, (uint32_t)MAX_BUFFER, @@ -137,8 +133,7 @@ bool chip_to_chip_dram_buffer_transfer( eth_receiver_core, { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + 0, (uint32_t)remaining_bytes, (uint32_t)num_loops, (uint32_t)MAX_BUFFER, @@ -169,7 +164,7 @@ bool chip_to_chip_dram_buffer_transfer( fixture->ReadBuffer(receiver_device, output_dram_buffer, dest_dram_data); pass &= (dest_dram_data == inputs); if (not pass) { - std::cout << "Mismatch at Core: " << output_dram_noc_xy.str() << std::endl; + std::cout << "Mismatch" << std::endl; std::cout << dest_dram_data[0] << std::endl; } return pass; diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp index 4b93d2a1132d..dc064fbce058 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_X_tile.cpp @@ -120,10 +120,6 @@ void matmul_tile( uint32_t num_input_tiles = 2 * M; - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; tt_metal::CircularBufferConfig cb_src0_config = tt_metal::CircularBufferConfig( @@ -200,11 +196,9 @@ void matmul_tile( reader_l1_args = { src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + 0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + 0, (std::uint32_t)K, (std::uint32_t)M, (std::uint32_t)N, @@ -222,11 +216,9 @@ void matmul_tile( reader_l1_args = { src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + 0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + 0, 1, 1, 1, @@ -274,13 +266,8 @@ void matmul_tile( vector bias(N * 512, 0); fixture->WriteBuffer(device, src2_dram_buffer, bias); - auto dram_src2_noc_xy = src2_dram_buffer->noc_coordinates(); vector bias_args = { - src2_dram_buffer->address(), - (std::uint32_t)dram_src2_noc_xy.x, - (std::uint32_t)dram_src2_noc_xy.y, - (std::uint32_t)N, - (std::uint32_t)(N * single_tile_size_bfp16b)}; + src2_dram_buffer->address(), 0, (std::uint32_t)N, (std::uint32_t)(N * single_tile_size_bfp16b)}; for (uint32_t arg : bias_args) { reader_l1_args.push_back(arg); @@ -293,10 +280,7 @@ void matmul_tile( program, unary_writer_kernel, core, - {dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tiles}); // this is M * N in the multi_tile case !! + {dst_dram_buffer->address(), 0, num_tiles}); // this is M * N in the multi_tile case !! fixture->RunProgram(device, program); diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp index a9c32b95c3d0..7044d93b772b 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_large_block.cpp @@ -228,17 +228,11 @@ bool matmul_large_block( auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - const std::array mm_reader_rt_args{ src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (std::uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks M * in0_block_w, // input 0 block num tiles N * in0_block_w, // input 1 block num tiles @@ -249,17 +243,12 @@ bool matmul_large_block( string writer_kernel; if (output_rm) { writer_kernel = "tt_metal/kernels/dataflow/writer_unary.cpp"; - writer_rt_args = { - dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - uint(M * N)}; + writer_rt_args = {dst_dram_buffer->address(), 0, uint(M * N)}; } else { writer_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp"; writer_rt_args = { dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + 0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp index acbc84f2f644..27cd8d1a0530 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_multi_core_X_dram.cpp @@ -242,10 +242,6 @@ bool matmul_multi_core_single_dram(tt_metal::Device* device) { dram_buffer_dst_addr, dram_buffer_size_out); - auto dram_src0_noc_xy = device->dram_core_from_dram_channel(dram_src0_channel_id); - auto dram_src1_noc_xy = device->dram_core_from_dram_channel(dram_src1_channel_id); - auto dram_dst_noc_xy = device->dram_core_from_dram_channel(dram_dst_channel_id); - auto activations_tilized = test_utils::tilize(activation_slice, per_core_M * 32, K * 32); auto activations_tile_layout = convert_to_tile_layout(activations_tilized); auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); @@ -261,11 +257,9 @@ bool matmul_multi_core_single_dram(tt_metal::Device* device) { const std::array mm_reader_args = { (std::uint32_t)dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks (std::uint32_t)per_core_M * in0_block_w, // input 0 block num tiles (std::uint32_t)per_core_N * in0_block_w, // input 1 block num tiles @@ -274,8 +268,7 @@ bool matmul_multi_core_single_dram(tt_metal::Device* device) { const std::array writer_args = { (std::uint32_t)dram_buffer_dst_addr, - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)per_core_M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp index 95c7569ad4f4..775484b5cd2c 100644 --- a/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp +++ b/tests/tt_metal/tt_metal/integration/matmul/test_matmul_single_core.cpp @@ -85,10 +85,6 @@ bool matmul_single_core( auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t cb0_tiles = M * in0_block_w * 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -118,11 +114,9 @@ bool matmul_single_core( std::vector mm_reader_rt_args{ src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + 0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + 0, (std::uint32_t)(K / in0_block_w), // num_blocks (std::uint32_t)(M * in0_block_w), // input 0 block num tiles (std::uint32_t)(N * in0_block_w), // input 1 block num tiles @@ -131,8 +125,7 @@ bool matmul_single_core( std::vector writer_rt_args{ dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + 0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp index 4c4077c4adcb..e5d03095211b 100644 --- a/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp +++ b/tests/tt_metal/tt_metal/integration/test_autonomous_relay_streams.cpp @@ -244,23 +244,23 @@ void build_and_run_autonomous_stream_test( log_trace( tt::LogTest, "sender_core: x={}, y={}", - device->physical_core_from_logical_core(sender_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(sender_core, CoreType::WORKER).y); + device->virtual_core_from_logical_core(sender_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(sender_core, CoreType::WORKER).y); log_trace( tt::LogTest, "first_relay_core: x={}, y={}", - device->physical_core_from_logical_core(first_relay_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(first_relay_core, CoreType::WORKER).y); + device->virtual_core_from_logical_core(first_relay_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(first_relay_core, CoreType::WORKER).y); log_trace( tt::LogTest, "second_relay_core: x={}, y={}", - device->physical_core_from_logical_core(second_relay_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(second_relay_core, CoreType::WORKER).y); + device->virtual_core_from_logical_core(second_relay_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(second_relay_core, CoreType::WORKER).y); log_trace( tt::LogTest, "receiver_core: x={}, y={}", - device->physical_core_from_logical_core(receiver_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(receiver_core, CoreType::WORKER).y); + device->virtual_core_from_logical_core(receiver_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(receiver_core, CoreType::WORKER).y); // Input DRAM buffer creation uint32_t buffer_size_bytes = num_messages * page_size; diff --git a/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp b/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp index 2fc6c5133a22..f41e3b2b53cf 100644 --- a/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp +++ b/tests/tt_metal/tt_metal/integration/test_basic_pipeline.cpp @@ -85,9 +85,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf } uint32_t src_address; - CoreCoord src_noc_xy; uint32_t dst_address; - CoreCoord dst_noc_xy; tt_metal::BufferType buff_type = test_config.IO_data_in_dram ? tt_metal::BufferType::DRAM : tt_metal::BufferType::L1; @@ -98,9 +96,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf auto dst_buffer = CreateBuffer(buff_config); src_address = src_buffer->address(); - src_noc_xy = src_buffer->noc_coordinates(); dst_address = dst_buffer->address(); - dst_noc_xy = dst_buffer->noc_coordinates(); // create kernels vector receiver_kernels; @@ -173,11 +169,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf program, receiver_kernels.at(core_id), core, - {src_address, - (uint32_t)src_noc_xy.x, - (uint32_t)src_noc_xy.y, - (uint32_t)num_tiles, - (uint32_t)num_repetitions}); + {src_address, 0, (uint32_t)num_tiles, (uint32_t)num_repetitions}); } else { SetRuntimeArgs( program, @@ -196,11 +188,7 @@ void create_and_run_row_pipeline(tt_metal::Device* device, const PipelineRowConf program, sender_kernels.at(core_id), core, - {dst_address, - (uint32_t)dst_noc_xy.x, - (uint32_t)dst_noc_xy.y, - (uint32_t)num_tiles, - (uint32_t)num_repetitions}); + {dst_address, 0, (uint32_t)num_tiles, (uint32_t)num_repetitions}); } else { SetRuntimeArgs( program, diff --git a/tests/tt_metal/tt_metal/integration/test_flatten.cpp b/tests/tt_metal/tt_metal/integration/test_flatten.cpp index f36c0d396c1a..a8740c477c29 100644 --- a/tests/tt_metal/tt_metal/integration/test_flatten.cpp +++ b/tests/tt_metal/tt_metal/integration/test_flatten.cpp @@ -87,9 +87,6 @@ bool flatten(DispatchFixture* fixture, tt_metal::Device* device, uint32_t num_ti auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input CB // CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math kernel, // input CB and reader @@ -146,21 +143,9 @@ bool flatten(DispatchFixture* fixture, tt_metal::Device* device, uint32_t num_ti fixture->WriteBuffer(device, src_dram_buffer, src_vec); tt_metal::SetRuntimeArgs( - program, - flatten_kernel, - core, - {dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - num_tiles_r, - num_tiles_c, - num_bytes_per_tensor_row}); + program, flatten_kernel, core, {dram_buffer_src_addr, 0, num_tiles_r, num_tiles_c, num_bytes_per_tensor_row}); - tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles * 32}); + tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dram_buffer_dst_addr, 0, num_tiles * 32}); fixture->RunProgram(device, program); @@ -246,8 +231,6 @@ bool flatten_stress(Device* device, uint32_t num_tiles_r = 5, uint32_t num_tiles auto src_dram_buffer = CreateBuffer(dram_config); auto dst_dram_buffer = CreateBuffer(dram_config); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); // Create the source vector std::shared_ptr> src_vec = std::make_shared>(create_random_vector_of_bfloat16( @@ -258,14 +241,8 @@ bool flatten_stress(Device* device, uint32_t num_tiles_r = 5, uint32_t num_tiles std::shared_ptr writer_runtime_args = std::make_shared(); std::shared_ptr compute_runtime_args = std::make_shared(); *compute_runtime_args = { - src_dram_buffer.get(), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - num_tiles_r, - num_tiles_c, - num_bytes_per_tensor_row}; - *writer_runtime_args = { - dst_dram_buffer.get(), (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles * 32}; + src_dram_buffer.get(), (uint32_t)0, num_tiles_r, num_tiles_c, num_bytes_per_tensor_row}; + *writer_runtime_args = {dst_dram_buffer.get(), (uint32_t)0, num_tiles * 32}; SetRuntimeArgs(device, detail::GetKernel(program, flatten_kernel), core, compute_runtime_args); diff --git a/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp index d6f793332848..c048face44e0 100644 --- a/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/integration/test_sfpu_compute.cpp @@ -120,10 +120,8 @@ bool run_sfpu_all_same_buffer(CommandQueue& cq, const SfpuConfig& test_config) { .buffer_type = tt::tt_metal::BufferType::DRAM}; auto input_dram_buffer = CreateBuffer(dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); vector compute_kernel_args = { uint32_t(test_config.num_tiles), // per_core_block_cnt @@ -145,15 +143,13 @@ bool run_sfpu_all_same_buffer(CommandQueue& cq, const SfpuConfig& test_config) { // Same runtime args for every core vector reader_rt_args = { (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }; vector writer_rt_args = { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + 0, (uint32_t)test_config.num_tiles, }; diff --git a/tests/tt_metal/tt_metal/llk/test_broadcast.cpp b/tests/tt_metal/tt_metal/llk/test_broadcast.cpp index 19baa1821845..e1231125fccd 100644 --- a/tests/tt_metal/tt_metal/llk/test_broadcast.cpp +++ b/tests/tt_metal/tt_metal/llk/test_broadcast.cpp @@ -178,26 +178,20 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig& auto src_a_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_a_addr = src_a_dram_buffer->address(); - auto src_a_dram_noc_xy = src_a_dram_buffer->noc_coordinates(); - tt_metal::CircularBufferConfig l1_src_a_cb_config = - tt_metal::CircularBufferConfig(single_tile_size, {{0, tt::DataFormat::Float16_b}}) - .set_page_size(0, single_tile_size); + tt_metal::CircularBufferConfig l1_src_a_cb_config = tt_metal::CircularBufferConfig(single_tile_size, {{0, tt::DataFormat::Float16_b}}) + .set_page_size(0, single_tile_size); auto l1_src_a_cb = tt_metal::CreateCircularBuffer(program, core, l1_src_a_cb_config); auto src_b_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_b_addr = src_b_dram_buffer->address(); - auto src_b_dram_noc_xy = src_b_dram_buffer->noc_coordinates(); - tt_metal::CircularBufferConfig l1_src_b_cb_config = - tt_metal::CircularBufferConfig(single_tile_size, {{1, tt::DataFormat::Float16_b}}) - .set_page_size(1, single_tile_size); + tt_metal::CircularBufferConfig l1_src_b_cb_config = tt_metal::CircularBufferConfig(single_tile_size, {{1, tt::DataFormat::Float16_b}}) + .set_page_size(1, single_tile_size); auto l1_src_b_cb = tt_metal::CreateCircularBuffer(program, core, l1_src_b_cb_config); auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dst_dram_noc_xy = dst_dram_buffer->noc_coordinates(); - tt_metal::CircularBufferConfig l1_dst_cb_config = - tt_metal::CircularBufferConfig(single_tile_size, {{16, tt::DataFormat::Float16_b}}) - .set_page_size(16, single_tile_size); + tt_metal::CircularBufferConfig l1_dst_cb_config = tt_metal::CircularBufferConfig(single_tile_size, {{16, tt::DataFormat::Float16_b}}) + .set_page_size(16, single_tile_size); auto l1_dst_cb = tt_metal::CreateCircularBuffer(program, core, l1_dst_cb_config); std::map defines = { @@ -259,12 +253,10 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig& core, { (uint32_t)dram_buffer_src_a_addr, - (uint32_t)src_a_dram_noc_xy.x, - (uint32_t)src_a_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)dram_buffer_src_b_addr, - (uint32_t)src_b_dram_noc_xy.x, - (uint32_t)src_b_dram_noc_xy.y, - (uint32_t)1, + (uint32_t)0, // dram bank id + (uint32_t)1, // num tiles }); tt_metal::SetRuntimeArgs( @@ -273,9 +265,8 @@ void run_single_core_broadcast(tt_metal::Device* device, const BroadcastConfig& core, { (uint32_t)dram_buffer_dst_addr, - (uint32_t)dst_dram_noc_xy.x, - (uint32_t)dst_dram_noc_xy.y, - (uint32_t)1, + (uint32_t)0, // dram bank id + (uint32_t)1, // num tiles }); std::vector input0 = generate_uniform_random_vector( diff --git a/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp b/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp index 9b0b1b0a8bcf..7d77364008cb 100644 --- a/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp +++ b/tests/tt_metal/tt_metal/llk/test_copy_block_matmul_partials.cpp @@ -49,9 +49,6 @@ void run_single_core_copy_block_matmul_partials( auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer_bf16->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = test_config.src0_cb_index; uint32_t num_input_tiles = test_config.reader_ublock; @@ -134,8 +131,7 @@ void run_single_core_copy_block_matmul_partials( unary_reader_kernel, core, {dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, + (uint32_t)0, // dram bank id num_tiles, src0_cb_index, test_config.reader_ublock, @@ -146,8 +142,7 @@ void run_single_core_copy_block_matmul_partials( unary_writer_kernel, core, {dram_buffer_dst_addr, - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (uint32_t)0, // dram bank id num_tiles, ouput_cb_index, test_config.writer_ublock, diff --git a/tests/tt_metal/tt_metal/llk/test_cumsum.cpp b/tests/tt_metal/tt_metal/llk/test_cumsum.cpp index 378b435554c7..5d87a9f79b3b 100644 --- a/tests/tt_metal/tt_metal/llk/test_cumsum.cpp +++ b/tests/tt_metal/tt_metal/llk/test_cumsum.cpp @@ -71,18 +71,14 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c auto src_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_addr = src_dram_buffer->address(); - auto src_dram_noc_xy = src_dram_buffer->noc_coordinates(); - tt_metal::CircularBufferConfig l1_src_cb_config = - tt_metal::CircularBufferConfig(dram_buffer_size, {{0, tt::DataFormat::Float16_b}}) - .set_page_size(0, single_tile_size); + tt_metal::CircularBufferConfig l1_src_cb_config = tt_metal::CircularBufferConfig(dram_buffer_size, {{0, tt::DataFormat::Float16_b}}) + .set_page_size(0, single_tile_size); auto l1_src_cb = tt_metal::CreateCircularBuffer(program, core, l1_src_cb_config); auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dst_dram_noc_xy = dst_dram_buffer->noc_coordinates(); - tt_metal::CircularBufferConfig l1_dst_cb_config = - tt_metal::CircularBufferConfig(dram_buffer_size, {{16, tt::DataFormat::Float16_b}}) - .set_page_size(16, single_tile_size); + tt_metal::CircularBufferConfig l1_dst_cb_config = tt_metal::CircularBufferConfig(dram_buffer_size, {{16, tt::DataFormat::Float16_b}}) + .set_page_size(16, single_tile_size); auto l1_dst_cb = tt_metal::CreateCircularBuffer(program, core, l1_dst_cb_config); string reader_kernel_name, writer_kernel_name; @@ -126,9 +122,9 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c core, { (uint32_t)dram_buffer_src_addr, - (uint32_t)src_dram_noc_xy.x, - (uint32_t)src_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)test_config.N * test_config.Ht * test_config.Wt, // Used for non transposing kernel + (uint32_t)0, // Unused (uint32_t)test_config.N, // Used for transposing kernel (uint32_t)test_config.Ht, // Used for transposing kernel (uint32_t)test_config.Wt, // Used for transposing kernel @@ -141,9 +137,9 @@ void run_single_core_cumsum(tt_metal::Device* device, const CumsumConfig& test_c core, { (uint32_t)dram_buffer_dst_addr, - (uint32_t)dst_dram_noc_xy.x, - (uint32_t)dst_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)test_config.N * test_config.Ht * test_config.Wt, // Used for non transposing kernel + (uint32_t)0, // Unused (uint32_t)test_config.N, // Used for transposing kernel (uint32_t)test_config.Ht, // Used for transposing kernel (uint32_t)test_config.Wt, // Used for transposing kernel diff --git a/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp b/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp index 3f588d713035..2222461d4c30 100644 --- a/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_dropout_sfpu_compute.cpp @@ -162,8 +162,7 @@ bool test_dropout_standalone( core, { src0_dram_buffer->address(), - static_cast(src0_dram_buffer->noc_coordinates().x), - static_cast(src0_dram_buffer->noc_coordinates().y), + 0, // dram bank id num_tiles, }); @@ -172,8 +171,7 @@ bool test_dropout_standalone( unary_writer_kernel_id, core, {dst_dram_buffer->address(), - static_cast(dst_dram_buffer->noc_coordinates().x), - static_cast(dst_dram_buffer->noc_coordinates().y), + 0, // dram bank id num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/llk/test_reconfig.cpp b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp index e15e209be5f7..a52064b1f674 100644 --- a/tests/tt_metal/tt_metal/llk/test_reconfig.cpp +++ b/tests/tt_metal/tt_metal/llk/test_reconfig.cpp @@ -91,27 +91,22 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c // This will be srcB in Bfp8_b auto input0_dram_buffer = CreateBuffer(dram_config_bfp8b); uint32_t input0_dram_byte_address = input0_dram_buffer->address(); - auto input0_dram_noc_xy = input0_dram_buffer->noc_coordinates(); // This will be srcA in Float16_b auto input1_dram_buffer = CreateBuffer(dram_config_bfp16b); uint32_t input1_dram_byte_address = input1_dram_buffer->address(); - auto input1_dram_noc_xy = input1_dram_buffer->noc_coordinates(); // This will be DEST in Float16_b auto input2_dram_buffer = CreateBuffer(dram_config_bfp16b); uint32_t input2_dram_byte_address = input2_dram_buffer->address(); - auto input2_dram_noc_xy = input2_dram_buffer->noc_coordinates(); // This will be Output0 in Float32 or Float16_b depending on fp32_dest_acc_en auto output0_dram_buffer = CreateBuffer(dram_config_out0); uint32_t output0_dram_byte_address = output0_dram_buffer->address(); - auto output0_dram_noc_xy = output0_dram_buffer->noc_coordinates(); // This will be Output1 in Bfp8_b auto output1_dram_buffer = CreateBuffer(dram_config_bfp8b); uint32_t output1_dram_byte_address = output1_dram_buffer->address(); - auto output1_dram_noc_xy = output1_dram_buffer->noc_coordinates(); tt_metal::CircularBufferConfig l1_input0_cb_config = tt_metal::CircularBufferConfig(dram_buffer_size_bfp8b, {{in0_id, tt::DataFormat::Bfp8_b}}) @@ -254,21 +249,24 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c tt_metal::detail::WriteToBuffer(input1_dram_buffer, src1_vec); tt_metal::detail::WriteToBuffer(input2_dram_buffer, src2_vec); + static constexpr uint32_t k_input0_dram_bank_id = 0; + static constexpr uint32_t k_input1_dram_bank_id = 0; + static constexpr uint32_t k_input2_dram_bank_id = 0; + static constexpr uint32_t k_output0_dram_bank_id = 0; + static constexpr uint32_t k_output1_dram_bank_id = 0; + tt_metal::SetRuntimeArgs( program, reader_kernel, core, { (uint32_t)input0_dram_byte_address, - (uint32_t)input0_dram_noc_xy.x, - (uint32_t)input0_dram_noc_xy.y, + k_input0_dram_bank_id, // dram bank id (uint32_t)input1_dram_byte_address, - (uint32_t)input1_dram_noc_xy.x, - (uint32_t)input1_dram_noc_xy.y, + k_input1_dram_bank_id, (uint32_t)test_config.num_tiles, (uint32_t)input2_dram_byte_address, - (uint32_t)input2_dram_noc_xy.x, - (uint32_t)input2_dram_noc_xy.y, + k_input2_dram_bank_id, }); tt_metal::SetRuntimeArgs( program, @@ -276,12 +274,10 @@ bool single_core_reconfig(tt_metal::Device* device, const ReconfigConfig& test_c core, { (uint32_t)output0_dram_byte_address, - (uint32_t)output0_dram_noc_xy.x, - (uint32_t)output0_dram_noc_xy.y, + k_output0_dram_bank_id, (uint32_t)out0_id, (uint32_t)output1_dram_byte_address, - (uint32_t)output1_dram_noc_xy.x, - (uint32_t)output1_dram_noc_xy.y, + k_output1_dram_bank_id, (uint32_t)out1_id, (uint32_t)test_config.num_tiles, (uint32_t)test_config.ublock_size_tiles, diff --git a/tests/tt_metal/tt_metal/llk/test_reduce.cpp b/tests/tt_metal/tt_metal/llk/test_reduce.cpp index 7d2d51556ad7..898827b074bb 100644 --- a/tests/tt_metal/tt_metal/llk/test_reduce.cpp +++ b/tests/tt_metal/tt_metal/llk/test_reduce.cpp @@ -145,10 +145,11 @@ void add_reader_writer_kernels( program, unary_writer_kernel, logical_core, - {dst_dram_buffer->address(), - (std::uint32_t)dst_dram_buffer->noc_coordinates().x, - (std::uint32_t)dst_dram_buffer->noc_coordinates().y, - num_tensor_tiles / Ht}); + { + dst_dram_buffer->address(), + (uint32_t)0, // dram bank id + num_tensor_tiles / Ht // num tiles + }); break; } @@ -176,8 +177,8 @@ void add_reader_writer_kernels( logical_core, { src_dram_buffer->address(), - (std::uint32_t)src_dram_buffer->noc_coordinates().x, - (std::uint32_t)src_dram_buffer->noc_coordinates().y, + (uint32_t)0, // dram bank id + (uint32_t)0, // unused num_tensor_tiles, NC, Ht, @@ -193,8 +194,7 @@ void add_reader_writer_kernels( unary_writer_kernel, logical_core, {dst_dram_buffer->address(), - (std::uint32_t)dst_dram_buffer->noc_coordinates().x, - (std::uint32_t)dst_dram_buffer->noc_coordinates().y, + (uint32_t)0, // dram bank id num_tiles}); break; diff --git a/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp b/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp index 0e2d99065b1d..ec74db1eeab7 100644 --- a/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_sfpu_compute.cpp @@ -124,10 +124,8 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c auto input_dram_buffer = CreateBuffer(dram_config); uint32_t input_dram_byte_address = input_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); vector compute_kernel_args = { uint32_t(test_config.num_tiles), // per_core_block_cnt @@ -149,15 +147,13 @@ bool run_sfpu_all_same_buffer(tt_metal::Device* device, const SfpuConfig& test_c // Same runtime args for every core vector reader_rt_args = { (uint32_t)input_dram_byte_address, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, + (uint32_t)0, (uint32_t)test_config.num_tiles, }; vector writer_rt_args = { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + (uint32_t)0, (uint32_t)test_config.num_tiles, }; diff --git a/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp index 19e2534d941f..39c9e4c91831 100644 --- a/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_single_core_binary_compute.cpp @@ -92,19 +92,15 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& .device = device, .size = byte_size, .page_size = byte_size, .buffer_type = tt::tt_metal::BufferType::DRAM}; auto input0_dram_buffer = CreateBuffer(dram_config); uint32_t input0_dram_byte_address = input0_dram_buffer->address(); - auto input0_dram_noc_xy = input0_dram_buffer->noc_coordinates(); auto input1_dram_buffer = CreateBuffer(dram_config); uint32_t input1_dram_byte_address = input1_dram_buffer->address(); - auto input1_dram_noc_xy = input1_dram_buffer->noc_coordinates(); auto input2_dram_buffer = CreateBuffer(dram_config); uint32_t input2_dram_byte_address = input2_dram_buffer->address(); - auto input2_dram_noc_xy = input2_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_byte_address = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); tt_metal::CircularBufferConfig l1_cb_config = tt_metal::CircularBufferConfig(byte_size, {{0, test_config.l1_input_data_format}}) @@ -244,15 +240,12 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& test_config.core, { (uint32_t)input0_dram_byte_address, - (uint32_t)input0_dram_noc_xy.x, - (uint32_t)input0_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)input1_dram_byte_address, - (uint32_t)input1_dram_noc_xy.x, - (uint32_t)input1_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)test_config.num_tiles, (uint32_t)input2_dram_byte_address, - (uint32_t)input2_dram_noc_xy.x, - (uint32_t)input2_dram_noc_xy.y, + (uint32_t)0, // dram bank id }); tt_metal::SetRuntimeArgs( program, @@ -260,8 +253,7 @@ bool single_core_binary(tt_metal::Device* device, const SingleCoreBinaryConfig& test_config.core, { (uint32_t)output_dram_byte_address, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)test_config.num_tiles, }); diff --git a/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp index 8e624df6a67c..34ad0b088189 100644 --- a/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp +++ b/tests/tt_metal/tt_metal/llk/test_single_core_matmul_compute.cpp @@ -179,13 +179,10 @@ bool single_tile_matmul(tt_metal::Device* device) { tt_metal::Program program = tt_metal::CreateProgram(); auto input0_dram_buffer = CreateBuffer(dram_config); const uint32_t in0_dram_addr = input0_dram_buffer->address(); - auto input0_dram_noc_xy = input0_dram_buffer->noc_coordinates(); auto input1_dram_buffer = CreateBuffer(dram_config); const uint32_t in1_dram_addr = input1_dram_buffer->address(); - auto input1_dram_noc_xy = input1_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config); const uint32_t out_dram_addr = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); tt_metal::CircularBufferConfig l1_input0_cb_config = tt_metal::CircularBufferConfig(byte_size, {{in0_cb_index, tt::DataFormat::Float16_b}}) @@ -256,11 +253,9 @@ bool single_tile_matmul(tt_metal::Device* device) { core, { (uint32_t)in0_dram_addr, - (uint32_t)input0_dram_noc_xy.x, - (uint32_t)input0_dram_noc_xy.y, + (uint32_t)0, // in_0 dram bank id (uint32_t)in1_dram_addr, - (uint32_t)input1_dram_noc_xy.x, - (uint32_t)input1_dram_noc_xy.y, + (uint32_t)0, (uint32_t)1, // num_tiles }); tt_metal::SetRuntimeArgs( @@ -269,9 +264,8 @@ bool single_tile_matmul(tt_metal::Device* device) { core, { (uint32_t)out_dram_addr, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, - (uint32_t)1, + (uint32_t)0, + (uint32_t)1, // num_tiles }); tt_metal::detail::LaunchProgram(device, program); @@ -323,12 +317,9 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 tt_metal::Program program = tt_metal::CreateProgram(); auto input0_dram_buffer = CreateBuffer(dram_config_0); const uint32_t in0_dram_addr = input0_dram_buffer->address(); - auto input0_dram_noc_xy = input0_dram_buffer->noc_coordinates(); auto input1_dram_buffer = CreateBuffer(dram_config_1); const uint32_t in1_dram_addr = input1_dram_buffer->address(); - auto input1_dram_noc_xy = input1_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config_out); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); const uint32_t out_dram_addr = output_dram_buffer->address(); tt_metal::CircularBufferConfig l1_input0_cb_config = @@ -403,11 +394,9 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 core, { (uint32_t)in0_dram_addr, - (uint32_t)input0_dram_noc_xy.x, - (uint32_t)input0_dram_noc_xy.y, + (uint32_t)0, (uint32_t)in1_dram_addr, - (uint32_t)input1_dram_noc_xy.x, - (uint32_t)input1_dram_noc_xy.y, + (uint32_t)0, (uint32_t)1, // num_blocks (uint32_t)M * K, // in0_block_tile_cnt (uint32_t)K * N, // in1_block_tile_cnt @@ -420,8 +409,7 @@ bool single_block_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint3 core, { (uint32_t)out_dram_addr, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + (uint32_t)0, (uint32_t)M * N, }); @@ -483,18 +471,13 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N tt_metal::Program program = tt_metal::CreateProgram(); auto input0_dram_buffer = CreateBuffer(dram_config_0); const uint32_t in0_dram_addr = input0_dram_buffer->address(); - auto input0_dram_noc_xy = input0_dram_buffer->noc_coordinates(); auto input1_dram_buffer = CreateBuffer(dram_config_1); const uint32_t in1_dram_addr = input1_dram_buffer->address(); - auto input1_dram_noc_xy = input1_dram_buffer->noc_coordinates(); auto output_dram_buffer = CreateBuffer(dram_config_out); const uint32_t out_dram_addr = output_dram_buffer->address(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - - tt_metal::CircularBufferConfig l1_input0_cb_config = - tt_metal::CircularBufferConfig(in0_byte_size, {{in0_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(in0_cb_index, cb_page_size); + tt_metal::CircularBufferConfig l1_input0_cb_config = tt_metal::CircularBufferConfig(in0_byte_size, {{in0_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(in0_cb_index, cb_page_size); auto l1_input0_cb = tt_metal::CreateCircularBuffer(program, core, l1_input0_cb_config); tt_metal::CircularBufferConfig l1_input1_cb_config = @@ -580,11 +563,9 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N core, { (uint32_t)in0_dram_addr, - (uint32_t)input0_dram_noc_xy.x, - (uint32_t)input0_dram_noc_xy.y, + (uint32_t)0, (uint32_t)in1_dram_addr, - (uint32_t)input1_dram_noc_xy.x, - (uint32_t)input1_dram_noc_xy.y, + (uint32_t)0, (uint32_t)1, // num_blocks (uint32_t)M * K, // in0_block_tile_cnt (uint32_t)K * N, // in1_block_tile_cnt @@ -597,8 +578,7 @@ bool blocked_matmul(tt_metal::Device* device, uint32_t M, uint32_t K, uint32_t N core, { (uint32_t)out_dram_addr, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, + (uint32_t)0, (uint32_t)M * N, }); diff --git a/tests/tt_metal/tt_metal/llk/test_transpose.cpp b/tests/tt_metal/tt_metal/llk/test_transpose.cpp index f8124dd3df16..d018f1ba74cd 100644 --- a/tests/tt_metal/tt_metal/llk/test_transpose.cpp +++ b/tests/tt_metal/tt_metal/llk/test_transpose.cpp @@ -108,9 +108,6 @@ void run_single_core_transpose(tt_metal::Device* device, const TransposeConfig& std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - CoreCoord dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - CoreCoord dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_buffer_tiles = 32; tt_metal::CircularBufferConfig cb_src0_config = @@ -161,8 +158,8 @@ void run_single_core_transpose(tt_metal::Device* device, const TransposeConfig& core, { dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, + (uint32_t)0, // unused to maintain compat + (uint32_t)0, // unused to maintain compat num_tensor_tiles, NC, Ht, @@ -175,7 +172,9 @@ void run_single_core_transpose(tt_metal::Device* device, const TransposeConfig& program, unary_writer_kernel, core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tensor_tiles}); + {dram_buffer_dst_addr, + (uint32_t)0, // unused to maintain compat + num_tensor_tiles}); auto seed = std::chrono::system_clock::now().time_since_epoch().count(); vector src_vec = create_random_vector_of_bfloat16(dram_buffer_size, 100.0f, 0x1234); diff --git a/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp b/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp index 85328167e4b1..9a25538f9579 100644 --- a/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp +++ b/tests/tt_metal/tt_metal/llk/test_untilize_tilize.cpp @@ -87,9 +87,6 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& std::shared_ptr dst_dram_buffer = CreateBuffer(output_dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - CoreCoord dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - CoreCoord dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = tt::CBIndex::c_0; uint32_t num_input_tiles = num_tiles; tt_metal::CircularBufferConfig cb_src0_config = @@ -105,7 +102,6 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& if (test_config.tilize_type.has_value() && test_config.tilize_type == TilizeType::UNPACK_A_B) { src1_dram_buffer = CreateBuffer(input_dram_config); dram_buffer_src1_addr = src1_dram_buffer->address(); - dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); uint32_t src1_cb_index = tt::CBIndex::c_1; uint32_t num_input_tiles = num_tiles; @@ -195,18 +191,17 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& std::vector src1_vec; - if (test_config.tilize_type.has_value() && test_config.tilize_type == TilizeType::UNPACK_A_B) { + if(test_config.tilize_type.has_value() && test_config.tilize_type == TilizeType::UNPACK_A_B) { + // tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp tt_metal::SetRuntimeArgs( program, reader_kernel, core, { dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, // dram bank id dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, // dram bank id (uint32_t)num_tiles, }); @@ -214,24 +209,20 @@ void run_single_core_tilize_program(tt_metal::Device* device, const TestConfig& tt_metal::detail::WriteToBuffer(src1_dram_buffer, src1_vec); } else { + // tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp tt_metal::SetRuntimeArgs( program, reader_kernel, core, {dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, // dram bank id num_tiles, src0_cb_index, test_config.num_tiles_c, false}); } - tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dram_buffer_dst_addr, (uint32_t)0, num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp index 2e49aec6d0c8..16249c9f9f85 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp @@ -229,293 +229,14 @@ uint32_t get_dram_bandwidth(tt::ARCH arch) { return dram_bandwidth_gb_per_sec; } -void get_dram_reader_core_coords_grayskull( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // hardcoded for grayskull - uint32_t full_grid_size_y = 12; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - uint32_t max_worker_y_physical = 0; - uint32_t min_worker_y_physical = 10000; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - if (core_phy.y > max_worker_y_physical) { - max_worker_y_physical = core_phy.y; - } - if (core_phy.y < min_worker_y_physical) { - min_worker_y_physical = core_phy.y; - } - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x; - uint32_t adj_core_y = dram_core.y + 1; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // move worker if they are in the harvested rows - for (auto& coord : adj_core_physical) { - auto y = coord.y; - - // if row is harvested, move core down by 1 - while (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() and - y < (full_grid_size_y - 1)) { - y += 1; - } - - coord.y = y; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; -} - -void get_dram_reader_core_coords( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - uint32_t full_grid_size_x = device->grid_size().x; - uint32_t full_grid_size_y = device->grid_size().y; - uint32_t x_step = 3; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - dram_coord_phy.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - all_worker_cores_logical.reserve(num_cores_x * num_cores_y); - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - all_worker_cores_y_physical.reserve(num_cores_y); - uint32_t max_worker_y_physical = 0; - uint32_t min_worker_y_physical = 10000; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - if (core_phy.y > max_worker_y_physical) { - max_worker_y_physical = core_phy.y; - } - if (core_phy.y < min_worker_y_physical) { - min_worker_y_physical = core_phy.y; - } - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - adj_core_physical.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x + 1; - uint32_t adj_core_y = dram_core.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // split the adjacent coords into two groups, because DRAM banks has two cols - std::vector adj_core_physical_g1; - adj_core_physical_g1.reserve(num_banks); - std::vector adj_core_physical_y_g1; - adj_core_physical_y_g1.reserve(num_banks); - std::vector adj_core_physical_g2; - adj_core_physical_g2.reserve(num_banks); - std::vector adj_core_physical_y_g2; - adj_core_physical_y_g2.reserve(num_banks); - for (auto core : adj_core_physical) { - if (core.x == adj_core_physical.front().x) { - adj_core_physical_g1.push_back(core); - } else { - adj_core_physical_g2.push_back(core); - } - } - std::vector indices_g1(adj_core_physical_g1.size()); - std::vector indices_g2(adj_core_physical_g2.size()); - std::iota(indices_g1.begin(), indices_g1.end(), 0); - std::iota(indices_g2.begin(), indices_g2.end(), 0); - std::sort(indices_g1.begin(), indices_g1.end(), [&adj_core_physical_g1](int i1, int i2) { - return adj_core_physical_g1[i1].y < adj_core_physical_g1[i2].y; - }); - std::sort(indices_g2.begin(), indices_g2.end(), [&adj_core_physical_g2](int i1, int i2) { - return adj_core_physical_g2[i1].y < adj_core_physical_g2[i2].y; - }); - std::rotate(indices_g1.begin(), indices_g1.end() - 1, indices_g1.end()); - std::rotate(indices_g2.begin(), indices_g2.end() - 1, indices_g2.end()); - - std::vector indices_g1_realloc(adj_core_physical_g1.size()); - std::vector indices_g2_realloc(adj_core_physical_g2.size()); - for (int new_index = 0; new_index < indices_g1.size(); ++new_index) { - indices_g1_realloc[indices_g1[new_index]] = new_index; - } - for (int new_index = 0; new_index < indices_g2.size(); ++new_index) { - indices_g2_realloc[indices_g2[new_index]] = new_index; - } - - std::sort(adj_core_physical_g1.begin(), adj_core_physical_g1.end(), [](const CoreCoord& a, const CoreCoord& b) { - return a.y < b.y; - }); - std::sort(adj_core_physical_g2.begin(), adj_core_physical_g2.end(), [](const CoreCoord& a, const CoreCoord& b) { - return a.y < b.y; - }); - std::rotate(adj_core_physical_g1.begin(), adj_core_physical_g1.end() - 1, adj_core_physical_g1.end()); - std::rotate(adj_core_physical_g2.begin(), adj_core_physical_g2.end() - 1, adj_core_physical_g2.end()); - - for (auto core : adj_core_physical_g1) { - adj_core_physical_y_g1.push_back(core.y); - } - for (auto core : adj_core_physical_g2) { - adj_core_physical_y_g2.push_back(core.y); - } - - // move the workers, if they are on harvested rows - auto process_group = [&](std::vector& group, std::vector& group_y, uint32_t x_step) { - for (auto& coord : group) { - auto y = coord.y; - - if (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() || - std::count(group_y.begin(), group_y.end(), y) >= 2) { - auto adjust_coord = [&](int start, int end, int step) { - bool found_new_row = false; - for (int j = start; step > 0 ? j <= end : j >= end; j += step) { - if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end() && - std::count(group_y.begin(), group_y.end(), j) == 0) { - coord.y = j; - coord.x += x_step; - x_step--; - found_new_row = true; - break; - } - } - if (not found_new_row) { - for (int j = start; step > 0 ? j <= end : j >= end; j += step) { - if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end()) { - coord.y = j; - coord.x += x_step; - x_step--; - found_new_row = true; - break; - } - } - } - }; - - if (y >= max_bank_id) { - adjust_coord(max_worker_y_physical, min_worker_y_physical, -1); - } else { - adjust_coord(min_worker_y_physical, max_worker_y_physical, 1); - } - } - } - }; - // move the workers, if they are on harvested rows - process_group(adj_core_physical_g1, adj_core_physical_y_g1, x_step); - process_group(adj_core_physical_g2, adj_core_physical_y_g2, x_step); - - // merge two group into one - std::vector adj_core_physical_realloc; - adj_core_physical_realloc.reserve(num_banks); - for (int i = 0; i < indices_g1_realloc.size(); ++i) { - adj_core_physical_realloc.push_back(adj_core_physical_g1[indices_g1_realloc[i]]); - } - for (int i = 0; i < indices_g2_realloc.size(); ++i) { - adj_core_physical_realloc.push_back(adj_core_physical_g2[indices_g2_realloc[i]]); - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - adj_core_logical_realloc.reserve(num_banks); - for (int i = 0; i < adj_core_physical_realloc.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical_realloc[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets +void get_optimal_dram_bank_to_reader_assignment( + Device* device, std::vector& all_worker_cores_ordered, CoreRangeSet& all_worker_cores) { + all_worker_cores_ordered = device->get_optimal_dram_bank_to_logical_worker_assignment(); std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); + for (const auto& worker_core : all_worker_cores_ordered) { + all_cores_set.insert(CoreRange(worker_core)); } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; + all_worker_cores = CoreRangeSet(all_cores_set); } int main(int argc, char** argv) { @@ -640,18 +361,14 @@ int main(int argc, char** argv) { CoreRangeSet all_cores; std::vector all_cores_list; - if (device->arch() == tt::ARCH::GRAYSKULL) { - get_dram_reader_core_coords_grayskull(device, all_cores, all_cores_list); - } else { - get_dram_reader_core_coords(device, all_cores, all_cores_list); - } + get_optimal_dram_bank_to_reader_assignment(device, all_cores_list, all_cores); uint32_t num_tiles_per_core = num_tiles / num_cores; uint32_t num_tiles_cb = num_tiles_per_core / num_blocks; for (auto core : all_cores_list) { - auto phys_core = device->worker_core_from_logical_core(core); - log_info("logical core: {}, physical coer: {}", core, phys_core); + auto virtual_core = device->worker_core_from_logical_core(core); + log_info("logical core: {}, virtual core: {}", core, virtual_core); } log_info( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp index 03d8cce586b0..d5abe69e1a6c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp @@ -336,453 +336,59 @@ uint32_t get_dram_bandwidth(tt::ARCH arch) { return dram_bandwidth_gb_per_sec; } -void get_dram_reader_core_coords_blackhole( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - uint32_t full_grid_size_x = soc_d.grid_size.x; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get x coords of the workers - std::vector all_worker_cores_x_physical; - for (int i = 0; i < num_cores_x; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(i, 0)); - all_worker_cores_x_physical.push_back(core_phy.x); - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_cols; - for (int i = 0; i < full_grid_size_x; ++i) { - auto x = i; - - if (std::find(all_worker_cores_x_physical.begin(), all_worker_cores_x_physical.end(), x) == - all_worker_cores_x_physical.end()) { - harvested_cols.push_back(x); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x + 1; - uint32_t adj_core_y = dram_core.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // move worker if they are in the harvested cols - for (auto& coord : adj_core_physical) { - auto x = coord.x; - - // if row is harvested, move core down by 1 - while (std::find(harvested_cols.begin(), harvested_cols.end(), x) != harvested_cols.end() and - x < (full_grid_size_x - 1)) { - x += 1; - } - - coord.x = x; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets +void get_optimal_dram_bank_to_reader_assignment( + Device* device, std::vector& all_worker_cores_ordered, CoreRangeSet& all_worker_cores) { + all_worker_cores_ordered = device->get_optimal_dram_bank_to_logical_worker_assignment(); std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); + for (const auto& worker_core : all_worker_cores_ordered) { + all_cores_set.insert(CoreRange(worker_core)); } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; + all_worker_cores = CoreRangeSet(all_cores_set); } -void get_l1_writer_core_coords_blackhole( - tt_metal::Device* device, - std::vector& all_dram_reader_cores, - CoreRangeSet& all_cores, - std::vector& all_cores_ordered) { - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - uint32_t full_grid_size_x = soc_d.grid_size.x; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get x coords of the workers - std::vector all_worker_cores_x_physical; - for (int i = 0; i < num_cores_x; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(i, 0)); - all_worker_cores_x_physical.push_back(core_phy.x); - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_cols; - for (int i = 0; i < full_grid_size_x; ++i) { - auto x = i; - - if (std::find(all_worker_cores_x_physical.begin(), all_worker_cores_x_physical.end(), x) == - all_worker_cores_x_physical.end()) { - harvested_cols.push_back(x); - } - } - - // get the ajacent cores of DRAM readers, for grayskull the l1 writers are below DRAM readers - std::vector adj_core_physical; +void get_l1_writer_core_coords_wormhole_b0( + std::vector& all_dram_reader_cores, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { + // Place writers horizontally next to DRAM readers in logical space (no column harvesting for WH) for (int i = 0; i < all_dram_reader_cores.size(); ++i) { auto dram_reader_core = all_dram_reader_cores[i]; - auto dram_reader_core_phy = device->worker_core_from_logical_core(dram_reader_core); - uint32_t adj_core_x = dram_reader_core_phy.x + 1; - uint32_t adj_core_y = dram_reader_core_phy.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - uint32_t adj_core_x2 = dram_reader_core_phy.x + 2; - uint32_t adj_core_y2 = dram_reader_core_phy.y; - adj_core_physical.push_back(CoreCoord(adj_core_x2, adj_core_y2)); + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x + 1, dram_reader_core.y)); + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x + 2, dram_reader_core.y)); } - - // move worker if they are in the harvested rows - for (auto& coord : adj_core_physical) { - auto x = coord.x; - - // if row is harvested, move core down by 1 - while (std::find(harvested_cols.begin(), harvested_cols.end(), x) != harvested_cols.end() and - x < (full_grid_size_x - 1)) { - x += 1; - } - - coord.x = x; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets std::set all_cores_set; - for (int i = 0; i < adj_core_logical_realloc.size(); ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); + for (int i = 0; i < all_cores_ordered.size(); ++i) { + all_cores_set.insert(CoreRange(all_cores_ordered[i])); } all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; } -void get_dram_reader_core_coords_grayskull( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - uint32_t full_grid_size_y = soc_d.grid_size.y; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x; - uint32_t adj_core_y = dram_core.y + 1; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // move worker if they are in the harvested rows - for (auto& coord : adj_core_physical) { - auto y = coord.y; - - // if row is harvested, move core down by 1 - while (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() and - y < (full_grid_size_y - 1)) { - y += 1; - } - - coord.y = y; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; -} - -void get_l1_writer_core_coords_grayskull( - tt_metal::Device* device, - std::vector& all_dram_reader_cores, - CoreRangeSet& all_cores, - std::vector& all_cores_ordered) { - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - uint32_t full_grid_size_y = soc_d.grid_size.y; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM readers, for grayskull the l1 writers are below DRAM readers - std::vector adj_core_physical; +void get_l1_writer_core_coords_blackhole( + std::vector& all_dram_reader_cores, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { + // Place writers horizontally next to DRAM readers in logical space (column harvesting enabled for BH incrementing + // in logical space can lead to physical physical columns being skipped when placing writers next to readers) for (int i = 0; i < all_dram_reader_cores.size(); ++i) { auto dram_reader_core = all_dram_reader_cores[i]; - auto dram_reader_core_phy = device->worker_core_from_logical_core(dram_reader_core); - uint32_t adj_core_x = dram_reader_core_phy.x; - uint32_t adj_core_y = dram_reader_core_phy.y + 1; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - uint32_t adj_core_x2 = dram_reader_core_phy.x + 1; - uint32_t adj_core_y2 = dram_reader_core_phy.y + 1; - adj_core_physical.push_back(CoreCoord(adj_core_x2, adj_core_y2)); - } - - // move worker if they are in the harvested rows - for (auto& coord : adj_core_physical) { - auto y = coord.y; - - // if row is harvested, move core down by 1 - while (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() and - y < (full_grid_size_y - 1)) { - y += 1; - } - - coord.y = y; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x + 1, dram_reader_core.y)); + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x + 2, dram_reader_core.y)); } - - // create sets std::set all_cores_set; - for (int i = 0; i < adj_core_logical_realloc.size(); ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); + for (int i = 0; i < all_cores_ordered.size(); ++i) { + all_cores_set.insert(CoreRange(all_cores_ordered[i])); } all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; } -void get_dram_reader_core_coords_wormhole_b0( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - dram_coord_phy.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - all_worker_cores_logical.reserve(num_cores_x * num_cores_y); - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - adj_core_physical.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x + 1; - uint32_t adj_core_y = dram_core.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // find the logical coord from physical coord - std::vector adj_core_logical; - adj_core_logical.reserve(num_banks); - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical; -} - -void get_l1_writer_core_coords_wormhole_b0( - tt_metal::Device* device, - std::vector& all_dram_reader_cores, - CoreRangeSet& all_cores, - std::vector& all_cores_ordered) { - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get the ajacent cores of DRAM readers, for wormhole the l1 writers are on the left or right DRAM readers - std::vector adj_core_physical; +void get_l1_writer_core_coords_grayskull( + std::vector& all_dram_reader_cores, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { for (int i = 0; i < all_dram_reader_cores.size(); ++i) { auto dram_reader_core = all_dram_reader_cores[i]; - auto dram_reader_core_phy = device->worker_core_from_logical_core(dram_reader_core); - uint32_t adj_core_x1 = dram_reader_core_phy.x + 1; - uint32_t adj_core_y1 = dram_reader_core_phy.y; - adj_core_physical.push_back(CoreCoord(adj_core_x1, adj_core_y1)); - uint32_t adj_core_x2 = dram_reader_core_phy.x + 2; - uint32_t adj_core_y2 = dram_reader_core_phy.y; - adj_core_physical.push_back(CoreCoord(adj_core_x2, adj_core_y2)); - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x, dram_reader_core.y + 1)); + all_cores_ordered.push_back(CoreCoord(dram_reader_core.x + 1, dram_reader_core.y + 1)); } - - // create sets std::set all_cores_set; - for (int i = 0; i < adj_core_logical_realloc.size(); ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); + for (int i = 0; i < all_cores_ordered.size(); ++i) { + all_cores_set.insert(CoreRange(all_cores_ordered[i])); } all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; } int main(int argc, char** argv) { @@ -804,39 +410,38 @@ int main(int argc, char** argv) { log_info("start DRAM benchmark"); + // try { + //////////////////////////////////////////////////////////////////////////// + // Initial Runtime Args Parse + //////////////////////////////////////////////////////////////////////////// + std::vector input_args(argv, argv + argc); try { - //////////////////////////////////////////////////////////////////////////// - // Initial Runtime Args Parse - //////////////////////////////////////////////////////////////////////////// - std::vector input_args(argv, argv + argc); - try { - std::tie(k, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192); + std::tie(k, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192); - std::tie(n, input_args) = - test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 12 * 128); + std::tie(n, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 12 * 128); - std::tie(num_blocks, input_args) = - test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-blocks", 8); + std::tie(num_blocks, input_args) = + test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-blocks", 8); - std::tie(num_tests, input_args) = - test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-tests", 1); + std::tie(num_tests, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-tests", 1); - std::tie(use_device_profiler, input_args) = - test_args::has_command_option_and_remaining_args(input_args, "--use-device-profiler"); + std::tie(use_device_profiler, input_args) = + test_args::has_command_option_and_remaining_args(input_args, "--use-device-profiler"); - std::tie(bypass_check, input_args) = - test_args::has_command_option_and_remaining_args(input_args, "--bypass-check"); + std::tie(bypass_check, input_args) = + test_args::has_command_option_and_remaining_args(input_args, "--bypass-check"); - std::tie(df, input_args) = - test_args::get_command_option_uint32_and_remaining_args(input_args, "--data-type", 2); + std::tie(df, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--data-type", 2); - std::tie(num_banks, input_args) = - test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-banks", 12); + std::tie(num_banks, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--num-banks", 12); - std::tie(bank_start_id, input_args) = - test_args::get_command_option_uint32_and_remaining_args(input_args, "--bank-start-id", 0); + std::tie(bank_start_id, input_args) = + test_args::get_command_option_uint32_and_remaining_args(input_args, "--bank-start-id", 0); - test_args::validate_remaining_args(input_args); + test_args::validate_remaining_args(input_args); } catch (const std::exception& e) { log_error(tt::LogTest, "Command line arguments found exception", e.what()); TT_ASSERT(false); @@ -901,7 +506,7 @@ int main(int argc, char** argv) { tt_metal::DispatchCoreConfig{tt_metal::DispatchCoreType::WORKER, tt_metal::DispatchCoreAxis::ROW}; } else { dispatch_core_config = - tt_metal::DispatchCoreConfig{tt_metal::DispatchCoreType::WORKER, tt_metal::DispatchCoreAxis::COL}; + tt_metal::DispatchCoreConfig{tt_metal::DispatchCoreType::WORKER, tt_metal::DispatchCoreAxis::ROW}; } tt_metal::Device* device = tt_metal::CreateDevice(device_id, 1, 0, 0, dispatch_core_config); dram_bandwidth_spec = get_dram_bandwidth(device->arch()); @@ -921,18 +526,17 @@ int main(int argc, char** argv) { std::vector all_dram_reader_cores_ordered; CoreRangeSet all_l1_receiver_cores; std::vector all_l1_writer_cores_ordered; + get_optimal_dram_bank_to_reader_assignment(device, all_dram_reader_cores_ordered, all_dram_reader_cores); + if (device->arch() == tt::ARCH::BLACKHOLE) { - get_dram_reader_core_coords_blackhole(device, all_dram_reader_cores, all_dram_reader_cores_ordered); get_l1_writer_core_coords_blackhole( - device, all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); + all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); } else if (device->arch() == tt::ARCH::WORMHOLE_B0) { - get_dram_reader_core_coords_wormhole_b0(device, all_dram_reader_cores, all_dram_reader_cores_ordered); get_l1_writer_core_coords_wormhole_b0( - device, all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); + all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); } else { - get_dram_reader_core_coords_grayskull(device, all_dram_reader_cores, all_dram_reader_cores_ordered); get_l1_writer_core_coords_grayskull( - device, all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); + all_dram_reader_cores_ordered, all_l1_receiver_cores, all_l1_writer_cores_ordered); } uint32_t num_tiles_per_core = num_tiles / num_cores; @@ -941,12 +545,12 @@ int main(int argc, char** argv) { log_info("all_dram_reader_cores"); for (auto core : all_dram_reader_cores_ordered) { auto phys_core = device->worker_core_from_logical_core(core); - log_info("logical core: {}, physical core: {}", core, phys_core); + log_info("logical core: {}, virtual core: {}", core, phys_core); } log_info("all_l1_writer_cores"); for (auto core : all_l1_writer_cores_ordered) { auto phys_core = device->worker_core_from_logical_core(core); - log_info("logical core: {}, physical core: {}", core, phys_core); + log_info("logical core: {}, virtual core: {}", core, phys_core); } log_info( @@ -1048,29 +652,29 @@ int main(int argc, char** argv) { } pass &= tt_metal::CloseDevice(device); - } catch (const std::exception& e) { - pass = false; - // Capture the exception error message - log_error(LogTest, "{}", e.what()); - // Capture system call errors that may have returned from driver/kernel - log_error(LogTest, "System error message: {}", std::strerror(errno)); - } - - // Determine if it passes performance goal - auto avg_dram_bandwidth = calculate_average(dram_bandwidth); - if (pass && bypass_check == false) { - // goal is 90% of peak DRAM bandwidth performance - double target_bandwidth = static_cast(dram_bandwidth_spec) * 0.9; - if (avg_dram_bandwidth < target_bandwidth) { - pass = false; - log_error( - LogTest, - "The DRAM bandwidth does not meet the criteria. " - "Current: {:.3f}GB/s, goal: {:.3f}GB/s", - avg_dram_bandwidth, - target_bandwidth); + // } catch (const std::exception& e) { + // pass = false; + // // Capture the exception error message + // log_error(LogTest, "{}", e.what()); + // // Capture system call errors that may have returned from driver/kernel + // log_error(LogTest, "System error message: {}", std::strerror(errno)); + // } + + // Determine if it passes performance goal + auto avg_dram_bandwidth = calculate_average(dram_bandwidth); + if (pass && bypass_check == false) { + // goal is 90% of peak DRAM bandwidth performance + double target_bandwidth = static_cast(dram_bandwidth_spec) * 0.9; + if (avg_dram_bandwidth < target_bandwidth) { + pass = false; + log_error( + LogTest, + "The DRAM bandwidth does not meet the criteria. " + "Current: {:.3f}GB/s, goal: {:.3f}GB/s", + avg_dram_bandwidth, + target_bandwidth); + } } - } if (pass) { log_info(LogTest, "Test Passed"); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h index 5edfc3fa4b10..08661f7d616c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h @@ -138,7 +138,7 @@ DeviceData::DeviceData( auto num_banks = device->num_banks(BufferType::DRAM); for (int bank_id = 0; bank_id < num_banks; bank_id++) { auto dram_channel = device->dram_channel_from_bank_id(bank_id); - CoreCoord phys_core = device->dram_core_from_dram_channel(dram_channel); + CoreCoord phys_core = device->logical_core_from_dram_channel(dram_channel); int32_t bank_offset = device->bank_offset(BufferType::DRAM, bank_id); this->all_data[phys_core][bank_id] = one_core_data_t(); this->all_data[phys_core][bank_id].logical_core = phys_core; @@ -187,7 +187,7 @@ void DeviceData::prepopulate_dram(Device* device, uint32_t size_words) { for (int bank_id = 0; bank_id < num_dram_banks; bank_id++) { auto offset = device->bank_offset(BufferType::DRAM, bank_id); auto dram_channel = device->dram_channel_from_bank_id(bank_id); - auto bank_core = device->dram_core_from_dram_channel(dram_channel); + auto bank_core = device->logical_core_from_dram_channel(dram_channel); one_core_data_t& data = this->all_data[bank_core][bank_id]; // Generate random or coherent data per bank of specific size. @@ -212,12 +212,7 @@ void DeviceData::prepopulate_dram(Device* device, uint32_t size_words) { } // Write to device once per bank (appropriate core and offset) - tt::Cluster::instance().write_core( - static_cast(&data.data[0]), - data.data.size() * sizeof(uint32_t), - tt_cxy_pair(device->id(), bank_core), - this->base_data_addr[static_cast(CoreType::DRAM)] + offset); - ; + tt::tt_metal::detail::WriteToDeviceDRAMChannel(device, bank_id, this->base_data_addr[static_cast(CoreType::DRAM)], data.data); this->base_result_data_addr[static_cast(CoreType::DRAM)] = this->base_data_addr[static_cast(CoreType::DRAM)] + data.data.size() * sizeof(uint32_t); @@ -386,8 +381,13 @@ inline bool DeviceData::validate_one_core( } // Read results from device and compare to expected for this core. - result_addr += bank_offset; - std::vector results = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, result_addr, size_bytes); + std::vector results; + if (core_type == CoreType::DRAM) { + tt::tt_metal::detail::ReadFromDeviceDRAMChannel(device, bank_id, result_addr, size_bytes, results); + } else { + result_addr += bank_offset; + results = tt::llrt::read_hex_vec_from_core(device->id(), phys_core, result_addr, size_bytes); + } log_info( tt::LogTest, @@ -534,28 +534,22 @@ void configure_kernel_variant( NOC my_noc_index, NOC upstream_noc_index, NOC downstream_noc_index) { - const auto& grid_size = device->grid_size(); + auto my_virtual_noc_coords = device->virtual_noc_coordinate(my_noc_index, phys_my_core); + auto upstream_virtual_noc_coords = device->virtual_noc_coordinate(upstream_noc_index, phys_upstream_core); + auto downstream_virtual_noc_coords = device->virtual_noc_coordinate(downstream_noc_index, phys_downstream_core); std::map defines = { - {"MY_NOC_X", std::to_string(tt::tt_metal::hal.noc_coordinate(my_noc_index, grid_size.x, phys_my_core.x))}, - {"MY_NOC_Y", std::to_string(tt::tt_metal::hal.noc_coordinate(my_noc_index, grid_size.y, phys_my_core.y))}, + {"DISPATCH_KERNEL", "1"}, + {"MY_NOC_X", std::to_string(my_virtual_noc_coords.x)}, + {"MY_NOC_Y", std::to_string(my_virtual_noc_coords.y)}, {"UPSTREAM_NOC_INDEX", std::to_string(upstream_noc_index)}, - {"UPSTREAM_NOC_X", - std::to_string(tt::tt_metal::hal.noc_coordinate(upstream_noc_index, grid_size.x, phys_upstream_core.x))}, - {"UPSTREAM_NOC_Y", - std::to_string(tt::tt_metal::hal.noc_coordinate(upstream_noc_index, grid_size.y, phys_upstream_core.y))}, - {"DOWNSTREAM_NOC_X", - std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.x, phys_downstream_core.x))}, - {"DOWNSTREAM_NOC_Y", - std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.y, phys_downstream_core.y))}, - {"DOWNSTREAM_SLAVE_NOC_X", - std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.x, 0xff))}, - {"DOWNSTREAM_SLAVE_NOC_Y", - std::to_string(tt::tt_metal::hal.noc_coordinate( - downstream_noc_index, - grid_size.y, - 0xff))}, // todo, add testing with dispatch_s once it processes more than go signals - {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth + {"UPSTREAM_NOC_X", std::to_string(upstream_virtual_noc_coords.x)}, + {"UPSTREAM_NOC_Y", std::to_string(upstream_virtual_noc_coords.y)}, + {"DOWNSTREAM_NOC_X", std::to_string(downstream_virtual_noc_coords.x)}, + {"DOWNSTREAM_NOC_Y", std::to_string(downstream_virtual_noc_coords.y)}, + {"DOWNSTREAM_SLAVE_NOC_X", std::to_string(0xff)}, + {"DOWNSTREAM_SLAVE_NOC_Y", std::to_string(0xff)}, // todo, add dispatch_s testing + {"FD_CORE_TYPE", std::to_string(0)}, // todo, support dispatch on eth }; compile_args.push_back(is_dram_variant); compile_args.push_back(is_host_variant); @@ -663,7 +657,7 @@ inline void generate_random_paged_payload( if (is_dram) { auto dram_channel = device->dram_channel_from_bank_id(bank_id); - bank_core = device->dram_core_from_dram_channel(dram_channel); + bank_core = device->logical_core_from_dram_channel(dram_channel); } else { bank_core = device->logical_core_from_bank_id(bank_id); } @@ -913,8 +907,8 @@ inline void gen_dispatcher_multicast_write_cmd( CQDispatchCmd cmd; memset(&cmd, 0, sizeof(CQDispatchCmd)); - CoreCoord physical_start = device->physical_core_from_logical_core(worker_core_range.start_coord, CoreType::WORKER); - CoreCoord physical_end = device->physical_core_from_logical_core(worker_core_range.end_coord, CoreType::WORKER); + CoreCoord physical_start = device->worker_core_from_logical_core(worker_core_range.start_coord); + CoreCoord physical_end = device->worker_core_from_logical_core(worker_core_range.end_coord); const uint32_t bank_id = 0; // No interleaved pages here. cmd.base.cmd_id = CQ_DISPATCH_CMD_WRITE_LINEAR; @@ -1119,8 +1113,8 @@ inline bool gen_rnd_dispatcher_packed_write_large_cmd( device_data.relevel(range); CQDispatchWritePackedLargeSubCmd sub_cmd; - CoreCoord physical_start = device->physical_core_from_logical_core(range.start_coord, CoreType::WORKER); - CoreCoord physical_end = device->physical_core_from_logical_core(range.end_coord, CoreType::WORKER); + CoreCoord physical_start = device->worker_core_from_logical_core(range.start_coord); + CoreCoord physical_end = device->worker_core_from_logical_core(range.end_coord); sub_cmd.noc_xy_addr = NOC_MULTICAST_ENCODING(physical_start.x, physical_start.y, physical_end.x, physical_end.y); sub_cmd.addr = device_data.get_result_data_addr(range.start_coord); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index 030d3597b56d..2c1409ab608e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -219,7 +219,7 @@ int main(int argc, char** argv) { } break; case 2: { src_mem = test_write ? "TO_L1" : "FROM_L1"; - CoreCoord w = device->physical_core_from_logical_core(src_worker_g, CoreType::WORKER); + CoreCoord w = device->worker_core_from_logical_core(src_worker_g); noc_addr_x = w.x; noc_addr_y = w.y; } break; @@ -233,7 +233,7 @@ int main(int argc, char** argv) { case 4: { src_mem = "FROM_L1_TO_HOST"; log_info(LogTest, "Host bw test overriding page_count to 1"); - CoreCoord w = device->physical_core_from_logical_core(src_worker_g, CoreType::WORKER); + CoreCoord w = device->worker_core_from_logical_core(src_worker_g); page_count_g = 1; noc_addr_x = w.x; noc_addr_y = w.y; @@ -241,7 +241,7 @@ int main(int argc, char** argv) { case 5: { src_mem = "FROM_HOST_TO_L1"; log_info(LogTest, "Host bw test overriding page_count to 1"); - CoreCoord w = device->physical_core_from_logical_core(src_worker_g, CoreType::WORKER); + CoreCoord w = device->worker_core_from_logical_core(src_worker_g); page_count_g = 1; noc_addr_x = w.x; noc_addr_y = w.y; @@ -249,10 +249,8 @@ int main(int argc, char** argv) { case 6: { src_mem = "FROM_L1_TO_MCAST"; issue_mcast = 1; - CoreCoord start = - device->physical_core_from_logical_core(mcast_src_workers_g.start_coord, CoreType::WORKER); - CoreCoord end = - device->physical_core_from_logical_core(mcast_src_workers_g.end_coord, CoreType::WORKER); + CoreCoord start = device->worker_core_from_logical_core(mcast_src_workers_g.start_coord); + CoreCoord end = device->worker_core_from_logical_core(mcast_src_workers_g.end_coord); noc_addr_x = start.x; noc_addr_y = start.y; mcast_noc_addr_end_x = end.x; @@ -299,7 +297,7 @@ int main(int argc, char** argv) { std::shared_ptr sync_event = std::make_shared(); - CoreCoord w = device->physical_core_from_logical_core(worker_g.start_coord, CoreType::WORKER); + CoreCoord w = device->worker_core_from_logical_core(worker_g.start_coord); log_info(LogTest, "Master core: {}", w.str()); string direction = test_write ? "Writing" : "Reading"; if (source_mem_g == 3) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp index e0b477046414..d22dd0bb90ee 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp @@ -390,21 +390,12 @@ void initialize_dram_banks(Device* device) { auto fill = std::vector(bank_size / sizeof(uint32_t), 0xBADDF00D); for (int bank_id = 0; bank_id < num_banks; bank_id++) { - auto offset = device->bank_offset(BufferType::DRAM, bank_id); - auto dram_channel = device->dram_channel_from_bank_id(bank_id); - auto bank_core = device->dram_core_from_dram_channel(dram_channel); log_info( tt::LogTest, - "Initializing DRAM {} bytes for bank_id: {} core: {} at addr: 0x{:x}", + "Initializing DRAM {} bytes for bank_id: {}", bank_size, - bank_id, - bank_core, - offset); - tt::Cluster::instance().write_core( - static_cast(fill.data()), - fill.size() * sizeof(uint32_t), - tt_cxy_pair(device->id(), bank_core), - offset); + bank_id); + tt::tt_metal::detail::WriteToDeviceDRAMChannel(device, bank_id, 0, fill); } } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index 5d482863c55f..1b6748e7768f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -300,13 +300,12 @@ void add_prefetcher_paged_read_cmd( add_bare_prefetcher_cmd(cmds, cmd, true); } -void add_prefetcher_linear_read_cmd( - Device* device, - vector& cmds, - vector& sizes, - CoreCoord worker_core, - uint32_t addr, - uint32_t length) { +void add_prefetcher_linear_read_cmd(Device *device, + vector& cmds, + vector& sizes, + CoreCoord worker_core, + uint32_t addr, + uint32_t length) { CoreCoord phys_worker_core = device->worker_core_from_logical_core(worker_core); CQPrefetchCmd cmd; @@ -444,7 +443,7 @@ void add_paged_dram_data_to_device_data( for (uint32_t page_idx = start_page; page_idx < last_page; page_idx++) { uint32_t dram_bank_id = page_idx % num_dram_banks_g; auto dram_channel = device->dram_channel_from_bank_id(dram_bank_id); - CoreCoord bank_core = device->dram_core_from_dram_channel(dram_channel); + CoreCoord bank_core = device->logical_core_from_dram_channel(dram_channel); uint32_t bank_offset = base_addr_words + page_size_words * (page_idx / num_dram_banks_g); if (page_idx == last_page - 1) { @@ -500,7 +499,7 @@ void gen_dram_packed_read_cmd( for (uint32_t i = 0; i < length_words; i += page_size_words) { uint32_t dram_bank_id = page_idx % num_dram_banks_g; auto dram_channel = device->dram_channel_from_bank_id(dram_bank_id); - CoreCoord bank_core = device->dram_core_from_dram_channel(dram_channel); + CoreCoord bank_core = device->logical_core_from_dram_channel(dram_channel); uint32_t bank_offset = base_addr_words + page_size_words * (page_idx / num_dram_banks_g); uint32_t words = (page_size_words > length_words - i) ? length_words - i : page_size_words; @@ -1050,15 +1049,8 @@ void gen_prefetcher_exec_buf_cmd_and_write_to_dram( uint32_t index = 0; for (uint32_t page_id = 0; page_id < pages; page_id++) { uint32_t bank_id = page_id % num_dram_banks_g; - auto offset = device->bank_offset(BufferType::DRAM, bank_id); - auto dram_channel = device->dram_channel_from_bank_id(bank_id); - auto bank_core = device->dram_core_from_dram_channel(dram_channel); - - tt::Cluster::instance().write_core( - static_cast(&exec_buf_cmds[index / sizeof(uint32_t)]), - page_size, - tt_cxy_pair(device->id(), bank_core), - DRAM_EXEC_BUF_DEFAULT_BASE_ADDR + offset + (page_id / num_dram_banks_g) * page_size); + std::vector exec_buf_page(exec_buf_cmds.begin() + index / sizeof(uint32_t), exec_buf_cmds.begin() + (index + page_size) / sizeof(uint32_t)); + tt::tt_metal::detail::WriteToDeviceDRAMChannel(device, bank_id, DRAM_EXEC_BUF_DEFAULT_BASE_ADDR + (page_id / num_dram_banks_g) * page_size, exec_buf_page); index += page_size; } @@ -1629,22 +1621,8 @@ void initialize_dram_banks(Device* device) { auto fill = std::vector(bank_size / sizeof(uint32_t), 0xBADDF00D); for (int bank_id = 0; bank_id < num_banks; bank_id++) { - auto offset = device->bank_offset(BufferType::DRAM, bank_id); - auto dram_channel = device->dram_channel_from_bank_id(bank_id); - auto bank_core = device->dram_core_from_dram_channel(dram_channel); - - log_info( - tt::LogTest, - "Initializing DRAM {} bytes for bank_id: {} core: {} at addr: 0x{:x}", - bank_size, - bank_id, - bank_core.str(), - offset); - tt::Cluster::instance().write_core( - static_cast(fill.data()), - fill.size() * sizeof(uint32_t), - tt_cxy_pair(device->id(), bank_core), - offset); + log_info(tt::LogTest, "Initializing DRAM {} bytes for bank_id: {}", bank_size, bank_id); + tt::tt_metal::detail::WriteToDeviceDRAMChannel(device, bank_id, 0, fill); } } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp index 3123ea1736ae..476e9890797a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_hop_latencies_no_edm.cpp @@ -224,8 +224,8 @@ void build_and_run_roundtrip_latency_test( sample_page_size, eth_sender_core, receiver_start_semaphore, - device->physical_core_from_logical_core(init_worker_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(init_worker_core, CoreType::WORKER).y, + device->virtual_core_from_logical_core(init_worker_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(init_worker_core, CoreType::WORKER).y, worker_sem0); std::vector const& sender_eth_rt_args = get_eth_sender_rt_args( device, @@ -233,15 +233,15 @@ void build_and_run_roundtrip_latency_test( num_samples, max_concurrent_samples, sample_page_size, - device->physical_core_from_logical_core(init_worker_core, CoreType::WORKER).x, - device->physical_core_from_logical_core(init_worker_core, CoreType::WORKER).y, + device->virtual_core_from_logical_core(init_worker_core, CoreType::WORKER).x, + device->virtual_core_from_logical_core(init_worker_core, CoreType::WORKER).y, worker_sem1); std::vector worker_init_rt_args = { worker_sem0, worker_sem1, - static_cast(device->physical_core_from_logical_core(eth_receiver_core, CoreType::ETH).x), - static_cast(device->physical_core_from_logical_core(eth_receiver_core, CoreType::ETH).y), + static_cast(device->virtual_core_from_logical_core(eth_receiver_core, CoreType::ETH).x), + static_cast(device->virtual_core_from_logical_core(eth_receiver_core, CoreType::ETH).y), receiver_start_semaphore}; auto receiver_kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp index d8d4896badf9..ccce74010490 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_workers_and_erisc_datamover_unidirectional.cpp @@ -253,7 +253,7 @@ bool RunWriteBWTest( for (uint32_t w = 0; w < chip0_num_workers_on_channel; w++) { // 10) worker_coord(s) auto worker_noc_coord = - sender_device->physical_core_from_logical_core(chip0_sender_worker_core, CoreType::WORKER); + sender_device->virtual_core_from_logical_core(chip0_sender_worker_core, CoreType::WORKER); chip0_edm_args.push_back( KernelXY{static_cast(worker_noc_coord.x), static_cast(worker_noc_coord.y)} .to_uint32()); @@ -400,7 +400,7 @@ bool RunWriteBWTest( for (uint32_t w = 0; w < chip1_num_workers_on_channel; w++) { // 10) worker_coord(s) auto worker_noc_coord = - receiver_device->physical_core_from_logical_core(chip1_sender_noc_xy, CoreType::WORKER); + receiver_device->virtual_core_from_logical_core(chip1_sender_noc_xy, CoreType::WORKER); chip1_edm_args.push_back( KernelXY{static_cast(worker_noc_coord.x), static_cast(worker_noc_coord.y)} .to_uint32()); @@ -448,7 +448,7 @@ bool RunWriteBWTest( for (uint32_t w = 0; w < chip1_num_workers_on_channel; w++) { // 10) worker_coord(s) auto worker_noc_coord = - receiver_device->physical_core_from_logical_core(chip1_receiver_worker_core, CoreType::WORKER); + receiver_device->virtual_core_from_logical_core(chip1_receiver_worker_core, CoreType::WORKER); chip1_edm_args.push_back( KernelXY{static_cast(worker_noc_coord.x), static_cast(worker_noc_coord.y)} .to_uint32()); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp index 87db470f9a6e..afbd7ae3038f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_bi_tunnel.cpp @@ -221,28 +221,29 @@ int main(int argc, char** argv) { std::vector tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -454,28 +455,29 @@ int main(int argc, char** argv) { std::vector rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i + 1, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i + 1, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp index 694919bf9a7a..547472f20f13 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_1cq.cpp @@ -216,28 +216,29 @@ int main(int argc, char** argv) { // tx on left chip std::vector l_tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; l_tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (l_to_r) { log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -255,28 +256,29 @@ int main(int argc, char** argv) { // tx on right chip std::vector r_tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; r_tx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (r_to_l) { log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -441,28 +443,29 @@ int main(int argc, char** argv) { // Rx Right std::vector r_rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; r_rx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (l_to_r) { log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -480,28 +483,29 @@ int main(int argc, char** argv) { // Rx Left std::vector l_rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; l_rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (r_to_l) { log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp index 6aa65a7b28a6..5562d50100d7 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tunnel_2cq.cpp @@ -216,28 +216,29 @@ int main(int argc, char** argv) { // tx on left chip std::vector l_tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; l_tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (l_to_r) { log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -255,28 +256,29 @@ int main(int argc, char** argv) { // tx on right chip std::vector r_tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; r_tx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (r_to_l) { log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -449,28 +451,29 @@ int main(int argc, char** argv) { // Rx Right std::vector r_rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; r_rx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (l_to_r) { log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -488,28 +491,29 @@ int main(int argc, char** argv) { // Rx Left std::vector l_rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; l_rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; if (r_to_l) { log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp index 552e32a31df0..b48b418822a5 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel.cpp @@ -213,28 +213,29 @@ int main(int argc, char** argv) { std::vector tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -376,28 +377,29 @@ int main(int argc, char** argv) { std::vector rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; rx_phys_core.push_back(device_r->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i + 1, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i + 1, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000 * 4, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp index 570b9ed24870..6c084f08576d 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_uni_tunnel_single_chip.cpp @@ -207,28 +207,29 @@ int main(int argc, char** argv) { std::vector tx_phys_core; for (uint32_t i = 0; i < num_src_endpoints; i++) { - CoreCoord core = {tx_x + i, tx_y}; + CoreCoord core = {tx_x+i, tx_y}; tx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - src_endpoint_start_id + i, // 0: src_endpoint_id - num_dest_endpoints, // 1: num_dest_endpoints - (tx_queue_start_addr >> 4), // 2: queue_start_addr_words - (tx_queue_size_bytes >> 4), // 3: queue_size_words - ((mux_queue_start_addr + i * mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words - (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words - (uint32_t)mux_phys_core.x, // 6: remote_rx_x - (uint32_t)mux_phys_core.y, // 7: remote_rx_y - i, // 8: remote_rx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type - test_results_addr, // 10: test_results_addr - test_results_size, // 11: test_results_size - prng_seed, // 12: prng_seed - data_kb_per_tx, // 13: total_data_kb - max_packet_size_words, // 14: max_packet_size_words - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000, // 17: timeout_cycles - }; + std::vector compile_args = + { + src_endpoint_start_id + i, // 0: src_endpoint_id + num_dest_endpoints, // 1: num_dest_endpoints + (tx_queue_start_addr >> 4), // 2: queue_start_addr_words + (tx_queue_size_bytes >> 4), // 3: queue_size_words + ((mux_queue_start_addr + i*mux_queue_size_bytes) >> 4), // 4: remote_rx_queue_start_addr_words + (mux_queue_size_bytes >> 4), // 5: remote_rx_queue_size_words + (uint32_t)mux_phys_core.x, // 6: remote_rx_x + (uint32_t)mux_phys_core.y, // 7: remote_rx_y + i, // 8: remote_rx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 9: tx_network_type + test_results_addr, // 10: test_results_addr + test_results_size, // 11: test_results_size + prng_seed, // 12: prng_seed + data_kb_per_tx, // 13: total_data_kb + max_packet_size_words, // 14: max_packet_size_words + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_tx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( @@ -366,28 +367,29 @@ int main(int argc, char** argv) { std::vector rx_phys_core; for (uint32_t i = 0; i < num_dest_endpoints; i++) { - CoreCoord core = {rx_x + i, rx_y}; + CoreCoord core = {rx_x+i, rx_y}; rx_phys_core.push_back(device->worker_core_from_logical_core(core)); - std::vector compile_args = { - dest_endpoint_start_id + i, // 0: dest_endpoint_id - num_src_endpoints, // 1: num_src_endpoints - num_dest_endpoints, // 2: num_dest_endpoints - (rx_queue_start_addr >> 4), // 3: queue_start_addr_words - (rx_queue_size_bytes >> 4), // 4: queue_size_words - (uint32_t)demux_phys_core.x, // 5: remote_tx_x - (uint32_t)demux_phys_core.y, // 6: remote_tx_y - i, // 7: remote_tx_queue_id - (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type - test_results_addr, // 9: test_results_addr - test_results_size, // 10: test_results_size - prng_seed, // 11: prng_seed - 0, // 12: reserved - max_packet_size_words, // 13: max_packet_size_words - rx_disable_data_check, // 14: disable data check - src_endpoint_start_id, // 15: src_endpoint_start_id - dest_endpoint_start_id, // 16: dest_endpoint_start_id - timeout_mcycles * 1000 * 1000, // 17: timeout_cycles - }; + std::vector compile_args = + { + dest_endpoint_start_id + i, // 0: dest_endpoint_id + num_src_endpoints, // 1: num_src_endpoints + num_dest_endpoints, // 2: num_dest_endpoints + (rx_queue_start_addr >> 4), // 3: queue_start_addr_words + (rx_queue_size_bytes >> 4), // 4: queue_size_words + (uint32_t)demux_phys_core.x, // 5: remote_tx_x + (uint32_t)demux_phys_core.y, // 6: remote_tx_y + i, // 7: remote_tx_queue_id + (uint32_t)DispatchRemoteNetworkType::NOC0, // 8: rx_rptr_update_network_type + test_results_addr, // 9: test_results_addr + test_results_size, // 10: test_results_size + prng_seed, // 11: prng_seed + 0, // 12: reserved + max_packet_size_words, // 13: max_packet_size_words + rx_disable_data_check, // 14: disable data check + src_endpoint_start_id, // 15: src_endpoint_start_id + dest_endpoint_start_id, // 16: dest_endpoint_start_id + timeout_mcycles * 1000 * 1000, // 17: timeout_cycles + }; log_info(LogTest, "run traffic_gen_rx at x={},y={}", core.x, core.y); auto kernel = tt_metal::CreateKernel( diff --git a/tests/tt_metal/tt_metal/test_bcast.cpp b/tests/tt_metal/tt_metal/test_bcast.cpp index 9796de5d1b4e..e0f9241b57f0 100644 --- a/tests/tt_metal/tt_metal/test_bcast.cpp +++ b/tests/tt_metal/tt_metal/test_bcast.cpp @@ -124,8 +124,6 @@ int main(int argc, char** argv) { uint32_t dram_buffer_src0_addr = src0_dram_buffer->address(); auto dst_dram_buffer = CreateBuffer(buff_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); uint32_t src0_cb_index = 0; uint32_t num_buffer_tiles = 2; @@ -238,7 +236,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(src1_config); uint32_t dram_buffer_src1_addr = src1_dram_buffer->address(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); tt_metal::detail::WriteToBuffer(src1_dram_buffer, bcast_tiled_u32); bool src0_is_dram = true; @@ -268,28 +265,20 @@ int main(int argc, char** argv) { program, binary_reader_kernel, core, - {dram_buffer_src0_addr, // 0 - (std::uint32_t)dram_src0_noc_xy.x, // 1 - (std::uint32_t)dram_src0_noc_xy.y, // 2 - num_tensor_tiles, // 3 - dram_buffer_src1_addr, // 4 - (std::uint32_t)dram_src1_noc_xy.x, // 5 - (std::uint32_t)dram_src1_noc_xy.y, // 6 + {dram_buffer_src0_addr, // 0 + (std::uint32_t)0, // 1 + num_tensor_tiles, // 2 + dram_buffer_src1_addr, // 3 + (std::uint32_t)0, // 4 num_bcast_tiles, NC * Ht * Wt, NC, Ht, Wt, - nc1}); // 7 8 9 10 11 12 + nc1}); // 5 6 7 8 9 10 tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dram_buffer_dst_addr, - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tensor_tiles}); + program, unary_writer_kernel, core, {dram_buffer_dst_addr, (std::uint32_t)0, num_tensor_tiles}); std::map compute_defines = { {"BCAST_DIM", bdim_to_llkdim_define[bcast_dim]}, diff --git a/tests/tt_metal/tt_metal/test_clean_init.cpp b/tests/tt_metal/tt_metal/test_clean_init.cpp index 5f72467a2ef2..cd3118ec3e89 100644 --- a/tests/tt_metal/tt_metal/test_clean_init.cpp +++ b/tests/tt_metal/tt_metal/test_clean_init.cpp @@ -44,13 +44,13 @@ int main(int argc, char** argv) { for (int device_id = 0; device_id < num_devices; device_id++) { try { /* - * Silicon accelerator setup - */ - Device* device = devices[device_id]; + * Silicon accelerator setup + */ + Device *device = devices[device_id]; /* - * Setup program and command queue to execute along with its buffers and kernels to use - */ + * Setup program and command queue to execute along with its buffers and kernels to use + */ CommandQueue& cq = device->command_queue(); Program program = CreateProgram(); @@ -60,22 +60,25 @@ int main(int argc, char** argv) { program, "tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp", core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default} + ); constexpr uint32_t single_tile_size = 2 * (32 * 32); constexpr uint32_t num_tiles = 50; constexpr uint32_t dram_buffer_size = single_tile_size * num_tiles; tt::tt_metal::InterleavedBufferConfig dram_config{ - .device = device, - .size = dram_buffer_size, - .page_size = dram_buffer_size, - .buffer_type = tt::tt_metal::BufferType::DRAM}; + .device= device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = tt::tt_metal::BufferType::DRAM + }; tt::tt_metal::InterleavedBufferConfig l1_config{ - .device = device, - .size = dram_buffer_size, - .page_size = dram_buffer_size, - .buffer_type = tt::tt_metal::BufferType::L1}; + .device= device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = tt::tt_metal::BufferType::L1 + }; auto l1_buffer = CreateBuffer(l1_config); @@ -86,8 +89,8 @@ int main(int argc, char** argv) { const uint32_t output_dram_buffer_addr = output_dram_buffer->address(); /* - * Create input data and runtime arguments, then execute - */ + * Create input data and runtime arguments, then execute + */ std::vector input_vec = create_random_vector_of_bfloat16( dram_buffer_size, 100, std::chrono::system_clock::now().time_since_epoch().count()); EnqueueWriteBuffer(cq, input_dram_buffer, input_vec, false); @@ -95,14 +98,18 @@ int main(int argc, char** argv) { const std::array runtime_args = { l1_buffer->address(), input_dram_buffer->address(), - static_cast(input_dram_buffer->noc_coordinates().x), - static_cast(input_dram_buffer->noc_coordinates().y), + 0, output_dram_buffer->address(), - static_cast(output_dram_buffer->noc_coordinates().x), - static_cast(output_dram_buffer->noc_coordinates().y), - l1_buffer->size()}; + 0, + l1_buffer->size() + }; - SetRuntimeArgs(program, dram_copy_kernel_id, core, runtime_args); + SetRuntimeArgs( + program, + dram_copy_kernel_id, + core, + runtime_args + ); EnqueueProgram(cq, program, false); tt::log_info("Started program"); @@ -110,14 +117,14 @@ int main(int argc, char** argv) { tt::log_info("Finished program"); /* - * Validation & Teardown - */ + * Validation & Teardown + */ std::vector result_vec; EnqueueReadBuffer(cq, output_dram_buffer, result_vec, true); pass &= input_vec == result_vec; - } catch (const std::exception& e) { + } catch (const std::exception &e) { tt::log_error(tt::LogTest, "Test failed with exception!"); tt::log_error(tt::LogTest, "{}", e.what()); diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index 5a543ff644d3..6a338ac73580 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -58,8 +58,6 @@ void construct_program(Program& program, Device* device, CoreCoord& core) { auto dst_dram_buffer = CreateBuffer(buff_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the // input CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to diff --git a/tests/tt_metal/tt_metal/test_core_range_set.cpp b/tests/tt_metal/tt_metal/test_core_range_set.cpp index 5ebac48ca2dd..0dc99f6c46b1 100644 --- a/tests/tt_metal/tt_metal/test_core_range_set.cpp +++ b/tests/tt_metal/tt_metal/test_core_range_set.cpp @@ -92,7 +92,6 @@ bool test_program_specified_with_core_range_set( .device = device, .size = buffer_size, .page_size = buffer_size, .buffer_type = tt_metal::BufferType::DRAM}; auto src_dram_buffer = CreateBuffer(dram_config); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); std::map> core_to_l1_buffer; for (auto core_range : core_range_set.ranges()) { @@ -139,7 +138,7 @@ bool test_program_specified_with_core_range_set( auto unary_writer_kernel = tt_metal::CreateKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tt_metal/kernels/dataflow/writer_unary_1.cpp", core_range_set, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -172,12 +171,14 @@ bool test_program_specified_with_core_range_set( // Reader kernel on all cores reads from same location in DRAM const std::array reader_rt_args = { - src_dram_buffer->address(), (std::uint32_t)dram_src_noc_xy.x, (std::uint32_t)dram_src_noc_xy.y, num_tiles}; - + src_dram_buffer->address(), uint(0), num_tiles}; for (const auto& [core, dst_l1_buffer] : core_to_l1_buffer) { tt_metal::SetRuntimeArgs(program, unary_reader_kernel, core, reader_rt_args); - auto l1_dst_noc_xy = dst_l1_buffer->noc_coordinates(); + auto bank_id = 0; + auto l1_dst_noc_xy = + device->virtual_core_from_logical_core(dst_l1_buffer->logical_core_from_bank_id(0), CoreType::WORKER); + tt_metal::SetRuntimeArgs( program, unary_writer_kernel, diff --git a/tests/tt_metal/tt_metal/test_datacopy.cpp b/tests/tt_metal/tt_metal/test_datacopy.cpp index 9caaced9a95b..92f31b304660 100644 --- a/tests/tt_metal/tt_metal/test_datacopy.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy.cpp @@ -60,9 +60,6 @@ int main(int argc, char** argv) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math // kernel, input CB and reader @@ -118,13 +115,17 @@ int main(int argc, char** argv) { program, unary_reader_kernel, core, - {dram_buffer_src_addr, (std::uint32_t)dram_src_noc_xy.x, (std::uint32_t)dram_src_noc_xy.y, num_tiles}); + {dram_buffer_src_addr, + 0, + num_tiles}); tt_metal::SetRuntimeArgs( program, unary_writer_kernel, core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + {dram_buffer_dst_addr, + 0, + num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp index 329cbbf4d877..ad11169d38a1 100644 --- a/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy_bfp8b.cpp @@ -53,9 +53,6 @@ int main(int argc, char** argv) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 1; tt_metal::CircularBufferConfig cb_src0_config = @@ -114,13 +111,17 @@ int main(int argc, char** argv) { program, unary_reader_kernel, core, - {dram_buffer_src_addr, (std::uint32_t)dram_src_noc_xy.x, (std::uint32_t)dram_src_noc_xy.y, num_tiles}); + {dram_buffer_src_addr, + 0, + num_tiles}); tt_metal::SetRuntimeArgs( program, unary_writer_kernel, core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + {dram_buffer_dst_addr, + 0, + num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp index a352719772b3..7156cc15c030 100644 --- a/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp +++ b/tests/tt_metal/tt_metal/test_datacopy_output_in_l1.cpp @@ -60,8 +60,8 @@ int main(int argc, char** argv) { auto dst_l1_buffer = CreateBuffer(l1_config); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto l1_dst_noc_xy = dst_l1_buffer->noc_coordinates(); + auto l1_dst_noc_xy = + device->virtual_core_from_logical_core(dst_l1_buffer->logical_core_from_bank_id(0), CoreType::WORKER); // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math @@ -91,7 +91,7 @@ int main(int argc, char** argv) { auto unary_writer_kernel = tt_metal::CreateKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tt_metal/kernels/dataflow/writer_unary_1.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -122,9 +122,8 @@ int main(int argc, char** argv) { unary_reader_kernel, core, {src_dram_buffer->address(), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - num_tiles}); + 0, + num_tiles}); tt_metal::SetRuntimeArgs( program, diff --git a/tests/tt_metal/tt_metal/test_dataflow_cb.cpp b/tests/tt_metal/tt_metal/test_dataflow_cb.cpp index 969d5ebe6599..277424bfaac9 100644 --- a/tests/tt_metal/tt_metal/test_dataflow_cb.cpp +++ b/tests/tt_metal/tt_metal/test_dataflow_cb.cpp @@ -51,9 +51,6 @@ int main(int argc, char** argv) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - int num_cbs = 1; // works at the moment assert(num_tiles % num_cbs == 0); int num_tiles_per_cb = num_tiles / num_cbs; @@ -116,18 +113,16 @@ int main(int argc, char** argv) { reader_cb_kernel, core, {dram_buffer_src_addr, - (uint32_t)dram_src_noc_xy.x, - (uint32_t)dram_src_noc_xy.y, - (uint32_t)num_tiles_per_cb}); + 0, + (uint32_t)num_tiles_per_cb}); tt_metal::SetRuntimeArgs( program, writer_cb_kernel, core, {dram_buffer_dst_addr, - (uint32_t)dram_dst_noc_xy.x, - (uint32_t)dram_dst_noc_xy.y, - (uint32_t)num_tiles_per_cb}); + 0, + (uint32_t)num_tiles_per_cb}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp index 6ffc8727174f..a4700ecbe0b6 100644 --- a/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp +++ b/tests/tt_metal/tt_metal/test_dram_copy_sticks_multi_core.cpp @@ -64,7 +64,6 @@ int main(int argc, char** argv) { auto src_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_addr = src_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); assert(src_dram_buffer->size() % (num_cores_r * num_cores_c) == 0); uint32_t per_core_l1_size = src_dram_buffer->size() / (num_cores_r * num_cores_c); std::unordered_map core_to_l1_addr; @@ -108,11 +107,10 @@ int main(int argc, char** argv) { unary_reader_kernel, core, {core_to_l1_addr.at(core), - dram_buffer_src_addr + (core_index * stick_size), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - (std::uint32_t)1, - (std::uint32_t)stick_size}); + dram_buffer_src_addr + (core_index * stick_size), + 0, + (std::uint32_t) 1, + (std::uint32_t) stick_size}); core_index++; } } diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp index fb7f2eac3c83..53c268c30800 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core.cpp @@ -80,9 +80,6 @@ int main(int argc, char** argv) { auto output_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = output_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - // Loader (producer kernel) running on BRISC on logical core {0, 0} auto producer_kernel = tt_metal::CreateKernel( program, @@ -113,31 +110,29 @@ int main(int argc, char** argv) { producer_kernel, loader_logical_core, {dram_buffer_src_addr, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, - loader_buffer_address, - (uint32_t)writer_worker_core.x, - (uint32_t)writer_worker_core.y, - stream_register_address, - num_output_tiles, - transient_buffer_size_tiles, - transient_buffer_size_bytes}); + 0, + loader_buffer_address, + (uint32_t)writer_worker_core.x, + (uint32_t)writer_worker_core.y, + stream_register_address, + num_output_tiles, + transient_buffer_size_tiles, + transient_buffer_size_bytes}); tt_metal::SetRuntimeArgs( program, consumer_kernel, writer_logical_core, {loader_buffer_address, - (uint32_t)loader_worker_core.x, - (uint32_t)loader_worker_core.y, - dram_buffer_dst_addr, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, - writer_buffer_address, - stream_register_address, - num_output_tiles, - transient_buffer_size_tiles, - transient_buffer_size_bytes}); + (uint32_t)loader_worker_core.x, + (uint32_t)loader_worker_core.y, + dram_buffer_dst_addr, + 0, + writer_buffer_address, + stream_register_address, + num_output_tiles, + transient_buffer_size_tiles, + transient_buffer_size_bytes}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp index a8a1b47ea5ac..a432cfb39d6c 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_multi_core_db.cpp @@ -100,9 +100,6 @@ int main(int argc, char** argv) { // auto output_dram_buffer = tt_metal::CreateDramBuffer(device, dram_channel_id, dram_buffer_size, // dram_buffer_dst_addr); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - // Loader (producer kernel) running on BRISC on logical core {0, 0} auto producer_kernel = tt_metal::CreateKernel( program, @@ -133,36 +130,36 @@ int main(int argc, char** argv) { producer_kernel, loader_logical_core, {dram_buffer_src_addr, - (uint32_t)input_dram_noc_xy.x, - (uint32_t)input_dram_noc_xy.y, - loader_buffer_address1, - loader_buffer_address2, - (uint32_t)writer_worker_core.x, - (uint32_t)writer_worker_core.y, - stream_register_address1, - stream_register_address2, - num_output_tiles, - transient_buffer_size_tiles, - transient_buffer_size_bytes}); + 0, + loader_buffer_address1, + loader_buffer_address2, + (uint32_t)writer_worker_core.x, + (uint32_t)writer_worker_core.y, + stream_register_address1, + stream_register_address2, + num_output_tiles, + transient_buffer_size_tiles, + transient_buffer_size_bytes} + ); tt_metal::SetRuntimeArgs( program, consumer_kernel, writer_logical_core, {loader_buffer_address1, - loader_buffer_address2, - (uint32_t)loader_worker_core.x, - (uint32_t)loader_worker_core.y, - dram_buffer_dst_addr, - (uint32_t)output_dram_noc_xy.x, - (uint32_t)output_dram_noc_xy.y, - writer_buffer_address1, - writer_buffer_address2, - stream_register_address1, - stream_register_address2, - num_output_tiles, - transient_buffer_size_tiles, - transient_buffer_size_bytes}); + loader_buffer_address2, + (uint32_t)loader_worker_core.x, + (uint32_t)loader_worker_core.y, + dram_buffer_dst_addr, + 0, + writer_buffer_address1, + writer_buffer_address2, + stream_register_address1, + stream_register_address2, + num_output_tiles, + transient_buffer_size_tiles, + transient_buffer_size_bytes} + ); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp index eee7394ae90d..f4dc47278ed2 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_single_core.cpp @@ -54,9 +54,6 @@ int main(int argc, char** argv) { auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_buffer_addr = output_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - auto dram_copy_kernel = tt_metal::CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp", @@ -80,13 +77,11 @@ int main(int argc, char** argv) { dram_copy_kernel, core, {l1_buffer_addr, - input_dram_buffer_addr, - (std::uint32_t)input_dram_noc_xy.x, - (std::uint32_t)input_dram_noc_xy.y, - output_dram_buffer_addr, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - dram_buffer_size}); + input_dram_buffer_addr, + 0, + output_dram_buffer_addr, + 0, + dram_buffer_size}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp b/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp index ccbe3b14287d..261c1611a1f2 100644 --- a/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp +++ b/tests/tt_metal/tt_metal/test_dram_loopback_single_core_db.cpp @@ -57,9 +57,6 @@ int main(int argc, char** argv) { auto output_dram_buffer = CreateBuffer(dram_config); uint32_t output_dram_buffer_addr = output_dram_buffer->address(); - auto input_dram_noc_xy = input_dram_buffer->noc_coordinates(); - auto output_dram_noc_xy = output_dram_buffer->noc_coordinates(); - auto dram_copy_kernel = tt_metal::CreateKernel( program, "tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp", @@ -83,16 +80,14 @@ int main(int argc, char** argv) { dram_copy_kernel, core, {input_dram_buffer_addr, - (std::uint32_t)input_dram_noc_xy.x, - (std::uint32_t)input_dram_noc_xy.y, - output_dram_buffer_addr, - (std::uint32_t)output_dram_noc_xy.x, - (std::uint32_t)output_dram_noc_xy.y, - dram_buffer_size_bytes, - num_tiles, - l1_buffer_addr, - total_l1_buffer_size_tiles, - total_l1_buffer_size_bytes}); + 0, + output_dram_buffer_addr, + 0, + dram_buffer_size_bytes, + num_tiles, + l1_buffer_addr, + total_l1_buffer_size_tiles, + total_l1_buffer_size_bytes}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp index a7cfe0466e24..76df1018d741 100644 --- a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp +++ b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast.cpp @@ -55,8 +55,6 @@ int main(int argc, char** argv) { auto dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_addr = dram_buffer->address(); - auto dram_noc_xy = dram_buffer->noc_coordinates(); - CoreCoord core_start = {0, 0}; CoreCoord grid_size = device->logical_grid_size(); CoreCoord core_end = {core_start.x + (grid_size.x - 1), core_start.y + (grid_size.y - 1)}; @@ -64,8 +62,7 @@ int main(int argc, char** argv) { auto core_end_physical = device->worker_core_from_logical_core(core_end); const std::array mcast_reader_args = { (std::uint32_t)dram_buffer_addr, - (std::uint32_t)dram_noc_xy.x, - (std::uint32_t)dram_noc_xy.y, + (std::uint32_t) 0, (std::uint32_t)dram_buffer_size, (std::uint32_t)local_buffer_addr, (std::uint32_t)dest_buffer_addr, diff --git a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp index 28030b22d97c..767218752f9a 100644 --- a/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp +++ b/tests/tt_metal/tt_metal/test_dram_to_l1_multicast_loopback_src.cpp @@ -51,7 +51,6 @@ int main(int argc, char** argv) { .buffer_type = tt_metal::BufferType::DRAM}; auto dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_addr = dram_buffer->address(); - auto dram_noc_xy = dram_buffer->noc_coordinates(); CoreCoord core_start = {0, 0}; CoreCoord grid_size = device->logical_grid_size(); @@ -60,8 +59,7 @@ int main(int argc, char** argv) { auto core_end_physical = device->worker_core_from_logical_core(core_end); const std::array mcast_reader_args = { (std::uint32_t)dram_buffer_addr, - (std::uint32_t)dram_noc_xy.x, - (std::uint32_t)dram_noc_xy.y, + 0, (std::uint32_t)dram_buffer_size, (std::uint32_t)local_buffer_addr, (std::uint32_t)dest_buffer_addr, diff --git a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp index b6d0d8564a62..60cdf9df7e44 100644 --- a/tests/tt_metal/tt_metal/test_eltwise_binary.cpp +++ b/tests/tt_metal/tt_metal/test_eltwise_binary.cpp @@ -77,10 +77,6 @@ int main(int argc, char** argv) { uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = tt::CBIndex::c_0; uint32_t num_input_tiles = 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -158,19 +154,17 @@ int main(int argc, char** argv) { EnqueueWriteBuffer(cq, std::ref(src1_dram_buffer), src1_vec, false); - const std::array reader_args = { + const std::array reader_args = { dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + 0, num_tiles, dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + 0, num_tiles, 0}; - const std::array writer_args = { - dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}; + const std::array writer_args = { + dram_buffer_dst_addr, 0, num_tiles}; SetRuntimeArgs(program, unary_writer_kernel, core, writer_args); SetRuntimeArgs(program, binary_reader_kernel, core, reader_args); diff --git a/tests/tt_metal/tt_metal/test_enqueue_program.cpp b/tests/tt_metal/tt_metal/test_enqueue_program.cpp index d31f07dee625..d685a0443028 100644 --- a/tests/tt_metal/tt_metal/test_enqueue_program.cpp +++ b/tests/tt_metal/tt_metal/test_enqueue_program.cpp @@ -35,9 +35,6 @@ tt_metal::Program generate_eltwise_unary_program(Device* device) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 2; tt_metal::CircularBufferConfig src_cb_config = @@ -99,9 +96,9 @@ void test_enqueue_program(std::functionriscv0_id.value(), worker_core, {out.address(), 0, 0, NUM_TILES}); - SetRuntimeArgs(program, kernel_group->riscv1_id.value(), worker_core, {buf.address(), 0, 0, NUM_TILES}); + const KernelGroup *kernel_group = program.kernels_on_core(worker_core, CoreType::WORKER); + SetRuntimeArgs(program, kernel_group->riscv0_id.value(), worker_core, {out.address(), 0, NUM_TILES}); + SetRuntimeArgs(program, kernel_group->riscv1_id.value(), worker_core, {buf.address(), 0, NUM_TILES}); EnqueueWriteBuffer(cq, std::ref(buf), inp, false); EnqueueProgram(cq, program, false); diff --git a/tests/tt_metal/tt_metal/test_flatten.cpp b/tests/tt_metal/tt_metal/test_flatten.cpp index 27353d8e45e5..508d5429efc1 100644 --- a/tests/tt_metal/tt_metal/test_flatten.cpp +++ b/tests/tt_metal/tt_metal/test_flatten.cpp @@ -104,8 +104,6 @@ int main(int argc, char** argv) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math @@ -168,17 +166,18 @@ int main(int argc, char** argv) { flatten_kernel, core, {dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - num_tiles_r, - num_tiles_c, - num_bytes_per_tensor_row}); + 0, + num_tiles_r, + num_tiles_c, + num_bytes_per_tensor_row}); tt_metal::SetRuntimeArgs( program, unary_writer_kernel, core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles * 32}); + {dram_buffer_dst_addr, + 0, + num_tiles * 32}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp index f1beb8c21e88..b42e28099cfe 100644 --- a/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_generic_binary_reader_matmul_large_block.cpp @@ -185,10 +185,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t cb0_tiles = M * in0_block_w * 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -247,11 +243,9 @@ int main(int argc, char** argv) { const std::array generic_binary_reader_args{ src0_dram_buffer->address(), - (uint32_t)dram_src0_noc_xy.x, - (uint32_t)dram_src0_noc_xy.y, + (uint32_t) 0, src1_dram_buffer->address(), - (uint32_t)dram_src1_noc_xy.x, - (uint32_t)dram_src1_noc_xy.y, + (uint32_t) 0, (uint32_t)source_addresses.size(), (uint32_t)source_addresses_in_l1_addr, (uint32_t)num_blocks, @@ -270,17 +264,14 @@ int main(int argc, char** argv) { const std::array writer_rt_args{ dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - (std::uint32_t)out_subblock_h, // num tiles per sub block m - (std::uint32_t)out_subblock_w, // num tiles per sub block n - (std::uint32_t)M / out_subblock_h, // num sub blocks m - (std::uint32_t)N / out_subblock_w, // num sub blocks n - (std::uint32_t)out_subblock_w * single_tile_size * - (N / out_subblock_w), // bytes offset to next row within sub-block - (std::uint32_t)out_subblock_h * out_subblock_w * single_tile_size * - (N / out_subblock_w), // bytes offset to next row of sub-blocks - (std::uint32_t)out_subblock_w * single_tile_size}; // bytes offset to next sub-block + (std::uint32_t) 0, + (std::uint32_t)out_subblock_h, // num tiles per sub block m + (std::uint32_t)out_subblock_w, // num tiles per sub block n + (std::uint32_t)M/out_subblock_h, // num sub blocks m + (std::uint32_t)N/out_subblock_w, // num sub blocks n + (std::uint32_t)out_subblock_w * single_tile_size * (N/out_subblock_w), // bytes offset to next row within sub-block + (std::uint32_t)out_subblock_h * out_subblock_w * single_tile_size * (N/out_subblock_w), // bytes offset to next row of sub-blocks + (std::uint32_t)out_subblock_w*single_tile_size}; // bytes offset to next sub-block auto unary_writer_kernel = tt_metal::CreateKernel( program, diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp index 749ec5c2c19f..7d5a5e632d66 100644 --- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp +++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp @@ -121,8 +121,6 @@ bool interleaved_stick_reader_single_bank_tilized_writer_datacopy_test(const tt: auto dst_dram_buffer = CreateBuffer(dst_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math // kernel, input CB and reader @@ -186,9 +184,8 @@ bool interleaved_stick_reader_single_bank_tilized_writer_datacopy_test(const tt: unary_writer_kernel, core, {dram_buffer_dst_addr, - (uint32_t)dram_dst_noc_xy.x, - (uint32_t)dram_dst_noc_xy.y, - (uint32_t)num_output_tiles}); + (uint32_t) 0, + (uint32_t) num_output_tiles}); CoreCoord debug_core = {1, 1}; @@ -288,8 +285,6 @@ bool interleaved_tilized_reader_interleaved_stick_writer_datacopy_test(const tt: auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the input // CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to math // kernel, input CB and reader @@ -475,7 +470,11 @@ bool test_interleaved_l1_datacopy(const tt::ARCH& arch) { if constexpr (dst_is_in_l1) { dst = CreateBuffer(l1_config); - tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dst->address(), 0, 0, num_pages}); + tt_metal::SetRuntimeArgs( + program, + unary_writer_kernel, + core, + {dst->address(), 0, num_pages}); tt_metal::detail::LaunchProgram(device, program); @@ -484,7 +483,11 @@ bool test_interleaved_l1_datacopy(const tt::ARCH& arch) { } else { dst = CreateBuffer(dram_config); - tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dst->address(), 0, 0, num_pages}); + tt_metal::SetRuntimeArgs( + program, + unary_writer_kernel, + core, + {dst->address(), 0, num_pages}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp index 60105fd134cc..d3d7774760b3 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary.cpp @@ -9,13 +9,12 @@ void kernel_main() { const uint32_t in0_cb = get_compile_time_arg_val(0); const uint32_t in1_cb = get_compile_time_arg_val(1); + uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src1_addr = get_arg_val(3); - uint32_t src1_noc_x = get_arg_val(4); - uint32_t src1_noc_y = get_arg_val(5); - uint32_t num_tiles = get_arg_val(6); + uint32_t src0_dram_bank_id = get_arg_val(1); + uint32_t src1_addr = get_arg_val(2); + uint32_t src1_dram_bank_id = get_arg_val(3); + uint32_t num_tiles = get_arg_val(4); // single-tile ublocks uint32_t ublock_size_bytes_0 = get_tile_size(in0_cb); @@ -27,8 +26,8 @@ void kernel_main() { // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker) for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_dram_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_dram_bank_id, src1_addr); cb_reserve_back(in0_cb, ublock_size_tiles); cb_reserve_back(in1_cb, ublock_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp index 4beec5f136e9..4987633d1b9b 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/reader_binary_blocked.cpp @@ -9,24 +9,23 @@ void kernel_main() { const uint32_t in0_cb = get_compile_time_arg_val(0); const uint32_t in1_cb = get_compile_time_arg_val(1); + uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src1_addr = get_arg_val(3); - uint32_t src1_noc_x = get_arg_val(4); - uint32_t src1_noc_y = get_arg_val(5); - uint32_t num_blocks = get_arg_val(6); - uint32_t in0_block_tile_cnt = get_arg_val(7); - uint32_t in1_block_tile_cnt = get_arg_val(8); - uint32_t in0_block_size_bytes = get_arg_val(9); - uint32_t in1_block_size_bytes = get_arg_val(10); + uint32_t src0_dram_bank_id = get_arg_val(1); + uint32_t src1_addr = get_arg_val(2); + uint32_t src1_dram_bank_id = get_arg_val(3); + uint32_t num_blocks = get_arg_val(4); + uint32_t in0_block_tile_cnt = get_arg_val(5); + uint32_t in1_block_tile_cnt = get_arg_val(6); + uint32_t in0_block_size_bytes = get_arg_val(7); + uint32_t in1_block_size_bytes = get_arg_val(8); uint32_t l1_write_addr_in0; uint32_t l1_write_addr_in1; for (uint32_t i = 0; i < num_blocks; i++) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_dram_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_dram_bank_id, src1_addr); cb_reserve_back(in0_cb, in0_block_tile_cnt); cb_reserve_back(in1_cb, in1_block_tile_cnt); diff --git a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp index 3caec0ae5675..9efe017fd3f0 100644 --- a/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/compute/unit_tests/matmul/writer_unary.cpp @@ -7,16 +7,15 @@ void kernel_main() { const uint32_t out_cb = get_compile_time_arg_val(0); uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t dst_dram_bank_id_addr = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); // single-tile ublocks uint32_t ublock_size_bytes = get_tile_size(out_cb); uint32_t ublock_size_tiles = 1; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_dram_bank_id_addr, dst_addr); cb_wait_front(out_cb, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(out_cb); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp index df99b8def697..4d2c3c3c4f0e 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_reader_unary.cpp @@ -9,21 +9,24 @@ void kernel_main() { const uint32_t cb_id = get_compile_time_arg_val(0); uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); - + uint32_t src_bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); +#if INTERFACE_WITH_L1 == 1 + constexpr bool read_from_dram = false; +#else + constexpr bool read_from_dram = true; +#endif // ublocks size defined in tiles constexpr uint32_t ublock_size_tiles = 1; uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; // read a ublock of tiles from src to CB, and then push the ublock to unpacker for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id(src_bank_id, src_addr); cb_reserve_back(cb_id, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id); - noc_async_read(src_noc_addr, l1_write_addr, ublock_size_bytes); + noc_async_read(src_buffer_noc_addr, l1_write_addr, ublock_size_bytes); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp index 110fc2a85991..d89d10eae54a 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/direct_writer_unary.cpp @@ -7,21 +7,26 @@ void kernel_main() { const uint32_t cb_id = get_compile_time_arg_val(0); uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); + +#if INTERFACE_WITH_L1 == 1 + constexpr bool write_to_dram = false; +#else + constexpr bool write_to_dram = true; +#endif // single-tile ublocks uint32_t ublock_size_bytes = get_tile_size(cb_id); uint32_t ublock_size_tiles = 1; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + uint64_t dst_buffer_noc_addr = get_noc_addr_from_bank_id(bank_id, dst_addr); cb_wait_front(cb_id, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id); - noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes); + noc_async_write(l1_read_addr, dst_buffer_noc_addr, ublock_size_bytes); noc_async_write_barrier(); cb_pop_front(cb_id, ublock_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp index e0f12a23715a..fff8cc55a870 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy.cpp @@ -12,15 +12,13 @@ void kernel_main() { std::uint32_t l1_buffer_addr = get_arg_val(0); - std::uint32_t dram_buffer_src_addr = get_arg_val(1); - std::uint32_t dram_src_noc_x = get_arg_val(2); - std::uint32_t dram_src_noc_y = get_arg_val(3); + std::uint32_t dram_buffer_src_addr = get_arg_val(1); + std::uint32_t dram_src_bank_id = get_arg_val(2); - std::uint32_t dram_buffer_dst_addr = get_arg_val(4); - std::uint32_t dram_dst_noc_x = get_arg_val(5); - std::uint32_t dram_dst_noc_y = get_arg_val(6); + std::uint32_t dram_buffer_dst_addr = get_arg_val(3); + std::uint32_t dram_dst_bank_id = get_arg_val(4); - std::uint32_t dram_buffer_size = get_arg_val(7); + std::uint32_t dram_buffer_size = get_arg_val(5); #if defined(SIGNAL_COMPLETION_TO_DISPATCHER) // We will assert later. This kernel will hang. @@ -40,12 +38,12 @@ void kernel_main() { #endif // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_src_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, dram_buffer_size); noc_async_read_barrier(); // DRAM NOC dst address - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_dst_bank_id, dram_buffer_dst_addr); noc_async_write(l1_buffer_addr, dram_buffer_dst_noc_addr, dram_buffer_size); noc_async_write_barrier(); } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp index b8a2756fbae0..4305e7de0eb8 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_db.cpp @@ -11,20 +11,18 @@ * APIs explicit flushes need to be used since the calls are non-blocking * */ void kernel_main() { - std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); + std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); + std::uint32_t dram_src_bank_id = get_arg_val(1); - std::uint32_t dram_buffer_dst_addr_base = get_arg_val(3); - std::uint32_t dram_dst_noc_x = get_arg_val(4); - std::uint32_t dram_dst_noc_y = get_arg_val(5); + std::uint32_t dram_buffer_dst_addr_base = get_arg_val(2); + std::uint32_t dram_dst_bank_id = get_arg_val(3); - std::uint32_t dram_buffer_size = get_arg_val(6); - std::uint32_t num_tiles = get_arg_val(7); + std::uint32_t dram_buffer_size = get_arg_val(4); + std::uint32_t num_tiles = get_arg_val(5); - std::uint32_t l1_buffer_addr = get_arg_val(8); - std::uint32_t l1_buffer_size_tiles = get_arg_val(9); - std::uint32_t l1_buffer_size_bytes = get_arg_val(10); + std::uint32_t l1_buffer_addr = get_arg_val(6); + std::uint32_t l1_buffer_size_tiles = get_arg_val(7); + std::uint32_t l1_buffer_size_bytes = get_arg_val(8); std::uint32_t rd_wr_l1_buffer_size_tiles = l1_buffer_size_tiles / 2; std::uint32_t rd_wr_l1_buffer_size_bytes = l1_buffer_size_bytes / 2; @@ -41,7 +39,7 @@ void kernel_main() { std::uint32_t l1_addr2 = l1_buffer_addr + rd_wr_l1_buffer_size_bytes; // DRAM NOC src address - dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_src_bank_id, dram_buffer_src_addr); // Copy data from DRAM into destination L1 buffer noc_async_read(dram_buffer_src_noc_addr, l1_addr1, rd_wr_l1_buffer_size_bytes); @@ -50,9 +48,9 @@ void kernel_main() { while (num_tiles_read < num_tiles) { // DRAM NOC src address - dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_src_bank_id, dram_buffer_src_addr); // DRAM NOC dst address - dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_dst_bank_id, dram_buffer_dst_addr); noc_async_read(dram_buffer_src_noc_addr, l1_addr2, rd_wr_l1_buffer_size_bytes); dram_buffer_src_addr += rd_wr_l1_buffer_size_bytes; @@ -77,7 +75,7 @@ void kernel_main() { } // DRAM NOC dst address - dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_dst_bank_id, dram_buffer_dst_addr); noc_async_write(l1_addr2, dram_buffer_dst_noc_addr, rd_wr_l1_buffer_size_bytes); // Wait for all the writes to complete (ie acked) noc_async_write_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp index fec10af9f428..ce07d238d134 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_sticks.cpp @@ -12,16 +12,15 @@ void kernel_main() { std::uint32_t l1_buffer_addr = get_arg_val(0); - std::uint32_t dram_buffer_src_addr = get_arg_val(1); - std::uint32_t dram_src_noc_x = get_arg_val(2); - std::uint32_t dram_src_noc_y = get_arg_val(3); + std::uint32_t dram_buffer_src_addr = get_arg_val(1); + std::uint32_t bank_id = get_arg_val(2); - std::uint32_t num_sticks = get_arg_val(4); - std::uint32_t stick_size = get_arg_val(5); - for (uint32_t i = 0; i < 1; i++) { - for (uint32_t stick_id = 0; stick_id < num_sticks; stick_id++) { + std::uint32_t num_sticks = get_arg_val(3); + std::uint32_t stick_size = get_arg_val(4); + for(uint32_t i = 0; i < 1; i++) { + for(uint32_t stick_id = 0; stick_id < num_sticks; stick_id++) { // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, stick_size); noc_async_read_barrier(); l1_buffer_addr += stick_size; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp new file mode 100644 index 000000000000..95c7566cbfbe --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_copy_to_noc_coord.cpp @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +/** + * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or + * other RISCs Any two RISC processors cannot use the same CMD_BUF non_blocking APIs shouldn't be mixed with slow noc.h + * APIs explicit flushes need to be used since the calls are non-blocking + * */ +void kernel_main() { + std::uint32_t local_l1_buffer_addr = get_arg_val(0); + + std::uint32_t l1_buffer_src_addr = get_arg_val(1); + std::uint32_t l1_src_noc_x = get_arg_val(2); + std::uint32_t l1_src_noc_y = get_arg_val(3); + + std::uint32_t l1_buffer_dst_addr = get_arg_val(4); + std::uint32_t l1_dst_noc_x = get_arg_val(5); + std::uint32_t l1_dst_noc_y = get_arg_val(6); + + std::uint32_t l1_buffer_size = get_arg_val(7); + +#if defined(SIGNAL_COMPLETION_TO_DISPATCHER) + // We will assert later. This kernel will hang. + // Need to signal completion to dispatcher before hanging so that + // Dispatcher Kernel is able to finish. + // Device Close () requires fast dispatch kernels to finish. +#if defined(COMPILE_FOR_ERISC) + tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(eth_l1_mem::address_map::ERISC_MEM_MAILBOX_BASE); +#else + tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE); +#endif + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31, false); +#endif + + // DRAM NOC src address + std::uint64_t l1_buffer_src_noc_addr = get_noc_addr(l1_src_noc_x, l1_src_noc_y, l1_buffer_src_addr); + noc_async_read(l1_buffer_src_noc_addr, local_l1_buffer_addr, l1_buffer_size); + noc_async_read_barrier(); + + // DRAM NOC dst address + std::uint64_t l1_buffer_dst_noc_addr = get_noc_addr(l1_dst_noc_x, l1_dst_noc_y, l1_buffer_dst_addr); + noc_async_write(local_l1_buffer_addr, l1_buffer_dst_noc_addr, l1_buffer_size); + noc_async_write_barrier(); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp index 02f7104aed63..ad3296e7fd4c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync.cpp @@ -12,16 +12,15 @@ constexpr static std::uint32_t VALID_VAL = 0x1234; constexpr static std::uint32_t INVALID_VAL = 0x4321; void kernel_main() { - std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t local_buffer_addr = get_arg_val(3); - std::uint32_t consumer_core_noc_x = get_arg_val(4); - std::uint32_t consumer_core_noc_y = get_arg_val(5); - std::uint32_t stream_register_address = get_arg_val(6); - std::uint32_t num_tiles = get_arg_val(7); - std::uint32_t transient_buffer_size_tiles = get_arg_val(8); - std::uint32_t transient_buffer_size_bytes = get_arg_val(9); + std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); + std::uint32_t bank_id = get_arg_val(1); + std::uint32_t local_buffer_addr = get_arg_val(2); + std::uint32_t consumer_core_noc_x = get_arg_val(3); + std::uint32_t consumer_core_noc_y = get_arg_val(4); + std::uint32_t stream_register_address = get_arg_val(5); + std::uint32_t num_tiles = get_arg_val(6); + std::uint32_t transient_buffer_size_tiles = get_arg_val(7); + std::uint32_t transient_buffer_size_bytes = get_arg_val(8); // Scratch address in L1, to write register value before we copy it to into local/remote registers volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); @@ -35,7 +34,7 @@ void kernel_main() { std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base; while (counter < num_tiles) { // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(bank_id, dram_buffer_src_addr); // Wait until sync register is INVALID_VAL (means its safe to corrupt destination buffer) wait_for_sync_register_value(stream_register_address, INVALID_VAL); // Copy data from dram into destination buffer diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp index fe07a8d15a06..22804b2bcd67 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_loader_sync_db.cpp @@ -20,18 +20,17 @@ inline std::uint32_t ping_pong_address(std::uint32_t addr1, std::uint32_t addr2, } } void kernel_main() { - std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t local_buffer_addr1 = get_arg_val(3); - std::uint32_t local_buffer_addr2 = get_arg_val(4); - std::uint32_t consumer_core_noc_x = get_arg_val(5); - std::uint32_t consumer_core_noc_y = get_arg_val(6); - std::uint32_t stream_register_address1 = get_arg_val(7); - std::uint32_t stream_register_address2 = get_arg_val(8); - std::uint32_t num_tiles = get_arg_val(9); - std::uint32_t transient_buffer_size_tiles = get_arg_val(10); - std::uint32_t transient_buffer_size_bytes = get_arg_val(11); + std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); + std::uint32_t bank_id = get_arg_val(1); + std::uint32_t local_buffer_addr1 = get_arg_val(2); + std::uint32_t local_buffer_addr2 = get_arg_val(3); + std::uint32_t consumer_core_noc_x = get_arg_val(4); + std::uint32_t consumer_core_noc_y = get_arg_val(5); + std::uint32_t stream_register_address1 = get_arg_val(6); + std::uint32_t stream_register_address2 = get_arg_val(7); + std::uint32_t num_tiles = get_arg_val(8); + std::uint32_t transient_buffer_size_tiles = get_arg_val(9); + std::uint32_t transient_buffer_size_bytes = get_arg_val(10); // Scratch address in L1, to write register value before we copy it to into local/remote registers volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); @@ -48,7 +47,7 @@ void kernel_main() { std::uint32_t local_buffer_address = ping_pong_address(local_buffer_addr1, local_buffer_addr2, counter); // DRAM NOC src address - dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(bank_id, dram_buffer_src_addr); // Wait until sync register is INVALID_VAL (means its safe to corrupt destination buffer) wait_for_sync_register_value(reg_addr, INVALID_VAL); // Copy data from dram into destination buffer diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp index 57ed1e1fa13d..60cb59310fc9 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast.cpp @@ -5,22 +5,21 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t src_buffer_size = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t src_buffer_size = get_arg_val(2); - uint32_t local_addr = get_arg_val(4); + uint32_t local_addr = get_arg_val(3); - uint32_t dst_addr = get_arg_val(5); - uint32_t dst_noc_x_start = get_arg_val(6); - uint32_t dst_noc_y_start = get_arg_val(7); - uint32_t dst_noc_x_end = get_arg_val(8); - uint32_t dst_noc_y_end = get_arg_val(9); - uint32_t num_dests = get_arg_val(10); + uint32_t dst_addr = get_arg_val(4); + uint32_t dst_noc_x_start = get_arg_val(5); + uint32_t dst_noc_y_start = get_arg_val(6); + uint32_t dst_noc_x_end = get_arg_val(7); + uint32_t dst_noc_y_end = get_arg_val(8); + uint32_t num_dests = get_arg_val(9); // Read src buffer into local L1 buffer - uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id(bank_id, src_addr); noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp index 376a6335d196..279977ce9737 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_exclude_region.cpp @@ -1,3 +1,4 @@ + // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -5,26 +6,26 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t src_buffer_size = get_arg_val(3); - - uint32_t local_addr = get_arg_val(4); - - uint32_t dst_addr = get_arg_val(5); - uint32_t dst_noc_x_start = get_arg_val(6); - uint32_t dst_noc_y_start = get_arg_val(7); - uint32_t dst_noc_x_end = get_arg_val(8); - uint32_t dst_noc_y_end = get_arg_val(9); - uint32_t num_dests = get_arg_val(10); - uint32_t exclude_start_x = get_arg_val(11); - uint32_t exclude_start_y = get_arg_val(12); - uint32_t exclude_dir_x = get_arg_val(13); - uint32_t exclude_dir_y = get_arg_val(14); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t src_buffer_size = get_arg_val(2); + + uint32_t local_addr = get_arg_val(3); + + uint32_t dst_addr = get_arg_val(4); + uint32_t dst_noc_x_start = get_arg_val(5); + uint32_t dst_noc_y_start = get_arg_val(6); + uint32_t dst_noc_x_end = get_arg_val(7); + uint32_t dst_noc_y_end = get_arg_val(8); + uint32_t num_dests = get_arg_val(9); + uint32_t exclude_start_x = get_arg_val(10); + uint32_t exclude_start_y = get_arg_val(11); + uint32_t exclude_dir_x = get_arg_val(12); + uint32_t exclude_dir_y = get_arg_val(13); + // Read src buffer into local L1 buffer - uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id(bank_id, src_addr); noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp index 48feaea150e3..a6cc85e70bb9 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dram_to_l1_multicast_include_src.cpp @@ -5,22 +5,21 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t src_buffer_size = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t src_buffer_size = get_arg_val(2); - uint32_t local_addr = get_arg_val(4); + uint32_t local_addr = get_arg_val(3); - uint32_t dst_addr = get_arg_val(5); - uint32_t dst_noc_x_start = get_arg_val(6); - uint32_t dst_noc_y_start = get_arg_val(7); - uint32_t dst_noc_x_end = get_arg_val(8); - uint32_t dst_noc_y_end = get_arg_val(9); - uint32_t num_dests = get_arg_val(10); + uint32_t dst_addr = get_arg_val(4); + uint32_t dst_noc_x_start = get_arg_val(5); + uint32_t dst_noc_y_start = get_arg_val(6); + uint32_t dst_noc_x_end = get_arg_val(7); + uint32_t dst_noc_y_end = get_arg_val(8); + uint32_t num_dests = get_arg_val(9); // Read src buffer into local L1 buffer - uint64_t src_buffer_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id(bank_id, src_addr); noc_async_read(src_buffer_noc_addr, local_addr, src_buffer_size); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp index 58c40e74cdd3..38fe828aa94b 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/flatten.cpp @@ -7,14 +7,13 @@ void kernel_main() { // Kernel args - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles_r = get_arg_val(3); - uint32_t num_tiles_c = get_arg_val(4); + uint32_t src_addr = get_arg_val(0); + uint32_t src_bank_id = get_arg_val(1); + uint32_t num_tiles_r = get_arg_val(2); + uint32_t num_tiles_c = get_arg_val(3); // How many bytes along a row in the original tensor - uint32_t num_bytes_per_tensor_row = get_arg_val(5); + uint32_t num_bytes_per_tensor_row = get_arg_val(4); /* Constants @@ -43,7 +42,7 @@ void kernel_main() { uint32_t src_addr_ = src_addr + start_dram_addr_offset_for_tensor_row; for (uint32_t k = 0; k < num_tiles_c; k++) { cb_reserve_back(cb_id_in0, 1); - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr_); + uint64_t src_noc_addr = get_noc_addr_from_bank_id(src_bank_id, src_addr_); // Read one row of data uint32_t l1_write_addr = get_write_ptr(cb_id_in0); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp index 3cfaf979f602..4a70af98b12a 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/generic_binary_reader_blocked.cpp @@ -12,20 +12,18 @@ // addresses must be provided in the order in which tiles are generated. It expects src1 data to already be tilized and // it simply copies it to L1. void kernel_main() { - std::uint32_t dram_buffer_src0_addr = get_arg_val(0); - std::uint32_t dram_src0_noc_x = get_arg_val(1); - std::uint32_t dram_src0_noc_y = get_arg_val(2); - std::uint32_t dram_buffer_src1_addr = get_arg_val(3); - std::uint32_t dram_src1_noc_x = get_arg_val(4); - std::uint32_t dram_src1_noc_y = get_arg_val(5); - std::uint32_t address_map_size = get_arg_val(6); - std::uint32_t address_map_l1_addr = get_arg_val(7); - std::uint32_t num_blocks = get_arg_val(8); - std::uint32_t src0_num_reads_per_block = get_arg_val(9); - std::uint32_t src0_dram_read_size_bytes = get_arg_val(10); - std::uint32_t src1_num_bytes_per_block = get_arg_val(11); - std::uint32_t src0_num_tiles_per_block = get_arg_val(12); - std::uint32_t src1_num_tiles_per_block = get_arg_val(13); + std::uint32_t dram_buffer_src0_addr = get_arg_val(0); + std::uint32_t dram_src0_bank_id = get_arg_val(1); + std::uint32_t dram_buffer_src1_addr = get_arg_val(2); + std::uint32_t dram_src1_bank_id = get_arg_val(3); + std::uint32_t address_map_size = get_arg_val(4); + std::uint32_t address_map_l1_addr = get_arg_val(5); + std::uint32_t num_blocks = get_arg_val(6); + std::uint32_t src0_num_reads_per_block = get_arg_val(7); + std::uint32_t src0_dram_read_size_bytes = get_arg_val(8); + std::uint32_t src1_num_bytes_per_block = get_arg_val(9); + std::uint32_t src0_num_tiles_per_block = get_arg_val(10); + std::uint32_t src1_num_tiles_per_block = get_arg_val(11); constexpr uint32_t cb0_id = 0; constexpr uint32_t cb1_id = 1; @@ -40,7 +38,7 @@ void kernel_main() { cb_reserve_back(cb1_id, src1_num_tiles_per_block); uint32_t l1_write0_addr = get_write_ptr(cb0_id); uint32_t l1_write1_addr = get_write_ptr(cb1_id); - std::uint64_t dram_buffer_src1_noc_addr = get_noc_addr(dram_src1_noc_x, dram_src1_noc_y, dram_buffer_src1_addr); + std::uint64_t dram_buffer_src1_noc_addr = get_noc_addr_from_bank_id(dram_src1_bank_id, dram_buffer_src1_addr); // src1 is already tilized in DRAM. Read the whole block of tiles in a single DRAM read access. noc_async_read(dram_buffer_src1_noc_addr, l1_write1_addr, src1_num_bytes_per_block); // src0 is not tilized in DRAM. @@ -49,7 +47,7 @@ void kernel_main() { for (uint32_t i = 0; i < src0_num_reads_per_block; i++) { uint32_t src_addr = source_addresses[source_addresses_list_index]; std::uint64_t dram_buffer_src0_noc_addr = - get_noc_addr(dram_src0_noc_x, dram_src0_noc_y, dram_buffer_src0_addr + src_addr); + get_noc_addr_from_bank_id(dram_src0_bank_id, dram_buffer_src0_addr + src_addr); noc_async_read(dram_buffer_src0_noc_addr, l1_write0_addr, src0_dram_read_size_bytes); l1_write0_addr += src0_dram_read_size_bytes; source_addresses_list_index += 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp index 8722a7aef9de..ce0c8c6c81c9 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/l1_to_l1.cpp @@ -5,19 +5,18 @@ #include void kernel_main() { - std::uint32_t dram_buffer_src_addr = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t l1_buffer_src_addr_base = get_arg_val(3); - std::uint32_t l1_buffer_dst_addr_base = get_arg_val(4); - std::uint32_t l1_dst_noc_x = get_arg_val(5); - std::uint32_t l1_dst_noc_y = get_arg_val(6); - std::uint32_t num_tiles = get_arg_val(7); - std::uint32_t single_tile_size_bytes = get_arg_val(8); - std::uint32_t total_tiles_size_bytes = get_arg_val(9); + std::uint32_t dram_buffer_src_addr = get_arg_val(0); + std::uint32_t src_bank_id = get_arg_val(1); + std::uint32_t l1_buffer_src_addr_base = get_arg_val(2); + std::uint32_t l1_buffer_dst_addr_base = get_arg_val(3); + std::uint32_t l1_dst_noc_x = get_arg_val(4); + std::uint32_t l1_dst_noc_y = get_arg_val(5); + std::uint32_t num_tiles = get_arg_val(6); + std::uint32_t single_tile_size_bytes = get_arg_val(7); + std::uint32_t total_tiles_size_bytes = get_arg_val(8); - // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + // DRAM NOC src address + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(src_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, l1_buffer_src_addr_base, total_tiles_size_bytes); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp index f9b35831b77a..fbdea334c76e 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h.cpp @@ -7,17 +7,15 @@ void kernel_main() { uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); - uint32_t src1_noc_x = get_arg_val(5); - uint32_t src1_noc_y = get_arg_val(6); - // skip arg 7 for compat with reader_diff_lengths - uint32_t NCHtWt = get_arg_val(8); - uint32_t NC = get_arg_val(9); - uint32_t Ht = get_arg_val(10); - uint32_t Wt = get_arg_val(11); + uint32_t src0_bank_id = get_arg_val(1); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_bank_id = get_arg_val(4); + // skip arg 5 for compat with reader_diff_lengths + uint32_t NCHtWt = get_arg_val(6); + uint32_t NC = get_arg_val(7); + uint32_t Ht = get_arg_val(8); + uint32_t Wt = get_arg_val(9); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; @@ -32,7 +30,7 @@ void kernel_main() { uint32_t num_tiles = src0_num_tiles; uint32_t i1 = 0; for (uint32_t i = 0; i < NCHtWt; i += onetile) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_addr); cb_reserve_back(cb_id_in0, onetile); l1_write_addr_in0 = get_write_ptr(cb_id_in0); noc_async_read(src0_noc_addr, l1_write_addr_in0, tile_bytes); @@ -43,7 +41,7 @@ void kernel_main() { // for each W-tile of the first tensor we push one tile from the second arg tile list // but we loop the second list around cb_reserve_back(cb_id_in1, onetile); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_addr); l1_write_addr_in1 = get_write_ptr(cb_id_in1); noc_async_read(src1_noc_addr, l1_write_addr_in1, tile_bytes); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp index 1433de4f3226..95c84b7e2ce6 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_h_8bank.cpp @@ -6,15 +6,15 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths - uint32_t NCHtWt = get_arg_val(8); - uint32_t NC = get_arg_val(9); - uint32_t Ht = get_arg_val(10); - uint32_t Wt = get_arg_val(11); - uint32_t nc1 = get_arg_val(12); // if 1 we expect the bcast tensor to have NC=1 + uint32_t NCHtWt = get_arg_val(6); + uint32_t NC = get_arg_val(7); + uint32_t Ht = get_arg_val(8); + uint32_t Wt = get_arg_val(9); + uint32_t nc1 = get_arg_val(10); // if 1 we expect the bcast tensor to have NC=1 constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp index c99e2f8be698..e1396e60662d 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_hw_8bank.cpp @@ -6,15 +6,15 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); // skip args 1,2,5,6,7 for compat with single bank readers and reader_diff_lengths - uint32_t NCHtWt = get_arg_val(8); - uint32_t NC = get_arg_val(9); - uint32_t Ht = get_arg_val(10); - uint32_t Wt = get_arg_val(11); - uint32_t nc1 = get_arg_val(12); // if 1 we expect the bcast tensor to have NC=1 and wrap around in NC + uint32_t NCHtWt = get_arg_val(6); + uint32_t NC = get_arg_val(7); + uint32_t Ht = get_arg_val(8); + uint32_t Wt = get_arg_val(9); + uint32_t nc1 = get_arg_val(10); // if 1 we expect the bcast tensor to have NC=1 and wrap around in NC constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp index c438c2051809..c1af622df03c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w.cpp @@ -7,17 +7,16 @@ void kernel_main() { uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); - uint32_t src1_noc_x = get_arg_val(5); - uint32_t src1_noc_y = get_arg_val(6); + uint32_t src0_bank_id = get_arg_val(1); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_bank_id = get_arg_val(4); + uint32_t src1_noc_y = get_arg_val(5); // skip arg 7 for compat with reader_diff_lengths - uint32_t NCHtWt = get_arg_val(8); - uint32_t NC = get_arg_val(9); - uint32_t Ht = get_arg_val(10); - uint32_t Wt = get_arg_val(11); + uint32_t NCHtWt = get_arg_val(6); + uint32_t NC = get_arg_val(7); + uint32_t Ht = get_arg_val(8); + uint32_t Wt = get_arg_val(9); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp index 38cff5068ae1..fb4b2e89e743 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_bcast_w_8bank.cpp @@ -6,15 +6,15 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); // skip args 1,2,5,6,7 for compat with single-bank readers and reader_diff_lengths - uint32_t NCHtWt = get_arg_val(8); - uint32_t NC = get_arg_val(9); - uint32_t Ht = get_arg_val(10); - uint32_t Wt = get_arg_val(11); - uint32_t nc1 = get_arg_val(12); // if 1 we expect the bcast tensor to have NC=1 + uint32_t NCHtWt = get_arg_val(6); + uint32_t NC = get_arg_val(7); + uint32_t Ht = get_arg_val(8); + uint32_t Wt = get_arg_val(9); + uint32_t nc1 = get_arg_val(10); // if 1 we expect the bcast tensor to have NC=1 constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; constexpr bool src1_is_dram = get_compile_time_arg_val(1) == 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp index f2c631499a97..34de8015346f 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_binary.cpp @@ -7,12 +7,10 @@ void kernel_main() { uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src1_addr = get_arg_val(3); - uint32_t src1_noc_x = get_arg_val(4); - uint32_t src1_noc_y = get_arg_val(5); - uint32_t num_tiles = get_arg_val(6); + uint32_t src0_bank_id = get_arg_val(1); + uint32_t src1_addr = get_arg_val(2); + uint32_t src1_bank_id = get_arg_val(3); + uint32_t num_tiles = get_arg_val(4); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; @@ -26,9 +24,9 @@ void kernel_main() { uint32_t l1_write_addr_in1; // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker) - for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + for (uint32_t i=0; i(src0_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_addr); cb_reserve_back(cb_id_in0, ublock_size_tiles); cb_reserve_back(cb_id_in1, ublock_size_tiles); @@ -47,19 +45,20 @@ void kernel_main() { src1_addr += ublock_size_bytes_1; } -// This input populates dest with values before binary operation -// executes, this is used to test eltwise binary with dest re-use -// and eltwise binary with dest accumulation -#if defined(DST_ACCUM_MODE) || defined(ELTWISE_DEST_REUSE_TYPE) - uint32_t src2_addr = get_arg_val(7); - uint32_t src2_noc_x = get_arg_val(8); - uint32_t src2_noc_y = get_arg_val(9); + + // This input populates dest with values before binary operation + // executes, this is used to test eltwise binary with dest re-use + // and eltwise binary with dest accumulation + #if defined(DST_ACCUM_MODE) || defined(ELTWISE_DEST_REUSE_TYPE) + uint32_t src2_addr = get_arg_val(5); + uint32_t src2_bank_id = get_arg_val(6); + constexpr uint32_t cb_id_in2 = 2; uint32_t ublock_size_bytes_2 = get_tile_size(cb_id_in2); uint32_t l1_write_addr_in2; - for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src2_noc_addr = get_noc_addr(src2_noc_x, src2_noc_y, src2_addr); + for (uint32_t i=0; i(src2_bank_id, src2_addr); cb_reserve_back(cb_id_in2, ublock_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp index 5094e3e2ce48..28b68964bbd3 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_cb_test.cpp @@ -5,18 +5,13 @@ #include #include "dataflow_api.h" -inline __attribute__((always_inline)) void read_and_push_to_cb( - const uint32_t cb_id, - uint32_t num_tiles_per_cb, - uint32_t ublock_size_tiles, - uint32_t ublock_size_bytes, - uint32_t dram_src_noc_x, - uint32_t dram_src_noc_y, - uint32_t& dram_buffer_src_addr) { +inline __attribute__((always_inline)) +void read_and_push_to_cb(const uint32_t cb_id, uint32_t num_tiles_per_cb, uint32_t ublock_size_tiles, uint32_t ublock_size_bytes, + uint32_t bank_id, uint32_t& dram_buffer_src_addr) { // read a ublock of tiles at the time from DRAM to L1 buffer, and push a ublock at the time to unpacker for (uint32_t i = 0; i < num_tiles_per_cb; i += ublock_size_tiles) { // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(bank_id, dram_buffer_src_addr); cb_reserve_back(cb_id, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id); @@ -29,21 +24,14 @@ inline __attribute__((always_inline)) void read_and_push_to_cb( } void kernel_main() { - std::uint32_t dram_buffer_src_addr = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t num_tiles_per_cb = get_arg_val(3); + std::uint32_t dram_buffer_src_addr = get_arg_val(0); + std::uint32_t bank_id = get_arg_val(1); + std::uint32_t num_tiles_per_cb = get_arg_val(2); constexpr uint32_t cb_id = get_compile_time_arg_val(0); constexpr uint32_t ublock_size_tiles = get_compile_time_arg_val(1); uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; - read_and_push_to_cb( - cb_id, - num_tiles_per_cb, - ublock_size_tiles, - ublock_size_bytes, - dram_src_noc_x, - dram_src_noc_y, - dram_buffer_src_addr); + read_and_push_to_cb(cb_id, num_tiles_per_cb, ublock_size_tiles, ublock_size_bytes, + bank_id, dram_buffer_src_addr); } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp index bd6ae074e7f1..a2db955dc177 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp @@ -7,10 +7,10 @@ void kernel_main() { // same arg indices as in reader_binary_diff_lenghts for compat - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); - uint32_t src1_num_tiles = get_arg_val(7); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_num_tiles = get_arg_val(5); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp index ff484517aa0c..b316eb012524 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_first_stage.cpp @@ -8,10 +8,9 @@ void kernel_main() { std::uint32_t buffer_src_addr = get_arg_val(0); - std::uint32_t src_noc_x = get_arg_val(1); - std::uint32_t src_noc_y = get_arg_val(2); - std::uint32_t num_tiles = get_arg_val(3); - std::uint32_t num_repetitions = get_arg_val(4); + std::uint32_t src_bank_id = get_arg_val(1); + std::uint32_t num_tiles = get_arg_val(2); + std::uint32_t num_repetitions = get_arg_val(3); constexpr uint32_t cb_id = get_compile_time_arg_val(0); constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1); @@ -20,7 +19,7 @@ void kernel_main() { for (uint32_t j = 0; j < num_repetitions; j++) { uint32_t src_addr = buffer_src_addr; for (uint32_t i = 0; i < num_tiles; i += block_size_tiles) { - std::uint64_t buffer_src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + std::uint64_t buffer_src_noc_addr = get_noc_addr_from_bank_id(src_bank_id, src_addr); cb_reserve_back(cb_id, block_size_tiles); if (j == 0) { diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp index 3511bd19d3b9..ae86ef804e82 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_blocked.cpp @@ -6,17 +6,15 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src1_addr = get_arg_val(3); - uint32_t src1_noc_x = get_arg_val(4); - uint32_t src1_noc_y = get_arg_val(5); - uint32_t num_blocks = get_arg_val(6); - uint32_t in0_block_tile_cnt = get_arg_val(7); - uint32_t in1_block_tile_cnt = get_arg_val(8); - uint32_t in0_block_size_bytes = get_arg_val(9); - uint32_t in1_block_size_bytes = get_arg_val(10); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_bank_id = get_arg_val(1); + uint32_t src1_addr = get_arg_val(2); + uint32_t src1_bank_id = get_arg_val(3); + uint32_t num_blocks = get_arg_val(4); + uint32_t in0_block_tile_cnt = get_arg_val(5); + uint32_t in1_block_tile_cnt = get_arg_val(6); + uint32_t in0_block_size_bytes = get_arg_val(7); + uint32_t in1_block_size_bytes = get_arg_val(8); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; @@ -24,9 +22,9 @@ void kernel_main() { uint32_t l1_write_addr_in0; uint32_t l1_write_addr_in1; - for (uint32_t i = 0; i < num_blocks; i++) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + for(uint32_t i = 0; i < num_blocks; i++) { + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_addr); cb_reserve_back(cb_id_in0, in0_block_tile_cnt); cb_reserve_back(cb_id_in1, in1_block_tile_cnt); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp index 82e8b919bd3f..81e12d58c519 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_small_block.cpp @@ -7,14 +7,12 @@ void kernel_main() { std::uint32_t dram_buffer_src0_addr = get_arg_val(0); - std::uint32_t dram_src0_noc_x = get_arg_val(1); - std::uint32_t dram_src0_noc_y = get_arg_val(2); + std::uint32_t dram_src0_bank_id = get_arg_val(1); - std::uint32_t dram_buffer_src1_addr = get_arg_val(3); - std::uint32_t dram_src1_noc_x = get_arg_val(4); - std::uint32_t dram_src1_noc_y = get_arg_val(5); + std::uint32_t dram_buffer_src1_addr = get_arg_val(2); + std::uint32_t dram_src1_bank_id = get_arg_val(3); - std::uint32_t num_tiles = get_arg_val(6); + std::uint32_t num_tiles = get_arg_val(4); // single-tile chunks uint32_t chunk_size_bytes_0 = get_tile_size(0); @@ -27,8 +25,10 @@ void kernel_main() { // read a chunk of tiles at the time from DRAM to L1 buffer, and push a chunk at the time to unpacker for (uint32_t i = 0; i < num_tiles; i += chunk_size_tiles) { // DRAM NOC src address - std::uint64_t dram_buffer_src0_noc_addr = get_noc_addr(dram_src0_noc_x, dram_src0_noc_y, dram_buffer_src0_addr); - std::uint64_t dram_buffer_src1_noc_addr = get_noc_addr(dram_src1_noc_x, dram_src1_noc_y, dram_buffer_src1_addr); + std::uint64_t dram_buffer_src0_noc_addr = + get_noc_addr_from_bank_id(dram_src0_bank_id, dram_buffer_src0_addr); + std::uint64_t dram_buffer_src1_noc_addr = + get_noc_addr_from_bank_id(dram_src1_bank_id, dram_buffer_src1_addr); cb_reserve_back(0, chunk_size_tiles); cb_reserve_back(1, chunk_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp index 56ed2b2823cd..9c4693d04507 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_matmul_with_bias_blocked.cpp @@ -6,32 +6,28 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src1_addr = get_arg_val(3); - uint32_t src1_noc_x = get_arg_val(4); - uint32_t src1_noc_y = get_arg_val(5); - uint32_t num_blocks = get_arg_val(6); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_dram_bank_id = get_arg_val(1); + uint32_t src1_addr = get_arg_val(2); + uint32_t src1_dram_bank_id = get_arg_val(3); + uint32_t num_blocks = get_arg_val(4); - uint32_t in0_block_tile_cnt = get_arg_val(7); - uint32_t in1_block_tile_cnt = get_arg_val(8); - uint32_t in0_block_size_bytes = get_arg_val(9); - uint32_t in1_block_size_bytes = get_arg_val(10); + uint32_t in0_block_tile_cnt = get_arg_val(5); + uint32_t in1_block_tile_cnt = get_arg_val(6); + uint32_t in0_block_size_bytes = get_arg_val(7); + uint32_t in1_block_size_bytes = get_arg_val(8); - uint32_t with_bias = get_arg_val(11); + uint32_t with_bias = get_arg_val(9); uint32_t src2_addr; - uint32_t src2_noc_x; - uint32_t src2_noc_y; + uint32_t src2_dram_bank_id; uint32_t in2_block_tile_cnt; uint32_t in2_block_size_bytes; if (with_bias) { - src2_addr = get_arg_val(12); - src2_noc_x = get_arg_val(13); - src2_noc_y = get_arg_val(14); - in2_block_tile_cnt = get_arg_val(15); - in2_block_size_bytes = get_arg_val(16); + src2_addr = get_arg_val(10); + src2_dram_bank_id = get_arg_val(11); + in2_block_tile_cnt = get_arg_val(12); + in2_block_size_bytes = get_arg_val(13); } constexpr uint32_t cb_id_in0 = 0; @@ -42,9 +38,9 @@ void kernel_main() { uint32_t l1_write_addr_in1; uint32_t l1_write_addr_in2; - for (uint32_t i = 0; i < num_blocks; i++) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + for(uint32_t i = 0; i < num_blocks; i++) { + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_dram_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_dram_bank_id, src1_addr); cb_reserve_back(cb_id_in0, in0_block_tile_cnt); cb_reserve_back(cb_id_in1, in1_block_tile_cnt); @@ -65,7 +61,7 @@ void kernel_main() { } if (with_bias) { - uint64_t src2_noc_addr = get_noc_addr(src2_noc_x, src2_noc_y, src2_addr); + uint64_t src2_noc_addr = get_noc_addr_from_bank_id(src2_dram_bank_id, src2_addr); l1_write_addr_in2 = get_write_ptr(cb_id_in2); cb_reserve_back(cb_id_in2, in2_block_tile_cnt); noc_async_read(src2_noc_addr, l1_write_addr_in2, in2_block_size_bytes); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp index 6e5f737b34d0..df96a7dd048c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp @@ -6,10 +6,9 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; @@ -19,12 +18,12 @@ void kernel_main() { // read a ublock of tiles from src to CB, and then push the ublock to unpacker for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_buffer_noc_addr = get_noc_addr_from_bank_id(bank_id, src_addr); cb_reserve_back(cb_id_in0, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id_in0); - noc_async_read(src_noc_addr, l1_write_addr, ublock_size_bytes); + noc_async_read(src_buffer_noc_addr, l1_write_addr, ublock_size_bytes); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp index c191ef7a83c5..707db8698543 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_n.cpp @@ -5,18 +5,17 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); - uint32_t cb_id_in0 = get_arg_val(4); - uint32_t ublock_size_tiles = get_arg_val(5); - bool reader_only = get_arg_val(6); + uint32_t src_addr = get_arg_val(0); + uint32_t src_dram_bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); + uint32_t cb_id_in0 = get_arg_val(3); + uint32_t ublock_size_tiles = get_arg_val(4); + bool reader_only = get_arg_val(5); uint32_t ublock_size_bytes = get_tile_size(cb_id_in0) * ublock_size_tiles; - for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + for (uint32_t i = 0; i(src_dram_bank_id, src_addr); if (reader_only == false) { cb_reserve_back(cb_id_in0, ublock_size_tiles); } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp index a7f37fda7d0c..b866fd166327 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_transpose_wh.cpp @@ -6,9 +6,10 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); + uint32_t src_addr = get_arg_val(0); + uint32_t src_dram_bank_id = get_arg_val(1); + // uint32_t unused = get_arg_val(2); + // uint32_t unused = get_arg_val(3); // skip 3 for compat with reader_unary_8bank, reader_unary uint32_t N = get_arg_val(4); uint32_t Ht = get_arg_val(5); @@ -27,9 +28,9 @@ void kernel_main() { // this reader will read a NHW tensor in NWH order for (uint32_t n = 0; n < N; n++) { src_addr = src_addrN; - for (uint32_t w = 0; w < Wt; w++) { - for (uint32_t h = 0; h < Ht; h++) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + for (uint32_t w = 0; w(src_dram_bank_id, src_addr); cb_reserve_back(cb_id_in0, onetile); uint32_t l1_write_addr = get_write_ptr(cb_id_in0); noc_async_read(src_noc_addr, l1_write_addr, tile_bytes); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp index 59efcbcb26e0..89391089a7ea 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync.cpp @@ -12,17 +12,16 @@ constexpr static std::uint32_t VALID_VAL = 0x1234; constexpr static std::uint32_t INVALID_VAL = 0x4321; void kernel_main() { - std::uint32_t buffer_src_addr = get_arg_val(0); - std::uint32_t src_noc_x = get_arg_val(1); - std::uint32_t src_noc_y = get_arg_val(2); - std::uint32_t buffer_dst_addr = get_arg_val(3); - std::uint32_t dst_noc_x = get_arg_val(4); - std::uint32_t dst_noc_y = get_arg_val(5); - std::uint32_t l1_buffer_address = get_arg_val(6); - std::uint32_t stream_register_address = get_arg_val(7); - std::uint32_t num_tiles = get_arg_val(8); - std::uint32_t transient_buffer_size_tiles = get_arg_val(9); - std::uint32_t transient_buffer_size_bytes = get_arg_val(10); + std::uint32_t buffer_src_addr = get_arg_val(0); + std::uint32_t src_noc_x = get_arg_val(1); + std::uint32_t src_noc_y = get_arg_val(2); + std::uint32_t buffer_dst_addr = get_arg_val(3); + std::uint32_t bank_id = get_arg_val(4); + std::uint32_t l1_buffer_address = get_arg_val(5); + std::uint32_t stream_register_address = get_arg_val(6); + std::uint32_t num_tiles = get_arg_val(7); + std::uint32_t transient_buffer_size_tiles = get_arg_val(8); + std::uint32_t transient_buffer_size_bytes = get_arg_val(9); // Scratch address in L1, two write register value before we copy it to into local/remote registers volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp index 5400fc6a2254..32dfa63f0436 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/remote_read_remote_write_sync_db.cpp @@ -21,20 +21,19 @@ inline std::uint32_t ping_pong_address(std::uint32_t addr1, std::uint32_t addr2, } void kernel_main() { - std::uint32_t buffer_src_addr1 = get_arg_val(0); - std::uint32_t buffer_src_addr2 = get_arg_val(1); - std::uint32_t src_noc_x = get_arg_val(2); - std::uint32_t src_noc_y = get_arg_val(3); - std::uint32_t buffer_dst_addr = get_arg_val(4); - std::uint32_t dst_noc_x = get_arg_val(5); - std::uint32_t dst_noc_y = get_arg_val(6); - std::uint32_t local_buffer_addr1 = get_arg_val(7); - std::uint32_t local_buffer_addr2 = get_arg_val(8); - std::uint32_t stream_register_address1 = get_arg_val(9); - std::uint32_t stream_register_address2 = get_arg_val(10); - std::uint32_t num_tiles = get_arg_val(11); - std::uint32_t transient_buffer_size_tiles = get_arg_val(12); - std::uint32_t transient_buffer_size_bytes = get_arg_val(13); + std::uint32_t buffer_src_addr1 = get_arg_val(0); + std::uint32_t buffer_src_addr2 = get_arg_val(1); + std::uint32_t src_noc_x = get_arg_val(2); + std::uint32_t src_noc_y = get_arg_val(3); + std::uint32_t buffer_dst_addr = get_arg_val(4); + std::uint32_t bank_id = get_arg_val(5); + std::uint32_t local_buffer_addr1 = get_arg_val(6); + std::uint32_t local_buffer_addr2 = get_arg_val(7); + std::uint32_t stream_register_address1 = get_arg_val(8); + std::uint32_t stream_register_address2 = get_arg_val(9); + std::uint32_t num_tiles = get_arg_val(10); + std::uint32_t transient_buffer_size_tiles = get_arg_val(11); + std::uint32_t transient_buffer_size_bytes = get_arg_val(12); // Scratch address in L1, two write register value before we copy it to into local/remote registers volatile tt_l1_ptr uint32_t* constant_ptr = reinterpret_cast(CONSTANT_REGISTER_VALUE); @@ -57,7 +56,7 @@ void kernel_main() { noc_async_read_barrier(); // DRAM NOC dst address - dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_buffer_addr); + dst_noc_addr = get_noc_addr_from_bank_id(bank_id, dst_buffer_addr); noc_async_write(local_buffer_address, dst_noc_addr, transient_buffer_size_bytes); dst_buffer_addr += transient_buffer_size_bytes; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp index 92af4e921780..160f086849c6 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc.cpp @@ -14,14 +14,13 @@ inline uint32_t TADDR(uint32_t ti) { return ti << 11; } void kernel_main() { uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t W = get_arg_val(3); - uint32_t H = get_arg_val(4); - uint32_t C = get_arg_val(5); - uint32_t HW = get_arg_val(6); - uint32_t N = get_arg_val(7); - uint32_t CHW = get_arg_val(8); + uint32_t src_bank_id = get_arg_val(1); + uint32_t W = get_arg_val(2); + uint32_t H = get_arg_val(3); + uint32_t C = get_arg_val(4); + uint32_t HW = get_arg_val(5); + uint32_t N = get_arg_val(6); + uint32_t CHW = get_arg_val(7); auto WT = (W >> 5); // number of tiles in W auto HT = (H >> 5); // number of tiles in H @@ -36,7 +35,7 @@ void kernel_main() { // The basic idea here is to iterate over output tiles (that will be over CT,WT) and H // this will generate a linearly incremented output address in the inner loop // we then reverse map this linear dest address to src address - uint64_t batch_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t batch_addr = get_noc_addr_from_bank_id(src_bank_id, src_addr); for (uint32_t n = 0; n < N; n++) { uint32_t htWT = 0; for (uint32_t h = 0; h < H; h++) { diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp index a5a5d5602459..2a459afdaea1 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/transpose_hc_8bank.cpp @@ -14,14 +14,13 @@ inline uint32_t TADDR(uint32_t ti) { return ti << 11; } void kernel_main() { uint32_t src0_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t W = get_arg_val(3); - uint32_t H = get_arg_val(4); - uint32_t C = get_arg_val(5); - uint32_t HW = get_arg_val(6); - uint32_t N = get_arg_val(7); - uint32_t CHW = get_arg_val(8); + uint32_t src_bank_id = get_arg_val(1); + uint32_t W = get_arg_val(2); + uint32_t H = get_arg_val(3); + uint32_t C = get_arg_val(4); + uint32_t HW = get_arg_val(5); + uint32_t N = get_arg_val(6); + uint32_t CHW = get_arg_val(7); auto WT = (W >> 5); // number of tiles in W auto HT = (H >> 5); // number of tiles in H diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp index a279088cd9d2..5c2b0ee98767 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_dram_to_l1.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "dataflow_api.h" /** * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or @@ -11,15 +12,14 @@ * */ void kernel_main() { std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); + std::uint32_t src_dram_bank_id = get_arg_val(1); - std::uint32_t l1_buffer_dst_addr_base = get_arg_val(3); - std::uint32_t dram_buffer_size = get_arg_val(4); + std::uint32_t l1_buffer_dst_addr_base = get_arg_val(2); + std::uint32_t dram_buffer_size = get_arg_val(3); std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base; // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(src_dram_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, l1_buffer_dst_addr_base, dram_buffer_size); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp index df99b8def697..dc562bb8080b 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_reader_unary.cpp @@ -8,18 +8,17 @@ void kernel_main() { const uint32_t cb_id = get_compile_time_arg_val(0); - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t src_bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); // ublocks size defined in tiles constexpr uint32_t ublock_size_tiles = 1; uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; // read a ublock of tiles from src to CB, and then push the ublock to unpacker - for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + for (uint32_t i = 0; i(src_bank_id, src_addr); cb_reserve_back(cb_id, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp index 79b373a241cd..e7e156f701a3 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_l1_to_dram.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "dataflow_api.h" /** * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on NCRISC or @@ -11,16 +12,14 @@ * */ void kernel_main() { std::uint32_t dram_buffer_dst_addr_base = get_arg_val(0); - std::uint32_t dram_dst_noc_x = get_arg_val(1); - std::uint32_t dram_dst_noc_y = get_arg_val(2); - - std::uint32_t l1_buffer_src_addr_base = get_arg_val(3); - std::uint32_t dram_buffer_size = get_arg_val(4); + std::uint32_t dram_bank_id = get_arg_val(1); + std::uint32_t l1_buffer_src_addr_base = get_arg_val(2); + std::uint32_t dram_buffer_size = get_arg_val(3); std::uint32_t dram_buffer_dst_addr = dram_buffer_dst_addr_base; // DRAM NOC dst address - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_dst_addr); noc_async_write(l1_buffer_src_addr_base, dram_buffer_dst_noc_addr, dram_buffer_size); noc_async_write_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp index b535b7bd4bf9..ec8faac45c7c 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/dram/direct_writer_unary.cpp @@ -6,17 +6,16 @@ void kernel_main() { const uint32_t cb_id = get_compile_time_arg_val(0); - uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); // single-tile ublocks uint32_t ublock_size_bytes = get_tile_size(cb_id); uint32_t ublock_size_tiles = 1; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_addr); cb_wait_front(cb_id, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp index c021e07b2de5..775cb2e30e9d 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_receiver.cpp @@ -6,17 +6,16 @@ void kernel_main() { std::uint32_t dram_buffer_dst_addr = get_arg_val(0); - std::uint32_t dram_dst_noc_x = get_arg_val(1); - std::uint32_t dram_dst_noc_y = get_arg_val(2); - std::uint32_t remaining_bytes = get_arg_val(3); - std::uint32_t num_loops = get_arg_val(4); - std::uint32_t num_bytes = get_arg_val(5); + std::uint32_t dram_bank_id = get_arg_val(1); + std::uint32_t remaining_bytes = get_arg_val(2); + std::uint32_t num_loops = get_arg_val(3); + std::uint32_t num_bytes = get_arg_val(4); // DRAM NOC dst address for (uint32_t i = 0; i < num_loops; i++) { eth_wait_for_bytes(num_bytes); - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_dst_addr); noc_async_write(eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, dram_buffer_dst_noc_addr, num_bytes); noc_async_write_barrier(); @@ -25,7 +24,7 @@ void kernel_main() { } eth_wait_for_bytes(remaining_bytes); - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_dst_addr); noc_async_write(eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, dram_buffer_dst_noc_addr, remaining_bytes); noc_async_write_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp index d35795495357..c4acbf759349 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_dram_to_dram_sender.cpp @@ -6,15 +6,14 @@ void kernel_main() { std::uint32_t dram_buffer_src_addr = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t remaining_bytes = get_arg_val(3); - std::uint32_t num_loops = get_arg_val(4); - std::uint32_t num_bytes = get_arg_val(5); + std::uint32_t dram_bank_id = get_arg_val(1); + std::uint32_t remaining_bytes = get_arg_val(2); + std::uint32_t num_loops = get_arg_val(3); + std::uint32_t num_bytes = get_arg_val(4); // DRAM NOC src address for (uint32_t i = 0; i < num_loops; i++) { - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, num_bytes); noc_async_read_barrier(); @@ -26,7 +25,7 @@ void kernel_main() { dram_buffer_src_addr += num_bytes; } - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, remaining_bytes); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp index 9b744d4051cd..e626f76f96af 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_reader_dram_to_l1.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "dataflow_api.h" /** * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on BRISC or @@ -12,14 +13,13 @@ void kernel_main() { std::uint32_t dram_buffer_src_addr_base = get_arg_val(0); - std::uint32_t dram_src_noc_x = get_arg_val(1); - std::uint32_t dram_src_noc_y = get_arg_val(2); - std::uint32_t dram_buffer_size = get_arg_val(3); - std::uint32_t local_eth_l1_addr_base = get_arg_val(4); + std::uint32_t dram_bank_id = get_arg_val(1); + std::uint32_t dram_buffer_size = get_arg_val(2); + std::uint32_t local_eth_l1_addr_base = get_arg_val(3); std::uint32_t dram_buffer_src_addr = dram_buffer_src_addr_base; // DRAM NOC src address - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, local_eth_l1_addr_base, dram_buffer_size); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp index 8ac1f1502e93..cf2c0b3087bc 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/direct_writer_l1_to_dram.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include "dataflow_api.h" /** * NOC APIs are prefixed w/ "ncrisc" (legacy name) but there's nothing NCRISC specific, they can be used on NCRISC or @@ -11,15 +12,14 @@ * */ void kernel_main() { std::uint32_t dram_buffer_dst_addr_base = get_arg_val(0); - std::uint32_t dram_dst_noc_x = get_arg_val(1); - std::uint32_t dram_dst_noc_y = get_arg_val(2); - std::uint32_t dram_buffer_size = get_arg_val(3); - std::uint32_t local_eth_l1_addr_base = get_arg_val(4); + std::uint32_t dram_bank_id = get_arg_val(1); + std::uint32_t dram_buffer_size = get_arg_val(2); + std::uint32_t local_eth_l1_addr_base = get_arg_val(3); std::uint32_t dram_buffer_dst_addr = dram_buffer_dst_addr_base; // DRAM NOC dst address - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_bank_id, dram_buffer_dst_addr); noc_async_write(local_eth_l1_addr_base, dram_buffer_dst_noc_addr, dram_buffer_size); noc_async_write_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_binary.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_binary.cpp index f788e0a7141d..d427c9490a14 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_binary.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_binary.cpp @@ -5,23 +5,21 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t dst0_addr = get_arg_val(0); - uint32_t dst0_noc_x = get_arg_val(1); - uint32_t dst0_noc_y = get_arg_val(2); - uint32_t cb_id_out0 = get_arg_val(3); - uint32_t dst1_addr = get_arg_val(4); - uint32_t dst1_noc_x = get_arg_val(5); - uint32_t dst1_noc_y = get_arg_val(6); - uint32_t cb_id_out1 = get_arg_val(7); - uint32_t num_tiles = get_arg_val(8); - uint32_t ublock_size_tiles = get_arg_val(9); + uint32_t dst0_addr = get_arg_val(0); + uint32_t dst0_dram_bank_id = get_arg_val(1); + uint32_t cb_id_out0 = get_arg_val(2); + uint32_t dst1_addr = get_arg_val(3); + uint32_t dst1_dram_bank_id = get_arg_val(4); + uint32_t cb_id_out1 = get_arg_val(5); + uint32_t num_tiles = get_arg_val(6); + uint32_t ublock_size_tiles = get_arg_val(7); uint32_t ublock0_size_bytes = get_tile_size(cb_id_out0) * ublock_size_tiles; uint32_t ublock1_size_bytes = get_tile_size(cb_id_out1) * ublock_size_tiles; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst0_noc_addr = get_noc_addr(dst0_noc_x, dst0_noc_y, dst0_addr); - uint64_t dst1_noc_addr = get_noc_addr(dst1_noc_x, dst1_noc_y, dst1_addr); + uint64_t dst0_noc_addr = get_noc_addr_from_bank_id(dst0_dram_bank_id, dst0_addr); + uint64_t dst1_noc_addr = get_noc_addr_from_bank_id(dst1_dram_bank_id, dst1_addr); cb_wait_front(cb_id_out0, ublock_size_tiles); cb_wait_front(cb_id_out1, ublock_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp index ccbac8d1439b..76c0fd42dcd9 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_cb_test.cpp @@ -4,17 +4,12 @@ #include "dataflow_api.h" -inline __attribute__((always_inline)) void pop_from_cb_and_write( - const uint32_t cb_id, - uint32_t num_tiles_per_cb, - uint32_t ublock_size_tiles, - uint32_t ublock_size_bytes, - uint32_t dram_dst_noc_x, - uint32_t dram_dst_noc_y, - uint32_t& dram_buffer_dst_addr) { +inline __attribute__((always_inline)) +void pop_from_cb_and_write(const uint32_t cb_id, uint32_t num_tiles_per_cb, uint32_t ublock_size_tiles, uint32_t ublock_size_bytes, + uint32_t bank_id, uint32_t& dram_buffer_dst_addr) { for (uint32_t i = 0; i < num_tiles_per_cb; i += ublock_size_tiles) { // DRAM NOC dst address - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(bank_id, dram_buffer_dst_addr); cb_wait_front(cb_id, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id); @@ -27,21 +22,14 @@ inline __attribute__((always_inline)) void pop_from_cb_and_write( } void kernel_main() { - std::uint32_t dram_buffer_dst_addr = get_arg_val(0); - std::uint32_t dram_dst_noc_x = get_arg_val(1); - std::uint32_t dram_dst_noc_y = get_arg_val(2); - std::uint32_t num_tiles_per_cb = get_arg_val(3); + std::uint32_t dram_buffer_dst_addr = get_arg_val(0); + std::uint32_t bank_id = get_arg_val(1); + std::uint32_t num_tiles_per_cb = get_arg_val(2); constexpr uint32_t cb_id = get_compile_time_arg_val(0); constexpr uint32_t ublock_size_tiles = get_compile_time_arg_val(1); uint32_t ublock_size_bytes = get_tile_size(cb_id) * ublock_size_tiles; - pop_from_cb_and_write( - cb_id, - num_tiles_per_cb, - ublock_size_tiles, - ublock_size_bytes, - dram_dst_noc_x, - dram_dst_noc_y, - dram_buffer_dst_addr); + pop_from_cb_and_write(cb_id, num_tiles_per_cb, ublock_size_tiles, ublock_size_bytes, + bank_id, dram_buffer_dst_addr); } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp index 260382f223f9..5e5f7583d7e7 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_last_stage.cpp @@ -12,6 +12,11 @@ void kernel_main() { std::uint32_t num_tiles = get_arg_val(3); std::uint32_t num_repetitions = get_arg_val(4); + std::uint32_t buffer_dst_addr = get_arg_val(0); + std::uint32_t dst_bank_id = get_arg_val(1); + std::uint32_t num_tiles = get_arg_val(2); + std::uint32_t num_repetitions = get_arg_val(3); + constexpr uint32_t cb_id = get_compile_time_arg_val(0); constexpr uint32_t block_size_tiles = get_compile_time_arg_val(1); @@ -20,7 +25,7 @@ void kernel_main() { for (uint32_t j = 0; j < num_repetitions; j++) { uint32_t dst_addr = buffer_dst_addr; for (uint32_t i = 0; i < num_tiles; i += block_size_tiles) { - std::uint64_t buffer_dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + std::uint64_t buffer_dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_addr); cb_wait_front(cb_id, block_size_tiles); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp index 4b8041fb5329..ffd7832242ca 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp @@ -5,8 +5,8 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t dst_addr = get_arg_val(0); - uint32_t num_tiles = get_arg_val(3); // Index 3 to match with regular writer_unary + uint32_t dst_addr = get_arg_val(0); + uint32_t num_tiles = get_arg_val(2); // Index 2 to match with regular writer_unary constexpr uint32_t cb_id_out0 = 16; constexpr uint32_t onetile = 1; diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_pop_n.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_pop_n.cpp index 23a0016c3118..67a502304573 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_pop_n.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_pop_n.cpp @@ -5,18 +5,17 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); - uint32_t cb_id_out0 = get_arg_val(4); - uint32_t ublock_size_tiles = get_arg_val(5); - bool writer_only = get_arg_val(6); + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_dram_bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); + uint32_t cb_id_out0 = get_arg_val(3); + uint32_t ublock_size_tiles = get_arg_val(4); + bool writer_only = get_arg_val(5); uint32_t ublock_size_bytes = get_tile_size(cb_id_out0) * ublock_size_tiles; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_dram_bank_id, dst_addr); if (writer_only == false) { cb_wait_front(cb_id_out0, ublock_size_tiles); } diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_transpose_wh.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_transpose_wh.cpp index ebd23e453e32..b0c9cfcb1460 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_transpose_wh.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_transpose_wh.cpp @@ -6,8 +6,8 @@ void kernel_main() { uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); + uint32_t dst_dram_bank_id = get_arg_val(1); + // uint32_t unused = get_arg_val(2); // uint32_t num_tiles = get_arg_val(3); uint32_t N = get_arg_val(4); uint32_t Ht = get_arg_val(5); @@ -26,9 +26,9 @@ void kernel_main() { // this writer will write a NWH tensor in NHW order for (uint32_t n = 0; n < N; n++) { dst_addr = dst_addrN; - for (uint32_t w = 0; w < Wt; w++) { - for (uint32_t h = 0; h < Ht; h++) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + for (uint32_t w = 0; w(dst_dram_bank_id, dst_addr); cb_wait_front(cb_id_out0, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id_out0); noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes); diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp index 1bdbe5b2de4f..77cbac915db8 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp @@ -5,16 +5,15 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t inner_r = get_arg_val(3); - uint32_t inner_c = get_arg_val(4); - uint32_t num_sub_blocks_m = get_arg_val(5); - uint32_t num_sub_blocks_n = get_arg_val(6); - uint32_t stride_r = get_arg_val(7); - uint32_t stride_subblock_r = get_arg_val(8); - uint32_t stride_subblock_c = get_arg_val(9); + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_bank_id = get_arg_val(1); + uint32_t inner_r = get_arg_val(2); + uint32_t inner_c = get_arg_val(3); + uint32_t num_sub_blocks_m = get_arg_val(4); + uint32_t num_sub_blocks_n = get_arg_val(5); + uint32_t stride_r = get_arg_val(6); + uint32_t stride_subblock_r = get_arg_val(7); + uint32_t stride_subblock_c = get_arg_val(8); constexpr uint32_t cb_id_out0 = 16; @@ -29,8 +28,8 @@ void kernel_main() { uint32_t dram_address_r = dram_address_block_beginning; for (uint32_t r = 0; r < inner_r; r++) { uint32_t dram_address_c = dram_address_r; - for (uint32_t c = 0; c < inner_c; c++) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dram_address_c); + for(uint32_t c = 0; c < inner_c; c++) { + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dram_address_c); cb_wait_front(cb_id_out0, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id_out0); diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp index e8fb86620973..800f924f6e43 100644 --- a/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/misc/print_tile.cpp @@ -33,11 +33,9 @@ void MAIN { // Read out the tile we want to print using BRISC, put it in c_in0 constexpr uint32_t cb_id = tt::CBIndex::c_0; #if defined(COMPILE_FOR_BRISC) - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint32_t src_addr = get_arg_val(0); + uint32_t src_bank_id = get_arg_val(1); + uint64_t src_noc_addr = get_noc_addr_from_bank_id(src_bank_id, src_addr); cb_reserve_back(cb_id, 1); noc_async_read(src_noc_addr, get_write_ptr(cb_id), get_tile_size(cb_id)); noc_async_read_barrier(); diff --git a/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp b/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp index 13406c2423b1..753fdfbe4961 100644 --- a/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/misc/watcher_asserts.cpp @@ -39,9 +39,10 @@ void MAIN { #else tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE); #endif - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_y), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/); } #else diff --git a/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp b/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp index 7031c71c949e..e034b925d25d 100644 --- a/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp +++ b/tests/tt_metal/tt_metal/test_l1_to_l1_multi_core.cpp @@ -69,7 +69,6 @@ int main(int argc, char** argv) { .buffer_type = tt_metal::BufferType::DRAM}; auto src_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src_addr = src_dram_buffer->address(); - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); tt_metal::detail::WriteToBuffer(src_dram_buffer, src_vec); auto l1_to_l1_kernel = tt_metal::CreateKernel( @@ -84,15 +83,14 @@ int main(int argc, char** argv) { l1_to_l1_kernel, core, {dram_buffer_src_addr, - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - l1_buffer_addr, - l1_buffer_addr, - (uint32_t)dst_soc_core.x, - (uint32_t)dst_soc_core.y, - num_tiles, - tile_size_bytes, - total_tiles_size_bytes}); + 0, + l1_buffer_addr, + l1_buffer_addr, + (uint32_t)dst_soc_core.x, + (uint32_t)dst_soc_core.y, + num_tiles, + tile_size_bytes, + total_tiles_size_bytes}); } } diff --git a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp index b7f4dc8a7608..3d870b8fa378 100644 --- a/tests/tt_metal/tt_metal/test_matmul_large_block.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_large_block.cpp @@ -279,17 +279,11 @@ bool test_matmul_large_block(tt_metal::Device* device, bool activations_rm, bool auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - const std::array mm_reader_rt_args{ src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks M * in0_block_w, // input 0 block num tiles N * in0_block_w, // input 1 block num tiles @@ -300,17 +294,12 @@ bool test_matmul_large_block(tt_metal::Device* device, bool activations_rm, bool string writer_kernel; if (output_rm) { writer_kernel = "tt_metal/kernels/dataflow/writer_unary.cpp"; - writer_rt_args = { - dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - uint(M * N)}; + writer_rt_args = {dst_dram_buffer->address(), (uint32_t)0, uint(M * N)}; } else { writer_kernel = "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unswizzle.cpp"; writer_rt_args = { dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (uint32_t)0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp index e26fd37047a9..63a1f3cc9721 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_single_dram.cpp @@ -383,10 +383,6 @@ int main(int argc, char** argv) { TT_FATAL(dram_buffer_src1_addr + dram_buffer_size_weights < 1024 * 1024 * 1024, "Error"); TT_FATAL(dram_buffer_dst_addr + dram_buffer_size_out < 1024 * 1024 * 1024, "Error"); - auto dram_src0_noc_xy = device->dram_core_from_dram_channel(dram_src0_channel_id); - auto dram_src1_noc_xy = device->dram_core_from_dram_channel(dram_src1_channel_id); - auto dram_dst_noc_xy = device->dram_core_from_dram_channel(dram_dst_channel_id); - auto activations_tilized = tilize(activation_slice, per_core_M * 32, K * 32); auto activations_tile_layout = convert_to_tile_layout(activations_tilized); auto activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); @@ -402,11 +398,9 @@ int main(int argc, char** argv) { const std::array mm_reader_args = { (std::uint32_t)dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks (std::uint32_t)per_core_M * in0_block_w, // input 0 block num tiles (std::uint32_t)per_core_N * in0_block_w, // input 1 block num tiles @@ -415,8 +409,7 @@ int main(int argc, char** argv) { const std::array writer_args = { (std::uint32_t)dram_buffer_dst_addr, - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (std::uint32_t)0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)per_core_M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp index 4cf23e123cf5..5d849d447ede 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp @@ -147,11 +147,6 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t cb0_tiles = M * 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -257,11 +252,9 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { vector reader_l1_args = { src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, K, M, N, @@ -270,13 +263,7 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { with_bias}; if (with_bias) { - auto dram_src2_noc_xy = src2_dram_buffer->noc_coordinates(); - vector bias_args = { - src2_dram_buffer->address(), - (std::uint32_t)dram_src2_noc_xy.x, - (std::uint32_t)dram_src2_noc_xy.y, - N, - N * single_tile_size}; + vector bias_args = {src2_dram_buffer->address(), (uint32_t)0, N, N * single_tile_size}; for (uint32_t arg : bias_args) { reader_l1_args.push_back(arg); @@ -285,11 +272,7 @@ bool run_matmul(const tt::ARCH& arch, const bool with_bias) { tt_metal::SetRuntimeArgs(program, mm_reader_kernel, core, reader_l1_args); - tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dst_dram_buffer->address(), (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, M * N}); + tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dst_dram_buffer->address(), (uint32_t)0, M * N}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp index 32f4b2fdffed..f45d6a460bbe 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_core.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_core.cpp @@ -217,10 +217,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t cb0_tiles = M * in0_block_w * 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -249,11 +245,9 @@ int main(int argc, char** argv) { const std::array mm_reader_rt_args{ src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks M * in0_block_w, // input 0 block num tiles N * in0_block_w, // input 1 block num tiles @@ -262,8 +256,7 @@ int main(int argc, char** argv) { const std::array writer_rt_args{ dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (uint32_t)0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp index 78bdbc88de09..8264184a457b 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_core_small.cpp @@ -218,10 +218,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(weights_config); auto dst_dram_buffer = CreateBuffer(dst_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t cb0_tiles = M * in0_block_w * 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -251,11 +247,9 @@ int main(int argc, char** argv) { const std::array mm_reader_rt_args{ src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, (std::uint32_t)(K / in0_block_w), // num_blocks M * in0_block_w, // input 0 block num tiles N * in0_block_w, // input 1 block num tiles @@ -264,8 +258,7 @@ int main(int argc, char** argv) { const std::array writer_rt_args{ dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (uint32_t)0, (std::uint32_t)out_subblock_h, // num tiles per sub block m (std::uint32_t)out_subblock_w, // num tiles per sub block n (std::uint32_t)M / out_subblock_h, // num sub blocks m diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp index de499d3528c9..42f5511eef98 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile.cpp @@ -52,10 +52,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(dram_config); auto dst_dram_buffer = CreateBuffer(dram_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 2; tt_metal::CircularBufferConfig cb_src0_config = @@ -136,11 +132,9 @@ int main(int argc, char** argv) { mm_reader_kernel, core, {src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, + (uint32_t)0, src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, + (uint32_t)0, 1, 1, 1, @@ -148,13 +142,7 @@ int main(int argc, char** argv) { 1 * single_tile_size}); tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tiles}); + program, unary_writer_kernel, core, {dst_dram_buffer->address(), (uint32_t)0, num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp index 8d508209c3ec..7966c969bc25 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_bfp8b.cpp @@ -52,10 +52,6 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(dram_config); auto dst_dram_buffer = CreateBuffer(dram_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 1; tt_metal::CircularBufferConfig cb_src0_config = @@ -137,25 +133,22 @@ int main(int argc, char** argv) { mm_reader_kernel, core, {src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, - 1, - 1, - 1, - 1 * single_tile_size, - 1 * single_tile_size}); + 0, + src1_dram_buffer->address(), + 0, + 1, + 1, + 1, + 1 * single_tile_size, + 1 * single_tile_size}); tt_metal::SetRuntimeArgs( program, unary_writer_kernel, core, {dst_dram_buffer->address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tiles}); + 0, + num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp index baf73e2b4bac..1a8a77bd6dec 100644 --- a/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_single_tile_output_in_l1.cpp @@ -58,9 +58,9 @@ int main(int argc, char** argv) { auto src1_dram_buffer = CreateBuffer(dram_config); auto dst_l1_buffer = CreateBuffer(l1_config); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto l1_dst_noc_xy = dst_l1_buffer->noc_coordinates(); + auto l1_dst_noc_xy = + device->virtual_core_from_logical_core(dst_l1_buffer->logical_core_from_bank_id(0), CoreType::WORKER); + ; uint32_t src0_cb_index = 0; uint32_t num_input_tiles = 1; @@ -94,7 +94,7 @@ int main(int argc, char** argv) { auto unary_writer_kernel = tt_metal::CreateKernel( program, - "tt_metal/kernels/dataflow/writer_unary.cpp", + "tt_metal/kernels/dataflow/writer_unary_1.cpp", core, tt_metal::DataMovementConfig{ .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); @@ -143,16 +143,14 @@ int main(int argc, char** argv) { mm_reader_kernel, core, {src0_dram_buffer->address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - src1_dram_buffer->address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, - 1, - 1, - 1, - 1 * single_tile_size, - 1 * single_tile_size}); + 0, + src1_dram_buffer->address(), + 0, + 1, + 1, + 1, + 1 * single_tile_size, + 1 * single_tile_size}); tt_metal::SetRuntimeArgs( program, diff --git a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp index 194e2d9a4bb4..f3a6cd35a308 100644 --- a/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp +++ b/tests/tt_metal/tt_metal/test_multi_core_kernel.cpp @@ -89,11 +89,7 @@ void compile_and_configure_program( tt_metal::detail::WriteToBuffer(src_dram_buffer, src_vec); } -void set_rt_args( - tt_metal::Program& program, - tt_metal::KernelHandle kernel, - const CoreRange& core_range, - const std::array& rt_args) { +void set_rt_args(tt_metal::Program &program, tt_metal::KernelHandle kernel, const CoreRange &core_range, const std::array &rt_args) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { CoreCoord core = CoreCoord(x, y); @@ -109,22 +105,19 @@ void write_same_runtime_args_to_device( tt_metal::KernelHandle writer_kernel_id, const CoreRange& core_range, int32_t num_tiles, - tt_metal::Buffer& src_dram_buffer, - tt_metal::Buffer& dst_dram_buffer) { - auto dram_src_noc_xy = src_dram_buffer.noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer.noc_coordinates(); + tt_metal::Buffer &src_dram_buffer, + tt_metal::Buffer &dst_dram_buffer) +{ const std::array unary_reader_args{ - (std::uint32_t)src_dram_buffer.address(), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, - (std::uint32_t)num_tiles}; + (std::uint32_t)src_dram_buffer.address(), + (std::uint32_t) 0, + (std::uint32_t)num_tiles}; const std::array unary_writer_args{ - (std::uint32_t)dst_dram_buffer.address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - (std::uint32_t)num_tiles}; + (std::uint32_t)dst_dram_buffer.address(), + (std::uint32_t) 0, + (std::uint32_t)num_tiles}; set_rt_args(program, reader_kernel_id, core_range, unary_reader_args); set_rt_args(program, writer_kernel_id, core_range, unary_writer_args); @@ -138,37 +131,31 @@ void write_unique_writer_runtime_args_to_device( const CoreRange& core_range, const CoreRangeSet& core_blocks, int32_t num_tiles, - tt_metal::Buffer& src_dram_buffer, - tt_metal::Buffer& dst_dram_buffer_1, - tt_metal::Buffer& dst_dram_buffer_2, - tt_metal::Buffer& dst_dram_buffer_3) { - auto dram_src_noc_xy = src_dram_buffer.noc_coordinates(); - // All dst buffers use the same DRAM channel - auto dram_dst_noc_xy = dst_dram_buffer_1.noc_coordinates(); + tt_metal::Buffer &src_dram_buffer, + tt_metal::Buffer &dst_dram_buffer_1, + tt_metal::Buffer &dst_dram_buffer_2, + tt_metal::Buffer &dst_dram_buffer_3 +) { // Same readers args because all kernels read from same src const std::array unary_reader_args{ (std::uint32_t)src_dram_buffer.address(), - (std::uint32_t)dram_src_noc_xy.x, - (std::uint32_t)dram_src_noc_xy.y, + (std::uint32_t) 0, (std::uint32_t)num_tiles}; const std::array unary_writer_args_1{ dst_dram_buffer_1.address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (std::uint32_t) 0, (std::uint32_t)num_tiles}; const std::array unary_writer_args_2{ dst_dram_buffer_2.address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (std::uint32_t) 0, (std::uint32_t)num_tiles}; const std::array unary_writer_args_3{ dst_dram_buffer_3.address(), - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, + (std::uint32_t) 0, (std::uint32_t)num_tiles}; set_rt_args(program, reader_kernel_id, core_range, unary_reader_args); diff --git a/tests/tt_metal/tt_metal/test_multiple_programs.cpp b/tests/tt_metal/tt_metal/test_multiple_programs.cpp index c92fb35bc7b3..e73e7f423ed4 100644 --- a/tests/tt_metal/tt_metal/test_multiple_programs.cpp +++ b/tests/tt_metal/tt_metal/test_multiple_programs.cpp @@ -165,27 +165,13 @@ void write_program_runtime_args_to_device( tt_metal::Buffer& src0_dram_buffer, tt_metal::Buffer& src1_dram_buffer, tt_metal::Buffer& dst_dram_buffer) { - auto dram_src0_noc_xy = src0_dram_buffer.noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer.noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer.noc_coordinates(); - tt_metal::SetRuntimeArgs( program, reader_kernel_id, core, - {src0_dram_buffer.address(), - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - src1_dram_buffer.address(), - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, - num_tiles}); + {src0_dram_buffer.address(), (uint32_t)0, src1_dram_buffer.address(), (uint32_t)0, num_tiles}); - tt_metal::SetRuntimeArgs( - program, - writer_kernel_id, - core, - {dst_dram_buffer.address(), (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + tt_metal::SetRuntimeArgs(program, writer_kernel_id, core, {dst_dram_buffer.address(), (uint32_t)0, num_tiles}); } ////////////////////////////////////////////////////////////////////////////////////////// // 1. First program runs eltwise binary on logical core {0, 0} diff --git a/tests/tt_metal/tt_metal/test_transpose_hc.cpp b/tests/tt_metal/tt_metal/test_transpose_hc.cpp index fc242dd9379a..825bf5a8e568 100644 --- a/tests/tt_metal/tt_metal/test_transpose_hc.cpp +++ b/tests/tt_metal/tt_metal/test_transpose_hc.cpp @@ -68,10 +68,8 @@ int main(int argc, char** argv) { auto src0_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_src0_addr = src0_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); uint32_t src0_cb_index = 0; uint32_t num_buffer_tiles = 2; @@ -130,27 +128,10 @@ int main(int argc, char** argv) { tt_metal::detail::WriteToBuffer(src0_dram_buffer, src0_vec); tt_metal::SetRuntimeArgs( - program, - reader_kernel, - core, - {dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - W, - H, - C, - HW, - N, - CHW}); + program, reader_kernel, core, {dram_buffer_src0_addr, (uint32_t)0, W, H, C, HW, N, CHW}); tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dram_buffer_dst_addr, - (std::uint32_t)dram_dst_noc_xy.x, - (std::uint32_t)dram_dst_noc_xy.y, - num_tensor_tiles}); + program, unary_writer_kernel, core, {dram_buffer_dst_addr, (uint32_t)0, num_tensor_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp index 890315de6fec..91ea7efc41bb 100644 --- a/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp +++ b/tests/tt_metal/tt_metal/test_untilize_eltwise_binary.cpp @@ -122,10 +122,6 @@ int main(int argc, char** argv) { auto dst_dram_buffer = CreateBuffer(dram_config); uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - auto dram_src0_noc_xy = src0_dram_buffer->noc_coordinates(); - auto dram_src1_noc_xy = src1_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - uint32_t src0_cb_index = 0; uint32_t num_input_tiles = num_tiles_c; tt_metal::CircularBufferConfig cb_src0_config = @@ -200,21 +196,9 @@ int main(int argc, char** argv) { program, binary_reader_kernel, core, - {dram_buffer_src0_addr, - (std::uint32_t)dram_src0_noc_xy.x, - (std::uint32_t)dram_src0_noc_xy.y, - num_tiles, - dram_buffer_src1_addr, - (std::uint32_t)dram_src1_noc_xy.x, - (std::uint32_t)dram_src1_noc_xy.y, - num_tiles, - 0}); + {dram_buffer_src0_addr, (uint32_t)0, num_tiles, dram_buffer_src1_addr, (uint32_t)0, num_tiles, 0}); - tt_metal::SetRuntimeArgs( - program, - unary_writer_kernel, - core, - {dram_buffer_dst_addr, (std::uint32_t)dram_dst_noc_xy.x, (std::uint32_t)dram_dst_noc_xy.y, num_tiles}); + tt_metal::SetRuntimeArgs(program, unary_writer_kernel, core, {dram_buffer_dst_addr, (uint32_t)0, num_tiles}); tt_metal::detail::LaunchProgram(device, program); diff --git a/tt_metal/common/CMakeLists.txt b/tt_metal/common/CMakeLists.txt index f7be3f8b1be8..bf3bd0068ab7 100644 --- a/tt_metal/common/CMakeLists.txt +++ b/tt_metal/common/CMakeLists.txt @@ -1,4 +1,5 @@ set(COMMON_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/core_assignment.cpp ${CMAKE_CURRENT_SOURCE_DIR}/core_coord.cpp ${CMAKE_CURRENT_SOURCE_DIR}/core_descriptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/metal_soc_descriptor.cpp diff --git a/tt_metal/common/core_assignment.cpp b/tt_metal/common/core_assignment.cpp new file mode 100644 index 000000000000..6131b31c9d87 --- /dev/null +++ b/tt_metal/common/core_assignment.cpp @@ -0,0 +1,230 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "core_assignment.hpp" + +namespace tt { +namespace tt_metal { + +void reassign_dram_interface_cores_for_grayskull( + const std::vector& non_worker_rows, + std::vector& dram_interface_workers, + uint32_t full_grid_size_y) { + // Reassign optimally placed DRAM Interface worker cores based on harvesting for GS + for (auto& coord : dram_interface_workers) { + // if row is harvested, move core down by 1 + while (std::find(non_worker_rows.begin(), non_worker_rows.end(), coord.y) != non_worker_rows.end() and + coord.y < (full_grid_size_y - 1)) { + coord.y += 1; + } + } +} + +std::vector reassign_dram_interface_cores_for_wormhole( + const std::vector& non_worker_rows, + const std::vector& dram_interface_workers, + uint32_t num_dram_banks, + uint32_t max_worker_y_physical, + uint32_t min_worker_y_physical) { + // Reassign optimally placed DRAM Interface worker cores based on harvesting for WH + std::vector dram_interface_workers_g1; + std::vector dram_interface_workers_g2; + std::vector dram_interface_worker_y_coords_g1; + std::vector dram_interface_worker_y_coords_g2; + + dram_interface_workers_g1.reserve(num_dram_banks); + dram_interface_worker_y_coords_g1.reserve(num_dram_banks); + dram_interface_workers_g2.reserve(num_dram_banks); + dram_interface_worker_y_coords_g2.reserve(num_dram_banks); + + // Separate Workers into 2 groups based on which DRAM column they are meant to interface with + for (const auto& core : dram_interface_workers) { + if (core.x == dram_interface_workers.front().x) { + dram_interface_workers_g1.push_back(core); + } else { + dram_interface_workers_g2.push_back(core); + } + } + + // Track the indices of the workers inside each group + std::vector indices_g1(dram_interface_workers_g1.size()); + std::vector indices_g2(dram_interface_workers_g2.size()); + std::iota(indices_g1.begin(), indices_g1.end(), 0); + std::iota(indices_g2.begin(), indices_g2.end(), 0); + + // Sort workers and associated group indices based on y coord + std::sort(indices_g1.begin(), indices_g1.end(), [&dram_interface_workers_g1](int i1, int i2) { + return dram_interface_workers_g1[i1].y < dram_interface_workers_g1[i2].y; + }); + std::sort(indices_g2.begin(), indices_g2.end(), [&dram_interface_workers_g2](int i1, int i2) { + return dram_interface_workers_g2[i1].y < dram_interface_workers_g2[i2].y; + }); + std::sort( + dram_interface_workers_g1.begin(), dram_interface_workers_g1.end(), [](const CoreCoord& a, const CoreCoord& b) { + return a.y < b.y; + }); + std::sort( + dram_interface_workers_g2.begin(), dram_interface_workers_g2.end(), [](const CoreCoord& a, const CoreCoord& b) { + return a.y < b.y; + }); + // Place the bottom-most worker and associated index at the start of the group + std::rotate( + dram_interface_workers_g1.begin(), dram_interface_workers_g1.end() - 1, dram_interface_workers_g1.end()); + std::rotate( + dram_interface_workers_g2.begin(), dram_interface_workers_g2.end() - 1, dram_interface_workers_g2.end()); + std::rotate(indices_g1.begin(), indices_g1.end() - 1, indices_g1.end()); + std::rotate(indices_g2.begin(), indices_g2.end() - 1, indices_g2.end()); + + // Track the shuffled indices + std::vector indices_g1_realloc(dram_interface_workers_g1.size()); + std::vector indices_g2_realloc(dram_interface_workers_g2.size()); + for (int new_index = 0; new_index < indices_g1.size(); ++new_index) { + indices_g1_realloc[indices_g1[new_index]] = new_index; + } + for (int new_index = 0; new_index < indices_g2.size(); ++new_index) { + indices_g2_realloc[indices_g2[new_index]] = new_index; + } + // Extract worker y coordinates per group + for (auto core : dram_interface_workers_g1) { + dram_interface_worker_y_coords_g1.push_back(core.y); + } + for (auto core : dram_interface_workers_g2) { + dram_interface_worker_y_coords_g2.push_back(core.y); + } + uint32_t x_step = 3; + // Helper function to shift harvested workers + auto shift_group_based_on_harvesting = [&](std::vector& group, + std::vector& group_y, + uint32_t x_step) { + for (auto& coord : group) { + auto y = coord.y; + + if (std::find(non_worker_rows.begin(), non_worker_rows.end(), y) != non_worker_rows.end() || + std::count(group_y.begin(), group_y.end(), y) >= 2) { + auto shift_coord_based_on_harvesting = [&](int start, int end, int step) { + bool found_new_row = false; + for (int j = start; step > 0 ? j <= end : j >= end; j += step) { + if (std::find(non_worker_rows.begin(), non_worker_rows.end(), j) == non_worker_rows.end() && + std::count(group_y.begin(), group_y.end(), j) == 0) { + coord.y = j; + coord.x += x_step; + x_step--; + found_new_row = true; + break; + } + } + if (not found_new_row) { + for (int j = start; step > 0 ? j <= end : j >= end; j += step) { + if (std::find(non_worker_rows.begin(), non_worker_rows.end(), j) == non_worker_rows.end()) { + coord.y = j; + coord.x += x_step; + x_step--; + found_new_row = true; + break; + } + } + } + }; + + if (y >= num_dram_banks - 1) { + shift_coord_based_on_harvesting(max_worker_y_physical, min_worker_y_physical, -1); + } else { + shift_coord_based_on_harvesting(min_worker_y_physical, max_worker_y_physical, 1); + } + } + } + }; + // Shift harvested workers + shift_group_based_on_harvesting(dram_interface_workers_g1, dram_interface_worker_y_coords_g1, x_step); + shift_group_based_on_harvesting(dram_interface_workers_g2, dram_interface_worker_y_coords_g2, x_step); + + // Merge both groups based on original indices (maintain ordering by dram bank_id here) + std::vector shifted_dram_interface_workers; + shifted_dram_interface_workers.reserve(num_dram_banks); + for (int i = 0; i < indices_g1_realloc.size(); ++i) { + shifted_dram_interface_workers.push_back(dram_interface_workers_g1[indices_g1_realloc[i]]); + } + for (int i = 0; i < indices_g2_realloc.size(); ++i) { + shifted_dram_interface_workers.push_back(dram_interface_workers_g2[indices_g2_realloc[i]]); + } + return shifted_dram_interface_workers; +} + +void reassign_dram_interface_cores_for_blackhole( + const std::vector& harvested_cols, + std::vector& dram_interface_workers, + uint32_t full_grid_size_x) { + for (auto& coord : dram_interface_workers) { + // if col is harvested, move core right by 1 + while (std::find(harvested_cols.begin(), harvested_cols.end(), coord.x) != harvested_cols.end() and + coord.x < (full_grid_size_x - 1)) { + coord.x += 1; + } + } +} + +std::vector get_optimal_dram_to_physical_worker_assignment( + ARCH arch, + const std::vector& dram_phy_coords, + uint32_t full_grid_size_x, + uint32_t full_grid_size_y, + std::vector worker_phy_x, + std::vector worker_phy_y) { + // Reassign optimally placed DRAM Interface worker cores based on harvesting for BH + std::vector non_worker_rows; + std::vector non_worker_cols; + uint32_t max_worker_y_physical = 0; + uint32_t min_worker_y_physical = std::numeric_limits::max(); + // For GS and WH, rows are harvested. Track them here. + if (arch == ARCH::GRAYSKULL or arch == ARCH::WORMHOLE_B0) { + for (int y_coord = 0; y_coord < full_grid_size_y; ++y_coord) { + if (std::find(worker_phy_y.begin(), worker_phy_y.end(), y_coord) == worker_phy_y.end()) { + non_worker_rows.push_back(y_coord); + } + if (y_coord > max_worker_y_physical) { + max_worker_y_physical = y_coord; + } + if (y_coord < min_worker_y_physical) { + min_worker_y_physical = y_coord; + } + } + } + std::vector dram_interface_workers; + uint32_t num_dram_banks = dram_phy_coords.size(); + // Get the optimal dram -> worker configuration here. + // For GS, worker cores are placed below the DRAM Controller. + // For WH, worker cores are placed to the right of the DRAM Controller. + for (int i = 0; i < num_dram_banks; ++i) { + auto dram_core = dram_phy_coords[i]; + if (arch == ARCH::GRAYSKULL) { + dram_interface_workers.push_back(CoreCoord(dram_core.x, dram_core.y + 1)); + } else if (arch == ARCH::WORMHOLE_B0 or arch == ARCH::BLACKHOLE) { + dram_interface_workers.push_back(CoreCoord(dram_core.x + 1, dram_core.y)); + } + } + + if (arch == ARCH::GRAYSKULL) { + // Reassign worker cores based on harvesting for GS. + reassign_dram_interface_cores_for_grayskull(non_worker_rows, dram_interface_workers, full_grid_size_y); + return dram_interface_workers; + } else if (arch == ARCH::WORMHOLE_B0) { + // Reassign worker cores based on harvesting for WH. + return reassign_dram_interface_cores_for_wormhole( + non_worker_rows, dram_interface_workers, num_dram_banks, max_worker_y_physical, min_worker_y_physical); + } else if (arch == ARCH::BLACKHOLE) { + // Reassign worker cores based on harvesting for BH. + // Need to account for column harvesting here. + for (int x_coord = 0; x_coord < full_grid_size_x; ++x_coord) { + if (std::find(worker_phy_x.begin(), worker_phy_x.end(), x_coord) == worker_phy_x.end()) { + non_worker_cols.push_back(x_coord); + } + } + reassign_dram_interface_cores_for_blackhole(non_worker_cols, dram_interface_workers, full_grid_size_x); + return dram_interface_workers; + } + TT_THROW("Invalid Arch Name specified"); +} + +} // namespace tt_metal +} // namespace tt diff --git a/tt_metal/common/core_assignment.hpp b/tt_metal/common/core_assignment.hpp new file mode 100644 index 000000000000..d10bcdd3a10c --- /dev/null +++ b/tt_metal/common/core_assignment.hpp @@ -0,0 +1,23 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "core_coord.hpp" +#include "tt_metal/llrt/tt_cluster.hpp" + +namespace tt { +namespace tt_metal { +// Returns an ordered list of DRAM Bank ID to optimally placed worker cores. Placing DRAM reader or writer +// kernels on these worker cores will minimize NOC congestion and the number of NOC hops required to complete +// a DRAM read or write. +// Worker cores are derived based on architecture, harvesting configurations and DRAM Controller placement. +std::vector get_optimal_dram_to_physical_worker_assignment( + ARCH arch, + const std::vector& dram_phy_coords, + uint32_t full_grid_size_x, + uint32_t full_grid_size_y, + std::vector worker_phy_x, + std::vector worker_phy_y); + +} // namespace tt_metal +} // namespace tt diff --git a/tt_metal/common/core_descriptor.hpp b/tt_metal/common/core_descriptor.hpp index d14d9f9a4456..7a7dc9b848d6 100644 --- a/tt_metal/common/core_descriptor.hpp +++ b/tt_metal/common/core_descriptor.hpp @@ -171,13 +171,4 @@ inline const std::vector& get_logical_dispatch_cores( return logical_dispatch_cores; } -/// @brief Get physical core coordinate from a logical location (device ID + core coordinate) -/// @param logical_location tt_cxy_pair describing chip and logical location core coordinate -/// @param core_type CoreType of core to translate -/// @return physical CoreCoord on the same chip as `logical_location` -inline CoreCoord get_physical_core_coordinate(const tt_cxy_pair& logical_location, const CoreType& core_type) { - const metal_SocDescriptor& soc_desc = tt::Cluster::instance().get_soc_desc(logical_location.chip); - return soc_desc.get_physical_core_from_logical_core(CoreCoord(logical_location.x, logical_location.y), core_type); -} - } // namespace tt diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 5554f2edcf37..f376b9746e7d 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -392,9 +392,10 @@ int main() { // Querying the noc_index is safe here, since the RUN_MSG_RESET_READ_PTR go signal is currently guaranteed // to only be seen after a RUN_MSG_GO signal, which will set the noc_index to a valid value. // For future proofing, the noc_index value is initialized to 0, to ensure an invalid NOC txn is not issued. - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_y), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); mailboxes->go_message.signal = RUN_MSG_DONE; // Notify dispatcher that this has been done DEBUG_SANITIZE_NOC_ADDR(noc_index, dispatch_addr, 4); @@ -512,9 +513,10 @@ int main() { if (launch_msg_address->kernel_config.mode == DISPATCH_MODE_DEV) { // Set launch message to invalid, so that the next time this slot is encountered, kernels are only run if a valid launch message is sent. launch_msg_address->kernel_config.enables = 0; - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_y), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); DEBUG_SANITIZE_NOC_ADDR(noc_index, dispatch_addr, 4); // Only executed if watcher is enabled. Ensures that we don't report stale data due to invalid launch // messages in the ring buffer. Must be executed before the atomic increment, as after that the launch diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index 44d760a069c2..883b615c9c7e 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -92,9 +92,10 @@ void __attribute__((noinline)) Application(void) { if (launch_msg_address->kernel_config.mode == DISPATCH_MODE_DEV) { launch_msg_address->kernel_config.enables = 0; - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_y), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); CLEAR_PREVIOUS_LAUNCH_MESSAGE_ENTRY_FOR_WATCHER(); internal_::notify_dispatch_core_done(dispatch_addr); mailboxes->launch_msg_rd_ptr = (launch_msg_rd_ptr + 1) & (launch_msg_buffer_num_entries - 1); @@ -105,9 +106,10 @@ void __attribute__((noinline)) Application(void) { } else if (go_message_signal == RUN_MSG_RESET_READ_PTR) { // Reset the launch message buffer read ptr mailboxes->launch_msg_rd_ptr = 0; - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_y), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); mailboxes->go_message.signal = RUN_MSG_DONE; internal_::notify_dispatch_core_done(dispatch_addr); } else { diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index 455629e95c7b..a425dd5c49d7 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -170,9 +170,10 @@ int main() { // Notify dispatcher core that it has completed if (launch_msg_address->kernel_config.mode == DISPATCH_MODE_DEV) { launch_msg_address->kernel_config.enables = 0; - uint64_t dispatch_addr = - NOC_XY_ADDR(NOC_X(mailboxes->go_message.master_x), - NOC_Y(mailboxes->go_message.master_x), DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_x), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); DEBUG_SANITIZE_NOC_ADDR(noc_index, dispatch_addr, 4); CLEAR_PREVIOUS_LAUNCH_MESSAGE_ENTRY_FOR_WATCHER(); noc_fast_atomic_increment(noc_index, NCRISC_AT_CMD_BUF, dispatch_addr, NOC_UNICAST_WRITE_VC, 1, 31 /*wrap*/, false /*linked*/); diff --git a/tt_metal/hw/inc/blackhole/noc/noc_parameters.h b/tt_metal/hw/inc/blackhole/noc/noc_parameters.h index 265466d2f282..d32631565f0b 100644 --- a/tt_metal/hw/inc/blackhole/noc/noc_parameters.h +++ b/tt_metal/hw/inc/blackhole/noc/noc_parameters.h @@ -5,6 +5,11 @@ #ifndef _NOC_PARAMETERS_H_ #define _NOC_PARAMETERS_H_ +// Coordinate Virtualization is not currently supported on BH (requires syseng support for updating FW). +#define VIRTUAL_TENSIX_START_X 0 +#define VIRTUAL_TENSIX_START_Y 0 +#define COORDINATE_VIRTUALIZATION_ENABLED 0 + #define NUM_NOCS 2 #define NUM_TENSIXES 140 diff --git a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h index e21d47417c6f..27e4217f00da 100644 --- a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h @@ -14,6 +14,10 @@ #define NOC_0_X(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x - 1 - (x))) #define NOC_0_Y(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y - 1 - (y))) +#define NOC_0_X_PHYS_COORD(noc_index, noc_size_x, x) NOC_0_X(noc_index, noc_size_x, x) +#define NOC_0_Y_PHYS_COORD(noc_index, noc_size_y, y) NOC_0_Y(noc_index, noc_size_y, y) +#define MY_NOC_ENCODING(noc_index) NOC_CMD_BUF_READ_REG(noc_index, 0, NOC_NODE_ID) + //// /*TODO: RT review this file, currently using wormhole b0 copy, check if any changes needed for BH*/ constexpr uint32_t DYNAMIC_NOC_NCRISC_WR_CMD_BUF = 2; // all writes share cmd buf diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index c6877db3e0a1..bd256d6a9ad1 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -2006,3 +2006,17 @@ void noc_async_read_barrier_with_trid(uint32_t trid, uint8_t noc = noc_index) { #endif WAYPOINT("NBTD"); } + +template +FORCE_INLINE +uint64_t get_noc_addr_from_bank_id(uint32_t bank_id, uint32_t bank_address_offset, uint8_t noc = noc_index) { + // Use addrgen tables to convert bank_ids to physical NOC coordinates + uint64_t noc_addr = 0; + if constexpr (DRAM) { + noc_addr = dram_bank_to_noc_xy[noc_index][bank_id]; + bank_address_offset += bank_to_dram_offset[bank_id]; + } else { + noc_addr = l1_bank_to_noc_xy[noc_index][bank_id]; + } + return (noc_addr << NOC_ADDR_COORD_SHIFT) | (bank_address_offset); +} diff --git a/tt_metal/hw/inc/debug/sanitize_noc.h b/tt_metal/hw/inc/debug/sanitize_noc.h index f01427da4921..3a1cc835f025 100644 --- a/tt_metal/hw/inc/debug/sanitize_noc.h +++ b/tt_metal/hw/inc/debug/sanitize_noc.h @@ -41,38 +41,77 @@ typedef bool debug_sanitize_noc_cast_t; typedef bool debug_sanitize_noc_which_core_t; // Helper function to get the core type from noc coords. -AddressableCoreType get_core_type(uint8_t noc_id, uint8_t x, uint8_t y) { +AddressableCoreType get_core_type(uint8_t noc_id, uint8_t x, uint8_t y, bool& is_virtual_coord) { core_info_msg_t tt_l1_ptr* core_info = GET_MAILBOX_ADDRESS_DEV(core_info); - + // Check if the target NOC endpoint is a valid non-Tensix core in the Physical Coordinate Space for (uint32_t idx = 0; idx < MAX_NON_WORKER_CORES; idx++) { uint8_t core_x = core_info->non_worker_cores[idx].x; uint8_t core_y = core_info->non_worker_cores[idx].y; - if (x == NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)core_x) && - y == NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)core_y)) { + if (x == NOC_0_X_PHYS_COORD(noc_id, core_info->noc_size_x, (uint32_t)core_x) && + y == NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)core_y)) { + is_virtual_coord = false; return core_info->non_worker_cores[idx].type; } } + if constexpr (COORDINATE_VIRTUALIZATION_ENABLED) { + // Was not a valid non-Tensix Physical Coordinate. Check if endpoint maps to a valid non-worker Virtual + // Coordinate. + for (uint32_t idx = 0; idx < MAX_VIRTUAL_NON_WORKER_CORES; idx++) { + uint8_t core_x = core_info->virtual_non_worker_cores[idx].x; + uint8_t core_y = core_info->virtual_non_worker_cores[idx].y; + + if (x == NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)core_x) && + y == NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)core_y)) { + is_virtual_coord = true; + return core_info->virtual_non_worker_cores[idx].type; + } + } + } + // Check if coordinate maps to a harvested row in the physical space. for (uint32_t idx = 0; idx < MAX_HARVESTED_ROWS; idx++) { uint16_t harvested_y = core_info->harvested_y[idx]; - if (y == NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)harvested_y)) { + if (y == NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)harvested_y)) { + is_virtual_coord = false; return AddressableCoreType::HARVESTED; } } + if constexpr (COORDINATE_VIRTUALIZATION_ENABLED) { + // Check if coordinate maps to a harvested row in the virtual space. + for (uint32_t idx = 0; idx < MAX_HARVESTED_ROWS; idx++) { + uint16_t virtual_harvested_y = core_info->virtual_harvested_y[idx]; + if (y == NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)virtual_harvested_y)) { + is_virtual_coord = true; + return AddressableCoreType::HARVESTED; + } + } + } - // Tensix + // Check if NOC endpoint is valid in the Tensix Physical Coordinate Space. if (noc_id == 0) { - if (x >= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)1) && - x <= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)core_info->noc_size_x - 1) && - y >= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)1) && - y <= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)core_info->noc_size_y - 1)) { + if (x >= NOC_0_X_PHYS_COORD(noc_id, core_info->noc_size_x, (uint32_t)0) && + x <= NOC_0_X_PHYS_COORD(noc_id, core_info->noc_size_x, (uint32_t)core_info->noc_size_x - 1) && + y >= NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)0) && + y <= NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)core_info->noc_size_y - 1)) { + is_virtual_coord = false; return AddressableCoreType::TENSIX; } } else { - if (x <= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)1) && - x >= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)core_info->noc_size_x - 1) && - y <= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)1) && - y >= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)core_info->noc_size_y - 1)) { + if (x <= NOC_0_X_PHYS_COORD(noc_id, core_info->noc_size_x, (uint32_t)0) && + x >= NOC_0_X_PHYS_COORD(noc_id, core_info->noc_size_x, (uint32_t)core_info->noc_size_x - 1) && + y <= NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)0) && + y >= NOC_0_Y_PHYS_COORD(noc_id, core_info->noc_size_y, (uint32_t)core_info->noc_size_y - 1)) { + is_virtual_coord = false; + return AddressableCoreType::TENSIX; + } + } + if constexpr (COORDINATE_VIRTUALIZATION_ENABLED) { + // Check if NOC endpoint is valid in the Tensix Virtual Coordinate Space. + if (x >= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)VIRTUAL_TENSIX_START_X) && + x <= NOC_0_X(noc_id, core_info->noc_size_x, (uint32_t)VIRTUAL_TENSIX_START_X + core_info->noc_size_x - 1) && + y >= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)VIRTUAL_TENSIX_START_Y) && + y <= NOC_0_Y(noc_id, core_info->noc_size_y, (uint32_t)VIRTUAL_TENSIX_START_Y + core_info->noc_size_y - 1)) { + is_virtual_coord = true; return AddressableCoreType::TENSIX; } } @@ -210,22 +249,40 @@ uint32_t debug_sanitize_noc_addr( y = (uint8_t)NOC_UNICAST_ADDR_Y(noc_addr); } uint64_t noc_local_addr = NOC_LOCAL_ADDR(noc_addr); - AddressableCoreType core_type = get_core_type(noc_id, x, y); - + bool is_virtual_coord = false; + AddressableCoreType core_type = get_core_type(noc_id, x, y, is_virtual_coord); // Extra check for multicast if (multicast) { uint8_t x_end = (uint8_t)NOC_MCAST_ADDR_END_X(noc_addr); uint8_t y_end = (uint8_t)NOC_MCAST_ADDR_END_Y(noc_addr); - - AddressableCoreType end_core_type = get_core_type(noc_id, x_end, y_end); + bool is_virtual_coord_end = false; + AddressableCoreType end_core_type = get_core_type(noc_id, x_end, y_end, is_virtual_coord_end); // Multicast supports workers only uint16_t return_code = DebugSanitizeNocOK; if (core_type != AddressableCoreType::TENSIX || end_core_type != AddressableCoreType::TENSIX) { return_code = DebugSanitizeNocMulticastNonWorker; } - if (x > x_end || y > y_end) { - return_code = DebugSanitizeNocMulticastInvalidRange; + if (is_virtual_coord != is_virtual_coord_end) { + return_code = DebugSanitizeNocMixedVirtualandPhysical; + } + if (is_virtual_coord && is_virtual_coord_end) { + // If coordinates are in virtual space, start can be greater than end, when using NOC1. + // This is because NOC0 and NOC1 endpoints are identical in virtual space, but order of + // start and end coords is still flipped between NOC0 and NOC1. + if (noc_id == 0) { + if (x > x_end || y > y_end) { + return_code = DebugSanitizeNocMulticastInvalidRange; + } + } else { + if (x_end > x || y_end > y) { + return_code = DebugSanitizeNocMulticastInvalidRange; + } + } + } else { + if (x > x_end || y > y_end) { + return_code = DebugSanitizeNocMulticastInvalidRange; + } } debug_sanitize_post_noc_addr_and_hang( noc_id, noc_addr, l1_addr, noc_len, multicast, dir, DEBUG_SANITIZE_NOC_TARGET, return_code); diff --git a/tt_metal/hw/inc/dev_msgs.h b/tt_metal/hw/inc/dev_msgs.h index 40d1cacf48f7..09c28fa93a04 100644 --- a/tt_metal/hw/inc/dev_msgs.h +++ b/tt_metal/hw/inc/dev_msgs.h @@ -184,6 +184,7 @@ enum debug_sanitize_noc_return_code_enum { DebugSanitizeNocMulticastNonWorker = 7, DebugSanitizeNocMulticastInvalidRange = 8, DebugSanitizeNocAlignment = 9, + DebugSanitizeNocMixedVirtualandPhysical = 10, }; struct debug_assert_msg_t { @@ -299,7 +300,12 @@ struct addressable_core_t { }; // TODO: This can move into the hal eventually, currently sized for WH. -constexpr static std::uint32_t MAX_NON_WORKER_CORES = 36 + 1 + 16; +// This is the number of Ethernet cores on WH (Ethernet cores can be queried through Virtual Coordinates). +// All other Non Worker Cores are not accessible through virtual coordinates. Subject to change, depending on the arch. +constexpr static std::uint32_t MAX_VIRTUAL_NON_WORKER_CORES = 18; +// This is the total number of Non Worker Cores on WH (first term is Ethernet, second term is PCIe and last term is +// DRAM). +constexpr static std::uint32_t MAX_NON_WORKER_CORES = MAX_VIRTUAL_NON_WORKER_CORES + 1 + 16; constexpr static std::uint32_t MAX_HARVESTED_ROWS = 2; constexpr static std::uint8_t CORE_COORD_INVALID = 0xFF; struct core_info_msg_t { @@ -308,10 +314,12 @@ struct core_info_msg_t { volatile uint64_t noc_dram_addr_base; volatile uint64_t noc_dram_addr_end; addressable_core_t non_worker_cores[MAX_NON_WORKER_CORES]; + addressable_core_t virtual_non_worker_cores[MAX_VIRTUAL_NON_WORKER_CORES]; volatile uint8_t harvested_y[MAX_HARVESTED_ROWS]; + volatile uint8_t virtual_harvested_y[MAX_HARVESTED_ROWS]; volatile uint8_t noc_size_x; volatile uint8_t noc_size_y; - volatile uint8_t pad[29]; + volatile uint8_t pad[27]; }; constexpr uint32_t launch_msg_buffer_num_entries = 4; diff --git a/tt_metal/hw/inc/grayskull/noc/noc_parameters.h b/tt_metal/hw/inc/grayskull/noc/noc_parameters.h index 7eff21e6dbd1..dbba51a83efd 100644 --- a/tt_metal/hw/inc/grayskull/noc/noc_parameters.h +++ b/tt_metal/hw/inc/grayskull/noc/noc_parameters.h @@ -13,6 +13,11 @@ #define NOC_Y_SIZE 1 #endif +// Coordinate Virtualization is not supported on GS (feature does not exist in NOC Hardware). +#define VIRTUAL_TENSIX_START_X 0 +#define VIRTUAL_TENSIX_START_Y 0 +#define COORDINATE_VIRTUALIZATION_ENABLED 0 + #define NUM_NOCS 2 #define NUM_TENSIXES 120 diff --git a/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h b/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h index d9279dbbb521..1f709b8a5527 100644 --- a/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h @@ -14,6 +14,9 @@ #define NOC_0_X(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x - 1 - (x))) #define NOC_0_Y(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y - 1 - (y))) +#define NOC_0_X_PHYS_COORD(noc_index, noc_size_x, x) NOC_0_X(noc_index, noc_size_x, x) +#define NOC_0_Y_PHYS_COORD(noc_index, noc_size_y, y) NOC_0_Y(noc_index, noc_size_y, y) +#define MY_NOC_ENCODING(noc_index) NOC_CMD_BUF_READ_REG(noc_index, 0, NOC_NODE_ID) //// constexpr uint32_t DYNAMIC_NOC_NCRISC_WR_CMD_BUF = 2; // all writes share cmd buf diff --git a/tt_metal/hw/inc/risc_common.h b/tt_metal/hw/inc/risc_common.h index 2407324dd539..a68e5c9cb4d8 100644 --- a/tt_metal/hw/inc/risc_common.h +++ b/tt_metal/hw/inc/risc_common.h @@ -21,6 +21,8 @@ #define NOC_Y(y) NOC_0_Y(noc_index, noc_size_y, (y)) #define DYNAMIC_NOC_X(noc, x) NOC_0_X(noc, noc_size_x, (x)) #define DYNAMIC_NOC_Y(noc, y) NOC_0_Y(noc, noc_size_y, (y)) +#define NOC_X_PHYS_COORD(x) NOC_0_X_PHYS_COORD(noc_index, noc_size_x, x) +#define NOC_Y_PHYS_COORD(y) NOC_0_Y_PHYS_COORD(noc_index, noc_size_y, y) #define TILE_WORD_2_BIT ((256 + 64 + 32) >> 4) #define TILE_WORD_4_BIT ((512 + 64 + 32) >> 4) @@ -139,7 +141,7 @@ inline uint32_t special_mult(uint32_t a, uint32_t special_b) { inline void risc_init() { for (uint32_t n = 0; n < NUM_NOCS; n++) { - uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(n, 0, NOC_NODE_ID); + uint32_t noc_id_reg = MY_NOC_ENCODING(n); my_x[n] = noc_id_reg & NOC_NODE_ID_MASK; my_y[n] = (noc_id_reg >> NOC_ADDR_NODE_ID_BITS) & NOC_NODE_ID_MASK; } diff --git a/tt_metal/hw/inc/wormhole/noc/noc_parameters.h b/tt_metal/hw/inc/wormhole/noc/noc_parameters.h index 34c899447cfb..43d8c3428f36 100644 --- a/tt_metal/hw/inc/wormhole/noc/noc_parameters.h +++ b/tt_metal/hw/inc/wormhole/noc/noc_parameters.h @@ -13,6 +13,12 @@ #define NOC_Y_SIZE 1 #endif +// Coordinate Virtualization is fully supported by WH NOC Hardware and Firmware. +// Tensix cores start at coorddinate in Virtual Space and are contiguous. +#define VIRTUAL_TENSIX_START_X 18 +#define VIRTUAL_TENSIX_START_Y 18 +#define COORDINATE_VIRTUALIZATION_ENABLED 1 + #define NUM_NOCS 2 #define NUM_TENSIXES 80 diff --git a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h index f7a399670eb4..a1030a010992 100644 --- a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h @@ -11,9 +11,11 @@ #include "noc_overlay_parameters.h" // Helper functions to convert NoC coordinates to NoC-0 coordinates, used in metal as "physical" coordinates. -#define NOC_0_X(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x - 1 - (x))) -#define NOC_0_Y(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y - 1 - (y))) - +#define NOC_0_X(noc_index, noc_size_x, x) x +#define NOC_0_Y(noc_index, noc_size_y, y) y +#define NOC_0_X_PHYS_COORD(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x - 1 - (x))) +#define NOC_0_Y_PHYS_COORD(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y - 1 - (y))) +#define MY_NOC_ENCODING(noc_index) NOC_CMD_BUF_READ_REG(noc_index, 0, NOC_CFG(NOC_ID_LOGICAL)); //// // Use VC 1 for unicast writes, and VC 4 for mcast writes diff --git a/tt_metal/impl/allocator/allocator.cpp b/tt_metal/impl/allocator/allocator.cpp index bdbbe8dd3f92..aa43d84043f6 100644 --- a/tt_metal/impl/allocator/allocator.cpp +++ b/tt_metal/impl/allocator/allocator.cpp @@ -537,8 +537,8 @@ void Allocator::reset() { void AllocatorConfig::reset() { dram_bank_offsets.clear(); core_type_from_noc_coord_table.clear(); - worker_log_to_physical_routing_x.clear(); - worker_log_to_physical_routing_y.clear(); + worker_log_to_virtual_routing_x.clear(); + worker_log_to_virtual_routing_y.clear(); l1_bank_remap.clear(); } diff --git a/tt_metal/impl/allocator/allocator_types.hpp b/tt_metal/impl/allocator/allocator_types.hpp index 2dc0e92816e5..b4ad6bf960c1 100644 --- a/tt_metal/impl/allocator/allocator_types.hpp +++ b/tt_metal/impl/allocator/allocator_types.hpp @@ -44,8 +44,8 @@ struct AllocatorConfig { size_t l1_small_size = 0; size_t trace_region_size = 0; std::unordered_map core_type_from_noc_coord_table = {}; - std::unordered_map worker_log_to_physical_routing_x = {}; - std::unordered_map worker_log_to_physical_routing_y = {}; + std::unordered_map worker_log_to_virtual_routing_x = {}; + std::unordered_map worker_log_to_virtual_routing_y = {}; BankMapping l1_bank_remap = {}; // for remapping which l1 bank points to which bank if we assume normal row-major assignment CoreRangeSet compute_grid = {}; diff --git a/tt_metal/impl/allocator/l1_banking_allocator.cpp b/tt_metal/impl/allocator/l1_banking_allocator.cpp index 0cd0c38d984a..c79cd949a440 100644 --- a/tt_metal/impl/allocator/l1_banking_allocator.cpp +++ b/tt_metal/impl/allocator/l1_banking_allocator.cpp @@ -75,17 +75,17 @@ void init_compute_and_storage_l1_bank_manager(Allocator& allocator, const Alloca num_banks_t num_banks = compute_total_and_storage_only_num_l1_banks(alloc_config); auto logical_to_noc_coord = [&alloc_config](CoreCoord logical_core) { TT_ASSERT( - alloc_config.worker_log_to_physical_routing_x.find(logical_core.x) != - alloc_config.worker_log_to_physical_routing_x.end() and - alloc_config.worker_log_to_physical_routing_y.find(logical_core.y) != - alloc_config.worker_log_to_physical_routing_y.end(), + alloc_config.worker_log_to_virtual_routing_x.find(logical_core.x) != + alloc_config.worker_log_to_virtual_routing_x.end() and + alloc_config.worker_log_to_virtual_routing_y.find(logical_core.y) != + alloc_config.worker_log_to_virtual_routing_y.end(), "Cannot find log_coord=[.y={}, .x={}] in logical to routing coord lookup tables... invalid AllocatorConfig " "setup", logical_core.y, logical_core.x); CoreCoord noc_core({ - static_cast(alloc_config.worker_log_to_physical_routing_x.at(logical_core.x)), - static_cast(alloc_config.worker_log_to_physical_routing_y.at(logical_core.y)), + static_cast(alloc_config.worker_log_to_virtual_routing_x.at(logical_core.x)), + static_cast(alloc_config.worker_log_to_virtual_routing_y.at(logical_core.y)), }); TT_ASSERT( alloc_config.core_type_from_noc_coord_table.find(noc_core) != diff --git a/tt_metal/impl/buffers/buffer.cpp b/tt_metal/impl/buffers/buffer.cpp index b1e5ec3e3374..3502254d8d53 100644 --- a/tt_metal/impl/buffers/buffer.cpp +++ b/tt_metal/impl/buffers/buffer.cpp @@ -425,27 +425,6 @@ CoreCoord Buffer::logical_core_from_bank_id(uint32_t bank_id) const { return allocator::logical_core_from_bank_id(*this->allocator_, bank_id); } -CoreCoord Buffer::noc_coordinates(uint32_t bank_id) const { - switch (this->buffer_type_) { - case BufferType::DRAM: - case BufferType::TRACE: { - auto dram_channel = this->dram_channel_from_bank_id(bank_id); - return this->device_->dram_core_from_dram_channel(dram_channel); - } - case BufferType::L1: // fallthrough - case BufferType::L1_SMALL: { - auto logical_core = this->logical_core_from_bank_id(bank_id); - return this->device_->worker_core_from_logical_core(logical_core); - } - case BufferType::SYSTEM_MEMORY: { - TT_THROW("Host buffer is located in system memory! Cannot retrieve NoC coordinates for it"); - } break; - default: TT_THROW("Unsupported buffer type!"); - } -} - -CoreCoord Buffer::noc_coordinates() const { return this->noc_coordinates(0); } - DeviceAddr Buffer::page_address(uint32_t bank_id, uint32_t page_index) const { uint32_t num_banks = allocator::num_banks(*this->allocator_, this->buffer_type_); TT_FATAL(bank_id < num_banks, "Invalid Bank ID: {} exceeds total numbers of banks ({})!", bank_id, num_banks); @@ -454,6 +433,21 @@ DeviceAddr Buffer::page_address(uint32_t bank_id, uint32_t page_index) const { return translate_page_address(offset, bank_id); } +DeviceAddr Buffer::bank_local_page_address(uint32_t bank_id, uint32_t page_index) const { + uint32_t num_banks = allocator::num_banks(*this->allocator_, this->buffer_type_); + TT_FATAL(bank_id < num_banks, "Invalid Bank ID: {} exceeds total numbers of banks ({})!", bank_id, num_banks); + uint32_t offset; + if (is_sharded(this->buffer_layout())) { + auto shard_spec = this->shard_spec(); + uint32_t pages_offset_within_bank = page_index % shard_spec.size(); + offset = (round_up(this->page_size(), this->alignment()) * pages_offset_within_bank); + } else { + uint32_t pages_offset_within_bank = page_index / num_banks; + offset = (round_up(this->page_size(), this->alignment()) * pages_offset_within_bank); + } + return this->address() + offset; +} + uint32_t Buffer::alignment() const { return this->allocator_->config.alignment; } diff --git a/tt_metal/impl/buffers/buffer.hpp b/tt_metal/impl/buffers/buffer.hpp index d3fdc9f60aa3..2866dbcceb6a 100644 --- a/tt_metal/impl/buffers/buffer.hpp +++ b/tt_metal/impl/buffers/buffer.hpp @@ -221,13 +221,9 @@ class Buffer final { CoreCoord logical_core_from_bank_id(uint32_t bank_id) const; - CoreCoord noc_coordinates(uint32_t bank_id) const; - - // returns NoC coordinates of first bank buffer is in - CoreCoord noc_coordinates() const; - DeviceAddr page_address(uint32_t bank_id, uint32_t page_index) const; + DeviceAddr bank_local_page_address(uint32_t bank_id, uint32_t page_index) const; uint32_t alignment() const; DeviceAddr aligned_page_size() const; DeviceAddr aligned_size() const; diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp index aac472a1d1a2..7a72c8758b6b 100644 --- a/tt_metal/impl/debug/dprint_server.cpp +++ b/tt_metal/impl/debug/dprint_server.cpp @@ -551,7 +551,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { // skip prints entirely to prevent kernel code from hanging waiting for the print buffer to be // flushed from the host. for (auto& logical_core : all_cores) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) { WriteInitMagic(device, phys_core, hart_index, false); } @@ -623,7 +623,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { CoreCoord phys_core; bool valid_logical_core = true; try { - phys_core = device->physical_core_from_logical_core(logical_core, core_type); + phys_core = device->virtual_core_from_logical_core(logical_core, core_type); } catch (std::runtime_error& error) { valid_logical_core = false; } @@ -653,7 +653,7 @@ void DebugPrintServerContext::AttachDevice(Device* device) { // Write print enable magic for the cores the user specified. uint32_t hart_mask = tt::llrt::OptionsG.get_feature_riscv_mask(tt::llrt::RunTimeDebugFeatureDprint); for (auto& logical_core : print_cores_sanitized) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) { if (hart_mask & (1 << hart_index)) { WriteInitMagic(device, phys_core, hart_index, true); @@ -700,7 +700,7 @@ void DebugPrintServerContext::DetachDevice(Device* device) { // Check all dprint-enabled cores on this device for outstanding prints. outstanding_prints = false; for (auto& logical_core : device_to_core_range_.at(device)) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int risc_id = 0; risc_id < GetNumRiscs(logical_core); risc_id++) { if (risc_mask & (1 << risc_id)) { // No need to check if risc is not dprint-enabled. @@ -762,7 +762,7 @@ void DebugPrintServerContext::DetachDevice(Device* device) { // When detaching a device, disable prints on it. CoreDescriptorSet all_cores = GetAllCores(device); for (auto& logical_core : all_cores) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int hart_index = 0; hart_index < GetNumRiscs(logical_core); hart_index++) { WriteInitMagic(device, phys_core, hart_index, false); } @@ -791,7 +791,7 @@ void DebugPrintServerContext::ClearSignals() { bool DebugPrintServerContext::PeekOneHartNonBlocking( Device* device, const CoreDescriptor& logical_core, int hart_id, bool new_data_this_iter) { // If init magic isn't cleared for this risc, then dprint isn't enabled on it, don't read it. - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); if (!CheckInitMagicCleared(device, phys_core, hart_id)) { return false; } diff --git a/tt_metal/impl/debug/noc_logging.cpp b/tt_metal/impl/debug/noc_logging.cpp index bd73b0a26715..ea14307f2379 100644 --- a/tt_metal/impl/debug/noc_logging.cpp +++ b/tt_metal/impl/debug/noc_logging.cpp @@ -39,7 +39,7 @@ void PrintNocData(noc_data_t noc_data, const string& file_name) { } void DumpCoreNocData(Device* device, const CoreDescriptor& logical_core, noc_data_t& noc_data) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int risc_id = 0; risc_id < GetNumRiscs(logical_core); risc_id++) { // Read out the DPRINT buffer, we stored our data in the "data field" uint64_t addr = GetDprintBufAddr(device, phys_core, risc_id); @@ -98,7 +98,7 @@ void ClearNocData(Device* device) { CoreDescriptorSet all_cores = GetAllCores(device); for (const CoreDescriptor& logical_core : all_cores) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core); + CoreCoord phys_core = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); for (int risc_id = 0; risc_id < GetNumRiscs(logical_core); risc_id++) { uint64_t addr = GetDprintBufAddr(device, phys_core, risc_id); std::vector initbuf = std::vector(DPRINT_BUFFER_SIZE / sizeof(uint32_t), 0); diff --git a/tt_metal/impl/debug/sanitize_noc_host.hpp b/tt_metal/impl/debug/sanitize_noc_host.hpp index cea090ff1974..860bb2fca591 100644 --- a/tt_metal/impl/debug/sanitize_noc_host.hpp +++ b/tt_metal/impl/debug/sanitize_noc_host.hpp @@ -32,6 +32,10 @@ static bool coord_found_p(CoreCoord range, CoreCoord core) { return core.x >= 1 && core.x <= range.x && core.y >= 1 && core.y <= range.y; } +static bool coord_found_p(std::unordered_set coords, CoreCoord core) { + return coords.find(core) != coords.end(); +} + static string noc_address(CoreCoord core, uint64_t a, uint32_t l) { std::stringstream ss; ss << "noc{" << core.str() << ", 0x" << std::setfill('0') << std::setw(8) << std::hex << a << ", " << std::dec << l @@ -55,7 +59,13 @@ static void print_stack_trace(void) { } static void watcher_sanitize_host_noc( - const char* what, const metal_SocDescriptor& soc_d, const CoreCoord& core, uint64_t addr, uint32_t lbytes) { + const char* what, + const metal_SocDescriptor& soc_d, + const std::unordered_set& virtual_worker_cores, + const std::unordered_set& virtual_eth_cores, + const CoreCoord& core, + uint64_t addr, + uint32_t lbytes) { if (coord_found_p(soc_d.get_pcie_cores(), core)) { TT_THROW("Host watcher: bad {} NOC coord {}", what, core.str()); } else if (coord_found_p(soc_d.get_dram_cores(), core)) { @@ -66,12 +76,12 @@ static void watcher_sanitize_host_noc( print_stack_trace(); TT_THROW("Host watcher: bad {} dram address {}", what, noc_address(core, addr, lbytes)); } - } else if (coord_found_p(soc_d.get_physical_ethernet_cores(), core)) { + } else if (coord_found_p(virtual_eth_cores, core)) { if (!DEBUG_VALID_ETH_ADDR(addr, lbytes)) { print_stack_trace(); TT_THROW("Host watcher: bad {} eth address {}", what, noc_address(core, addr, lbytes)); } - } else if (coord_found_p(soc_d.grid_size, core)) { + } else if (coord_found_p(virtual_worker_cores, core)) { if (!DEBUG_VALID_WORKER_ADDR(addr, lbytes)) { print_stack_trace(); TT_THROW("Host watcher: bad {} worker address {}", what, noc_address(core, addr, lbytes)); @@ -84,13 +94,23 @@ static void watcher_sanitize_host_noc( } void watcher_sanitize_host_noc_read( - const metal_SocDescriptor& soc_d, const CoreCoord& core, uint64_t addr, uint32_t lbytes) { - watcher_sanitize_host_noc("read", soc_d, core, addr, lbytes); + const metal_SocDescriptor& soc_d, + const std::unordered_set& virtual_worker_cores, + const std::unordered_set& virtual_eth_cores, + const CoreCoord& core, + uint64_t addr, + uint32_t lbytes) { + watcher_sanitize_host_noc("read", soc_d, virtual_worker_cores, virtual_eth_cores, core, addr, lbytes); } void watcher_sanitize_host_noc_write( - const metal_SocDescriptor& soc_d, const CoreCoord& core, uint64_t addr, uint32_t lbytes) { - watcher_sanitize_host_noc("write", soc_d, core, addr, lbytes); + const metal_SocDescriptor& soc_d, + const std::unordered_set& virtual_worker_cores, + const std::unordered_set& virtual_eth_cores, + const CoreCoord& core, + uint64_t addr, + uint32_t lbytes) { + watcher_sanitize_host_noc("write", soc_d, virtual_worker_cores, virtual_eth_cores, core, addr, lbytes); } } // namespace tt diff --git a/tt_metal/impl/debug/watcher_device_reader.cpp b/tt_metal/impl/debug/watcher_device_reader.cpp index d764bf8a2a62..c03b7548b79c 100644 --- a/tt_metal/impl/debug/watcher_device_reader.cpp +++ b/tt_metal/impl/debug/watcher_device_reader.cpp @@ -76,15 +76,11 @@ static uint32_t get_riscv_stack_size(const CoreDescriptor& core, uint32_t type) static string get_noc_target_str( Device* device, CoreDescriptor& core, int noc, const debug_sanitize_noc_addr_msg_t* san) { auto get_core_and_mem_type = [](Device* device, CoreCoord& noc_coord, int noc) -> std::pair { - // Get the physical coord from the noc coord - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device->id()); - CoreCoord phys_core = { - tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.x, noc_coord.x), - tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.y, noc_coord.y)}; - + // Get the virtual coord from the noc coord + CoreCoord virtual_core = device->virtual_noc_coordinate(noc, noc_coord); CoreType core_type; try { - core_type = device->core_type_from_physical_core(phys_core); + core_type = device->core_type_from_virtual_core(virtual_core); } catch (std::runtime_error& e) { // We may not be able to get a core type if the physical coords are bad. return {"Unknown", ""}; @@ -304,13 +300,13 @@ void WatcherDeviceReader::DumpCore(CoreDescriptor& logical_core, bool is_active_ // Watcher only treats ethernet + worker cores. bool is_eth_core = (logical_core.type == CoreType::ETH); CoreDescriptor core; - core.coord = device->physical_core_from_logical_core(logical_core.coord, logical_core.type); + core.coord = device->virtual_core_from_logical_core(logical_core.coord, logical_core.type); core.type = logical_core.type; // Print device id, core coords (logical) string core_type = is_eth_core ? "ethnet" : "worker"; string core_str = fmt::format( - "Device {} {} core(x={:2},y={:2}) phys(x={:2},y={:2})", + "Device {} {} core(x={:2},y={:2}) virtual(x={:2},y={:2})", device->id(), core_type, logical_core.coord.x, @@ -476,6 +472,10 @@ void WatcherDeviceReader::DumpNocSanitizeStatus( error_msg = get_noc_target_str(device, core, noc, san); error_msg += " (invalid address alignment in NOC transaction)."; break; + case DebugSanitizeNocMixedVirtualandPhysical: + error_msg = get_noc_target_str(device, core, noc, san); + error_msg += " (mixing virtual and physical coordinates in Mcast)."; + break; default: error_msg = fmt::format( "Watcher unexpected data corruption, noc debug state on core {}, unknown failure code: {}", diff --git a/tt_metal/impl/debug/watcher_server.cpp b/tt_metal/impl/debug/watcher_server.cpp index 8b92836a1fbd..9b81e7d13b01 100644 --- a/tt_metal/impl/debug/watcher_server.cpp +++ b/tt_metal/impl/debug/watcher_server.cpp @@ -283,7 +283,7 @@ void watcher_init(Device* device) { CoreCoord phys_core; bool valid_logical_core = true; try { - phys_core = device->physical_core_from_logical_core(logical_core, core_type); + phys_core = device->virtual_core_from_logical_core(logical_core, core_type); } catch (std::runtime_error& error) { valid_logical_core = false; } diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 2f8196d7c05e..f55271e3679b 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -5,7 +5,7 @@ #include #include #include "tt_metal/device.hpp" -#include "common/core_coord.hpp" +#include "common/core_assignment.hpp" #include "tt_metal/host_api.hpp" #include "tt_metal/impl/device/device.hpp" #include "tt_metal/impl/trace/trace.hpp" @@ -70,21 +70,12 @@ uint32_t Device::num_worker_cores(HalProgrammableCoreType core_type, SubDeviceId return this->active_sub_device_manager_->sub_device(sub_device_id).num_cores(core_type); } -std::vector Device::get_noc_encoding_for_active_eth_cores(NOC noc_index) { - auto active_ethernet_cores = this->get_active_ethernet_cores(true); - std::vector noc_encodings = {}; - noc_encodings.reserve(active_ethernet_cores.size()); - for (const auto& core : active_ethernet_cores) { - noc_encodings.push_back(this->get_noc_unicast_encoding(noc_index, ethernet_core_from_logical_core(core))); - } - return noc_encodings; -} /* Get all dispatch cores associated with this device. On return, my_dispatch_cores contains dispatch cores used by * this device (split between cores on this device itself and if this is a remote device, the mmio device dispatch * cores being used by this device). On return, other_dispatch_cores contains dispatch cores on this device that are * used by other (remote) devices. */ -void Device::get_associated_dispatch_phys_cores( +void Device::get_associated_dispatch_virtual_cores( std::unordered_map> &my_dispatch_cores, std::unordered_map> &other_dispatch_cores) { if (this->is_mmio_capable()) { @@ -96,54 +87,54 @@ void Device::get_associated_dispatch_phys_cores( if (device_id == this->id_) { //mmio device. bool dispatch_hd_allocated = false; - CoreCoord phys_core_dispatch_hd; + CoreCoord virtual_core_dispatch_hd; if (dispatch_core_manager::instance().is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_core(device_id, curr_channel, cq_id); - phys_core_dispatch_hd = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - my_dispatch_cores[this->id_].insert(phys_core_dispatch_hd); + virtual_core_dispatch_hd = this->virtual_core_from_logical_core(dispatch_location, dispatch_core_type); + my_dispatch_cores[this->id_].insert(virtual_core_dispatch_hd); dispatch_hd_allocated = true; - log_debug(tt::LogMetal, "MMIO Device Dispatch core: Logical: {} - Physical: {}", dispatch_location.str(), phys_core_dispatch_hd.str()); + log_debug(tt::LogMetal, "MMIO Device Dispatch core: Logical: {} - Physical: {}", dispatch_location.str(), virtual_core_dispatch_hd.str()); } // Include dispatch_s in the dispatch core location set, if its not on the same core as dispatch_hd if (dispatch_core_manager::instance().is_dispatcher_s_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair dispatch_s_location = dispatch_core_manager::instance().dispatcher_s_core(device_id, curr_channel, cq_id); - CoreCoord phys_core_dispatch_s = get_physical_core_coordinate(dispatch_s_location, dispatch_core_type); - if ((!dispatch_hd_allocated) or (phys_core_dispatch_s != phys_core_dispatch_hd)) { - my_dispatch_cores[dispatch_s_location.chip].insert(phys_core_dispatch_s); + CoreCoord virtual_core_dispatch_s = this->virtual_core_from_logical_core(dispatch_s_location, dispatch_core_type); + if ((!dispatch_hd_allocated) or (virtual_core_dispatch_s != virtual_core_dispatch_hd)) { + my_dispatch_cores[dispatch_s_location.chip].insert(virtual_core_dispatch_s); } } if (dispatch_core_manager::instance().is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - my_dispatch_cores[this->id_].insert(phys_core); - log_debug(tt::LogMetal, "MMIO Device Prefetch core: Logical: {} - Physical: {}", prefetch_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(prefetch_location, dispatch_core_type); + my_dispatch_cores[this->id_].insert(virtual_core); + log_debug(tt::LogMetal, "MMIO Device Prefetch core: Logical: {} - Physical: {}", prefetch_location.str(), virtual_core.str()); } } else if (tt::DevicePool::instance().is_device_active(device_id)) { //non mmio devices serviced by this mmio capable device. //skip remote dispatch cores only if respective remote device is active. if (dispatch_core_manager::instance().is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - other_dispatch_cores[this->id_].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will keep running on MMIO Device.", dispatch_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(dispatch_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will keep running on MMIO Device.", dispatch_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - other_dispatch_cores[this->id_].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will keep running on MMIO Device.", prefetch_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(prefetch_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will keep running on MMIO Device.", prefetch_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_mux_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); - other_dispatch_cores[this->id_].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will keep running on MMIO Device.", mux_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(mux_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will keep running on MMIO Device.", mux_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_demux_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); - other_dispatch_cores[this->id_].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will keep running on MMIO Device.", demux_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(demux_location, dispatch_core_type); + other_dispatch_cores[this->id_].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will keep running on MMIO Device.", demux_location.str(), virtual_core.str()); } } } @@ -157,47 +148,47 @@ void Device::get_associated_dispatch_phys_cores( for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { if (dispatch_core_manager::instance().is_dispatcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - my_dispatch_cores[dispatch_location.chip].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will be reset on MMIO Device.", dispatch_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(dispatch_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Dispatch core: Logical: {} - Physical: {} will be reset on MMIO Device.", dispatch_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_prefetcher_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - my_dispatch_cores[prefetch_location.chip].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will be reset on MMIO Device.", prefetch_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(prefetch_location, dispatch_core_type); + my_dispatch_cores[prefetch_location.chip].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Prefetch core: Logical: {} - Physical: {} will be reset on MMIO Device.", prefetch_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_mux_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); - my_dispatch_cores[mux_location.chip].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will be reset on MMIO Device.", mux_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(mux_location, dispatch_core_type); + my_dispatch_cores[mux_location.chip].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Mux core: Logical: {} - Physical: {} will be reset on MMIO Device.", mux_location.str(), virtual_core.str()); } if (dispatch_core_manager::instance().is_demux_core_allocated(device_id, curr_channel, cq_id)) { tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, curr_channel, cq_id); - CoreCoord phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); - my_dispatch_cores[demux_location.chip].insert(phys_core); - log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will be reset on MMIO Device.", demux_location.str(), phys_core.str()); + CoreCoord virtual_core = this->virtual_core_from_logical_core(demux_location, dispatch_core_type); + my_dispatch_cores[demux_location.chip].insert(virtual_core); + log_debug(tt::LogMetal, "Remote Device Demux core: Logical: {} - Physical: {} will be reset on MMIO Device.", demux_location.str(), virtual_core.str()); } - CoreCoord phys_core; + CoreCoord virtual_core; tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_d_core(device_id, curr_channel, cq_id); - phys_core = get_physical_core_coordinate(dispatch_location, dispatch_core_type); - my_dispatch_cores[dispatch_location.chip].insert(phys_core); + virtual_core = this->virtual_core_from_logical_core(dispatch_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(virtual_core); // Include dispatch_s in the dispatch core location set, if its not on the same core as dispatch_d tt_cxy_pair dispatch_s_location = dispatch_core_manager::instance().dispatcher_s_core(device_id, curr_channel, cq_id); - CoreCoord phys_core_dispatch_s = get_physical_core_coordinate(dispatch_s_location, dispatch_core_type); - if (phys_core_dispatch_s != phys_core) { - my_dispatch_cores[dispatch_s_location.chip].insert(phys_core_dispatch_s); + CoreCoord virtual_core_dispatch_s = this->virtual_core_from_logical_core(dispatch_s_location, dispatch_core_type); + if (virtual_core_dispatch_s != virtual_core) { + my_dispatch_cores[dispatch_s_location.chip].insert(virtual_core_dispatch_s); } tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_d_core(device_id, curr_channel, cq_id); - phys_core = get_physical_core_coordinate(prefetch_location, dispatch_core_type); - my_dispatch_cores[dispatch_location.chip].insert(phys_core); + virtual_core = this->virtual_core_from_logical_core(prefetch_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(virtual_core); tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_d_core(device_id, curr_channel, cq_id); - phys_core = get_physical_core_coordinate(mux_location, dispatch_core_type); - my_dispatch_cores[dispatch_location.chip].insert(phys_core); + virtual_core = this->virtual_core_from_logical_core(mux_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(virtual_core); tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_d_core(device_id, curr_channel, cq_id); - phys_core = get_physical_core_coordinate(demux_location, dispatch_core_type); - my_dispatch_cores[dispatch_location.chip].insert(phys_core); + virtual_core = this->virtual_core_from_logical_core(demux_location, dispatch_core_type); + my_dispatch_cores[dispatch_location.chip].insert(virtual_core); } } } @@ -248,8 +239,8 @@ std::unique_ptr Device::initialize_allocator(size_t l1_small_size, si .l1_small_size = align(l1_small_size, hal.get_alignment(HalMemType::L1)), .trace_region_size = align(trace_region_size, hal.get_alignment(HalMemType::DRAM)), .core_type_from_noc_coord_table = {}, // Populated later - .worker_log_to_physical_routing_x = soc_desc.worker_log_to_physical_routing_x, - .worker_log_to_physical_routing_y = soc_desc.worker_log_to_physical_routing_y, + .worker_log_to_virtual_routing_x = tt::Cluster::instance().get_worker_logical_to_virtual_x(this->id()), + .worker_log_to_virtual_routing_y = tt::Cluster::instance().get_worker_logical_to_virtual_y(this->id()), .l1_bank_remap = {l1_bank_remap.begin(), l1_bank_remap.end()}, .compute_grid = CoreRangeSet(CoreRange(CoreCoord(0, 0), CoreCoord(compute_size.x - 1, compute_size.y - 1))), .alignment = std::max(hal.get_alignment(HalMemType::DRAM), hal.get_alignment(HalMemType::L1)), @@ -266,7 +257,7 @@ std::unique_ptr Device::initialize_allocator(size_t l1_small_size, si } // Initialize core_type_from_noc_coord_table table for (const auto& core: soc_desc.physical_cores) { - config.core_type_from_noc_coord_table.insert({core.first, AllocCoreType::Invalid}); + config.core_type_from_noc_coord_table.insert({this->virtual_core_from_physical_core(core.first, core.second.type), AllocCoreType::Invalid}); } for (const CoreCoord& core : tt::get_logical_compute_cores(id_, num_hw_cqs_, dispatch_core_config)) { @@ -280,7 +271,7 @@ std::unique_ptr Device::initialize_allocator(size_t l1_small_size, si config.core_type_from_noc_coord_table[noc_coord] = AllocCoreType::StorageOnly; } for (const CoreCoord &core : tt::get_logical_dispatch_cores(id_, num_hw_cqs_, dispatch_core_config)) { - const auto noc_coord = this->physical_core_from_logical_core(core, dispatch_core_type); + const auto noc_coord = this->virtual_core_from_logical_core(core, dispatch_core_type); config.core_type_from_noc_coord_table[noc_coord] = AllocCoreType::Dispatch; } for (const auto &core : soc_desc.get_logical_ethernet_cores()) { @@ -410,7 +401,7 @@ void Device::build_firmware() { jit_build_set(this->firmware_build_states_, nullptr); } -void Device::initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord phys_core) +void Device::initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core) { const uint32_t dram_to_noc_sz_in_bytes = dram_bank_to_noc_xy_.size() * sizeof(uint16_t); const uint32_t l1_to_noc_sz_in_bytes = l1_bank_to_noc_xy_.size() * sizeof(uint16_t); @@ -423,26 +414,26 @@ void Device::initialize_device_bank_to_noc_tables(const HalProgrammableCoreType TT_ASSERT((dram_to_noc_sz_in_bytes + l1_to_noc_sz_in_bytes + dram_offset_sz_in_bytes + l1_offset_sz_in_bytes) <= mem_bank_to_noc_size, "Size of bank_to_noc table is greater than available space"); - tt::Cluster::instance().write_core(&dram_bank_to_noc_xy_[0], dram_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), mem_bank_to_noc_addr); + tt::Cluster::instance().write_core(&dram_bank_to_noc_xy_[0], dram_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), virtual_core), mem_bank_to_noc_addr); uint64_t l1_noc_addr = mem_bank_to_noc_addr + dram_to_noc_sz_in_bytes; - tt::Cluster::instance().write_core(&l1_bank_to_noc_xy_[0], l1_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), l1_noc_addr); + tt::Cluster::instance().write_core(&l1_bank_to_noc_xy_[0], l1_to_noc_sz_in_bytes, tt_cxy_pair(this->id(), virtual_core), l1_noc_addr); uint64_t dram_offset_addr = l1_noc_addr + l1_to_noc_sz_in_bytes; - tt::Cluster::instance().write_core(&dram_bank_offset_map_[0], dram_offset_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), dram_offset_addr); + tt::Cluster::instance().write_core(&dram_bank_offset_map_[0], dram_offset_sz_in_bytes, tt_cxy_pair(this->id(), virtual_core), dram_offset_addr); uint64_t l1_offset_addr = dram_offset_addr + dram_offset_sz_in_bytes; - tt::Cluster::instance().write_core(&l1_bank_offset_map_[0], l1_offset_sz_in_bytes, tt_cxy_pair(this->id(), phys_core), l1_offset_addr); + tt::Cluster::instance().write_core(&l1_bank_offset_map_[0], l1_offset_sz_in_bytes, tt_cxy_pair(this->id(), virtual_core), l1_offset_addr); } -void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg) { +void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord virtual_core, launch_msg_t *launch_msg, go_msg_t* go_msg) { ZoneScoped; - this->initialize_device_bank_to_noc_tables(core_type, phys_core); + this->initialize_device_bank_to_noc_tables(core_type, virtual_core); uint32_t core_type_idx = hal.get_programmable_core_type_index(core_type); uint32_t processor_class_count = hal.get_processor_classes_count(core_type); switch (core_type) { case HalProgrammableCoreType::TENSIX: { - llrt::program_risc_startup_addr(this->id(), phys_core); + llrt::program_risc_startup_addr(this->id(), virtual_core); for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { auto [build_idx, num_build_states] = this->build_processor_type_to_index(core_type_idx, processor_class); for (uint32_t riscv_id = build_idx; riscv_id < (build_idx + num_build_states); riscv_id++) { @@ -458,7 +449,7 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC } log_debug(LogDevice, "RISC {} fw binary size: {} in bytes", riscv_id, fw_size); if (not llrt::OptionsG.get_skip_loading_fw()) { - llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, core_type_idx, processor_class, (riscv_id - build_idx)); + llrt::test_load_write_read_risc_binary(binary_mem, this->id(), virtual_core, core_type_idx, processor_class, (riscv_id - build_idx)); } } } @@ -471,7 +462,7 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC if (dispatch_core_manager::instance().get_dispatch_core_type(this->id()) == CoreType::WORKER) { physical_dispatch_cores = this->worker_cores_from_logical_cores(dispatch_core_manager::instance().get_all_logical_dispatch_cores(this->id())); } - if (std::find(physical_dispatch_cores.begin(), physical_dispatch_cores.end(), phys_core) != physical_dispatch_cores.end()) { + if (std::find(physical_dispatch_cores.begin(), physical_dispatch_cores.end(), virtual_core) != physical_dispatch_cores.end()) { // Dispatch cores - Host writes launch messages launch_msg->kernel_config.mode = DISPATCH_MODE_HOST; } else { @@ -486,7 +477,7 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC case HalProgrammableCoreType::IDLE_ETH: { bool is_idle_eth = core_type == HalProgrammableCoreType::IDLE_ETH; if (is_idle_eth) { - tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), phys_core)); + tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(this->id(), virtual_core)); } if (not llrt::OptionsG.get_skip_loading_fw()) { for (uint32_t processor_class = 0; processor_class < processor_class_count; processor_class++) { @@ -499,14 +490,14 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC (eriscv_id - build_idx)); uint32_t fw_size = binary_mem.get_text_size(); log_debug(LogDevice, "ERISC fw binary size: {} in bytes", fw_size); - llrt::test_load_write_read_risc_binary(binary_mem, this->id(), phys_core, core_type_idx, processor_class, (eriscv_id - build_idx)); + llrt::test_load_write_read_risc_binary(binary_mem, this->id(), virtual_core, core_type_idx, processor_class, (eriscv_id - build_idx)); } } } if (is_idle_eth) { - llrt::program_risc_startup_addr(this->id(), phys_core); + llrt::program_risc_startup_addr(this->id(), virtual_core); } else { - llrt::launch_erisc_app_fw_on_core(this->id(), phys_core); + llrt::launch_erisc_app_fw_on_core(this->id(), virtual_core); } // Ethernet worker core. Launch messages will be sent by FD infra if it's enabled // Idle ethernet core. Used by FD infra. Host will write launch messages during init. @@ -528,12 +519,12 @@ void Device::initialize_firmware(const HalProgrammableCoreType &core_type, CoreC // worker cores (Tensix and active eth) configured with DISPATCH_MODE_DEV // When using Slow Dispatch, all cores initialized with DISPATCH_MODE_HOST std::vector init_launch_msg_data(launch_msg_buffer_num_entries, *launch_msg); - tt::Cluster::instance().write_core(init_launch_msg_data.data(), launch_msg_buffer_num_entries * sizeof(launch_msg_t), tt_cxy_pair(this->id(), phys_core), this->get_dev_addr(phys_core, HalL1MemAddrType::LAUNCH)); - uint32_t go_addr = this->get_dev_addr(phys_core, HalL1MemAddrType::GO_MSG); - tt::Cluster::instance().write_core(go_msg, sizeof(go_msg_t), tt_cxy_pair(this->id(), phys_core), go_addr); - uint64_t launch_msg_buffer_read_ptr_addr = this->get_dev_addr(phys_core, HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR); + tt::Cluster::instance().write_core(init_launch_msg_data.data(), launch_msg_buffer_num_entries * sizeof(launch_msg_t), tt_cxy_pair(this->id(), virtual_core), this->get_dev_addr(virtual_core, HalL1MemAddrType::LAUNCH)); + uint32_t go_addr = this->get_dev_addr(virtual_core, HalL1MemAddrType::GO_MSG); + tt::Cluster::instance().write_core(go_msg, sizeof(go_msg_t), tt_cxy_pair(this->id(), virtual_core), go_addr); + uint64_t launch_msg_buffer_read_ptr_addr = this->get_dev_addr(virtual_core, HalL1MemAddrType::LAUNCH_MSG_BUFFER_RD_PTR); uint32_t zero = 0; - tt::Cluster::instance().write_core(&zero, sizeof(uint32_t), tt_cxy_pair(this->id(), phys_core), launch_msg_buffer_read_ptr_addr); + tt::Cluster::instance().write_core(&zero, sizeof(uint32_t), tt_cxy_pair(this->id(), virtual_core), launch_msg_buffer_read_ptr_addr); } void Device::reset_cores() { @@ -549,16 +540,16 @@ void Device::reset_cores() { go_msg_t go_msg; std::memset(&go_msg, 0, sizeof(go_msg_t)); for (const auto ð_core : this->get_active_ethernet_cores()) { - CoreCoord physical_core = this->ethernet_core_from_logical_core(eth_core); + CoreCoord virtual_core = this->ethernet_core_from_logical_core(eth_core); std::vector data(sizeof(launch_msg_t) / sizeof(uint32_t)); std::vector go_signal_data(sizeof(go_msg_t) / sizeof(uint32_t)); DeviceAddr launch_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::LAUNCH); DeviceAddr go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::GO_MSG); data = tt::llrt::read_hex_vec_from_core( - this->id(), physical_core, launch_addr, sizeof(launch_msg_t)); + this->id(), virtual_core, launch_addr, sizeof(launch_msg_t)); go_signal_data = tt::llrt::read_hex_vec_from_core( - this->id(), physical_core, go_signal_addr, sizeof(go_msg_t)); + this->id(), virtual_core, go_signal_addr, sizeof(go_msg_t)); launch_msg_t *launch_msg = (launch_msg_t *)(&data[0]); go_msg_t * go_signal = (go_msg_t *)(&go_signal_data[0]); if (kernel_still_running(launch_msg, go_signal)) { @@ -566,30 +557,30 @@ void Device::reset_cores() { tt::LogMetal, "While initializing Device {}, ethernet tunneler core {} on Device {} detected as still running, issuing exit signal.", this->id(), - physical_core.str(), + virtual_core.str(), this->id()); launch_msg->kernel_config.exit_erisc_kernel = 1; - llrt::write_launch_msg_to_core(this->id(), physical_core, launch_msg, &go_msg, launch_addr, false); - device_to_early_exit_cores[this->id()].insert(physical_core); + llrt::write_launch_msg_to_core(this->id(), virtual_core, launch_msg, &go_msg, launch_addr, false); + device_to_early_exit_cores[this->id()].insert(virtual_core); } } - this->get_associated_dispatch_phys_cores(dispatch_cores, other_dispatch_cores); + this->get_associated_dispatch_virtual_cores(dispatch_cores, other_dispatch_cores); // Ignore other_dispatch_cores, they will be reset by the devices that use them. for (auto &id_and_cores : dispatch_cores) { for (auto it = id_and_cores.second.begin(); it != id_and_cores.second.end(); it++) { - const auto &phys_core = *it; + const auto &virtual_core = *it; // Only need to manually reset ethernet dispatch cores, tensix cores are all reset below. - if (llrt::is_ethernet_core(phys_core, id_and_cores.first)) { + if (tt::Cluster::instance().is_ethernet_core(virtual_core, id_and_cores.first)) { // Ethernet cores won't be reset, so just signal the dispatch cores to early exit. std::vector data(sizeof(launch_msg_t) / sizeof(uint32_t)); std::vector go_signal_data(sizeof(go_msg_t) / sizeof(uint32_t)); DeviceAddr launch_addr = hal.get_dev_addr(HalProgrammableCoreType::IDLE_ETH, HalL1MemAddrType::LAUNCH); DeviceAddr go_signal_addr = hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::GO_MSG); data = tt::llrt::read_hex_vec_from_core( - id_and_cores.first, phys_core, launch_addr, sizeof(launch_msg_t)); + id_and_cores.first, virtual_core, launch_addr, sizeof(launch_msg_t)); go_signal_data = tt::llrt::read_hex_vec_from_core( - this->id(), phys_core, go_signal_addr, sizeof(go_msg_t)); + this->id(), virtual_core, go_signal_addr, sizeof(go_msg_t)); launch_msg_t *launch_msg = (launch_msg_t *)(&data[0]); go_msg_t * go_signal = (go_msg_t *)(&go_signal_data[0]); if (kernel_still_running(launch_msg, go_signal)) { @@ -597,11 +588,11 @@ void Device::reset_cores() { tt::LogMetal, "While initializing device {}, ethernet dispatch core {} on Device {} detected as still running, issuing exit signal.", this->id(), - phys_core.str(), + virtual_core.str(), id_and_cores.first); launch_msg->kernel_config.exit_erisc_kernel = 1; - llrt::write_launch_msg_to_core(id_and_cores.first, phys_core, launch_msg, &go_msg, launch_addr, false); - device_to_early_exit_cores[id_and_cores.first].insert(phys_core); + llrt::write_launch_msg_to_core(id_and_cores.first, virtual_core, launch_msg, &go_msg, launch_addr, false); + device_to_early_exit_cores[id_and_cores.first].insert(virtual_core); } } } @@ -665,22 +656,40 @@ void Device::initialize_and_launch_firmware() { const std::vector &pcie_cores = soc_d.get_pcie_cores(); const std::vector &dram_cores = soc_d.get_dram_cores(); const std::vector ð_cores = soc_d.get_physical_ethernet_cores(); + // The SOC descriptor can list a dram core multiple times, depending on how GDDR is assigned to banks + // Get a list of unique DRAM cores. + std::unordered_set unique_dram_cores(dram_cores.begin(), dram_cores.end()); TT_ASSERT( - pcie_cores.size() + dram_cores.size() + eth_cores.size() <= MAX_NON_WORKER_CORES, + pcie_cores.size() + unique_dram_cores.size() + eth_cores.size() <= MAX_NON_WORKER_CORES, "Detected more pcie/dram/eth cores than fit in the device mailbox."); + TT_ASSERT( + eth_cores.size() <= MAX_VIRTUAL_NON_WORKER_CORES, + "Detected more eth cores (virtual non-workers) than can fit in device mailbox."); for (int idx = 0; idx < MAX_NON_WORKER_CORES; idx++) { core_info->non_worker_cores[idx] = {CORE_COORD_INVALID, CORE_COORD_INVALID, AddressableCoreType::UNKNOWN}; } + for (int idx = 0; idx < MAX_VIRTUAL_NON_WORKER_CORES; idx++) { + core_info->virtual_non_worker_cores[idx] = {CORE_COORD_INVALID, CORE_COORD_INVALID, AddressableCoreType::UNKNOWN}; + } + int non_worker_cores_idx = 0; for (const CoreCoord &core : pcie_cores) { core_info->non_worker_cores[non_worker_cores_idx++] = {core.x, core.y, AddressableCoreType::PCIE}; } - for (const CoreCoord &core : dram_cores) { + for (const CoreCoord &core : unique_dram_cores) { core_info->non_worker_cores[non_worker_cores_idx++] = {core.x, core.y, AddressableCoreType::DRAM}; } for (const CoreCoord &core : eth_cores) { core_info->non_worker_cores[non_worker_cores_idx++] = {core.x, core.y, AddressableCoreType::ETH}; } + if (hal.is_coordinate_virtualization_enabled()) { + // Track Virtual Non Worker Cores (In this case only Eth) separately + uint32_t virtual_non_worker_cores_idx = 0; + for (const CoreCoord &core : eth_cores) { + auto virtual_core = this->virtual_core_from_physical_core(core, CoreType::ETH); + core_info->virtual_non_worker_cores[virtual_non_worker_cores_idx++] = {virtual_core.x, virtual_core.y, AddressableCoreType::ETH}; + } + } // Determine which noc-coords are harvested // TODO(PGK/Almeet): fix this w/ new UMD @@ -695,6 +704,13 @@ void Device::initialize_and_launch_firmware() { TT_ASSERT(harvested_rows.size() <= MAX_HARVESTED_ROWS, "Detected more harvested rows than fit in mailbox."); for (int idx = 0; idx < MAX_HARVESTED_ROWS; idx++) { core_info->harvested_y[idx] = (idx < harvested_rows.size()) ? harvested_rows[idx] : CORE_COORD_INVALID; + // Populate harvested rows in virtual coordinate space if virtualization is supported by HW. + // Harvested rows in the virtual space are placed at the end of the worker grid, + if (hal.is_coordinate_virtualization_enabled() and idx < harvested_rows.size()) { + core_info->virtual_harvested_y[idx] = (hal.get_virtual_worker_start_y() + this->logical_grid_size().y + harvested_rows.size() - (idx + 1)); + } else { + core_info->virtual_harvested_y[idx] = CORE_COORD_INVALID; + } } core_info->noc_size_x = soc_d.grid_size.x; @@ -721,10 +737,10 @@ void Device::initialize_and_launch_firmware() { // Clear erisc sync info std::vector zero_vec_erisc_init(eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_SIZE / sizeof(uint32_t), 0); for (const auto ð_core : this->get_active_ethernet_cores()) { - CoreCoord physical_core = this->ethernet_core_from_logical_core(eth_core); + CoreCoord virtual_core = this->ethernet_core_from_logical_core(eth_core); llrt::write_hex_vec_to_core( - this->id(), physical_core, zero_vec_erisc_init, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); + this->id(), virtual_core, zero_vec_erisc_init, eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE); } // Load erisc app base FW to eth cores @@ -784,11 +800,11 @@ void Device::clear_l1_state() { // Clear erisc sync info for (const auto ð_core : this->get_active_ethernet_cores()) { - CoreCoord physical_core = this->ethernet_core_from_logical_core(eth_core); + CoreCoord virtual_core = this->ethernet_core_from_logical_core(eth_core); llrt::write_hex_vec_to_core( this->id(), - physical_core, + virtual_core, zero_vec_above_tile_header_buffer, eth_l1_mem::address_map::TILE_HEADER_BUFFER_BASE); @@ -801,11 +817,11 @@ void Device::configure_kernel_variant( const string& path, const std::vector& compile_args, CoreCoord kernel_core, - CoreCoord kernel_physical_core, + CoreCoord kernel_virtual_core, CoreType dispatch_core_type, - CoreCoord upstream_physical_core, - CoreCoord downstream_physical_core, - CoreCoord downstream_slave_physical_core, + CoreCoord upstream_virtual_core, + CoreCoord downstream_virtual_core, + CoreCoord downstream_slave_virtual_core, std::map defines_in, NOC my_noc_index, NOC upstream_noc_index, @@ -814,25 +830,28 @@ void Device::configure_kernel_variant( bool send_to_brisc, bool force_watcher_no_inline) { - const auto& grid_size = this->grid_size(); - // TODO: just pass in the programmable index uint32_t programmable_core_type_index = (dispatch_core_type == CoreType::WORKER) ? hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX) : is_active_eth_core ? hal.get_programmable_core_type_index(HalProgrammableCoreType::ACTIVE_ETH) : hal.get_programmable_core_type_index(HalProgrammableCoreType::IDLE_ETH); + auto my_virtual_noc_coords = this->virtual_noc_coordinate(my_noc_index, kernel_virtual_core); + auto upstream_virtual_noc_coords = this->virtual_noc_coordinate(upstream_noc_index, upstream_virtual_core); + auto downstream_virtual_noc_coords = this->virtual_noc_coordinate(downstream_noc_index, downstream_virtual_core); + auto downstream_slave_virtual_noc_coords = this->virtual_noc_coordinate(downstream_noc_index, downstream_slave_virtual_core); + std::map defines = { {"DISPATCH_KERNEL", "1"}, - {"MY_NOC_X", std::to_string(tt::tt_metal::hal.noc_coordinate(my_noc_index, grid_size.x, kernel_physical_core.x))}, - {"MY_NOC_Y", std::to_string(tt::tt_metal::hal.noc_coordinate(my_noc_index, grid_size.y, kernel_physical_core.y))}, + {"MY_NOC_X", std::to_string(my_virtual_noc_coords.x)}, + {"MY_NOC_Y", std::to_string(my_virtual_noc_coords.y)}, {"UPSTREAM_NOC_INDEX", std::to_string(upstream_noc_index)}, - {"UPSTREAM_NOC_X", std::to_string(tt::tt_metal::hal.noc_coordinate(upstream_noc_index, grid_size.x, upstream_physical_core.x))}, - {"UPSTREAM_NOC_Y", std::to_string(tt::tt_metal::hal.noc_coordinate(upstream_noc_index, grid_size.y, upstream_physical_core.y))}, - {"DOWNSTREAM_NOC_X", std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.x, downstream_physical_core.x))}, - {"DOWNSTREAM_NOC_Y", std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.y, downstream_physical_core.y))}, - {"DOWNSTREAM_SLAVE_NOC_X", std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.x, downstream_slave_physical_core.x))}, - {"DOWNSTREAM_SLAVE_NOC_Y", std::to_string(tt::tt_metal::hal.noc_coordinate(downstream_noc_index, grid_size.y, downstream_slave_physical_core.y))}, + {"UPSTREAM_NOC_X", std::to_string(upstream_virtual_noc_coords.x)}, + {"UPSTREAM_NOC_Y", std::to_string(upstream_virtual_noc_coords.y)}, + {"DOWNSTREAM_NOC_X", std::to_string(downstream_virtual_noc_coords.x)}, + {"DOWNSTREAM_NOC_Y", std::to_string(downstream_virtual_noc_coords.y)}, + {"DOWNSTREAM_SLAVE_NOC_X", std::to_string(downstream_slave_virtual_noc_coords.x)}, + {"DOWNSTREAM_SLAVE_NOC_Y", std::to_string(downstream_slave_virtual_noc_coords.y)}, {"FD_CORE_TYPE", std::to_string(programmable_core_type_index)}, }; if (force_watcher_no_inline) { @@ -896,7 +915,7 @@ void Device::update_workers_build_settings(std::vector> 4; @@ -971,8 +990,8 @@ void Device::update_workers_build_settings(std::vector> 4; // 3: rx_queue_size_words for (uint32_t i = 0; i < fwd_vc_count; i++) { - compile_args[4 + i] = packet_switch_4B_pack(tunneler_settings.eth_partner_physical_core.x, - tunneler_settings.eth_partner_physical_core.y, + compile_args[4 + i] = packet_switch_4B_pack(tunneler_settings.eth_partner_virtual_core.x, + tunneler_settings.eth_partner_virtual_core.y, i, (uint32_t)DispatchRemoteNetworkType::ETH); // 4 - 13: remote_receiver fwd vcs @@ -1031,8 +1050,8 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DEMUX][0]); - compile_args[4 + return_vc] = packet_switch_4B_pack(demux_settings.worker_physical_core.x, - demux_settings.worker_physical_core.y, + compile_args[4 + return_vc] = packet_switch_4B_pack(demux_settings.worker_virtual_core.x, + demux_settings.worker_virtual_core.y, 0,//demux input, (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_receiver return vc compile_args[14 + return_vc * 2] = demux_settings.cb_start_address >> 4; // 8: remote_receiver_queue_start_addr_words return vc @@ -1042,8 +1061,8 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::MUX_D][0]); uint32_t prefetch_d_count = device_worker_variants[DispatchWorkerType::PREFETCH_D].size(); - compile_args[4 + return_vc] = packet_switch_4B_pack(mux_d_settings.worker_physical_core.x, - mux_d_settings.worker_physical_core.y, + compile_args[4 + return_vc] = packet_switch_4B_pack(mux_d_settings.worker_virtual_core.x, + mux_d_settings.worker_virtual_core.y, mux_d_settings.semaphores.size(),//mux_d input. This is return path from next tunnel stop towards mmio device. //mux_d iput 0 is driven by local Dispatch D (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_receiver return vc @@ -1066,8 +1085,8 @@ void Device::update_workers_build_settings(std::vector demux input, 1=> demux_d output to local Prefetch D, 2=> demux_d output to tunneler (to next tunnel stop) (uint32_t)DispatchRemoteNetworkType::NOC0); // 10: remote_sender fwd vcs } @@ -1080,8 +1099,8 @@ void Device::update_workers_build_settings(std::vector> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x compile_args[arg_index++] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x } - compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x - compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y + compile_args[16] = tunneler_settings.worker_virtual_core.x; // 16: remote_rx_x + compile_args[17] = tunneler_settings.worker_virtual_core.y; // 17: remote_rx_y compile_args[18] = tunneler_settings.vc_count * 2 - 1; // 18: remote_rx_queue_id compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type uint32_t dest_map_array[4] = {0, 1, 2, 3}; @@ -1153,12 +1172,12 @@ void Device::update_workers_build_settings(std::vector> 4; // 2: rx_queue_size_words compile_args[3] = 2; // 3: demux_fan_out - compile_args[4] = packet_switch_4B_pack((uint32_t)demux_1_settings.worker_physical_core.x, - (uint32_t)demux_1_settings.worker_physical_core.y, + compile_args[4] = packet_switch_4B_pack((uint32_t)demux_1_settings.worker_virtual_core.x, + (uint32_t)demux_1_settings.worker_virtual_core.y, 0, (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info - compile_args[5] = packet_switch_4B_pack((uint32_t)demux_2_settings.worker_physical_core.x, - (uint32_t)demux_2_settings.worker_physical_core.y, + compile_args[5] = packet_switch_4B_pack((uint32_t)demux_2_settings.worker_virtual_core.x, + (uint32_t)demux_2_settings.worker_virtual_core.y, 0, (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info @@ -1167,8 +1186,8 @@ void Device::update_workers_build_settings(std::vector> 4; // 10: remote_tx_queue_start_addr_words x compile_args[11] = demux_2_settings.cb_size_bytes >> 4; // 11: remote_tx_queue_size_words x - compile_args[16] = tunneler_settings.worker_physical_core.x; // 16: remote_rx_x - compile_args[17] = tunneler_settings.worker_physical_core.y; // 17: remote_rx_y + compile_args[16] = tunneler_settings.worker_virtual_core.x; // 16: remote_rx_x + compile_args[17] = tunneler_settings.worker_virtual_core.y; // 17: remote_rx_y compile_args[18] = tunneler_settings.vc_count * 2 - 1; // 18: remote_rx_queue_id compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type @@ -1194,16 +1213,16 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DISPATCH][i]); - demux_1_compile_args[4 + i] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, - (uint32_t)settings.worker_physical_core.y, + demux_1_compile_args[4 + i] = packet_switch_4B_pack((uint32_t)settings.worker_virtual_core.x, + (uint32_t)settings.worker_virtual_core.y, 0, (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info demux_1_compile_args[8 + i * 2] = settings.cb_start_address >> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x demux_1_compile_args[9 + i * 2] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x } - demux_1_compile_args[16] = demux_settings.worker_physical_core.x; // 16: remote_rx_x - demux_1_compile_args[17] = demux_settings.worker_physical_core.y; // 17: remote_rx_y + demux_1_compile_args[16] = demux_settings.worker_virtual_core.x; // 16: remote_rx_x + demux_1_compile_args[17] = demux_settings.worker_virtual_core.y; // 17: remote_rx_y demux_1_compile_args[18] = 1; // 18: remote_rx_queue_id demux_1_compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type uint32_t dest_map_array[4] = {0, 1, 2, 3}; @@ -1236,16 +1255,16 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DISPATCH][i + demux_1_fanout]); - demux_2_compile_args[4 + i] = packet_switch_4B_pack((uint32_t)settings.worker_physical_core.x, - (uint32_t)settings.worker_physical_core.y, + demux_2_compile_args[4 + i] = packet_switch_4B_pack((uint32_t)settings.worker_virtual_core.x, + (uint32_t)settings.worker_virtual_core.y, 0, (uint32_t)DispatchRemoteNetworkType::NOC0); // 4,5,6,7: remote_tx_x_info demux_2_compile_args[8 + i * 2] = settings.cb_start_address >> 4; // 8, 10, 12, 14: remote_tx_queue_start_addr_words x demux_2_compile_args[9 + i * 2] = settings.cb_size_bytes >> 4; // 9, 11, 13, 15: remote_tx_queue_size_words x } - demux_2_compile_args[16] = demux_settings.worker_physical_core.x; // 16: remote_rx_x - demux_2_compile_args[17] = demux_settings.worker_physical_core.y; // 17: remote_rx_y + demux_2_compile_args[16] = demux_settings.worker_virtual_core.x; // 16: remote_rx_x + demux_2_compile_args[17] = demux_settings.worker_virtual_core.y; // 17: remote_rx_y demux_2_compile_args[18] = 2; // 18: remote_rx_queue_id demux_2_compile_args[19] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 19: tx_network_type dest_endpoint_output_map = packet_switch_dest_pack(dest_map_array, 4); @@ -1282,12 +1301,12 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::PREFETCH][dispatch_idx]); - auto prefetch_physical_core = prefetch_h_settings.worker_physical_core; + auto prefetch_virtual_core = prefetch_h_settings.worker_virtual_core; auto dispatch_core_type = settings.dispatch_core_type; uint32_t host_completion_queue_wr_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_wr_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_rd_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); - settings.upstream_cores.push_back(demux_settings.worker_physical_core); + settings.upstream_cores.push_back(demux_settings.worker_virtual_core); settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0)); settings.compile_args.resize(31); auto& compile_args = settings.compile_args; @@ -1307,7 +1326,7 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DISPATCH][dispatch_idx]); auto prefetch_h_settings = std::get<1>(device_worker_variants[DispatchWorkerType::PREFETCH][dispatch_idx]); - auto prefetch_physical_core = prefetch_h_settings.worker_physical_core; + auto prefetch_virtual_core = prefetch_h_settings.worker_virtual_core; auto dispatch_core_type = settings.dispatch_core_type; uint32_t host_completion_queue_wr_ptr = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_wr_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR); uint32_t dev_completion_queue_rd_ptr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD); - settings.upstream_cores.push_back(demux_settings.worker_physical_core); + settings.upstream_cores.push_back(demux_settings.worker_virtual_core); settings.downstream_cores.push_back(tt_cxy_pair(0, 0, 0)); settings.compile_args.resize(31); auto& compile_args = settings.compile_args; @@ -1362,7 +1381,7 @@ void Device::update_workers_build_settings(std::vector 1) { auto &us_tunneler_remote_settings = std::get<1>(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); - auto mux_d_sender = us_tunneler_remote_settings.worker_physical_core; + auto mux_d_sender = us_tunneler_remote_settings.worker_virtual_core; compile_args[47] = (return_vc << 24) | ((us_tunneler_remote_settings.vc_count * 2 - 1) << 16) | (mux_d_sender.y << 8) | (mux_d_sender.x); - log_debug(tt::LogMetal, "Tunnelr Inner Device {} will send done to {}", tunneler_settings.worker_physical_core.str(), mux_d_sender.str()); + log_debug(tt::LogMetal, "Tunnelr Inner Device {} will send done to {}", tunneler_settings.worker_virtual_core.str(), mux_d_sender.str()); } break; @@ -1497,8 +1516,8 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::PREFETCH_D][p + prefetch_d_connected]); - compile_args[4 + demux_output_idx] = packet_switch_4B_pack(prefetch_d_setting.worker_physical_core.x, - prefetch_d_setting.worker_physical_core.y, + compile_args[4 + demux_output_idx] = packet_switch_4B_pack(prefetch_d_setting.worker_virtual_core.x, + prefetch_d_setting.worker_virtual_core.y, 0, // prefetch_d input queue id (uint32_t)DispatchRemoteNetworkType::NOC0); // 4: remote_tx_0_info compile_args[8 + demux_output_cb_info_idx] = prefetch_d_setting.cb_start_address >> 4; @@ -1511,8 +1530,8 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::US_TUNNELER_REMOTE][0]); for (int i = 0; i < vcs_per_demux_d; i++) { - compile_args[4 + demux_output_idx + i] = packet_switch_4B_pack((uint32_t)us_tunneler_remote_settings.worker_physical_core.x, - (uint32_t)us_tunneler_remote_settings.worker_physical_core.y, + compile_args[4 + demux_output_idx + i] = packet_switch_4B_pack((uint32_t)us_tunneler_remote_settings.worker_virtual_core.x, + (uint32_t)us_tunneler_remote_settings.worker_virtual_core.y, remote_tunneler_vcs_connected, (uint32_t)DispatchRemoteNetworkType::NOC0); // 5: remote_tx_1_info compile_args[8 + (demux_output_idx + i) * 2] = (us_tunneler_remote_settings.cb_start_address + remote_tunneler_vcs_connected * us_tunneler_remote_settings.cb_size_bytes) >> 4; // 10: remote_tx_queue_start_addr_words 1 @@ -1527,8 +1546,8 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::DISPATCH_D][prefetch_d_idx]); // 1 to 1 mapping bw prefetch_d and dispatch_d auto dispatch_s_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH_S][prefetch_d_idx]); // 1 to 1 mapping bw prefetch_d and dispatch_s auto dispatch_core_type = prefetch_d_settings.dispatch_core_type; - prefetch_d_settings.upstream_cores.push_back(demux_d_settings.worker_physical_core); - prefetch_d_settings.downstream_cores.push_back(dispatch_d_settings.worker_physical_core); - prefetch_d_settings.downstream_cores.push_back(dispatch_s_settings.worker_physical_core); + prefetch_d_settings.upstream_cores.push_back(demux_d_settings.worker_virtual_core); + prefetch_d_settings.downstream_cores.push_back(dispatch_d_settings.worker_virtual_core); + prefetch_d_settings.downstream_cores.push_back(dispatch_s_settings.worker_virtual_core); uint32_t pcie_alignment = hal.get_alignment(HalMemType::HOST); uint32_t scratch_db_base = (prefetch_d_settings.cb_start_address + prefetch_d_settings.cb_size_bytes + pcie_alignment - 1) & (~(pcie_alignment - 1)); uint32_t scratch_db_size = dispatch_constants::get(dispatch_core_type).scratch_db_size(); @@ -1668,9 +1687,9 @@ void Device::update_workers_build_settings(std::vector(device_worker_variants[DispatchWorkerType::PREFETCH_D][dispatch_s_idx]); // 1 to 1 mapping bw prefetch_d and dispatch_s auto dispatch_d_settings = std::get<1>(device_worker_variants[DispatchWorkerType::DISPATCH_D][dispatch_s_idx]); // 1 to 1 mapping bw dispatch_d and dispatch_s - dispatch_s_settings.upstream_cores.push_back(prefetch_d_settings.worker_physical_core); - dispatch_s_settings.downstream_cores.push_back(dispatch_d_settings.worker_physical_core); + dispatch_s_settings.upstream_cores.push_back(prefetch_d_settings.worker_virtual_core); + dispatch_s_settings.downstream_cores.push_back(dispatch_d_settings.worker_virtual_core); auto dispatch_core_type = dispatch_s_settings.dispatch_core_type; uint32_t dispatch_message_base_addr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_MESSAGE); uint32_t dispatch_s_sync_sem_base_addr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_S_SYNC_SEM); @@ -1762,8 +1781,8 @@ void Device::update_workers_build_settings(std::vector(dispatch_d_settings); - compile_args[4 + mux_d_input_idx] = packet_switch_4B_pack(dispatch_d_setting.worker_physical_core.x, - dispatch_d_setting.worker_physical_core.y, + compile_args[4 + mux_d_input_idx] = packet_switch_4B_pack(dispatch_d_setting.worker_virtual_core.x, + dispatch_d_setting.worker_virtual_core.y, 1, DispatchRemoteNetworkType::NOC0); // 4,5,6,7: src x info mux_d_input_idx++; @@ -1773,8 +1792,8 @@ void Device::update_workers_build_settings(std::vector(us_tunneler_remote_settings); - compile_args[4 + mux_d_input_idx] = packet_switch_4B_pack(us_tunneler_remote_setting.worker_physical_core.x, - us_tunneler_remote_setting.worker_physical_core.y, + compile_args[4 + mux_d_input_idx] = packet_switch_4B_pack(us_tunneler_remote_setting.worker_virtual_core.x, + us_tunneler_remote_setting.worker_virtual_core.y, us_tunneler_remote_setting.vc_count * 2 - 1, DispatchRemoteNetworkType::NOC0); // 4,5,6,7: src x info mux_d_input_idx++; @@ -1785,8 +1804,8 @@ void Device::update_workers_build_settings(std::vector> 4; // 8: remote_tx_queue_start_addr_words compile_args[9] = tunneler_settings.cb_size_bytes >> 4; // 9: remote_tx_queue_size_words - compile_args[10] = tunneler_settings.worker_physical_core.x; // 10: remote_tx_x - compile_args[11] = tunneler_settings.worker_physical_core.y; // 11: remote_tx_y + compile_args[10] = tunneler_settings.worker_virtual_core.x; // 10: remote_tx_x + compile_args[11] = tunneler_settings.worker_virtual_core.y; // 11: remote_tx_y compile_args[12] = tunneler_settings.vc_count - 1; // 12: remote_tx_queue_id compile_args[13] = (uint32_t)DispatchRemoteNetworkType::NOC0; // 13: tx_network_type compile_args[14] = 0; // 14: test_results_addr (disabled) @@ -1867,7 +1886,7 @@ void Device::setup_tunnel_for_remote_devices() { settings.dispatch_core_type = dispatch_core_type; tt_cxy_pair prefetch_location = dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(prefetch_location.chip, get_physical_core_coordinate(prefetch_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(prefetch_location.chip, this->virtual_core_from_logical_core(prefetch_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp"; //prefetch needs three semaphores. settings.semaphores.push_back(0); @@ -1885,7 +1904,7 @@ void Device::setup_tunnel_for_remote_devices() { for (uint32_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { tt_cxy_pair dispatch_location = dispatch_core_manager::instance().dispatcher_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(dispatch_location.chip, get_physical_core_coordinate(dispatch_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(dispatch_location.chip, this->virtual_core_from_logical_core(dispatch_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp"; //dispatch needs one semaphore. settings.semaphores.push_back(0); @@ -1935,7 +1954,7 @@ void Device::setup_tunnel_for_remote_devices() { //N300, T3K 1, 2 CQ case settings.semaphores = std::vector(num_prefetchers); tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, channel, 0); - settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(mux_location.chip, this->virtual_core_from_logical_core(mux_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).mux_buffer_size(num_hw_cqs); settings.cb_start_address = dispatch_constants::get(dispatch_core_type).dispatch_buffer_base(); @@ -1943,7 +1962,7 @@ void Device::setup_tunnel_for_remote_devices() { tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 0); - settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(demux_location.chip, this->virtual_core_from_logical_core(demux_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); settings.cb_size_bytes = 0x10000; @@ -1952,7 +1971,7 @@ void Device::setup_tunnel_for_remote_devices() { //TG, TGG 1, 2 CQ case settings.semaphores = std::vector(MAX_SWITCH_FAN_IN); tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, channel, 0); - settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(mux_location.chip, this->virtual_core_from_logical_core(mux_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; settings.cb_start_address = dispatch_constants::get(dispatch_core_type).dispatch_buffer_base(); settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).mux_buffer_size(1); @@ -1960,12 +1979,12 @@ void Device::setup_tunnel_for_remote_devices() { tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); if (num_prefetchers == 8) { tt_cxy_pair mux_location = dispatch_core_manager::instance().mux_core(device_id, channel, 1); - settings.worker_physical_core = tt_cxy_pair(mux_location.chip, get_physical_core_coordinate(mux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(mux_location.chip, this->virtual_core_from_logical_core(mux_location, dispatch_core_type)); tunnel_core_allocations[MUX].push_back(std::make_tuple(mux_location, settings)); } tt_cxy_pair demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 0); - settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(demux_location.chip, this->virtual_core_from_logical_core(demux_location, dispatch_core_type)); settings.semaphores.clear(); settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); @@ -1974,14 +1993,14 @@ void Device::setup_tunnel_for_remote_devices() { settings.semaphores = std::vector(num_prefetchers / 2); demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 1); - settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(demux_location.chip, this->virtual_core_from_logical_core(demux_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); settings.cb_size_bytes = 0x10000; tunnel_core_allocations[DEMUX].push_back(std::make_tuple(demux_location, settings)); demux_location = dispatch_core_manager::instance().demux_core(device_id, channel, 2); - settings.worker_physical_core = tt_cxy_pair(demux_location.chip, get_physical_core_coordinate(demux_location, dispatch_core_type)); + settings.worker_virtual_core = tt_cxy_pair(demux_location.chip, this->virtual_core_from_logical_core(demux_location, dispatch_core_type)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_demux.cpp"; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); settings.cb_size_bytes = 0x10000; @@ -1996,8 +2015,8 @@ void Device::setup_tunnel_for_remote_devices() { tt_cxy_pair us_location = dispatch_core_manager::instance().tunneler_core(us_device, device_id, channel, cq_id); tt_cxy_pair local_location = dispatch_core_manager::instance().us_tunneler_core_local(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(us_location.chip, get_physical_core_coordinate(us_location, CoreType::ETH)); - settings.eth_partner_physical_core = tt_cxy_pair(local_location.chip, get_physical_core_coordinate(local_location, CoreType::ETH)); + settings.worker_virtual_core = tt_cxy_pair(us_location.chip, this->virtual_core_from_logical_core(us_location, CoreType::ETH)); + settings.eth_partner_virtual_core = tt_cxy_pair(local_location.chip, this->virtual_core_from_logical_core(local_location, CoreType::ETH)); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp"; settings.cb_start_address = 0x19000; settings.cb_size_bytes = 0x4000; @@ -2007,9 +2026,9 @@ void Device::setup_tunnel_for_remote_devices() { settings.tunnel_stop = tunnel_stop; //swap the two etnernet link pair cores for downstream chip on the link pair. - tt_cxy_pair temp = settings.worker_physical_core; - settings.worker_physical_core = settings.eth_partner_physical_core; - settings.eth_partner_physical_core = temp; + tt_cxy_pair temp = settings.worker_virtual_core; + settings.worker_virtual_core = settings.eth_partner_virtual_core; + settings.eth_partner_virtual_core = temp; settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_eth_tunneler.cpp"; tunnel_core_allocations[US_TUNNELER_LOCAL].push_back(std::make_tuple(local_location, settings)); TT_ASSERT(us_location.chip == us_device, @@ -2021,7 +2040,7 @@ void Device::setup_tunnel_for_remote_devices() { settings.dispatch_core_type = dispatch_core_type; tt_cxy_pair mux_d_location = dispatch_core_manager::instance().mux_d_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(mux_d_location.chip, get_physical_core_coordinate(mux_d_location, dispatch_core_type)); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(mux_d_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/packet_mux.cpp"; settings.semaphores = std::vector(num_hw_cqs); settings.consumer_semaphore_id = 0; @@ -2033,7 +2052,7 @@ void Device::setup_tunnel_for_remote_devices() { uint32_t demux_vcs = settings.vc_count - 1; tt_cxy_pair demux_d_location = dispatch_core_manager::instance().demux_d_core(device_id, channel, 0); - settings.worker_physical_core = tt_cxy_pair(demux_d_location.chip, get_physical_core_coordinate(demux_d_location, dispatch_core_type)); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(demux_d_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; settings.producer_semaphore_id = 0; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); @@ -2045,7 +2064,7 @@ void Device::setup_tunnel_for_remote_devices() { if (tunnel.size() > 2 && demux_vcs > 1) { //TG/TGG 1-2 CQs demux_d_location = dispatch_core_manager::instance().demux_d_core(device_id, channel, 1); - settings.worker_physical_core = tt_cxy_pair(demux_d_location.chip, get_physical_core_coordinate(demux_d_location, dispatch_core_type)); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(demux_d_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/vc_packet_router.cpp"; settings.producer_semaphore_id = 0; settings.cb_start_address = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::UNRESERVED); @@ -2063,7 +2082,7 @@ void Device::setup_tunnel_for_remote_devices() { settings.producer_semaphore_id = 2; settings.consumer_slave_semaphore_id = 3; tt_cxy_pair prefetch_d_location = dispatch_core_manager::instance().prefetcher_d_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(prefetch_d_location.chip, get_physical_core_coordinate(prefetch_d_location, dispatch_core_type)); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(prefetch_d_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp"; settings.cb_start_address = dispatch_constants::get(dispatch_core_type).dispatch_buffer_base(); settings.cb_size_bytes = dispatch_constants::get(dispatch_core_type).prefetch_d_buffer_size(); @@ -2084,7 +2103,7 @@ void Device::setup_tunnel_for_remote_devices() { CoreCoord compute_grid_size = this->compute_with_storage_grid_size(); settings.num_compute_cores = uint32_t(compute_grid_size.x * compute_grid_size.y); tt_cxy_pair dispatch_d_location = dispatch_core_manager::instance().dispatcher_d_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(dispatch_d_location.chip, get_physical_core_coordinate(dispatch_d_location, dispatch_core_type)); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(dispatch_d_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp"; tunnel_core_allocations[DISPATCH_D].push_back(std::make_tuple(dispatch_d_location, settings)); settings.semaphores.clear(); @@ -2107,7 +2126,8 @@ void Device::setup_tunnel_for_remote_devices() { settings.producer_semaphore_id = 0; // sync with producer (prefetcher) } tt_cxy_pair dispatch_s_location = dispatch_core_manager::instance().dispatcher_s_core(device_id, channel, cq_id); - settings.worker_physical_core = tt_cxy_pair(dispatch_s_location.chip, get_physical_core_coordinate(dispatch_s_location, dispatch_core_type)); + auto dispatch_s_virtual_coords = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(dispatch_s_location.chip, dispatch_s_location, dispatch_core_type); + settings.worker_virtual_core = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(dispatch_s_location, dispatch_core_type); settings.kernel_file = "tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp"; tunnel_core_allocations[DISPATCH_S].push_back(std::make_tuple(dispatch_s_location, settings)); settings.semaphores.clear(); @@ -2172,7 +2192,7 @@ void Device::setup_tunnel_for_remote_devices() { { if (device_worker_variants[dwv].size()) { for (auto &[core, settings] : device_worker_variants[dwv]) { - log_debug(LogMetal, "Tunnel {} Stop {} is Device {}. Core {} - Physical {} will run {}.", t, tunnel_stop, tunnel_device, core.str(), settings.worker_physical_core.str(), magic_enum::enum_name((tt::tt_metal::DispatchWorkerType)dwv)); + log_debug(LogMetal, "Tunnel {} Stop {} is Device {}. Core {} - Physical {} will run {}.", t, tunnel_stop, tunnel_device, core.str(), settings.worker_virtual_core.str(), magic_enum::enum_name((tt::tt_metal::DispatchWorkerType)dwv)); for (uint32_t arg = 0; arg < settings.compile_args.size(); arg++) { log_debug(LogMetal, "CompileArgs[{}] = {}", arg, settings.compile_args[arg]); } @@ -2226,8 +2246,9 @@ void Device::compile_command_queue_programs() { CoreType dispatch_core_type = dispatch_core_config.get_core_type(); tt_cxy_pair prefetch_core = dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); tt_cxy_pair dispatch_core = dispatch_core_manager::instance().dispatcher_core(device_id, channel, cq_id); - CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_core, dispatch_core_type); - CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_core, dispatch_core_type); + CoreCoord prefetch_virtual_core = this->virtual_core_from_logical_core(prefetch_core, dispatch_core_type); + CoreCoord dispatch_virtual_core = this->virtual_core_from_logical_core(dispatch_core, dispatch_core_type); + uint32_t cq_start = dispatch_constants::get(dispatch_core_type).get_host_command_queue_addr(CommandQueueHostAddrType::UNRESERVED); uint32_t command_queue_start_addr = get_absolute_cq_offset(channel, cq_id, cq_size); @@ -2249,14 +2270,14 @@ void Device::compile_command_queue_programs() { // dispatch_s location and flow control vars initialized as invalid. Will be set if dispatch_s is enabled for the given configuration. tt_cxy_pair dispatch_s_core = tt_cxy_pair(0xff, 0xff, 0xff); - CoreCoord dispatch_s_physical_core = {0xff, 0xff}; + CoreCoord dispatch_s_virtual_core = {0xff, 0xff}; uint32_t dispatch_s_buffer_base = 0xff; uint32_t dispatch_s_sem = 0xff; // used by dispatch_s to sync with prefetch uint32_t dispatch_s_sync_sem_base_addr = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::DISPATCH_S_SYNC_SEM);; // used by dispatch_d to signal that dispatch_s can send go signal if (this->dispatch_s_enabled()) { // Skip allocating dispatch_s for multi-CQ configurations with ethernet dispatch dispatch_s_core = dispatch_core_manager::instance().dispatcher_s_core(device_id, channel, cq_id); - dispatch_s_physical_core = get_physical_core_coordinate(dispatch_s_core, dispatch_core_type); + dispatch_s_virtual_core = this->virtual_core_from_logical_core(dispatch_s_core, dispatch_core_type); uint32_t dispatch_buffer_base = dispatch_constants::get(dispatch_core_type).dispatch_buffer_base(); if (dispatch_core_type == CoreType::WORKER) { // dispatch_s is on the same Tensix core as dispatch_d. Shared resources. Offset CB start idx. @@ -2270,9 +2291,9 @@ void Device::compile_command_queue_programs() { } log_debug(LogDevice, "Dispatching out of {} cores", magic_enum::enum_name(dispatch_core_type)); - log_debug(LogDevice, "Prefetch HD logical location: {} physical core: {}", prefetch_core.str(), prefetch_physical_core.str()); - log_debug(LogDevice, "Dispatch HD logical location: {} physical core {}", dispatch_core.str(), dispatch_physical_core.str()); - log_debug(LogDevice, "Dispatch S logical location: {} physical core {}", dispatch_s_core.str(), dispatch_s_physical_core.str()); + log_debug(LogDevice, "Prefetch HD logical location: {} virtual core: {}", prefetch_core.str(), prefetch_virtual_core.str()); + log_debug(LogDevice, "Dispatch HD logical location: {} virtual core {}", dispatch_core.str(), dispatch_virtual_core.str()); + log_debug(LogDevice, "Dispatch S logical location: {} virtual core {}", dispatch_s_core.str(), dispatch_s_virtual_core.str()); std::vector prefetch_compile_args = { dispatch_constants::get(dispatch_core_type).dispatch_buffer_base(), @@ -2310,11 +2331,11 @@ void Device::compile_command_queue_programs() { "tt_metal/impl/dispatch/kernels/cq_prefetch.cpp", prefetch_compile_args, prefetch_core, - prefetch_physical_core, + prefetch_virtual_core, dispatch_core_type, CoreCoord{0, 0}, - dispatch_physical_core, - dispatch_s_physical_core, + dispatch_virtual_core, + dispatch_s_virtual_core, std::map {}, my_noc_index, my_noc_index, @@ -2369,11 +2390,11 @@ void Device::compile_command_queue_programs() { "tt_metal/impl/dispatch/kernels/cq_dispatch.cpp", dispatch_compile_args, dispatch_core, - dispatch_physical_core, + dispatch_virtual_core, dispatch_core_type, - prefetch_physical_core, + prefetch_virtual_core, CoreCoord{0, 0}, - dispatch_s_physical_core, + dispatch_s_virtual_core, std::map {}, my_noc_index, dispatch_upstream_noc_index, @@ -2399,10 +2420,10 @@ void Device::compile_command_queue_programs() { "tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp", dispatch_s_compile_args, dispatch_s_core, - dispatch_s_physical_core, + dispatch_s_virtual_core, dispatch_core_type, - prefetch_physical_core, - dispatch_physical_core, + prefetch_virtual_core, + dispatch_virtual_core, CoreCoord{0, 0}, std::map {}, dispatch_s_noc_index, @@ -2466,7 +2487,7 @@ void Device::compile_command_queue_programs() { prefetch_settings.kernel_file, prefetch_settings.compile_args, prefetch_core, - prefetch_settings.worker_physical_core, + prefetch_settings.worker_virtual_core, prefetch_settings.dispatch_core_type, prefetch_settings.upstream_cores[0], prefetch_settings.downstream_cores[0], @@ -2558,7 +2579,7 @@ void Device::compile_command_queue_programs() { dispatch_settings.kernel_file, dispatch_settings.compile_args, dispatch_core, - dispatch_settings.worker_physical_core, + dispatch_settings.worker_virtual_core, dispatch_settings.dispatch_core_type, dispatch_settings.upstream_cores[0], CoreCoord{0xffffffff, 0xffffffff}, @@ -2647,7 +2668,7 @@ void Device::compile_command_queue_programs() { prefetch_d_settings.kernel_file, prefetch_d_settings.compile_args, prefetch_d_core, - prefetch_d_settings.worker_physical_core, + prefetch_d_settings.worker_virtual_core, prefetch_d_settings.dispatch_core_type, prefetch_d_settings.upstream_cores[0], prefetch_d_settings.downstream_cores[0], @@ -2675,7 +2696,7 @@ void Device::compile_command_queue_programs() { dispatch_d_settings.kernel_file, dispatch_d_settings.compile_args, dispatch_d_core, - dispatch_d_settings.worker_physical_core, + dispatch_d_settings.worker_virtual_core, dispatch_d_settings.dispatch_core_type, dispatch_d_settings.upstream_cores[0], dispatch_d_settings.downstream_cores[0], @@ -2698,7 +2719,7 @@ void Device::compile_command_queue_programs() { dispatch_s_settings.kernel_file, dispatch_s_settings.compile_args, dispatch_s_core, - dispatch_s_settings.worker_physical_core, + dispatch_s_settings.worker_virtual_core, dispatch_s_settings.dispatch_core_type, dispatch_s_settings.upstream_cores[0], dispatch_s_settings.downstream_cores[0], @@ -2919,8 +2940,8 @@ void Device::init_command_queue_device() { for (const CoreCoord &logical_dispatch_core : logical_dispatch_cores) { launch_msg_t msg = command_queue_program.kernels_on_core(logical_dispatch_core, index)->launch_msg; go_msg_t go_msg = command_queue_program.kernels_on_core(logical_dispatch_core, index)->go_msg; - CoreCoord phys_core = this->physical_core_from_logical_core(logical_dispatch_core, core_type); - tt::llrt::write_launch_msg_to_core(this->id(), phys_core, &msg, &go_msg, this->get_dev_addr(phys_core, HalL1MemAddrType::LAUNCH)); + CoreCoord virtual_core = this->virtual_core_from_logical_core(logical_dispatch_core, core_type); + tt::llrt::write_launch_msg_to_core(this->id(), virtual_core, &msg, &go_msg, this->get_dev_addr(virtual_core, HalL1MemAddrType::LAUNCH)); } } @@ -2936,8 +2957,8 @@ void Device::init_command_queue_device() { for (const CoreCoord &logical_dispatch_core : logical_dispatch_cores) { launch_msg_t msg = mmio_command_queue_program.kernels_on_core(logical_dispatch_core, index)->launch_msg; go_msg_t go_msg = mmio_command_queue_program.kernels_on_core(logical_dispatch_core, index)->go_msg; - CoreCoord phys_core = mmio_device->physical_core_from_logical_core(logical_dispatch_core, core_type); - tt::llrt::write_launch_msg_to_core(mmio_device_id, phys_core, &msg, &go_msg, mmio_device->get_dev_addr(phys_core, HalL1MemAddrType::LAUNCH)); + CoreCoord virtual_core = mmio_device->virtual_core_from_logical_core(logical_dispatch_core, core_type); + tt::llrt::write_launch_msg_to_core(mmio_device_id, virtual_core, &msg, &go_msg, mmio_device->get_dev_addr(virtual_core, HalL1MemAddrType::LAUNCH)); } } } @@ -3008,7 +3029,7 @@ bool Device::close() { std::unordered_map> not_done_dispatch_cores; std::unordered_map> cores_to_skip; - this->get_associated_dispatch_phys_cores(not_done_dispatch_cores, cores_to_skip); + this->get_associated_dispatch_virtual_cores(not_done_dispatch_cores, cores_to_skip); auto mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->id_); std::unordered_set wait_for_cores = not_done_dispatch_cores[mmio_device_id]; @@ -3037,12 +3058,12 @@ bool Device::close() { if (this->id_ != mmio_device_id) { for (auto it = not_done_dispatch_cores[mmio_device_id].begin(); it != not_done_dispatch_cores[mmio_device_id].end(); it++) { - const auto &phys_core = *it; - if(llrt::is_ethernet_core(phys_core, this->id_)) { - log_debug(tt::LogMetal, "Ethernet dispatch core {} on Device {} is idle. Closing Device {}", phys_core.str(), mmio_device_id, this->id()); + const auto &virtual_core = *it; + if(tt::Cluster::instance().is_ethernet_core(virtual_core, this->id_)) { + log_debug(tt::LogMetal, "Ethernet dispatch core {} on Device {} is idle. Closing Device {}", virtual_core.str(), mmio_device_id, this->id()); } else { - log_debug(tt::LogMetal, "Resetting core {} on Device {} when closing Device {}", phys_core.str(), mmio_device_id, this->id()); - tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(mmio_device_id, phys_core)); + log_debug(tt::LogMetal, "Resetting core {} on Device {} when closing Device {}", virtual_core.str(), mmio_device_id, this->id()); + tt::Cluster::instance().assert_risc_reset_at_core(tt_cxy_pair(mmio_device_id, virtual_core)); } } } @@ -3102,15 +3123,6 @@ CoreCoord Device::dram_grid_size() const { return tt::Cluster::instance().get_soc_desc(id_).get_dram_grid_size(); } -CoreCoord Device::physical_core_from_logical_core(const CoreCoord &logical_coord, const CoreType &core_type) const { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); - return soc_desc.get_physical_core_from_logical_core(logical_coord, core_type); -} - -CoreCoord Device::physical_core_from_logical_core(const CoreDescriptor &logical_core) const { - return physical_core_from_logical_core(logical_core.coord, logical_core.type); -} - CoreType Device::core_type_from_physical_core(const CoreCoord &physical_coord) const { const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); if (soc_desc.physical_cores.find(physical_coord) == soc_desc.physical_cores.end()) @@ -3119,7 +3131,32 @@ CoreType Device::core_type_from_physical_core(const CoreCoord &physical_coord) c return soc_desc.physical_cores.at(physical_coord).type; } -CoreCoord Device::worker_core_from_logical_core(const CoreCoord &logical_core) const { +CoreType Device::core_type_from_virtual_core(const CoreCoord &virtual_coord) const { + if (tt::Cluster::instance().is_worker_core(virtual_coord, this->id_)) { + return CoreType::WORKER; + } else if (tt::Cluster::instance().is_ethernet_core(virtual_coord, this->id_)) { + return CoreType::ETH; + } + return this->core_type_from_physical_core(virtual_coord); +} + + +CoreCoord Device::virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const { + if (coord.x >= this->grid_size().x || coord.y >= this->grid_size().y) { + // Coordinate already in virtual space: NOC0 and NOC1 are the same + return coord; + } else { + const auto& grid_size = this->grid_size(); + // Coordinate in Physical Space. Convert to Virtual. + CoreCoord phys_coord = { + hal.noc_coordinate(noc_index, grid_size.x, coord.x), + hal.noc_coordinate(noc_index, grid_size.y, coord.y) + }; + return this->virtual_core_from_physical_core(phys_coord, this->core_type_from_physical_core(phys_coord)); + } +} + +CoreCoord Device::physical_worker_core_from_logical_core(const CoreCoord &logical_core) const { const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); return soc_desc.get_physical_tensix_core_from_logical(logical_core); } @@ -3127,66 +3164,64 @@ CoreCoord Device::worker_core_from_logical_core(const CoreCoord &logical_core) c std::vector Device::worker_cores_from_logical_cores(const std::vector &logical_cores) const { std::vector worker_cores(logical_cores.size()); for (std::size_t idx = 0; idx < logical_cores.size(); idx++) - worker_cores[idx] = worker_core_from_logical_core(logical_cores[idx]); + worker_cores[idx] = this->worker_core_from_logical_core(logical_cores[idx]); return worker_cores; } -CoreCoord Device::dram_core_from_logical_core(const CoreCoord &logical_core) const { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); - return soc_desc.get_physical_dram_core_from_logical(logical_core); +std::vector Device::ethernet_cores_from_logical_cores(const std::vector &logical_cores) const { + std::vector eth_cores(logical_cores.size()); + for (std::size_t idx = 0; idx < logical_cores.size(); idx++) { + eth_cores[idx] = this->ethernet_core_from_logical_core(logical_cores[idx]); + } + return eth_cores; } - -std::vector Device::dram_cores_from_logical_cores(const std::vector &logical_cores) const { - std::vector dram_cores(logical_cores.size()); - for (std::size_t idx = 0; idx < logical_cores.size(); idx++) - dram_cores[idx] = dram_core_from_logical_core(logical_cores[idx]); - - return dram_cores; +CoreCoord Device::virtual_core_from_logical_core(const CoreCoord &logical_coord, const CoreType& core_type) const { + return tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(this->id_, logical_coord, core_type); } -CoreCoord Device::ethernet_core_from_logical_core(const CoreCoord &logical_core) const { - return tt::Cluster::instance().ethernet_core_from_logical_core(id_, logical_core); +CoreCoord Device::virtual_core_from_physical_core(const CoreCoord &physical_coord, const CoreType& core_type) const { + return tt::Cluster::instance().get_virtual_coordinate_from_physical_coordinates(this->id_, physical_coord, core_type); } -CoreCoord Device::logical_core_from_ethernet_core(const CoreCoord &physical_core) const { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(this->id_); - return soc_desc.get_logical_ethernet_core_from_physical(physical_core); +CoreCoord Device::worker_core_from_logical_core(const CoreCoord &logical_core) const { + return this->virtual_core_from_logical_core(logical_core, CoreType::WORKER); } -std::vector Device::ethernet_cores_from_logical_cores(const std::vector &logical_cores) const { - std::vector ethernet_cores(logical_cores.size()); +CoreCoord Device::ethernet_core_from_logical_core(const CoreCoord &logical_core) const { + return this->virtual_core_from_logical_core(logical_core, CoreType::ETH); +} - for (std::size_t idx = 0; idx < logical_cores.size(); idx++) - ethernet_cores[idx] = ethernet_core_from_logical_core(logical_cores[idx]); - return ethernet_cores; +CoreCoord Device::logical_core_from_ethernet_core(const CoreCoord ðernet_core) const { + return tt::Cluster::instance().get_logical_ethernet_core_from_virtual(this->id(), ethernet_core); } -uint32_t Device::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& physical_core) const { - const auto& grid_size = this->grid_size(); +uint32_t Device::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const { + auto virtual_noc_coord = this->virtual_noc_coordinate(noc_index, core); return tt::tt_metal::hal.noc_xy_encoding( - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.x, physical_core.x), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.y, physical_core.y) + virtual_noc_coord.x, + virtual_noc_coord.y ); } -uint32_t Device::get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& physical_cores) const { - const auto& grid_size = this->grid_size(); +uint32_t Device::get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const { + auto virtual_noc_start = this->virtual_noc_coordinate(noc_index, cores.start_coord); + auto virtual_noc_end = this->virtual_noc_coordinate(noc_index, cores.end_coord); // NOC 1 mcasts from bottom left to top right, so we need to reverse the coords if (noc_index == 0) { return tt::tt_metal::hal.noc_multicast_encoding( - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.x, physical_cores.start_coord.x), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.y, physical_cores.start_coord.y), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.x, physical_cores.end_coord.x), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.y, physical_cores.end_coord.y) + virtual_noc_start.x, + virtual_noc_start.y, + virtual_noc_end.x, + virtual_noc_end.y ); } else { return tt::tt_metal::hal.noc_multicast_encoding( - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.x, physical_cores.end_coord.x), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.y, physical_cores.end_coord.y), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.x, physical_cores.start_coord.x), - tt::tt_metal::hal.noc_coordinate(noc_index, grid_size.y, physical_cores.start_coord.y) + virtual_noc_end.x, + virtual_noc_end.y, + virtual_noc_start.x, + virtual_noc_start.y ); } } @@ -3613,8 +3648,9 @@ void Device::generate_device_bank_to_noc_tables() l1_bank_to_noc_xy_.reserve(tt::tt_metal::hal.get_num_nocs() * l1_noc_coord_per_bank.size()); for (unsigned int noc = 0; noc < tt::tt_metal::hal.get_num_nocs(); noc++) { for (unsigned int bank_id = 0; bank_id < l1_noc_coord_per_bank.size(); bank_id++) { - uint16_t noc_x = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.x, l1_noc_coord_per_bank[bank_id].x); - uint16_t noc_y = tt::tt_metal::hal.noc_coordinate(noc, soc_d.grid_size.y, l1_noc_coord_per_bank[bank_id].y); + auto l1_noc_coords = this->virtual_noc_coordinate(noc, l1_noc_coord_per_bank[bank_id]); + uint16_t noc_x = l1_noc_coords.x; + uint16_t noc_y = l1_noc_coords.y; uint16_t xy = ((noc_y << NOC_ADDR_NODE_ID_BITS) | noc_x) << NOC_COORD_REG_OFFSET; l1_bank_to_noc_xy_.push_back(xy); } @@ -3706,6 +3742,60 @@ const std::vector &Device::get_sub_device_ids() const { return this->active_sub_device_manager_->get_sub_device_ids(); } +std::vector Device::get_optimal_dram_bank_to_logical_worker_assignment() { + // Top level function that users (ex: Op Writers) can use to assign Tensix Worker cores + // as DRAM readers or writers. Returns logical coordinates of optimally placed workers. + // This function queries Physical Coordinates (only exposed directly to the Device class) + // and passes them to logic in core_assignment.cpp to derive the most optimal core placement + // based on architecture specific logic and Physical Grid configuration. + if (not this->optimal_dram_bank_to_logical_worker_assignment_.size()) { + uint32_t full_grid_size_x = this->grid_size().x; + uint32_t full_grid_size_y = this->grid_size().y; + + auto compute_with_storage_grid_size = this->compute_with_storage_grid_size(); + uint32_t num_cores_x = compute_with_storage_grid_size.x; + uint32_t num_cores_y = compute_with_storage_grid_size.y; + // Get physical coordinates of DRAM Controller NOC end-points + uint32_t num_dram_banks = this->num_dram_channels(); + std::vector dram_phy_coords; + for (int i = 0; i < num_dram_banks; ++i) { + dram_phy_coords.push_back(dram_core_from_dram_channel(i)); + } + // Get all logical cores in the worker grid + std::vector all_worker_cores_logical; + for (int i = 0; i < num_cores_x; ++i) { + for (int j = 0; j < num_cores_y; ++j) { + all_worker_cores_logical.push_back(CoreCoord(i, j)); + } + } + // Get the physical rows and cols (y, x) in the worker grid + std::vector worker_phy_y = std::vector(num_cores_y); + for (int i = 0; i < num_cores_y; ++i) { + auto core_phy = this->physical_worker_core_from_logical_core(CoreCoord(0, i)); + worker_phy_y.at(i) = core_phy.y; + } + std::vector worker_phy_x = std::vector(num_cores_x); + for (int i = 0; i < num_cores_x; ++i) { + auto core_phy = this->physical_worker_core_from_logical_core(CoreCoord(i, 0)); + worker_phy_x.push_back(core_phy.x); + } + // Get optimal placement of worker cores interfacing with DRAM Controllers in physical coordinate space + auto physical_worker_cores = get_optimal_dram_to_physical_worker_assignment(this->arch(), dram_phy_coords, full_grid_size_x, full_grid_size_y, worker_phy_x, worker_phy_y); + // Convert to physical worker coordinates to logical. This gets returned to the user. + for (int i = 0; i < physical_worker_cores.size(); ++i) { + for (int j = 0; j < all_worker_cores_logical.size(); ++j) { + auto core = this->physical_worker_core_from_logical_core(all_worker_cores_logical[j]); + if (physical_worker_cores[i] == core) { + this->optimal_dram_bank_to_logical_worker_assignment_.push_back(all_worker_cores_logical[j]); + } + } + } + } + return this->optimal_dram_bank_to_logical_worker_assignment_; +} + + + size_t v1::GetNumAvailableDevices() { return tt::Cluster::instance().number_of_user_devices(); } size_t v1::GetNumPCIeDevices() { return tt::Cluster::instance().number_of_pci_devices(); } diff --git a/tt_metal/impl/device/device.hpp b/tt_metal/impl/device/device.hpp index 616a831e0462..f2ef56a3a830 100644 --- a/tt_metal/impl/device/device.hpp +++ b/tt_metal/impl/device/device.hpp @@ -64,6 +64,12 @@ class Device { private: static_assert(detail::SubDeviceManager::MAX_NUM_SUB_DEVICES <= dispatch_constants::DISPATCH_MESSAGE_ENTRIES, "MAX_NUM_SUB_DEVICES must be less than or equal to dispatch_constants::DISPATCH_MESSAGE_ENTRIES"); static constexpr uint32_t DEFAULT_NUM_SUB_DEVICES = 1; + + CoreCoord physical_worker_core_from_logical_core(const CoreCoord &logical_core) const; + CoreCoord dram_core_from_dram_channel(uint32_t dram_channel) const; + CoreType core_type_from_physical_core(const CoreCoord &physical_core) const; + CoreCoord virtual_core_from_physical_core(const CoreCoord &physical_coord, const CoreType& core_type) const; + public: // friend void tt_gdb(Device* device, int chip_id, const vector cores, vector ops); Device () = delete; @@ -109,22 +115,21 @@ class Device { CoreCoord dram_grid_size() const; - CoreCoord physical_core_from_logical_core(const CoreCoord &logical_core, const CoreType &core_type) const; - CoreCoord physical_core_from_logical_core(const CoreDescriptor &logical_core) const; - CoreType core_type_from_physical_core(const CoreCoord &physical_core) const; + CoreType core_type_from_virtual_core(const CoreCoord& virtual_coord) const; + + CoreCoord virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const; - CoreCoord worker_core_from_logical_core(const CoreCoord &logical_core) const; std::vector worker_cores_from_logical_cores(const std::vector &logical_cores) const; + std::vector ethernet_cores_from_logical_cores(const std::vector &logical_cores) const; + std::vector get_optimal_dram_bank_to_logical_worker_assignment(); - CoreCoord dram_core_from_logical_core(const CoreCoord &logical_core) const; - std::vector dram_cores_from_logical_cores(const std::vector &logical_cores) const; + CoreCoord virtual_core_from_logical_core(const CoreCoord &logical_coord, const CoreType& core_type) const; + + CoreCoord worker_core_from_logical_core(const CoreCoord &logical_core) const; // Ethernet API CoreCoord ethernet_core_from_logical_core(const CoreCoord &logical_core) const; - CoreCoord logical_core_from_ethernet_core(const CoreCoord &physical_core) const; - - std::vector ethernet_cores_from_logical_cores(const std::vector &logical_cores) const; - std::vector get_noc_encoding_for_active_eth_cores(NOC noc_index); + CoreCoord logical_core_from_ethernet_core(const CoreCoord ðernet_core) const; std::unordered_set get_ethernet_connected_device_ids() const { return tt::Cluster::instance().get_ethernet_connected_device_ids(this->id_); @@ -167,7 +172,6 @@ class Device { uint32_t dram_channel_from_bank_id(uint32_t bank_id) const; uint32_t dram_channel_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const; - CoreCoord dram_core_from_dram_channel(uint32_t dram_channel) const; CoreCoord logical_core_from_dram_channel(uint32_t dram_channel) const; uint32_t dram_channel_from_logical_core(const CoreCoord& logical_core) const; @@ -210,8 +214,9 @@ class Device { // core.y represents different channels along one const std::set ðernet_cores() const { return this->ethernet_cores_; } - uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& physical_core) const; - uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& physical_cores) const; + + uint32_t get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const; + uint32_t get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const; const std::unordered_set &get_allocated_buffers() const; const std::unordered_set &get_allocated_buffers(SubDeviceId sub_device_id) const; @@ -259,19 +264,19 @@ class Device { void initialize_build(); void initialize_device_kernel_defines(); void build_firmware(); - void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord phys_core); - void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord phys_core, launch_msg_t *launch_msg, go_msg_t* go_msg); + void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core); + void initialize_firmware(const HalProgrammableCoreType &core_type, CoreCoord virtual_core, launch_msg_t *launch_msg, go_msg_t* go_msg); void reset_cores(); void initialize_and_launch_firmware(); void init_command_queue_host(); void init_command_queue_device(); void initialize_synchronous_sw_cmd_queue(); - void configure_kernel_variant(Program& program, const string& path, const std::vector& compile_args, CoreCoord kernel_core, CoreCoord Kernel_physical_core, - CoreType dispatch_core_type, CoreCoord upstream_physical_core, CoreCoord downstream_physical_core, CoreCoord downstream_slave_physical_core, std::map defines_in, NOC my_noc_index, NOC upstream_noc_index, NOC downstream_noc_index, bool is_active_eth_core = false, bool send_to_brisc = false, bool force_watcher_no_inline = false); + void configure_kernel_variant(Program& program, const string& path, const std::vector& compile_args, CoreCoord kernel_core, CoreCoord kernel_virtual_core, + CoreType dispatch_core_type, CoreCoord upstream_virtual_core, CoreCoord downstream_virtual_core, CoreCoord downstream_slave_virtual_core, std::map defines_in, NOC my_noc_index, NOC upstream_noc_index, NOC downstream_noc_index, bool is_active_eth_core = false, bool send_to_brisc = false, bool force_watcher_no_inline = false); void compile_command_queue_programs(); void configure_command_queue_programs(); void clear_l1_state(); - void get_associated_dispatch_phys_cores( + void get_associated_dispatch_virtual_cores( std::unordered_map> &my_dispatch_cores, std::unordered_map> &other_dispatch_cores); std::pair build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const; @@ -315,7 +320,7 @@ class Device { std::set compute_cores_; std::set storage_only_cores_; std::set ethernet_cores_; - + std::vector optimal_dram_bank_to_logical_worker_assignment_; // SystemMemoryManager is the interface to the hardware command queue std::vector> hw_command_queues_; std::vector> sw_command_queues_; @@ -354,9 +359,9 @@ class Device { uint32_t trace_buffers_size = 0; void update_dispatch_cores_for_multi_cq_eth_dispatch(); - HalProgrammableCoreType get_programmable_core_type(CoreCoord phys_core) const; + HalProgrammableCoreType get_programmable_core_type(CoreCoord virtual_core) const; template - T get_dev_addr(CoreCoord phys_core, HalL1MemAddrType addr_type) const; + T get_dev_addr(CoreCoord virtual_core, HalL1MemAddrType addr_type) const; // Returns address where allocator starts allocating buffer template T get_base_allocator_addr(const HalMemType &mem_type) const; @@ -406,12 +411,12 @@ class Device { } // namespace v0 -inline HalProgrammableCoreType Device::get_programmable_core_type(CoreCoord phys_core) const { +inline HalProgrammableCoreType Device::get_programmable_core_type(CoreCoord virtual_core) const { HalProgrammableCoreType programmable_core_type = HalProgrammableCoreType::TENSIX; - if (tt::llrt::is_ethernet_core(phys_core, this->id_)) { + if (tt::Cluster::instance().is_ethernet_core(virtual_core, this->id_)) { // Eth pcores have a different address, but only active ones. - CoreCoord logical_core = this->logical_core_from_ethernet_core(phys_core); + CoreCoord logical_core = this->logical_core_from_ethernet_core(virtual_core); if (this->is_active_ethernet_core(logical_core)) { programmable_core_type = HalProgrammableCoreType::ACTIVE_ETH; } else { @@ -423,8 +428,8 @@ inline HalProgrammableCoreType Device::get_programmable_core_type(CoreCoord phys } template -inline T Device::get_dev_addr(CoreCoord phys_core, HalL1MemAddrType addr_type) const { - return hal.get_dev_addr(this->get_programmable_core_type(phys_core), addr_type); +inline T Device::get_dev_addr(CoreCoord virtual_core, HalL1MemAddrType addr_type) const { + return hal.get_dev_addr(this->get_programmable_core_type(virtual_core), addr_type); } template @@ -446,11 +451,11 @@ std::vector> Device::extract_dst_noc_mu std::vector> dst_noc_multicast_info; dst_noc_multicast_info.reserve(ranges.size()); for (const CoreRange& core_range : ranges) { - CoreCoord physical_start = this->physical_core_from_logical_core(core_range.start_coord, core_type); - CoreCoord physical_end = this->physical_core_from_logical_core(core_range.end_coord, core_type); + CoreCoord virtual_start = this->virtual_core_from_logical_core(core_range.start_coord, core_type); + CoreCoord virtual_end = this->virtual_core_from_logical_core(core_range.end_coord, core_type); uint32_t num_receivers = core_range.size(); - dst_noc_multicast_info.push_back(std::make_pair(CoreRange(physical_start, physical_end), num_receivers)); + dst_noc_multicast_info.push_back(std::make_pair(CoreRange(virtual_start, virtual_end), num_receivers)); } return dst_noc_multicast_info; } diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index e0ef8b96cfc4..d677a362cd72 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -100,10 +100,10 @@ void EnqueueReadInterleavedBufferCommand::add_prefetch_relay(HugepageDeviceComma void EnqueueReadShardedBufferCommand::add_prefetch_relay(HugepageDeviceCommand& command) { uint32_t padded_page_size = this->buffer.aligned_page_size(); - const CoreCoord physical_core = - this->buffer.device()->physical_core_from_logical_core(this->core, this->buffer.core_type()); + const CoreCoord virtual_core = + this->buffer.device()->virtual_core_from_logical_core(this->core, this->buffer.core_type()); command.add_prefetch_relay_linear( - this->device->get_noc_unicast_encoding(this->noc_index, physical_core), + this->device->get_noc_unicast_encoding(this->noc_index, virtual_core), padded_page_size * this->pages_to_read, this->bank_base_address); } @@ -240,13 +240,13 @@ void EnqueueWriteInterleavedBufferCommand::add_buffer_data(HugepageDeviceCommand void EnqueueWriteShardedBufferCommand::add_dispatch_write(HugepageDeviceCommand& command_sequence) { uint32_t data_size_bytes = this->pages_to_write * this->padded_page_size; - const CoreCoord physical_core = - this->buffer.device()->physical_core_from_logical_core(this->core, this->buffer.core_type()); + const CoreCoord virtual_core = + this->buffer.device()->virtual_core_from_logical_core(this->core, this->buffer.core_type()); bool flush_prefetch = true; command_sequence.add_dispatch_write_linear( flush_prefetch, 0, - this->device->get_noc_unicast_encoding(this->noc_index, physical_core), + this->device->get_noc_unicast_encoding(this->noc_index, virtual_core), this->bank_base_address, data_size_bytes); } @@ -656,10 +656,9 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc } } } - - CoreCoord physical_core = device->physical_core_from_logical_core(core_coord, core_type); + CoreCoord virtual_core = device->virtual_core_from_logical_core(core_coord, core_type); unique_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, physical_core)}); + .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, virtual_core)}); } } } @@ -720,9 +719,9 @@ void EnqueueProgramCommand::assemble_runtime_args_commands(ProgramCommandSequenc unicast_sub_cmd.reserve(kernel->logical_cores().size()); for (auto& core_coord : kernel->logical_cores()) { // can make a vector of unicast encodings here - CoreCoord physical_core = device->ethernet_core_from_logical_core(core_coord); + CoreCoord virtual_core_coords = device->virtual_core_from_logical_core(core_coord, CoreType::ETH); unicast_sub_cmd.emplace_back(CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, physical_core)}); + .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, virtual_core_coords)}); } } else { std::vector> dst_noc_multicast_info = @@ -890,8 +889,8 @@ void EnqueueProgramCommand::assemble_device_commands( uint32_t max_overall_index = 0; uint32_t remote_offset_index = program.get_program_config(index).local_cb_size / sizeof(uint32_t); for (const CoreRange& core_range : circular_buffers_unique_coreranges) { - const CoreCoord physical_start = device->worker_core_from_logical_core(core_range.start_coord); - const CoreCoord physical_end = device->worker_core_from_logical_core(core_range.end_coord); + const CoreCoord virtual_start = device->virtual_core_from_logical_core(core_range.start_coord, CoreType::WORKER); + const CoreCoord virtual_end = device->virtual_core_from_logical_core(core_range.end_coord, CoreType::WORKER); const uint32_t num_receivers = core_range.size(); auto& cb_config_payload = cb_config_payloads[i]; @@ -924,7 +923,7 @@ void EnqueueProgramCommand::assemble_device_commands( } multicast_cb_config_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{ .noc_xy_addr = this->device->get_noc_multicast_encoding( - this->noc_index, CoreRange(physical_start, physical_end)), + this->noc_index, CoreRange(virtual_start, virtual_end)), .num_mcast_dests = (uint32_t)core_range.size()}); multicast_cb_config_data.emplace_back(cb_config_payload.data(), max_index * sizeof(uint32_t)); max_overall_index = std::max(max_overall_index, max_index); @@ -1089,14 +1088,12 @@ void EnqueueProgramCommand::assemble_device_commands( kernel_group.launch_msg.kernel_config.host_assigned_id = program.get_runtime_id(); const void* launch_message_data = (const void*)(&kernel_group.launch_msg); for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { - CoreCoord physical_start = - device->physical_core_from_logical_core(core_range.start_coord, kernel_group.get_core_type()); - CoreCoord physical_end = - device->physical_core_from_logical_core(core_range.end_coord, kernel_group.get_core_type()); + CoreCoord virtual_start = device->virtual_core_from_logical_core(core_range.start_coord, kernel_group.get_core_type()); + CoreCoord virtual_end = device->virtual_core_from_logical_core(core_range.end_coord, kernel_group.get_core_type()); multicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedMulticastSubCmd{ .noc_xy_addr = this->device->get_noc_multicast_encoding( - this->noc_index, CoreRange(physical_start, physical_end)), + this->noc_index, CoreRange(virtual_start, virtual_end)), .num_mcast_dests = (uint32_t)core_range.size()}); multicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB); } @@ -1123,11 +1120,11 @@ void EnqueueProgramCommand::assemble_device_commands( for (const CoreRange& core_range : kernel_group.core_ranges.ranges()) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { - CoreCoord physical_coord = device->physical_core_from_logical_core( + CoreCoord virtual_coord = device->virtual_core_from_logical_core( CoreCoord({x, y}), kernel_group.get_core_type()); unicast_go_signal_sub_cmds.emplace_back(CQDispatchWritePackedUnicastSubCmd{ .noc_xy_addr = - this->device->get_noc_unicast_encoding(this->noc_index, physical_coord)}); + this->device->get_noc_unicast_encoding(this->noc_index, virtual_coord)}); unicast_go_signal_data.emplace_back(launch_message_data, go_signal_sizeB); } } @@ -1738,9 +1735,9 @@ void EnqueueRecordEventCommand::process() { dispatch_location = dispatch_core_manager::instance().dispatcher_d_core(this->device->id(), channel, cq_id); } - CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_location, core_type); + CoreCoord dispatch_virtual_core = this->device->virtual_core_from_logical_core(dispatch_location, core_type); unicast_sub_cmds[cq_id] = CQDispatchWritePackedUnicastSubCmd{ - .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, dispatch_physical_core)}; + .noc_xy_addr = this->device->get_noc_unicast_encoding(this->noc_index, dispatch_virtual_core)}; event_payloads[cq_id] = {event_payload.data(), event_payload.size() * sizeof(uint32_t)}; } @@ -1978,8 +1975,8 @@ HWCommandQueue::HWCommandQueue(Device* device, uint32_t id, NOC noc_index) : enqueue_program_dispatch_core = dispatch_core_manager::instance().dispatcher_d_core(device->id(), channel, id); } } - this->physical_enqueue_program_dispatch_core = - device->physical_core_from_logical_core(enqueue_program_dispatch_core, core_type); + this->virtual_enqueue_program_dispatch_core = + device->virtual_core_from_logical_core(enqueue_program_dispatch_core, core_type); tt_cxy_pair completion_q_writer_location = dispatch_core_manager::instance().completion_queue_writer_core(device->id(), channel, this->id); @@ -2057,8 +2054,8 @@ void HWCommandQueue::reset_worker_state(bool reset_launch_msg_state) { } go_msg_t reset_launch_message_read_ptr_go_signal; reset_launch_message_read_ptr_go_signal.signal = RUN_MSG_RESET_READ_PTR; - reset_launch_message_read_ptr_go_signal.master_x = (uint8_t)this->physical_enqueue_program_dispatch_core.x; - reset_launch_message_read_ptr_go_signal.master_y = (uint8_t)this->physical_enqueue_program_dispatch_core.y; + reset_launch_message_read_ptr_go_signal.master_x = (uint8_t)this->virtual_enqueue_program_dispatch_core.x; + reset_launch_message_read_ptr_go_signal.master_y = (uint8_t)this->virtual_enqueue_program_dispatch_core.y; for (uint8_t i = 0; i < num_sub_devices; ++i) { reset_launch_message_read_ptr_go_signal.dispatch_message_offset = (uint8_t)dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(i); uint32_t dispatch_message_addr = dispatch_message_base_addr + dispatch_constants::get(dispatch_core_type).get_dispatch_message_offset(i); @@ -2534,7 +2531,7 @@ void HWCommandQueue::enqueue_program(Program& program, bool blocking) { this->device, this->noc_index, program, - this->physical_enqueue_program_dispatch_core, + this->virtual_enqueue_program_dispatch_core, this->manager, this->get_config_buffer_mgr(sub_device_index), expected_workers_completed, @@ -2627,7 +2624,7 @@ void HWCommandQueue::enqueue_trace(const uint32_t trace_id, bool blocking) { auto trace_inst = this->device->get_trace(trace_id); auto command = EnqueueTraceCommand( - this->id, this->device, this->manager, trace_inst->desc, *trace_inst->buffer, this->expected_num_workers_completed, this->noc_index, this->physical_enqueue_program_dispatch_core); + this->id, this->device, this->manager, trace_inst->desc, *trace_inst->buffer, this->expected_num_workers_completed, this->noc_index, this->virtual_enqueue_program_dispatch_core); this->enqueue_command(command, false, {}); diff --git a/tt_metal/impl/dispatch/command_queue.hpp b/tt_metal/impl/dispatch/command_queue.hpp index 2ef1e886def1..661e3d6d4992 100644 --- a/tt_metal/impl/dispatch/command_queue.hpp +++ b/tt_metal/impl/dispatch/command_queue.hpp @@ -516,7 +516,7 @@ class HWCommandQueue { ~HWCommandQueue(); - CoreCoord physical_enqueue_program_dispatch_core; + CoreCoord virtual_enqueue_program_dispatch_core; CoreCoord completion_queue_writer_core; NOC noc_index; volatile bool is_dprint_server_hung(); diff --git a/tt_metal/impl/dispatch/command_queue_interface.hpp b/tt_metal/impl/dispatch/command_queue_interface.hpp index 4c1c3d231b7a..8204be59bb8f 100644 --- a/tt_metal/impl/dispatch/command_queue_interface.hpp +++ b/tt_metal/impl/dispatch/command_queue_interface.hpp @@ -479,19 +479,25 @@ class SystemMemoryManager { for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) { tt_cxy_pair prefetcher_core = tt::tt_metal::dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id); - tt_cxy_pair prefetcher_physical_core = - tt_cxy_pair(prefetcher_core.chip, tt::get_physical_core_coordinate(prefetcher_core, core_type)); - this->prefetcher_cores[cq_id] = prefetcher_physical_core; + auto prefetcher_virtual = tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates(prefetcher_core.chip, CoreCoord(prefetcher_core.x, prefetcher_core.y), core_type); + this->prefetcher_cores[cq_id] = tt_cxy_pair(prefetcher_core.chip, prefetcher_virtual.x, prefetcher_virtual.y); this->prefetch_q_writers.emplace_back( - tt::Cluster::instance().get_static_tlb_writer(prefetcher_physical_core)); + tt::Cluster::instance().get_static_tlb_writer(this->prefetcher_cores[cq_id])); tt_cxy_pair completion_queue_writer_core = tt::tt_metal::dispatch_core_manager::instance().completion_queue_writer_core(device_id, channel, cq_id); + auto completion_queue_writer_virtual = + tt::Cluster::instance().get_virtual_coordinate_from_logical_coordinates( + completion_queue_writer_core.chip, + CoreCoord(completion_queue_writer_core.x, completion_queue_writer_core.y), + core_type); + const std::tuple completion_interface_tlb_data = tt::Cluster::instance() .get_tlb_data(tt_cxy_pair( completion_queue_writer_core.chip, - tt::get_physical_core_coordinate(completion_queue_writer_core, core_type))) + completion_queue_writer_virtual.x, + completion_queue_writer_virtual.y)) .value(); auto [completion_tlb_offset, completion_tlb_size] = completion_interface_tlb_data; this->completion_byte_addrs[cq_id] = completion_tlb_offset + completion_q_rd_ptr % completion_tlb_size; diff --git a/tt_metal/impl/dispatch/dispatch_core_manager.hpp b/tt_metal/impl/dispatch/dispatch_core_manager.hpp index db6118dffec7..3425388d9fb3 100644 --- a/tt_metal/impl/dispatch/dispatch_core_manager.hpp +++ b/tt_metal/impl/dispatch/dispatch_core_manager.hpp @@ -28,8 +28,8 @@ struct dispatch_worker_build_settings_t{ std::vector compile_args; std::vector upstream_cores; std::vector downstream_cores; - tt_cxy_pair worker_physical_core; - tt_cxy_pair eth_partner_physical_core; + tt_cxy_pair worker_virtual_core; + tt_cxy_pair eth_partner_virtual_core; CoreType dispatch_core_type; uint32_t command_queue_start_addr; uint32_t issue_queue_start_addr; diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index 49877dcf9ae1..647a6b484027 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -60,7 +60,8 @@ constexpr uint32_t downstream_noc_xy = uint32_t(NOC_XY_ENCODING(DOWNSTREAM_NOC_X constexpr uint32_t dispatch_s_noc_xy = uint32_t(NOC_XY_ENCODING(DOWNSTREAM_SLAVE_NOC_X, DOWNSTREAM_SLAVE_NOC_Y)); constexpr uint8_t my_noc_index = NOC_INDEX; constexpr uint32_t my_noc_xy = uint32_t(NOC_XY_ENCODING(MY_NOC_X, MY_NOC_Y)); -constexpr uint64_t pcie_noc_xy = uint64_t(NOC_XY_PCIE_ENCODING(NOC_X(PCIE_NOC_X), NOC_Y(PCIE_NOC_Y))); +constexpr uint64_t pcie_noc_xy = + uint64_t(NOC_XY_PCIE_ENCODING(NOC_X_PHYS_COORD(PCIE_NOC_X), NOC_Y_PHYS_COORD(PCIE_NOC_Y))); constexpr uint32_t dispatch_cb_page_size = 1 << dispatch_cb_log_page_size; constexpr uint32_t completion_queue_end_addr = completion_queue_base_addr + completion_queue_size; @@ -947,7 +948,7 @@ static inline bool process_cmd_d( switch (cmd->base.cmd_id) { case CQ_DISPATCH_CMD_WRITE_LINEAR: WAYPOINT("DWB"); - DPRINT << "cmd_write\n"; + DPRINT << "cmd_write_linear\n"; process_write(block_noc_writes_to_clear, block_next_start_addr); WAYPOINT("DWD"); break; diff --git a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp index 4e4d7ce297cc..02cbbc964401 100644 --- a/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_prefetch.cpp @@ -69,7 +69,8 @@ constexpr uint32_t my_noc_xy = uint32_t(NOC_XY_ENCODING(MY_NOC_X, MY_NOC_Y)); constexpr uint32_t upstream_noc_xy = uint32_t(NOC_XY_ENCODING(UPSTREAM_NOC_X, UPSTREAM_NOC_Y)); constexpr uint32_t downstream_noc_xy = uint32_t(NOC_XY_ENCODING(DOWNSTREAM_NOC_X, DOWNSTREAM_NOC_Y)); constexpr uint32_t dispatch_s_noc_xy = uint32_t(NOC_XY_ENCODING(DOWNSTREAM_SLAVE_NOC_X, DOWNSTREAM_SLAVE_NOC_Y)); -constexpr uint64_t pcie_noc_xy = uint64_t(NOC_XY_PCIE_ENCODING(NOC_X(PCIE_NOC_X), NOC_Y(PCIE_NOC_Y))); +constexpr uint64_t pcie_noc_xy = + uint64_t(NOC_XY_PCIE_ENCODING(NOC_X_PHYS_COORD(PCIE_NOC_X), NOC_Y_PHYS_COORD(PCIE_NOC_Y))); constexpr uint32_t downstream_cb_page_size = 1 << downstream_cb_log_page_size; constexpr uint32_t dispatch_s_cb_page_size = 1 << dispatch_s_cb_log_page_size; constexpr uint32_t downstream_cb_end = downstream_cb_base + (1 << downstream_cb_log_page_size) * downstream_cb_pages; diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 216ffcae5b32..e76abaf56513 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -845,7 +845,7 @@ void detail::Program_::init_semaphores(const Device &device, const CoreCoord &lo for (auto semaphore : semaphores_on_core) { llrt::write_hex_vec_to_core( device.id(), - device.physical_core_from_logical_core(logical_core, core_type), + device.virtual_core_from_logical_core(logical_core, core_type), std::vector{semaphore.get().initial_value()}, addr + semaphore.get().offset()); } @@ -991,8 +991,8 @@ void detail::Program_::populate_dispatch_data(Device *device) { for (const CoreRange &core_range : ranges) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { - CoreCoord physical_coord = device->physical_core_from_logical_core(CoreCoord({x, y}), core_type); - dst_noc_unicast_info.push_back(std::make_pair(physical_coord, /*num_mcast_dests=*/0)); + CoreCoord virtual_coord = device->virtual_core_from_logical_core(CoreCoord({x, y}), core_type); + dst_noc_unicast_info.push_back(std::make_pair(virtual_coord, /*num_mcast_dests=*/0)); } } } @@ -1559,7 +1559,6 @@ void detail::Program_::compile(Device *device, bool fd_bootloader_mode) { TT_FATAL(not on_dispatch_core, "Illegal kernel placement for {}, Kernels cannot be placed on dispatch cores!", kernel->name()); } }; - for (auto & kernels : kernels_) { for (auto &[id, kernel] : kernels) { validate_kernel_placement(kernel); @@ -1629,8 +1628,8 @@ void Program::set_runtime_id(uint64_t id) { pimpl_->set_runtime_id(id); } uint32_t detail::Program_::get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); - HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); + CoreCoord virtual_core = device->virtual_core_from_logical_core(logical_core, core_type); + HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(virtual_core); uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); const auto &sub_device_ids = this->determine_sub_device_ids(device); // TODO: This restriction can be lifted once we have support for programs spanning multiple sub-devices @@ -1651,8 +1650,8 @@ uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, Core uint32_t detail::Program_::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); - HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); + CoreCoord virtual_core = device->virtual_core_from_logical_core(logical_core, core_type); + HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(virtual_core); uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); const auto &sub_device_ids = this->determine_sub_device_ids(device); // TODO: This restriction can be lifted once this function is changed to return a vector of addresses @@ -1681,8 +1680,8 @@ void Program::set_last_used_command_queue_for_testing(HWCommandQueue *queue) { uint32_t detail::Program_::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); - HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); + CoreCoord virtual_core = device->virtual_core_from_logical_core(logical_core, core_type); + HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(virtual_core); uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); return this->program_configs_[index].sem_size; @@ -1694,8 +1693,8 @@ uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType uint32_t detail::Program_::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const { - CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); - HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); + CoreCoord virtual_core = device->virtual_core_from_logical_core(logical_core, core_type); + HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(virtual_core); uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); return this->program_configs_[index].cb_size; diff --git a/tt_metal/impl/sub_device/sub_device_manager.cpp b/tt_metal/impl/sub_device/sub_device_manager.cpp index d2297c894a46..c1500f850640 100644 --- a/tt_metal/impl/sub_device/sub_device_manager.cpp +++ b/tt_metal/impl/sub_device/sub_device_manager.cpp @@ -257,8 +257,8 @@ void SubDeviceManager::populate_sub_allocators() { .l1_small_size = 0, .trace_region_size = 0, .core_type_from_noc_coord_table = {}, // Populated later - .worker_log_to_physical_routing_x = global_allocator_config.worker_log_to_physical_routing_x, - .worker_log_to_physical_routing_y = global_allocator_config.worker_log_to_physical_routing_y, + .worker_log_to_virtual_routing_x = global_allocator_config.worker_log_to_virtual_routing_x, + .worker_log_to_virtual_routing_y = global_allocator_config.worker_log_to_virtual_routing_y, .l1_bank_remap = std::move(l1_bank_remap), .compute_grid = compute_cores, .alignment = global_allocator_config.alignment, @@ -303,12 +303,12 @@ void SubDeviceManager::populate_noc_data() { this->num_noc_mcast_txns_[i] = tensix_cores.size(); this->noc_mcast_unicast_data_.resize(idx + this->num_noc_mcast_txns_[i] * 2); for (const auto& core_range : tensix_cores.ranges()) { - auto physical_start = - this->device_->physical_core_from_logical_core(core_range.start_coord, CoreType::WORKER); - auto physical_end = this->device_->physical_core_from_logical_core(core_range.end_coord, CoreType::WORKER); - auto physical_core_range = CoreRange(physical_start, physical_end); + auto virtual_start = + this->device_->virtual_core_from_logical_core(core_range.start_coord, CoreType::WORKER); + auto virtual_end = this->device_->virtual_core_from_logical_core(core_range.end_coord, CoreType::WORKER); + auto virtual_core_range = CoreRange(virtual_start, virtual_end); this->noc_mcast_unicast_data_[idx++] = - this->device_->get_noc_multicast_encoding(noc_index, physical_core_range); + this->device_->get_noc_multicast_encoding(noc_index, virtual_core_range); this->noc_mcast_unicast_data_[idx++] = core_range.size(); } this->noc_unicast_data_start_index_[i] = idx; @@ -317,9 +317,8 @@ void SubDeviceManager::populate_noc_data() { for (const auto& core_range : eth_cores.ranges()) { this->noc_mcast_unicast_data_.resize(idx + core_range.size()); for (const auto& core : core_range) { - auto physical_core = this->device_->physical_core_from_logical_core(core, CoreType::ETH); - this->noc_mcast_unicast_data_[idx++] = - this->device_->get_noc_unicast_encoding(noc_index, physical_core); + auto virtual_core = this->device_->virtual_core_from_logical_core(core, CoreType::ETH); + this->noc_mcast_unicast_data_[idx++] = this->device_->get_noc_unicast_encoding(noc_index, virtual_core); } } this->num_noc_unicast_txns_[i] = idx - this->noc_unicast_data_start_index_[i]; diff --git a/tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp b/tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp index 54162f862319..5aed4b52ac0c 100644 --- a/tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp +++ b/tt_metal/kernels/dataflow/reader_binary_diff_lengths.cpp @@ -6,14 +6,12 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src0_addr = get_arg_val(0); - uint32_t src0_noc_x = get_arg_val(1); - uint32_t src0_noc_y = get_arg_val(2); - uint32_t src0_num_tiles = get_arg_val(3); - uint32_t src1_addr = get_arg_val(4); - uint32_t src1_noc_x = get_arg_val(5); - uint32_t src1_noc_y = get_arg_val(6); - uint32_t src1_num_tiles = get_arg_val(7); + uint32_t src0_addr = get_arg_val(0); + uint32_t src0_bank_id = get_arg_val(1); + uint32_t src0_num_tiles = get_arg_val(2); + uint32_t src1_addr = get_arg_val(3); + uint32_t src1_bank_id = get_arg_val(4); + uint32_t src1_num_tiles = get_arg_val(5); constexpr uint32_t cb_id_in0 = 0; constexpr uint32_t cb_id_in1 = 1; @@ -31,7 +29,7 @@ void kernel_main() { // read ublocks from src0/src1 to CB0/CB1, then push ublocks to compute (unpacker) for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { if (i < src0_num_tiles) { - uint64_t src0_noc_addr = get_noc_addr(src0_noc_x, src0_noc_y, src0_addr); + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_addr); cb_reserve_back(cb_id_in0, ublock_size_tiles); l1_write_addr_in0 = get_write_ptr(cb_id_in0); @@ -46,7 +44,7 @@ void kernel_main() { } if (i < src1_num_tiles) { - uint64_t src1_noc_addr = get_noc_addr(src1_noc_x, src1_noc_y, src1_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_addr); cb_reserve_back(cb_id_in1, ublock_size_tiles); l1_write_addr_in1 = get_write_ptr(cb_id_in1); diff --git a/tt_metal/kernels/dataflow/reader_unary.cpp b/tt_metal/kernels/dataflow/reader_unary.cpp index 37ed368b8bb9..a79c750cb977 100644 --- a/tt_metal/kernels/dataflow/reader_unary.cpp +++ b/tt_metal/kernels/dataflow/reader_unary.cpp @@ -7,10 +7,9 @@ #include "dataflow_api.h" void kernel_main() { - uint32_t src_addr = get_arg_val(0); - uint32_t src_noc_x = get_arg_val(1); - uint32_t src_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t src_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); constexpr uint32_t cb_id_in0 = 0; @@ -20,7 +19,7 @@ void kernel_main() { // read a ublock of tiles from src to CB, and then push the ublock to unpacker for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t src_noc_addr = get_noc_addr(src_noc_x, src_noc_y, src_addr); + uint64_t src_noc_addr = get_noc_addr_from_bank_id(bank_id, src_addr); cb_reserve_back(cb_id_in0, ublock_size_tiles); uint32_t l1_write_addr = get_write_ptr(cb_id_in0); diff --git a/tt_metal/kernels/dataflow/writer_unary.cpp b/tt_metal/kernels/dataflow/writer_unary.cpp index 61c3cdc08d06..adddd7b20b3a 100644 --- a/tt_metal/kernels/dataflow/writer_unary.cpp +++ b/tt_metal/kernels/dataflow/writer_unary.cpp @@ -3,12 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 #include "dataflow_api.h" +#include "debug/dprint.h" void kernel_main() { - uint32_t dst_addr = get_arg_val(0); - uint32_t dst_noc_x = get_arg_val(1); - uint32_t dst_noc_y = get_arg_val(2); - uint32_t num_tiles = get_arg_val(3); + uint32_t dst_addr = get_arg_val(0); + uint32_t bank_id = get_arg_val(1); + uint32_t num_tiles = get_arg_val(2); constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16; @@ -17,7 +17,7 @@ void kernel_main() { uint32_t ublock_size_tiles = 1; for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { - uint64_t dst_noc_addr = get_noc_addr(dst_noc_x, dst_noc_y, dst_addr); + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(bank_id, dst_addr); cb_wait_front(cb_id_out0, ublock_size_tiles); uint32_t l1_read_addr = get_read_ptr(cb_id_out0); diff --git a/tt_metal/kernels/dataflow/writer_unary_1.cpp b/tt_metal/kernels/dataflow/writer_unary_1.cpp new file mode 100644 index 000000000000..2ee5486e851c --- /dev/null +++ b/tt_metal/kernels/dataflow/writer_unary_1.cpp @@ -0,0 +1,30 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "dataflow_api.h" + +void kernel_main() { + uint32_t dst_addr = get_arg_val(0); + uint32_t dst_x = get_arg_val(1); + uint32_t dst_y = get_arg_val(2); + uint32_t num_tiles = get_arg_val(3); + + constexpr uint32_t cb_id_out0 = 16; + + // single-tile ublocks + uint32_t ublock_size_bytes = get_tile_size(cb_id_out0); + uint32_t ublock_size_tiles = 1; + + for (uint32_t i = 0; i < num_tiles; i += ublock_size_tiles) { + uint64_t dst_noc_addr = get_noc_addr(dst_x, dst_y, dst_addr); + cb_wait_front(cb_id_out0, ublock_size_tiles); + uint32_t l1_read_addr = get_read_ptr(cb_id_out0); + noc_async_write(l1_read_addr, dst_noc_addr, ublock_size_bytes); + + noc_async_write_barrier(); + + cb_pop_front(cb_id_out0, ublock_size_tiles); + dst_addr += ublock_size_bytes; + } +} diff --git a/tt_metal/llrt/blackhole/bh_hal.cpp b/tt_metal/llrt/blackhole/bh_hal.cpp index 1096da8ec651..5c6513a264ad 100644 --- a/tt_metal/llrt/blackhole/bh_hal.cpp +++ b/tt_metal/llrt/blackhole/bh_hal.cpp @@ -77,7 +77,10 @@ void Hal::initialize_bh() { return NOC_MULTICAST_ENCODING(x_start, y_start, x_end, y_end); }; - num_nocs_ = NUM_NOCS; + this->num_nocs_ = NUM_NOCS; + this->coordinate_virtualization_enabled_ = COORDINATE_VIRTUALIZATION_ENABLED; + this->virtual_worker_start_x_ = VIRTUAL_TENSIX_START_X; + this->virtual_worker_start_y_ = VIRTUAL_TENSIX_START_Y; } } // namespace tt_metal diff --git a/tt_metal/llrt/grayskull/gs_hal.cpp b/tt_metal/llrt/grayskull/gs_hal.cpp index 71a889179b8d..03bb4e0c84e0 100644 --- a/tt_metal/llrt/grayskull/gs_hal.cpp +++ b/tt_metal/llrt/grayskull/gs_hal.cpp @@ -161,7 +161,10 @@ void Hal::initialize_gs() { return NOC_MULTICAST_ENCODING(x_start, y_start, x_end, y_end); }; - num_nocs_ = NUM_NOCS; + this->num_nocs_ = NUM_NOCS; + this->coordinate_virtualization_enabled_ = COORDINATE_VIRTUALIZATION_ENABLED; + this->virtual_worker_start_x_ = VIRTUAL_TENSIX_START_X; + this->virtual_worker_start_y_ = VIRTUAL_TENSIX_START_Y; } } // namespace tt_metal diff --git a/tt_metal/llrt/hal.hpp b/tt_metal/llrt/hal.hpp index 80e880026961..9344b6bd4ac5 100644 --- a/tt_metal/llrt/hal.hpp +++ b/tt_metal/llrt/hal.hpp @@ -149,6 +149,9 @@ class Hal { std::vector dram_sizes_; std::vector mem_alignments_; uint32_t num_nocs_; + bool coordinate_virtualization_enabled_; + uint32_t virtual_worker_start_x_; + uint32_t virtual_worker_start_y_; void initialize_gs(); void initialize_wh(); @@ -178,6 +181,9 @@ class Hal { return noc_multicast_encoding_func_(x_start, y_start, x_end, y_end); } + bool is_coordinate_virtualization_enabled() const { return this->coordinate_virtualization_enabled_; }; + std::uint32_t get_virtual_worker_start_x() const { return this->virtual_worker_start_x_; } + std::uint32_t get_virtual_worker_start_y() const { return this->virtual_worker_start_y_; } uint32_t get_programmable_core_type_count() const; HalProgrammableCoreType get_programmable_core_type(uint32_t core_type_index) const; uint32_t get_programmable_core_type_index(HalProgrammableCoreType programmable_core_type_index) const; diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index 888b40fe7442..ccea2588f93f 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -105,9 +105,8 @@ std::vector read_hex_vec_from_core(chip_id_t chip, const CoreCoord &co return read_hex_vec; } -CoreCoord logical_core_from_ethernet_core(chip_id_t chip_id, const CoreCoord &physical_core) { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip_id); - return soc_desc.get_logical_ethernet_core_from_physical(physical_core); +CoreCoord logical_core_from_ethernet_core(chip_id_t chip_id, const CoreCoord ðernet_core) { + return tt::Cluster::instance().get_logical_ethernet_core_from_virtual(chip_id, ethernet_core); } void write_launch_msg_to_core(chip_id_t chip, const CoreCoord core, launch_msg_t *msg, go_msg_t *go_msg, uint64_t base_addr, bool send_go) { @@ -177,7 +176,7 @@ uint32_t generate_risc_startup_addr(bool is_eth_core) { void program_risc_startup_addr(chip_id_t chip_id, const CoreCoord &core) { std::vector jump_to_fw; - jump_to_fw.push_back(generate_risc_startup_addr(is_ethernet_core(core, chip_id))); + jump_to_fw.push_back(generate_risc_startup_addr(tt::Cluster::instance().is_ethernet_core(core, chip_id))); write_hex_vec_to_core(chip_id, core, tt::stl::Span(jump_to_fw.data(), jump_to_fw.size()), 0); } @@ -188,7 +187,7 @@ bool test_load_write_read_risc_binary( uint32_t core_type_idx, uint32_t processor_class_idx, uint32_t processor_type_idx) { - assert(is_worker_core(core, chip_id) or is_ethernet_core(core, chip_id)); + assert(tt::Cluster::instance().is_worker_core(core, chip_id) or tt::Cluster::instance().is_ethernet_core(core, chip_id)); uint64_t local_init_addr = tt::tt_metal::hal.get_binary_local_init_addr(core_type_idx, processor_class_idx, processor_type_idx); @@ -225,7 +224,7 @@ CoreCoord get_core_for_dram_channel(int dram_channel_id, chip_id_t chip_id) { namespace internal_ { static bool check_if_riscs_on_specified_core_done(chip_id_t chip_id, const CoreCoord &core, int run_state) { - bool is_eth_core = is_ethernet_core(core, chip_id); + bool is_eth_core = tt::Cluster::instance().is_ethernet_core(core, chip_id); bool is_active_eth_core = false; bool is_inactive_eth_core = false; diff --git a/tt_metal/llrt/llrt.hpp b/tt_metal/llrt/llrt.hpp index 260d9793baf6..0bf814e58696 100644 --- a/tt_metal/llrt/llrt.hpp +++ b/tt_metal/llrt/llrt.hpp @@ -87,7 +87,7 @@ void write_hex_vec_to_core( std::vector read_hex_vec_from_core(chip_id_t chip, const CoreCoord &core, uint64_t addr, uint32_t size); -CoreCoord logical_core_from_ethernet_core(chip_id_t chip_id, CoreCoord &physical_core); +CoreCoord logical_core_from_ethernet_core(chip_id_t chip_id, CoreCoord ðernet_core); void write_launch_msg_to_core(chip_id_t chip, CoreCoord core, launch_msg_t *msg, go_msg_t * go_msg, uint64_t addr, bool send_go = true); @@ -95,18 +95,6 @@ void launch_erisc_app_fw_on_core(chip_id_t chip, CoreCoord core); void print_worker_cores(chip_id_t chip_id = 0); -inline bool is_worker_core(const CoreCoord &core, chip_id_t chip_id) { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip_id); - return std::find(soc_desc.physical_workers.begin(), soc_desc.physical_workers.end(), core) != - soc_desc.physical_workers.end(); -} - -inline bool is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) { - const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip_id); - return std::find(soc_desc.physical_ethernet_cores.begin(), soc_desc.physical_ethernet_cores.end(), core) != - soc_desc.physical_ethernet_cores.end(); -} - uint32_t generate_risc_startup_addr(bool is_eth_core); void program_risc_startup_addr(chip_id_t chip_id, const CoreCoord &core); diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index d49d2ae4d5e4..e26404d9457c 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -179,6 +179,9 @@ void Cluster::initialize_device_drivers() { tt_device_params default_params; this->start_driver(default_params); + this->generate_virtual_to_umd_coord_mapping(); + this->generate_logical_to_virtual_coord_mapping(); + this->generate_virtual_to_profiler_flat_id_mapping(); } void Cluster::assert_risc_reset() { @@ -211,6 +214,10 @@ void Cluster::get_metal_desc_from_tt_desc( } } +const std::unordered_map& Cluster::get_virtual_routing_to_profiler_flat_id(chip_id_t chip_id) const { + return this->virtual_routing_to_profiler_flat_id_.at(this->get_board_type(chip_id)); +} + void Cluster::open_driver(const bool &skip_driver_allocs) { const std::string sdesc_path = get_soc_description_file(this->arch_, this->target_type_); @@ -308,6 +315,132 @@ const metal_SocDescriptor &Cluster::get_soc_desc(chip_id_t chip) const { return this->sdesc_per_chip_.at(chip); } +void Cluster::generate_virtual_to_umd_coord_mapping() { + // UMD APIs currently use a coordinate system that is not Physical, Virtual or Logical. + // TT-Metal uses Virtual Coordinates when programming txns on device. + // This mapping allows Cluster APIs to be consistent with the rest of TT-Metal, while correctly + // using UMD under the hood. + // This will be kept around until UMD supports generic coordinates in its APIs, at which point TT-Metal + // virtual coordinates can be passed to UMD directly. + for (auto chip_id : this->cluster_desc_->get_all_chips()) { + this->virtual_worker_cores_[chip_id] = {}; + this->virtual_eth_cores_[chip_id] = {}; + for (auto& core_desc : this->get_soc_desc(chip_id).physical_cores) { + if (core_desc.second.type != CoreType::HARVESTED) { + CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, core_desc.first, core_desc.second.type); + tt_cxy_pair virtual_core = tt_cxy_pair(chip_id, virtual_coords.x, virtual_coords.y); + tt_cxy_pair umd_core = this->get_soc_desc(chip_id).convert_to_umd_coordinates(tt_cxy_pair(chip_id, core_desc.first.x, core_desc.first.y)); + this->virtual_to_umd_coord_mapping_[virtual_core] = umd_core; + if (core_desc.second.type == CoreType::WORKER) { + this->virtual_worker_cores_[chip_id].insert(virtual_coords); + } else if (core_desc.second.type == CoreType::ETH) { + this->virtual_eth_cores_[chip_id].insert(virtual_coords); + } + } + } + } +} + +void Cluster::generate_logical_to_virtual_coord_mapping() { + for (auto chip_id : this->cluster_desc_->get_all_chips()) { + auto board_type = this->get_board_type(chip_id); + if (this->worker_logical_to_virtual_x_.find(board_type) != this->worker_logical_to_virtual_x_.end()) { + continue; + } + auto& soc_desc = this->get_soc_desc(chip_id); + this->worker_logical_to_virtual_x_.insert({board_type, {}}); + this->worker_logical_to_virtual_y_.insert({board_type, {}}); + this->eth_logical_to_virtual_.insert({board_type, {}}); + for (auto x_coords : soc_desc.worker_log_to_routing_x) { + CoreCoord phys_core = soc_desc.get_physical_core_from_logical_core(CoreCoord(x_coords.first, 0), CoreType::WORKER); + CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, phys_core, CoreType::WORKER); + this->worker_logical_to_virtual_x_.at(board_type).insert({x_coords.first, virtual_coords.x}); + } + for (auto y_coords : soc_desc.worker_log_to_routing_y) { + CoreCoord phys_core = soc_desc.get_physical_core_from_logical_core(CoreCoord(0, y_coords.first), CoreType::WORKER); + CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, phys_core, CoreType::WORKER); + this->worker_logical_to_virtual_y_.at(board_type).insert({y_coords.first, virtual_coords.y}); + } + for (std::size_t log_eth_core_y = 0; log_eth_core_y < soc_desc.physical_ethernet_cores.size(); log_eth_core_y++) { + CoreCoord logical_eth_core = {0, log_eth_core_y}; + CoreCoord virtual_coords = this->get_virtual_coordinate_from_physical_coordinates(chip_id, soc_desc.physical_ethernet_cores.at(log_eth_core_y), CoreType::ETH); + this->eth_logical_to_virtual_.at(board_type).insert({logical_eth_core, virtual_coords}); + } + } + +} + +void Cluster::generate_virtual_to_profiler_flat_id_mapping() { +#if defined(TRACY_ENABLE) + for (auto chip_id : this->cluster_desc_->get_all_chips()) { + auto board_type = this->get_board_type(chip_id); + if (this->virtual_routing_to_profiler_flat_id_.find(board_type) != this->virtual_routing_to_profiler_flat_id_.end()) { + continue; + } + this->virtual_routing_to_profiler_flat_id_.insert({board_type, {}}); + auto& soc_desc = this->get_soc_desc(chip_id); + for (const auto& core_to_profiler_id : soc_desc.physical_routing_to_profiler_flat_id) { + if (std::find(soc_desc.physical_workers.begin(), soc_desc.physical_workers.end(), core_to_profiler_id.first) != soc_desc.physical_workers.end()) { + this->virtual_routing_to_profiler_flat_id_.at(board_type).insert({this->get_virtual_coordinate_from_physical_coordinates(chip_id, core_to_profiler_id.first, CoreType::WORKER), core_to_profiler_id.second}); + } else { + this->virtual_routing_to_profiler_flat_id_.at(board_type).insert({this->get_virtual_coordinate_from_physical_coordinates(chip_id, core_to_profiler_id.first, CoreType::ETH), core_to_profiler_id.second}); + } + } + } +#endif +} + +bool Cluster::is_worker_core(const CoreCoord &core, chip_id_t chip_id) const { + return this->virtual_worker_cores_.at(chip_id).find(core) != this->virtual_worker_cores_.at(chip_id).end(); +} + +bool Cluster::is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) const { + + return this->virtual_eth_cores_.find(chip_id) != this->virtual_eth_cores_.end() and + this->virtual_eth_cores_.at(chip_id).find(core) != this->virtual_eth_cores_.at(chip_id).end(); +} + +const std::unordered_set& Cluster::get_virtual_worker_cores(chip_id_t chip_id) const { + return this->virtual_worker_cores_.at(chip_id); +} + +const std::unordered_set& Cluster::get_virtual_eth_cores(chip_id_t chip_id) const { + return this->virtual_eth_cores_.at(chip_id); +} + +CoreCoord Cluster::get_virtual_coordinate_from_logical_coordinates(chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const { + auto board_type = this->get_board_type(chip_id); + if (core_type == CoreType::WORKER) { + return CoreCoord(this->worker_logical_to_virtual_x_.at(board_type).at(logical_coord.x), this->worker_logical_to_virtual_y_.at(board_type).at(logical_coord.y)); + } else if (core_type == CoreType::ETH) { + return this->eth_logical_to_virtual_.at(board_type).at(logical_coord); + } + auto& soc_desc = this->get_soc_desc(chip_id); + return soc_desc.get_physical_core_from_logical_core(logical_coord, core_type); +} + +tt_cxy_pair Cluster::get_virtual_coordinate_from_logical_coordinates(tt_cxy_pair logical_coordinate, const CoreType& core_type) const { + auto xy_virtual_coord = this->get_virtual_coordinate_from_logical_coordinates(logical_coordinate.chip, CoreCoord(logical_coordinate.x, logical_coordinate.y), core_type); + return tt_cxy_pair(logical_coordinate.chip, xy_virtual_coord); +} +CoreCoord Cluster::get_virtual_coordinate_from_physical_coordinates(chip_id_t chip_id, CoreCoord physical_coord, const CoreType& core_type) const { + auto& soc_desc = this->get_soc_desc(chip_id); + if (not (core_type == CoreType::WORKER or core_type == CoreType::ETH)) { + return physical_coord; + } + tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(tt_cxy_pair(chip_id, physical_coord.x, physical_coord.y)); + std::size_t c = virtual_chip_coord.x; + std::size_t r = virtual_chip_coord.y; + this->driver_->translate_to_noc_table_coords(chip_id, r, c); + return CoreCoord{c, r}; +} + +CoreCoord Cluster::get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCoord core) const { + const metal_SocDescriptor &soc_desc = tt::Cluster::instance().get_soc_desc(chip); + auto phys_eth_core = this->virtual_to_umd_coord_mapping_.at(tt_cxy_pair(chip, core.x, core.y)); + return soc_desc.get_logical_ethernet_core_from_physical(phys_eth_core); +} + uint32_t Cluster::get_harvested_rows(chip_id_t chip) const { if (this->target_type_ == TargetDevice::Simulator) { return 0; @@ -332,16 +465,16 @@ int Cluster::get_device_aiclk(const chip_id_t &chip_id) const { return 0; } -void Cluster::deassert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const { - const metal_SocDescriptor &soc_desc = this->get_soc_desc(physical_chip_coord.chip); - tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(physical_chip_coord); - this->driver_->deassert_risc_reset_at_core(virtual_chip_coord); +void Cluster::deassert_risc_reset_at_core(const tt_cxy_pair &core) const { + const metal_SocDescriptor &soc_desc = this->get_soc_desc(core.chip); + tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); + this->driver_->deassert_risc_reset_at_core(umd_core); } -void Cluster::assert_risc_reset_at_core(const tt_cxy_pair &physical_chip_coord) const { - const metal_SocDescriptor &soc_desc = this->get_soc_desc(physical_chip_coord.chip); - tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(physical_chip_coord); - this->driver_->assert_risc_reset_at_core(virtual_chip_coord); +void Cluster::assert_risc_reset_at_core(const tt_cxy_pair &core) const { + const metal_SocDescriptor &soc_desc = this->get_soc_desc(core.chip); + tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); + this->driver_->assert_risc_reset_at_core(umd_core); } void Cluster::write_dram_vec(std::vector &vec, tt_target_dram dram, uint64_t addr, bool small_access) const { @@ -384,10 +517,11 @@ void Cluster::write_core( chip_id_t chip_id = core.chip; const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); if (tt::llrt::OptionsG.get_watcher_enabled()) { - tt::watcher_sanitize_host_noc_write(soc_desc, {core.x, core.y}, addr, sz_in_bytes); + tt::watcher_sanitize_host_noc_write(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {core.x, core.y}, addr, sz_in_bytes); } - tt_cxy_pair virtual_core = soc_desc.convert_to_umd_coordinates(core); - this->driver_->write_to_device(mem_ptr, sz_in_bytes, virtual_core, addr, "LARGE_WRITE_TLB"); + + tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); + this->driver_->write_to_device(mem_ptr, sz_in_bytes, umd_core, addr, "LARGE_WRITE_TLB"); if (this->cluster_desc_->is_chip_remote(chip_id)) { this->driver_->wait_for_non_mmio_flush(chip_id); } @@ -399,11 +533,11 @@ void Cluster::read_core( const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); if (tt::llrt::OptionsG.get_watcher_enabled()) { - tt::watcher_sanitize_host_noc_read(soc_desc, {core.x, core.y}, addr, size_in_bytes); + tt::watcher_sanitize_host_noc_read(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {core.x, core.y}, addr, size_in_bytes); } - tt_cxy_pair virtual_core = soc_desc.convert_to_umd_coordinates(core); - this->driver_->read_from_device(mem_ptr, virtual_core, addr, size_in_bytes, "LARGE_READ_TLB"); + tt_cxy_pair umd_core = this->virtual_to_umd_coord_mapping_.at(core); + this->driver_->read_from_device(mem_ptr, umd_core, addr, size_in_bytes, "LARGE_READ_TLB"); } void Cluster::read_core( @@ -418,10 +552,10 @@ void Cluster::write_reg(const std::uint32_t *mem_ptr, tt_cxy_pair target, uint64 const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); if (tt::llrt::OptionsG.get_watcher_enabled()) { - tt::watcher_sanitize_host_noc_write(soc_desc, {target.x, target.y}, addr, size_in_bytes); + tt::watcher_sanitize_host_noc_write(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {target.x, target.y}, addr, size_in_bytes); } - tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target); - this->driver_->write_to_device(mem_ptr, size_in_bytes, virtual_target, addr, "REG_TLB"); + tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); + this->driver_->write_to_device(mem_ptr, size_in_bytes, umd_target, addr, "REG_TLB"); if (this->cluster_desc_->is_chip_remote(chip_id)) { this->driver_->wait_for_non_mmio_flush(chip_id); } @@ -433,10 +567,10 @@ void Cluster::read_reg(std::uint32_t *mem_ptr, tt_cxy_pair target, uint64_t addr const metal_SocDescriptor &soc_desc = this->get_soc_desc(chip_id); if (tt::llrt::OptionsG.get_watcher_enabled()) { - tt::watcher_sanitize_host_noc_read(soc_desc, {target.x, target.y}, addr, size_in_bytes); + tt::watcher_sanitize_host_noc_read(soc_desc, this->virtual_worker_cores_.at(chip_id), this->virtual_eth_cores_.at(chip_id), {target.x, target.y}, addr, size_in_bytes); } - tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target); - this->driver_->read_from_device(mem_ptr, virtual_target, addr, size_in_bytes, "REG_TLB"); + tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); + this->driver_->read_from_device(mem_ptr, umd_target, addr, size_in_bytes, "REG_TLB"); } void Cluster::write_sysmem( @@ -689,7 +823,6 @@ void Cluster::reserve_ethernet_cores_for_tunneling() { // only setup fd tunneling for devices grouped with same mmio device and if no bi dir // tunnel found between the two chips and if link distance between both chips to mmio // chip is not the same - tt_cxy_pair(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); log_debug( LogDevice, "Reserving {} for tunneling", @@ -863,18 +996,18 @@ void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_ }; for (const auto &chip_id : non_mmio_devices) { for (const auto &[eth_core, routing_info] : this->device_eth_routing_info_.at(chip_id)) { - tt_cxy_pair eth_phys_core(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); + tt_cxy_pair virtual_eth_core(chip_id, get_virtual_coordinate_from_logical_coordinates(chip_id, eth_core, CoreType::ETH)); // Enable internal ethernet routing for non-mmio devices write_core( - (void *)&routing_info_enabled, sizeof(routing_info_t), eth_phys_core, routing_info_addr, false); + (void *)&routing_info_enabled, sizeof(routing_info_t), virtual_eth_core, routing_info_addr, false); } } for (const auto &chip_id : mmio_devices) { for (const auto &[eth_core, routing_info] : this->device_eth_routing_info_.at(chip_id)) { - tt_cxy_pair eth_phys_core(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); + tt_cxy_pair virtual_eth_core(chip_id, get_virtual_coordinate_from_logical_coordinates(chip_id, eth_core, CoreType::ETH)); // Enable internal ethernet routing for mmio devices write_core( - (void *)&routing_info_enabled, sizeof(routing_info_t), eth_phys_core, routing_info_addr, false); + (void *)&routing_info_enabled, sizeof(routing_info_t), virtual_eth_core, routing_info_addr, false); } } } else { @@ -885,18 +1018,18 @@ void Cluster::set_internal_routing_info_for_ethernet_cores(bool enable_internal_ }; for (const auto &chip_id : mmio_devices) { for (const auto &[eth_core, routing_info] : this->device_eth_routing_info_.at(chip_id)) { - tt_cxy_pair eth_phys_core(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); + tt_cxy_pair virtual_eth_core(chip_id, get_virtual_coordinate_from_logical_coordinates(chip_id, eth_core, CoreType::ETH)); // Disable internal ethernet routing for mmio devices write_core( - (void *)&routing_info_disabled, sizeof(routing_info_t), eth_phys_core, routing_info_addr, false); + (void *)&routing_info_disabled, sizeof(routing_info_t), virtual_eth_core, routing_info_addr, false); } } for (const auto &chip_id : non_mmio_devices) { for (const auto &[eth_core, routing_info] : this->device_eth_routing_info_.at(chip_id)) { - tt_cxy_pair eth_phys_core(chip_id, ethernet_core_from_logical_core(chip_id, eth_core)); + tt_cxy_pair virtual_eth_core(chip_id, get_virtual_coordinate_from_logical_coordinates(chip_id, eth_core, CoreType::ETH)); // Disable internal ethernet routing for non-mmio devices write_core( - (void *)&routing_info_disabled, sizeof(routing_info_t), eth_phys_core, routing_info_addr, false); + (void *)&routing_info_disabled, sizeof(routing_info_t), virtual_eth_core, routing_info_addr, false); } } } diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 4d45aeba0d26..3b59f4cc7d06 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -61,6 +61,12 @@ class Cluster { ARCH arch() const { return this->arch_; } const metal_SocDescriptor &get_soc_desc(chip_id_t chip) const; + CoreCoord get_virtual_coordinate_from_logical_coordinates(chip_id_t chip_id, CoreCoord logical_coord, const CoreType& core_type) const; + CoreCoord get_virtual_coordinate_from_physical_coordinates(chip_id_t chip_id, CoreCoord physical_coord, const CoreType& core_type) const; + tt_cxy_pair get_virtual_coordinate_from_logical_coordinates(tt_cxy_pair logical_coordinate, const CoreType& core_type) const; + const std::unordered_set& get_virtual_worker_cores(chip_id_t chip_id) const; + const std::unordered_set& get_virtual_eth_cores(chip_id_t chip_id) const; + uint32_t get_harvested_rows(chip_id_t chip) const; uint32_t get_harvesting_mask(chip_id_t chip) const { return this->driver_->get_harvesting_masks_for_soc_descriptors().at(chip); @@ -90,9 +96,8 @@ class Cluster { std::optional> get_tlb_data(const tt_cxy_pair &target) const { tt::umd::Cluster *device = dynamic_cast(driver_.get()); - const metal_SocDescriptor &soc_desc = this->get_soc_desc(target.chip); - tt_cxy_pair virtual_chip_coord = soc_desc.convert_to_umd_coordinates(target); - return device->get_tlb_data_from_target(virtual_chip_coord); + tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); + return device->get_tlb_data_from_target(umd_target); } std::function get_fast_pcie_static_tlb_write_callable( @@ -106,9 +111,8 @@ class Cluster { // Allows for fast writes when targeting same device core by only doing the lookup once and avoiding repeated stack traversals tt::Writer get_static_tlb_writer(tt_cxy_pair target) const { tt::umd::Cluster *device = dynamic_cast(driver_.get()); - const metal_SocDescriptor &soc_desc = this->get_soc_desc(target.chip); - tt_cxy_pair virtual_target = soc_desc.convert_to_umd_coordinates(target); - return device->get_static_tlb_writer(virtual_target); + tt_cxy_pair umd_target = this->virtual_to_umd_coord_mapping_.at(target); + return device->get_static_tlb_writer(umd_target); } std::uint32_t get_numa_node_for_device(uint32_t device_id) const { @@ -210,6 +214,12 @@ class Cluster { // Returns Wormhole chip board type. BoardType get_board_type(chip_id_t chip_id) const; + bool is_worker_core(const CoreCoord &core, chip_id_t chip_id) const; + bool is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) const; + CoreCoord get_logical_ethernet_core_from_virtual(chip_id_t chip, CoreCoord core) const; + const std::unordered_map& get_worker_logical_to_virtual_x(chip_id_t chip_id) const { return this->worker_logical_to_virtual_x_.at(this->get_board_type(chip_id)); }; + const std::unordered_map& get_worker_logical_to_virtual_y(chip_id_t chip_id) const { return this->worker_logical_to_virtual_y_.at(this->get_board_type(chip_id)); }; + const std::unordered_map& get_virtual_routing_to_profiler_flat_id(chip_id_t chip_id) const; private: Cluster(); ~Cluster(); @@ -226,7 +236,9 @@ class Cluster { void get_metal_desc_from_tt_desc( const std::unordered_map &input, const std::unordered_map &per_chip_id_harvesting_masks); - tt_cxy_pair convert_physical_cxy_to_virtual(const tt_cxy_pair &physical_cxy) const; + void generate_virtual_to_umd_coord_mapping(); + void generate_logical_to_virtual_coord_mapping(); + void generate_virtual_to_profiler_flat_id_mapping(); // Reserves ethernet cores in cluster for tunneling void reserve_ethernet_cores_for_tunneling(); @@ -256,7 +268,14 @@ class Cluster { std::unordered_map> devices_grouped_by_assoc_mmio_device_; // Save mapping of device id to associated MMIO device id for fast lookup std::unordered_map device_to_mmio_device_; - + // Data Structures Tracking Virtual Coordinates + std::unordered_map virtual_to_umd_coord_mapping_; + std::unordered_map> virtual_worker_cores_; + std::unordered_map> virtual_eth_cores_; + std::unordered_map> worker_logical_to_virtual_x_; + std::unordered_map> worker_logical_to_virtual_y_; + std::unordered_map> eth_logical_to_virtual_; + std::unordered_map> virtual_routing_to_profiler_flat_id_; // Flag to tell whether we are on a TG type of system. // If any device has to board type of GALAXY, we are on a TG cluster. bool is_tg_cluster_; diff --git a/tt_metal/llrt/wormhole/wh_hal.cpp b/tt_metal/llrt/wormhole/wh_hal.cpp index 4f20cfb9993e..a82bad6c6b3e 100644 --- a/tt_metal/llrt/wormhole/wh_hal.cpp +++ b/tt_metal/llrt/wormhole/wh_hal.cpp @@ -78,7 +78,10 @@ void Hal::initialize_wh() { return NOC_MULTICAST_ENCODING(x_start, y_start, x_end, y_end); }; - num_nocs_ = NUM_NOCS; + this->num_nocs_ = NUM_NOCS; + this->coordinate_virtualization_enabled_ = COORDINATE_VIRTUALIZATION_ENABLED; + this->virtual_worker_start_x_ = VIRTUAL_TENSIX_START_X; + this->virtual_worker_start_y_ = VIRTUAL_TENSIX_START_Y; } } // namespace tt_metal diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp index 81e19e45f60b..66a83b7b9b87 100644 --- a/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_compute/add_2_integers_in_compute.cpp @@ -29,15 +29,10 @@ int main(int argc, char** argv) { std::shared_ptr src1_dram_buffer = CreateBuffer(dram_config); std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); - auto src0_dram_noc_coord = src0_dram_buffer->noc_coordinates(); - auto src1_dram_noc_coord = src1_dram_buffer->noc_coordinates(); - auto dst_dram_noc_coord = dst_dram_buffer->noc_coordinates(); - uint32_t src0_dram_noc_x = src0_dram_noc_coord.x; - uint32_t src0_dram_noc_y = src0_dram_noc_coord.y; - uint32_t src1_dram_noc_x = src1_dram_noc_coord.x; - uint32_t src1_dram_noc_y = src1_dram_noc_coord.y; - uint32_t dst_dram_noc_x = dst_dram_noc_coord.x; - uint32_t dst_dram_noc_y = dst_dram_noc_coord.y; + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + uint32_t src0_bank_id = 0; + uint32_t src1_bank_id = 0; + uint32_t dst_bank_id = 0; /* Use L1 circular buffers to set input and output buffers that the compute engine will use */ constexpr uint32_t src0_cb_index = CBIndex::c_0; @@ -102,14 +97,9 @@ int main(int argc, char** argv) { program, binary_reader_kernel_id, core, - {src0_dram_buffer->address(), - src1_dram_buffer->address(), - src0_dram_noc_x, - src0_dram_noc_y, - src1_dram_noc_x, - src1_dram_noc_y}); + {src0_dram_buffer->address(), src1_dram_buffer->address(), src0_bank_id, src1_bank_id}); SetRuntimeArgs(program, eltwise_binary_kernel_id, core, {}); - SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_dram_noc_x, dst_dram_noc_y}); + SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id}); EnqueueProgram(cq, program, false); Finish(cq); diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp index 3b9a10aba37c..4f4fd8abe952 100644 --- a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/reader_binary_1_tile.cpp @@ -8,13 +8,11 @@ void kernel_main() { uint32_t src0_addr = get_arg_val(0); uint32_t src1_addr = get_arg_val(1); - uint32_t src0_dram_noc_x = get_arg_val(2); - uint32_t src0_dram_noc_y = get_arg_val(3); - uint32_t src1_dram_noc_x = get_arg_val(4); - uint32_t src1_dram_noc_y = get_arg_val(5); + uint32_t src0_bank_id = get_arg_val(2); + uint32_t src1_bank_id = get_arg_val(3); - uint64_t src0_noc_addr = get_noc_addr(src0_dram_noc_x, src0_dram_noc_y, src0_addr); - uint64_t src1_noc_addr = get_noc_addr(src1_dram_noc_x, src1_dram_noc_y, src1_addr); + uint64_t src0_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_addr); + uint64_t src1_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_addr); constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1; diff --git a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp index 89aea6f18584..9bd9b460661d 100644 --- a/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_compute/kernels/dataflow/writer_1_tile.cpp @@ -6,10 +6,9 @@ void kernel_main() { uint32_t dst_addr = get_arg_val(0); - uint32_t dst_dram_noc_x = get_arg_val(1); - uint32_t dst_dram_noc_y = get_arg_val(2); + uint32_t dst_bank_id = get_arg_val(1); - uint64_t dst_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_addr); + uint64_t dst_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_addr); constexpr uint32_t cb_id_out0 = tt::CBIndex::c_16; uint32_t ublock_size_bytes = get_tile_size(cb_id_out0); diff --git a/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp b/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp index a91e89e83d37..6716eed1d123 100644 --- a/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_riscv/add_2_integers_in_riscv.cpp @@ -25,15 +25,10 @@ int main(int argc, char** argv) { std::shared_ptr src1_dram_buffer = CreateBuffer(dram_config); std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); - auto src0_dram_noc_coord = src0_dram_buffer->noc_coordinates(); - auto src1_dram_noc_coord = src1_dram_buffer->noc_coordinates(); - auto dst_dram_noc_coord = dst_dram_buffer->noc_coordinates(); - uint32_t src0_dram_noc_x = src0_dram_noc_coord.x; - uint32_t src0_dram_noc_y = src0_dram_noc_coord.y; - uint32_t src1_dram_noc_x = src1_dram_noc_coord.x; - uint32_t src1_dram_noc_y = src1_dram_noc_coord.y; - uint32_t dst_dram_noc_x = dst_dram_noc_coord.x; - uint32_t dst_dram_noc_y = dst_dram_noc_coord.y; + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + uint32_t src0_bank_id = 0; + uint32_t src1_bank_id = 0; + uint32_t dst_bank_id = 0; /* Create source data and write to DRAM */ std::vector src0_vec(1, 14); @@ -67,17 +62,12 @@ int main(int argc, char** argv) { program, binary_reader_kernel_id, core, - { - src0_dram_buffer->address(), - src1_dram_buffer->address(), - dst_dram_buffer->address(), - src0_dram_noc_x, - src0_dram_noc_y, - src1_dram_noc_x, - src1_dram_noc_y, - dst_dram_noc_x, - dst_dram_noc_y, - }); + {src0_dram_buffer->address(), + src1_dram_buffer->address(), + dst_dram_buffer->address(), + src0_bank_id, + src1_bank_id, + dst_bank_id}); EnqueueProgram(cq, program, false); Finish(cq); diff --git a/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp b/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp index 61bf0ce554d4..8b7947af9052 100644 --- a/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp +++ b/tt_metal/programming_examples/add_2_integers_in_riscv/kernels/reader_writer_add_in_riscv.cpp @@ -6,17 +6,14 @@ void kernel_main() { uint32_t src0_dram = get_arg_val(0); uint32_t src1_dram = get_arg_val(1); uint32_t dst_dram = get_arg_val(2); - uint32_t src0_dram_noc_x = get_arg_val(3); - uint32_t src0_dram_noc_y = get_arg_val(4); - uint32_t src1_dram_noc_x = get_arg_val(5); - uint32_t src1_dram_noc_y = get_arg_val(6); - uint32_t dst_dram_noc_x = get_arg_val(7); - uint32_t dst_dram_noc_y = get_arg_val(8); + uint32_t src0_bank_id = get_arg_val(3); + uint32_t src1_bank_id = get_arg_val(4); + uint32_t dst_bank_id = get_arg_val(5); // NoC coords (x,y) depending on DRAM location on-chip - uint64_t src0_dram_noc_addr = get_noc_addr(src0_dram_noc_x, src0_dram_noc_y, src0_dram); - uint64_t src1_dram_noc_addr = get_noc_addr(src1_dram_noc_x, src1_dram_noc_y, src1_dram); - uint64_t dst_dram_noc_addr = get_noc_addr(dst_dram_noc_x, dst_dram_noc_y, dst_dram); + uint64_t src0_dram_noc_addr = get_noc_addr_from_bank_id(src0_bank_id, src0_dram); + uint64_t src1_dram_noc_addr = get_noc_addr_from_bank_id(src1_bank_id, src1_dram); + uint64_t dst_dram_noc_addr = get_noc_addr_from_bank_id(dst_bank_id, dst_dram); constexpr uint32_t cb_id_in0 = tt::CBIndex::c_0; // index=0 constexpr uint32_t cb_id_in1 = tt::CBIndex::c_1; // index=1 diff --git a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp index 358c816abe90..98154a7c16ac 100644 --- a/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp +++ b/tt_metal/programming_examples/eltwise_binary/eltwise_binary.cpp @@ -96,7 +96,10 @@ int main(int argc, char** argv) { std::shared_ptr src0_dram_buffer = CreateBuffer(dram_config); std::shared_ptr src1_dram_buffer = CreateBuffer(dram_config); std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); - + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + uint32_t src0_bank_id = 0; + uint32_t src1_bank_id = 0; + uint32_t dst_bank_id = 0; /* * Use circular buffers to set input and output buffers that the * compute engine will use. @@ -182,25 +185,16 @@ int main(int argc, char** argv) { binary_reader_kernel_id, core, {src0_dram_buffer->address(), - static_cast(src0_dram_buffer->noc_coordinates().x), - static_cast(src0_dram_buffer->noc_coordinates().y), + src0_bank_id, num_tiles, src1_dram_buffer->address(), - static_cast(src1_dram_buffer->noc_coordinates().x), - static_cast(src1_dram_buffer->noc_coordinates().y), + src1_bank_id, num_tiles, 0}); SetRuntimeArgs(program, eltwise_binary_kernel_id, core, {num_tiles, 1}); - SetRuntimeArgs( - program, - unary_writer_kernel_id, - core, - {dst_dram_buffer->address(), - static_cast(dst_dram_buffer->noc_coordinates().x), - static_cast(dst_dram_buffer->noc_coordinates().y), - num_tiles}); + SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id, num_tiles}); EnqueueProgram(cq, program, false); Finish(cq); @@ -268,25 +262,16 @@ int main(int argc, char** argv) { binary_reader_kernel_id, core, {src0_dram_buffer->address(), - static_cast(src0_dram_buffer->noc_coordinates().x), - static_cast(src0_dram_buffer->noc_coordinates().y), + src0_bank_id, num_tiles, src1_dram_buffer->address(), - static_cast(src1_dram_buffer->noc_coordinates().x), - static_cast(src1_dram_buffer->noc_coordinates().y), + src1_bank_id, num_tiles, 0}); SetRuntimeArgs(program_mul, eltwise_binary_kernel_id, core, {num_tiles, 1}); - SetRuntimeArgs( - program_mul, - unary_writer_kernel_id, - core, - {dst_dram_buffer->address(), - static_cast(dst_dram_buffer->noc_coordinates().x), - static_cast(dst_dram_buffer->noc_coordinates().y), - num_tiles}); + SetRuntimeArgs(program_mul, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id, num_tiles}); /* * Execute. diff --git a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp index 70af2275a2ae..d1a4752bcb0b 100644 --- a/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp +++ b/tt_metal/programming_examples/eltwise_sfpu/eltwise_sfpu.cpp @@ -53,6 +53,9 @@ int main(int argc, char** argv) { std::shared_ptr dst_dram_buffer = CreateBuffer(dram_config); const uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + uint32_t src0_bank_id = 0; + uint32_t dst_bank_id = 0; /* * Use circular buffers to set input and output buffers that the @@ -129,19 +132,11 @@ int main(int argc, char** argv) { core, { src0_dram_buffer->address(), - static_cast(src0_dram_buffer->noc_coordinates().x), - static_cast(src0_dram_buffer->noc_coordinates().y), + src0_bank_id, num_tiles, }); - SetRuntimeArgs( - program, - unary_writer_kernel_id, - core, - {dst_dram_buffer->address(), - static_cast(dst_dram_buffer->noc_coordinates().x), - static_cast(dst_dram_buffer->noc_coordinates().y), - num_tiles}); + SetRuntimeArgs(program, unary_writer_kernel_id, core, {dst_dram_buffer->address(), dst_bank_id, num_tiles}); EnqueueProgram(cq, program, false); Finish(cq); diff --git a/tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp b/tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp index a84d23fb46e5..0e364122eb28 100644 --- a/tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp +++ b/tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp @@ -7,21 +7,19 @@ void kernel_main() { std::uint32_t l1_buffer_addr = get_arg_val(0); - std::uint32_t dram_buffer_src_addr = get_arg_val(1); - std::uint32_t dram_src_noc_x = get_arg_val(2); - std::uint32_t dram_src_noc_y = get_arg_val(3); + std::uint32_t dram_buffer_src_addr = get_arg_val(1); + std::uint32_t dram_buffer_src_bank = get_arg_val(2); - std::uint32_t dram_buffer_dst_addr = get_arg_val(4); - std::uint32_t dram_dst_noc_x = get_arg_val(5); - std::uint32_t dram_dst_noc_y = get_arg_val(6); + std::uint32_t dram_buffer_dst_addr = get_arg_val(3); + std::uint32_t dram_buffer_dst_bank = get_arg_val(4); - std::uint32_t dram_buffer_size = get_arg_val(7); + std::uint32_t dram_buffer_size = get_arg_val(5); - std::uint64_t dram_buffer_src_noc_addr = get_noc_addr(dram_src_noc_x, dram_src_noc_y, dram_buffer_src_addr); + std::uint64_t dram_buffer_src_noc_addr = get_noc_addr_from_bank_id(dram_buffer_src_bank, dram_buffer_src_addr); noc_async_read(dram_buffer_src_noc_addr, l1_buffer_addr, dram_buffer_size); noc_async_read_barrier(); - std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr(dram_dst_noc_x, dram_dst_noc_y, dram_buffer_dst_addr); + std::uint64_t dram_buffer_dst_noc_addr = get_noc_addr_from_bank_id(dram_buffer_dst_bank, dram_buffer_dst_addr); noc_async_write(l1_buffer_addr, dram_buffer_dst_noc_addr, dram_buffer_size); noc_async_write_barrier(); } diff --git a/tt_metal/programming_examples/loopback/loopback.cpp b/tt_metal/programming_examples/loopback/loopback.cpp index 7ad21566b44a..77f20b22be95 100644 --- a/tt_metal/programming_examples/loopback/loopback.cpp +++ b/tt_metal/programming_examples/loopback/loopback.cpp @@ -66,6 +66,10 @@ int main(int argc, char** argv) { auto output_dram_buffer = CreateBuffer(dram_config); const uint32_t output_dram_buffer_addr = output_dram_buffer->address(); + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + const uint32_t input_bank_id = 0; + const uint32_t output_bank_id = 0; + /* * Create input data and runtime arguments, then execute */ @@ -76,11 +80,9 @@ int main(int argc, char** argv) { const std::vector runtime_args = { l1_buffer->address(), input_dram_buffer->address(), - static_cast(input_dram_buffer->noc_coordinates().x), - static_cast(input_dram_buffer->noc_coordinates().y), + input_bank_id, output_dram_buffer->address(), - static_cast(output_dram_buffer->noc_coordinates().x), - static_cast(output_dram_buffer->noc_coordinates().y), + output_bank_id, l1_buffer->size()}; SetRuntimeArgs(program, dram_copy_kernel_id, core, runtime_args); diff --git a/tt_metal/tools/profiler/profiler.cpp b/tt_metal/tools/profiler/profiler.cpp index e6b13e0a798a..6c8b9ea18470 100644 --- a/tt_metal/tools/profiler/profiler.cpp +++ b/tt_metal/tools/profiler/profiler.cpp @@ -34,9 +34,7 @@ void DeviceProfiler::readRiscProfilerResults( int riscCount; profiler_msg_t* profiler_msg; - const metal_SocDescriptor& soc_d = tt::Cluster::instance().get_soc_desc(device_id); - auto ethCores = soc_d.get_physical_ethernet_cores(); - if (std::find(ethCores.begin(), ethCores.end(), worker_core) == ethCores.end()) { + if (tt::Cluster::instance().is_worker_core(worker_core, device_id)) { profiler_msg = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::PROFILER); CoreType = HalProgrammableCoreType::TENSIX; riscCount = 5; @@ -47,7 +45,7 @@ void DeviceProfiler::readRiscProfilerResults( riscCount = 1; } - uint32_t coreFlatID = soc_d.physical_routing_to_profiler_flat_id.at(worker_core); + uint32_t coreFlatID = tt::Cluster::instance().get_virtual_routing_to_profiler_flat_id(device_id).at(worker_core); uint32_t startIndex = coreFlatID * MAX_RISCV_PER_CORE * PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC; std::vector control_buffer = tt::llrt::read_hex_vec_from_core( diff --git a/tt_metal/tools/profiler/tt_metal_profiler.cpp b/tt_metal/tools/profiler/tt_metal_profiler.cpp index e90e9caa236d..5c9df9b95265 100644 --- a/tt_metal/tools/profiler/tt_metal_profiler.cpp +++ b/tt_metal/tools/profiler/tt_metal_profiler.cpp @@ -65,12 +65,9 @@ void setControlBuffer(uint32_t device_id, std::vector& control_buffer) control_buffer[kernel_profiler::CORE_COUNT_PER_DRAM] = soc_d.profiler_ceiled_core_count_perf_dram_bank; - auto ethCores = soc_d.get_physical_ethernet_cores(); - for (auto& core : soc_d.physical_routing_to_profiler_flat_id) { + for (auto& core : tt::Cluster::instance().get_virtual_routing_to_profiler_flat_id(device_id)) { profiler_msg_t* profiler_msg; - // TODO: clean this up when HAL is more complete (one lookup w/ type) - if (std::find(ethCores.begin(), ethCores.end(), core.first) == ethCores.end()) { - // Tensix + if (tt::Cluster::instance().is_worker_core(core.first, device_id)) { profiler_msg = hal.get_dev_addr(HalProgrammableCoreType::TENSIX, HalL1MemAddrType::PROFILER); } else { @@ -334,8 +331,8 @@ void DumpDeviceProfileResults(Device* device, bool lastDump) { workerCores.push_back(curr_core); } for (const CoreCoord& core : device->get_active_ethernet_cores(true)) { - auto physicalCore = device->physical_core_from_logical_core(core, CoreType::ETH); - workerCores.push_back(physicalCore); + auto virtualCore = device->virtual_core_from_logical_core(core, CoreType::ETH); + workerCores.push_back(virtualCore); } device->push_work( [device, workerCores, lastDump]() mutable { DumpDeviceProfileResults(device, workerCores, lastDump); }); @@ -355,10 +352,10 @@ void DumpDeviceProfileResults(Device* device, std::vector& worker_cor auto device_num_hw_cqs = device->num_hw_cqs(); for (const CoreCoord& core : tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs, dispatch_core_config)) { - const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); + const auto curr_core = device->virtual_core_from_logical_core(core, dispatch_core_type); worker_cores.push_back(curr_core); } - for (const CoreCoord& core : tt::Cluster::instance().get_soc_desc(device_id).physical_ethernet_cores) { + for (const CoreCoord& core : tt::Cluster::instance().get_virtual_eth_cores(device_id)) { worker_cores.push_back(core); } } @@ -393,7 +390,7 @@ void DumpDeviceProfileResults(Device* device, std::vector& worker_cor } for (const CoreCoord& core : tt::get_logical_dispatch_cores(device_id, device_num_hw_cqs, dispatch_core_config)) { - const auto curr_core = device->physical_core_from_logical_core(core, dispatch_core_type); + const auto curr_core = device->virtual_core_from_logical_core(core, dispatch_core_type); profiler_msg_t* profiler_msg = device->get_dev_addr(curr_core, HalL1MemAddrType::PROFILER); std::vector control_buffer = tt::llrt::read_hex_vec_from_core( @@ -410,17 +407,16 @@ void DumpDeviceProfileResults(Device* device, std::vector& worker_cor if (waitForDispatch) { continue; } - for (const CoreCoord& core : - tt::Cluster::instance().get_soc_desc(device_id).physical_ethernet_cores) { + for (const CoreCoord& virtual_core : tt::Cluster::instance().get_virtual_eth_cores(device_id)) { profiler_msg_t* profiler_msg = - device->get_dev_addr(core, HalL1MemAddrType::PROFILER); + device->get_dev_addr(virtual_core, HalL1MemAddrType::PROFILER); std::vector control_buffer = tt::llrt::read_hex_vec_from_core( device_id, - core, + virtual_core, reinterpret_cast(profiler_msg->control_vector), kernel_profiler::PROFILER_L1_CONTROL_BUFFER_SIZE); if (control_buffer[kernel_profiler::PROFILER_DONE] == 0) { - unfinishedCore = core; + unfinishedCore = virtual_core; waitForDispatch = true; continue; } diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index e59f14430cd6..c3e1b8b71bc8 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -312,7 +312,7 @@ bool WriteToDeviceL1( std::vector& host_buffer, CoreType core_type) { ZoneScoped; - auto worker_core = device->physical_core_from_logical_core(logical_core, core_type); + auto worker_core = device->virtual_core_from_logical_core(logical_core, core_type); llrt::write_hex_vec_to_core(device->id(), worker_core, host_buffer, address); return true; } @@ -426,17 +426,22 @@ void WriteToDeviceSharded(Buffer& buffer, tt::stl::Span host_buff const auto& buffer_page_mapping = *buffer.get_buffer_page_mapping(); auto total_pages = buffer.num_pages(); + std::vector page; + page.resize(page_size / sizeof(uint32_t)); for (int host_page_id = 0; host_page_id < total_pages; host_page_id++) { auto dev_page_id = buffer_page_mapping.host_page_to_dev_page_mapping_[host_page_id]; auto core = buffer_page_mapping.all_cores_[buffer_page_mapping.dev_page_to_core_mapping_[dev_page_id]]; auto bank_id = device->bank_ids_from_logical_core(buffer.buffer_type(), core)[0]; auto absolute_address = buffer.sharded_page_address(bank_id, dev_page_id); + auto bank_local_address = buffer.bank_local_page_address(bank_id, dev_page_id); auto data_index = host_page_id * page_size; - std::vector page; - page.insert(page.end(), host_buffer.begin() + data_index, host_buffer.begin() + data_index + page_size); - - auto noc_coordinates = buffer.noc_coordinates(bank_id); - llrt::write_hex_vec_to_core(device->id(), noc_coordinates, page, absolute_address); + std::memcpy(page.data(), host_buffer.data() + data_index, page_size); + if (buffer.is_l1()) { + auto core_coordinates = device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_id)); + llrt::write_hex_vec_to_core(device->id(), core_coordinates, page, absolute_address); + } else { + WriteToDeviceDRAMChannel(device, bank_id, bank_local_address, page); + } } } @@ -455,16 +460,22 @@ void WriteToDeviceInterleavedContiguous(const Buffer& buffer, tt::stl::Spannum_banks(buffer.buffer_type()); uint32_t bank_index = 0; int data_index = 0; + std::vector page; + page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { auto absolute_address = buffer.page_address(bank_index, page_index); - std::vector page; - page.insert(page.end(), host_buffer.begin() + data_index, host_buffer.begin() + data_index + page_size); + // Get address offset of buffer in bank. Required when writing to DRAM. + auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); + std::memcpy(page.data(), host_buffer.data() + data_index, page_size); switch (buffer.buffer_type()) { case BufferType::DRAM: + WriteToDeviceDRAMChannel(device, bank_index, bank_local_address, page); + break; case BufferType::L1: case BufferType::L1_SMALL: { - auto noc_coordinates = buffer.noc_coordinates(bank_index); - llrt::write_hex_vec_to_core(device->id(), noc_coordinates, page, absolute_address); + auto core_coordinates = + device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); + llrt::write_hex_vec_to_core(device->id(), core_coordinates, page, absolute_address); } break; default: TT_THROW("Unsupported buffer type to write to device!"); } @@ -509,26 +520,30 @@ void ReadFromDeviceInterleavedContiguous(const Buffer& buffer, uint8_t* host_buf size_t host_idx = 0; uint32_t bank_index = 0; - std::vector page; + std::vector page; + page.resize(page_size / sizeof(uint32_t)); for (int page_index = 0; page_index < num_pages; page_index++) { auto absolute_address = buffer.page_address(bank_index, page_index); + // Get address offset of buffer in bank. Required when reading from DRAM. + auto bank_local_address = buffer.bank_local_page_address(bank_index, page_index); page.clear(); switch (buffer.buffer_type()) { case BufferType::DRAM: case BufferType::TRACE: + ReadFromDeviceDRAMChannel(device, bank_index, bank_local_address, page_size, page); + break; case BufferType::L1: case BufferType::L1_SMALL: { - auto noc_coordinates = buffer.noc_coordinates(bank_index); - page.resize(page_size); - tt::Cluster::instance().read_core( - page.data(), page_size, tt_cxy_pair(device->id(), noc_coordinates), absolute_address); + auto core_coordinates = + device->worker_core_from_logical_core(buffer.logical_core_from_bank_id(bank_index)); + tt::Cluster::instance().read_core(page.data(), page_size, tt_cxy_pair(device->id(), core_coordinates), absolute_address); } break; default: TT_THROW("Unsupported buffer type to read from device!"); } // Copy page into host buffer - std::memcpy(host_buffer + host_idx, page.data(), page.size()); - host_idx += page.size(); + std::memcpy(host_buffer + host_idx, page.data(), page_size); + host_idx += page_size; bank_index = (bank_index + 1) % num_banks; } @@ -543,10 +558,18 @@ void read_pages_to_host_helper( const uint32_t& dev_page_id, const uint32_t& bank_id) { auto absolute_address = dev_buffer.sharded_page_address(bank_id, dev_page_id); - auto noc_coordinates = dev_buffer.noc_coordinates(bank_id); uint32_t host_buffer_start = host_page_id * page_size; - tt::Cluster::instance().read_core( - host_buffer + host_buffer_start, page_size, tt_cxy_pair(device->id(), noc_coordinates), absolute_address); + if (dev_buffer.is_l1()) { + auto core_coordinates = device->worker_core_from_logical_core(dev_buffer.logical_core_from_bank_id(bank_id)); + tt::Cluster::instance().read_core( + host_buffer + host_buffer_start, page_size, tt_cxy_pair(device->id(), core_coordinates), absolute_address); + } else { + std::vector page; + page.resize(page_size / sizeof(uint32_t)); + auto bank_local_address = dev_buffer.bank_local_page_address(bank_id, dev_page_id); + ReadFromDeviceDRAMChannel(device, bank_id, bank_local_address, page_size, page); + std::memcpy(host_buffer + host_buffer_start, page.data(), page_size); + } } void ReadFromDeviceSharded(Buffer& buffer, uint8_t* host_buffer, bool shard_order) { @@ -669,7 +692,7 @@ void LaunchProgram(Device* device, Program& program, bool wait_until_cores_done) go_msg_t* go_msg = &program.kernels_on_core(logical_core, programmable_core_type_index)->go_msg; msg->kernel_config.host_assigned_id = program.get_runtime_id(); - auto physical_core = device->physical_core_from_logical_core(logical_core, core_type); + auto physical_core = device->virtual_core_from_logical_core(logical_core, core_type); not_done_cores.insert(physical_core); tt::llrt::write_launch_msg_to_core( device->id(), @@ -697,7 +720,7 @@ void WaitProgramDone(Device* device, Program& program) { const auto& logical_cores = logical_cores_used_in_program[index]; CoreType core_type = hal.get_core_type(index); for (const auto& logical_core : logical_cores) { - auto physical_core = device->physical_core_from_logical_core(logical_core, core_type); + auto physical_core = device->virtual_core_from_logical_core(logical_core, core_type); not_done_cores.insert(physical_core); } } @@ -726,8 +749,7 @@ bool ConfigureDeviceWithProgram(Device* device, Program& program, bool fd_bootlo CoreType core_type = hal.get_core_type(index); for (const auto& logical_core : logical_cores) { KernelGroup* kernel_group = program.kernels_on_core(logical_core, index); - CoreCoord physical_core = device->physical_core_from_logical_core(logical_core, core_type); - + CoreCoord physical_core = device->virtual_core_from_logical_core(logical_core, core_type); ConfigureKernelGroup(program, index, kernel_group, device, logical_core); // TODO: add support for CB for ethernet cores if (core_type == CoreType::WORKER) { @@ -785,7 +807,7 @@ void WriteRuntimeArgsToDevice(Device* device, Program& program) { for (auto x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { CoreCoord logical_core(x, y); - auto physical_core = device->physical_core_from_logical_core(logical_core, core_type); + auto physical_core = device->virtual_core_from_logical_core(logical_core, core_type); for (int dispatch_class = 0; dispatch_class < processor_classes; dispatch_class++) { auto& optional_id = kg.kernel_ids[dispatch_class]; if (optional_id) { diff --git a/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp b/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp index d9fabe0fe58c..92edce9eff77 100644 --- a/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/barrier/device/host/barrier_full_worker_grid.cpp @@ -66,16 +66,16 @@ static std::tuple, std::array, std::array< erisc_semaphore_address, start_semaphore_address, erisc_buffer_address, - static_cast(device->physical_core_from_logical_core(sem_init_core, CoreType::WORKER).x), - static_cast(device->physical_core_from_logical_core(sem_init_core, CoreType::WORKER).y), + static_cast(device->virtual_core_from_logical_core(sem_init_core, CoreType::WORKER).x), + static_cast(device->virtual_core_from_logical_core(sem_init_core, CoreType::WORKER).y), worker_sem0}; const std::array sender_rt_args = { static_cast(is_starting_core ? 1 : 0), // is_ring_start eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE, // handshake_addr erisc_buffer_address, erisc_semaphore_address, - static_cast(device->physical_core_from_logical_core(sem_init_core, CoreType::WORKER).x), - static_cast(device->physical_core_from_logical_core(sem_init_core, CoreType::WORKER).y), + static_cast(device->virtual_core_from_logical_core(sem_init_core, CoreType::WORKER).x), + static_cast(device->virtual_core_from_logical_core(sem_init_core, CoreType::WORKER).y), worker_sem1}; // sample size const std::array sem_id_args = { worker_sem0, diff --git a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp index c2b142e9b1c9..ed5ba80cb770 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/types/ccl_types_args_emitters.cpp @@ -95,11 +95,11 @@ static std::pair, std::vector> shard_noc_cores_f std::vector logical_to_noc_row_map; std::vector logical_to_noc_col_map; for (uint32_t y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { - CoreCoord noc_core = d->physical_core_from_logical_core(CoreCoord(0, y), CoreType::WORKER); + CoreCoord noc_core = d->virtual_core_from_logical_core(CoreCoord(0, y), CoreType::WORKER); logical_to_noc_row_map.push_back(noc_core.y); } for (uint32_t x = core_range.start_coord.x; x <= core_range.end_coord.x; x++) { - CoreCoord noc_core = d->physical_core_from_logical_core(CoreCoord(x, 0), CoreType::WORKER); + CoreCoord noc_core = d->virtual_core_from_logical_core(CoreCoord(x, 0), CoreType::WORKER); logical_to_noc_col_map.push_back(noc_core.x); } diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index 7c0544a8c692..83a9c06ab0ad 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -720,9 +720,6 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl( // act uint32_t act_dram_addr = src0_dram_buffer->address(); - auto act_dram_noc_xy = src0_dram_buffer->noc_coordinates(); - uint32_t act_noc_x = act_dram_noc_xy.x; - uint32_t act_noc_y = act_dram_noc_xy.y; assert(act_matrix_width_ntiles % act_block_w_ntiles == 0); assert(act_block_h_ntiles % out_subblock_h_ntiles == 0); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp index 41f00c99ff08..9e6b7d9130e1 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_width_sharded_program_factory.cpp @@ -407,9 +407,6 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_width_sharded_v2_impl( // act uint32_t act_dram_addr = src0_dram_buffer->address(); - auto act_dram_noc_xy = src0_dram_buffer->noc_coordinates(); - uint32_t act_noc_x = act_dram_noc_xy.x; - uint32_t act_noc_y = act_dram_noc_xy.y; TT_FATAL( act_block_h_ntiles % out_subblock_h_ntiles == 0, diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_reader.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_reader.cpp index 174f71e22b77..3b4c6c306166 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_reader.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_reader.cpp @@ -7,6 +7,7 @@ void kernel_main() { constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + constexpr bool read_from_dram = get_compile_time_arg_val(1); const uint32_t total_num_sticks = get_arg_val(0); const uint32_t local_stride_bytes = get_arg_val(1); @@ -25,10 +26,9 @@ void kernel_main() { uint32_t write_offset = args[args_idx++]; uint32_t l1_write_addr = base_write_addr + write_offset; - uint32_t x_coord = args[args_idx++]; - uint32_t y_coord = args[args_idx++]; + uint32_t bank_id = args[args_idx++]; uint32_t read_offset = base_read_addr + args[args_idx++]; - uint64_t noc_read_addr = get_noc_addr(x_coord, y_coord, read_offset); + uint64_t noc_read_addr = get_noc_addr_from_bank_id(bank_id, read_offset); for (uint32_t j = 0; j < total_num_sticks; ++j) { noc_async_read(noc_read_addr, l1_write_addr, read_size); diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_writer.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_writer.cpp index a52753903a59..ee573927dba8 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_writer.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_writer.cpp @@ -7,6 +7,7 @@ void kernel_main() { constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + constexpr bool write_to_dram = get_compile_time_arg_val(1); const uint32_t total_num_sticks = get_arg_val(0); const uint32_t local_stride_bytes = get_arg_val(1); @@ -25,10 +26,9 @@ void kernel_main() { uint32_t read_offset = args[args_idx++]; uint32_t l1_read_addr = base_l1_read_addr + read_offset; - uint32_t x_coord = args[args_idx++]; - uint32_t y_coord = args[args_idx++]; + uint32_t bank_id = args[args_idx++]; uint32_t write_offset = base_write_addr + args[args_idx++]; - uint64_t noc_write_addr = get_noc_addr(x_coord, y_coord, write_offset); + uint64_t noc_write_addr = get_noc_addr_from_bank_id(bank_id, write_offset); for (uint32_t j = 0; j < total_num_sticks; ++j) { noc_async_write(l1_read_addr, noc_write_addr, write_size); diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp index 64871ad90e85..835773e8a0a3 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp @@ -6,7 +6,9 @@ #include "dataflow_api.h" void kernel_main() { - constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + + constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + constexpr bool read_from_dram = get_compile_time_arg_val(1); uint32_t src_addr = get_arg_val(0); uint32_t write_offset = get_arg_val(1); @@ -16,12 +18,10 @@ void kernel_main() { uint32_t l1_write_addr = get_write_ptr(shard_cb_id) + write_offset; for (uint32_t i = 0; i < num_reads; ++i) { - uint32_t x_coord = args[args_idx++]; - uint32_t y_coord = args[args_idx++]; + uint32_t bank_id = args[args_idx++]; uint32_t addr = src_addr + args[args_idx++]; - uint64_t src_noc_addr = get_noc_addr(x_coord, y_coord, addr); uint32_t read_size = args[args_idx++]; - noc_async_read(src_noc_addr, l1_write_addr, read_size); + noc_async_read(get_noc_addr_from_bank_id(bank_id, addr), l1_write_addr, read_size); l1_write_addr += read_size; } noc_async_read_barrier(); diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp index 396173cac789..4c1f4f9d59f7 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp @@ -6,7 +6,9 @@ #include "dataflow_api.h" void kernel_main() { - constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + + constexpr uint32_t shard_cb_id = get_compile_time_arg_val(0); + constexpr bool write_to_dram = get_compile_time_arg_val(1); uint32_t dst_addr = get_arg_val(0); uint32_t read_offset = get_arg_val(1); @@ -16,12 +18,10 @@ void kernel_main() { uint32_t l1_read_addr = get_read_ptr(shard_cb_id) + read_offset; for (uint32_t i = 0; i < num_writes; ++i) { - uint32_t x_coord = args[args_idx++]; - uint32_t y_coord = args[args_idx++]; + uint32_t bank_id = args[args_idx++]; uint32_t addr = dst_addr + args[args_idx++]; - uint64_t dst_noc_addr = get_noc_addr(x_coord, y_coord, addr); uint32_t write_size = args[args_idx++]; - noc_async_write(l1_read_addr, dst_noc_addr, write_size); + noc_async_write(l1_read_addr, get_noc_addr_from_bank_id(bank_id, addr), write_size); l1_read_addr += write_size; } noc_async_write_barrier(); diff --git a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp index 713e8aebb4e6..8ba24ad224da 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/sharded/reshard/device/reshard_program_factory.cpp @@ -341,11 +341,18 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu ? "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_reader.cpp" : "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_width_writer.cpp"; - tt::tt_metal::KernelHandle kernel_id_0 = - tt::tt_metal::CreateKernel(program, kernel_name, all_cores, tt::tt_metal::ReaderDataMovementConfig({cb_index})); + bool interface_with_dram = (remote_core_type == CoreType::DRAM); + tt::tt_metal::KernelHandle kernel_id_0 = tt::tt_metal::CreateKernel( + program, + kernel_name, + all_cores, + tt::tt_metal::ReaderDataMovementConfig({cb_index, interface_with_dram})); - tt::tt_metal::KernelHandle kernel_id_1 = - tt::tt_metal::CreateKernel(program, kernel_name, all_cores, tt::tt_metal::WriterDataMovementConfig({cb_index})); + tt::tt_metal::KernelHandle kernel_id_1 = tt::tt_metal::CreateKernel( + program, + kernel_name, + all_cores, + tt::tt_metal::WriterDataMovementConfig({cb_index, interface_with_dram})); tt::tt_metal::CircularBufferConfig cb_config = tt::tt_metal::CircularBufferConfig(total_size, {{cb_index, data_format}}) @@ -359,7 +366,6 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu auto remote_buffer_type = remote_tensor.buffer()->buffer_type(); auto bank_id = device->bank_ids_from_logical_core(remote_buffer_type, remote_cores[remote_core_idx])[0]; uint32_t bank_offset = device->bank_offset(remote_buffer_type, bank_id); - auto remote_core = device->physical_core_from_logical_core(remote_cores[remote_core_idx], remote_core_type); std::array kernels = {kernel_id_0, kernel_id_1}; uint32_t local_units_left = num_units; @@ -382,17 +388,13 @@ operation::ProgramWithCallbacks reshard_multi_core_same_width(const Tensor& inpu bank_id = device->bank_ids_from_logical_core(remote_buffer_type, remote_cores[remote_core_idx])[0]; bank_offset = device->bank_offset(remote_buffer_type, bank_id); - remote_core = - device->physical_core_from_logical_core(remote_cores[remote_core_idx], remote_core_type); } uint32_t units_to_transfer = std::min(remote_core_units_rem, local_units_to_transfer); - auto remote_core = - device->physical_core_from_logical_core(remote_cores[remote_core_idx], remote_core_type); + bank_id = device->bank_ids_from_logical_core(remote_buffer_type, remote_cores[remote_core_idx])[0]; kernel_args.insert( kernel_args.end(), - {static_cast(remote_core.x), - static_cast(remote_core.y), - (remote_units_per_shard - remote_core_units_rem) * unit_size + bank_offset, + {bank_id, + (remote_units_per_shard - remote_core_units_rem) * unit_size, units_to_transfer * unit_size}); local_units_per_core -= units_to_transfer; local_units_to_transfer -= units_to_transfer; @@ -481,18 +483,17 @@ operation::ProgramWithCallbacks reshard_multi_core_generic(const Tensor& input, std::vector physical_core_coords; physical_core_coords.reserve(grid.x * grid.y); for (uint32_t i = 0; i < grid.x; i++) { - auto physical_input_core = device->physical_core_from_logical_core(CoreCoord(i, 0), input_core_type); + auto physical_input_core = device->virtual_core_from_logical_core(CoreCoord(i, 0), input_core_type); physical_core_coords.push_back(physical_input_core.x); } for (uint32_t i = 0; i < grid.y; i++) { - auto physical_input_core = device->physical_core_from_logical_core(CoreCoord(0, i), input_core_type); + auto physical_input_core = device->virtual_core_from_logical_core(CoreCoord(0, i), input_core_type); physical_core_coords.push_back(physical_input_core.y); } for (const auto& core : cores) { auto page_stride_vector = output_core_to_page_range_pair.at(core); uint32_t num_ranges = page_stride_vector.size(); - std::vector runtime_args = physical_core_coords; auto runtime_args_0 = get_runtime_args_for_given_ranges( physical_core_coords, page_stride_vector, @@ -540,8 +541,7 @@ operation::ProgramWithCallbacks reshard_multi_core_generic(const Tensor& input, struct WidthShardedRuntimeArgs { uint32_t write_size; uint32_t read_offset; - uint32_t x_coord; - uint32_t y_coord; + uint32_t bank_id; uint32_t write_offset; }; @@ -591,14 +591,10 @@ compute_width_sharded_reshard_runtime_args( auto bank_id = device->bank_ids_from_logical_core(remote_buffer_type, remote_cores[current_remote_core_idx])[0]; auto bank_offset = device->bank_offset(remote_buffer_type, bank_id); - const auto& remote_core = - device->physical_core_from_logical_core(remote_cores[current_remote_core_idx], remote_core_type); - core_args.emplace_back( element_size * transfer_size, element_size * local_shard_offset, - remote_core.x, - remote_core.y, + bank_id, element_size * remote_shard_offset + bank_offset); local_shard_offset += transfer_size; @@ -640,7 +636,7 @@ operation::ProgramWithCallbacks reshard_multi_core_same_height(const Tensor& inp const auto local_core_type = local_tensor.buffer()->core_type(); const auto remote_core_type = remote_tensor.buffer()->core_type(); - + bool interface_with_dram = (remote_core_type == CoreType::DRAM); const auto local_cores = corerange_to_cores( local_shard_spec.grid, std::nullopt, local_shard_spec.orientation == ShardOrientation::ROW_MAJOR); const auto remote_cores = corerange_to_cores( @@ -667,11 +663,11 @@ operation::ProgramWithCallbacks reshard_multi_core_same_height(const Tensor& inp ? "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_reader.cpp" : "ttnn/cpp/ttnn/operations/data_movement/sharded/device/kernels/dataflow/reshard_same_height_writer.cpp"; - tt::tt_metal::KernelHandle kernel_id_0 = - tt::tt_metal::CreateKernel(program, kernel_name, all_cores, tt::tt_metal::ReaderDataMovementConfig({cb_index})); + tt::tt_metal::KernelHandle kernel_id_0 = tt::tt_metal::CreateKernel( + program, kernel_name, all_cores, tt::tt_metal::ReaderDataMovementConfig({cb_index, interface_with_dram})); - tt::tt_metal::KernelHandle kernel_id_1 = - tt::tt_metal::CreateKernel(program, kernel_name, all_cores, tt::tt_metal::WriterDataMovementConfig({cb_index})); + tt::tt_metal::KernelHandle kernel_id_1 = tt::tt_metal::CreateKernel( + program, kernel_name, all_cores, tt::tt_metal::WriterDataMovementConfig({cb_index, interface_with_dram})); uint32_t remote_address = remote_tensor.buffer()->address(); auto remote_buffer_type = remote_tensor.buffer()->buffer_type(); @@ -709,7 +705,7 @@ operation::ProgramWithCallbacks reshard_multi_core_same_height(const Tensor& inp args_for_all_segments.size()}; for (const auto& args : args_for_all_segments) { const std::vector segment_kernel_0 = { - args.write_size, args.read_offset, args.x_coord, args.y_coord, args.write_offset}; + args.write_size, args.read_offset, args.bank_id, args.write_offset}; runtime_args_0.insert(runtime_args_0.end(), segment_kernel_0.begin(), segment_kernel_0.end()); // Adjust read and write offsets to the correct stick address because we are splitting work across 2 kernels @@ -717,7 +713,7 @@ operation::ProgramWithCallbacks reshard_multi_core_same_height(const Tensor& inp const uint32_t adjusted_write_offset = args.write_offset + total_num_sticks_kernel_0 * remote_stride_bytes; const std::vector segment_kernel_1 = { - args.write_size, adjusted_read_offset, args.x_coord, args.y_coord, adjusted_write_offset}; + args.write_size, adjusted_read_offset, args.bank_id, adjusted_write_offset}; runtime_args_1.insert(runtime_args_1.end(), segment_kernel_1.begin(), segment_kernel_1.end()); } SetRuntimeArgs(program, kernel_id_0, local_cores[core_idx], runtime_args_0); diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp index db90fe852634..f4bea017571c 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op_multi_core_reuse_mcast_dram_sharded_program_factory.cpp @@ -22,381 +22,6 @@ namespace reuse_dram_sharded_optimized_helpers { using ttnn::operations::unary::UnaryOpType; using ttnn::operations::unary::UnaryWithParam; -void get_dram_reader_core_coords_grayskull( - tt::tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // hardcoded for grayskull - uint32_t full_grid_size_y = 12; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - uint32_t max_worker_y_physical = 0; - uint32_t min_worker_y_physical = 10000; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - if (core_phy.y > max_worker_y_physical) { - max_worker_y_physical = core_phy.y; - } - if (core_phy.y < min_worker_y_physical) { - min_worker_y_physical = core_phy.y; - } - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x; - uint32_t adj_core_y = dram_core.y + 1; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // move worker if they are in the harvested rows - for (auto& coord : adj_core_physical) { - auto y = coord.y; - - // if row is harvested, move core down by 1 - while (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() and - y < (full_grid_size_y - 1)) { - y += 1; - } - - coord.y = y; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; -} - -void get_dram_reader_core_coords_wormhole_b0( - tt::tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // hardcoded for wh_b0 - uint32_t full_grid_size_y = 12; - uint32_t x_step = 3; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - dram_coord_phy.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - all_worker_cores_logical.reserve(num_cores_x * num_cores_y); - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get y coords of the workers - std::vector all_worker_cores_y_physical; - all_worker_cores_y_physical.reserve(num_cores_y); - uint32_t max_worker_y_physical = 0; - uint32_t min_worker_y_physical = 10000; - for (int i = 0; i < num_cores_y; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(0, i)); - all_worker_cores_y_physical.push_back(core_phy.y); - if (core_phy.y > max_worker_y_physical) { - max_worker_y_physical = core_phy.y; - } - if (core_phy.y < min_worker_y_physical) { - min_worker_y_physical = core_phy.y; - } - } - - // get the harvested rows, we treat dram and eth cores as harvested as well - std::vector harvested_rows; - for (int i = 0; i < full_grid_size_y; ++i) { - auto y = i; - - if (std::find(all_worker_cores_y_physical.begin(), all_worker_cores_y_physical.end(), y) == - all_worker_cores_y_physical.end()) { - harvested_rows.push_back(y); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - adj_core_physical.reserve(num_banks); - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x + 1; - uint32_t adj_core_y = dram_core.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // split the adjacent coords into two groups, because DRAM banks has two cols - std::vector adj_core_physical_g1; - adj_core_physical_g1.reserve(num_banks); - std::vector adj_core_physical_y_g1; - adj_core_physical_y_g1.reserve(num_banks); - std::vector adj_core_physical_g2; - adj_core_physical_g2.reserve(num_banks); - std::vector adj_core_physical_y_g2; - adj_core_physical_y_g2.reserve(num_banks); - for (auto core : adj_core_physical) { - if (core.x == adj_core_physical.front().x) { - adj_core_physical_g1.push_back(core); - } else { - adj_core_physical_g2.push_back(core); - } - } - std::vector indices_g1(adj_core_physical_g1.size()); - std::vector indices_g2(adj_core_physical_g2.size()); - std::iota(indices_g1.begin(), indices_g1.end(), 0); - std::iota(indices_g2.begin(), indices_g2.end(), 0); - std::sort(indices_g1.begin(), indices_g1.end(), [&adj_core_physical_g1](int i1, int i2) { - return adj_core_physical_g1[i1].y < adj_core_physical_g1[i2].y; - }); - std::sort(indices_g2.begin(), indices_g2.end(), [&adj_core_physical_g2](int i1, int i2) { - return adj_core_physical_g2[i1].y < adj_core_physical_g2[i2].y; - }); - std::rotate(indices_g1.begin(), indices_g1.end() - 1, indices_g1.end()); - std::rotate(indices_g2.begin(), indices_g2.end() - 1, indices_g2.end()); - - std::vector indices_g1_realloc(adj_core_physical_g1.size()); - std::vector indices_g2_realloc(adj_core_physical_g2.size()); - for (int new_index = 0; new_index < indices_g1.size(); ++new_index) { - indices_g1_realloc[indices_g1[new_index]] = new_index; - } - for (int new_index = 0; new_index < indices_g2.size(); ++new_index) { - indices_g2_realloc[indices_g2[new_index]] = new_index; - } - - std::sort(adj_core_physical_g1.begin(), adj_core_physical_g1.end(), [](const CoreCoord& a, const CoreCoord& b) { - return a.y < b.y; - }); - std::sort(adj_core_physical_g2.begin(), adj_core_physical_g2.end(), [](const CoreCoord& a, const CoreCoord& b) { - return a.y < b.y; - }); - std::rotate(adj_core_physical_g1.begin(), adj_core_physical_g1.end() - 1, adj_core_physical_g1.end()); - std::rotate(adj_core_physical_g2.begin(), adj_core_physical_g2.end() - 1, adj_core_physical_g2.end()); - - for (auto core : adj_core_physical_g1) { - adj_core_physical_y_g1.push_back(core.y); - } - for (auto core : adj_core_physical_g2) { - adj_core_physical_y_g2.push_back(core.y); - } - - // move the workers, if they are on harvested rows - auto process_group = [&](std::vector& group, std::vector& group_y, uint32_t x_step) { - for (auto& coord : group) { - auto y = coord.y; - - if (std::find(harvested_rows.begin(), harvested_rows.end(), y) != harvested_rows.end() || - std::count(group_y.begin(), group_y.end(), y) >= 2) { - auto adjust_coord = [&](int start, int end, int step) { - bool found_new_row = false; - for (int j = start; step > 0 ? j <= end : j >= end; j += step) { - if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end() && - std::count(group_y.begin(), group_y.end(), j) == 0) { - coord.y = j; - coord.x += x_step; - x_step--; - found_new_row = true; - break; - } - } - if (not found_new_row) { - for (int j = start; step > 0 ? j <= end : j >= end; j += step) { - if (std::find(harvested_rows.begin(), harvested_rows.end(), j) == harvested_rows.end()) { - coord.y = j; - coord.x += x_step; - x_step--; - found_new_row = true; - break; - } - } - } - }; - - if (y >= max_bank_id) { - adjust_coord(max_worker_y_physical, min_worker_y_physical, -1); - } else { - adjust_coord(min_worker_y_physical, max_worker_y_physical, 1); - } - } - } - }; - // move the workers, if they are on harvested rows - process_group(adj_core_physical_g1, adj_core_physical_y_g1, x_step); - process_group(adj_core_physical_g2, adj_core_physical_y_g2, x_step); - - // merge two group into one - std::vector adj_core_physical_realloc; - adj_core_physical_realloc.reserve(num_banks); - for (int i = 0; i < indices_g1_realloc.size(); ++i) { - adj_core_physical_realloc.push_back(adj_core_physical_g1[indices_g1_realloc[i]]); - } - for (int i = 0; i < indices_g2_realloc.size(); ++i) { - adj_core_physical_realloc.push_back(adj_core_physical_g2[indices_g2_realloc[i]]); - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - adj_core_logical_realloc.reserve(num_banks); - for (int i = 0; i < adj_core_physical_realloc.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical_realloc[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; -} - -void get_dram_reader_core_coords_blackhole( - tt_metal::Device* device, CoreRangeSet& all_cores, std::vector& all_cores_ordered) { - // hardcoded for blackhole - uint32_t full_grid_size_x = 17; - - // get all the logical coord - auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - - // get dram banks and coords - uint32_t num_banks = device->num_dram_channels(); - uint32_t max_bank_id = num_banks - 1; - std::vector dram_coord_phy; - for (int i = 0; i < num_banks; ++i) { - dram_coord_phy.push_back(device->dram_core_from_dram_channel(i)); - } - - // get worker logical coords - std::vector all_worker_cores_logical; - for (int i = 0; i < num_cores_x; ++i) { - for (int j = 0; j < num_cores_y; ++j) { - all_worker_cores_logical.push_back(CoreCoord(i, j)); - } - } - - // get x coords of the workers - std::vector all_worker_cores_x_physical; - for (int i = 0; i < num_cores_x; ++i) { - auto core_phy = device->worker_core_from_logical_core(CoreCoord(i, 0)); - all_worker_cores_x_physical.push_back(core_phy.x); - } - - // get the harvested cols, we treat dram and eth cores as harvested as well - std::vector harvested_cols; - for (int i = 0; i < full_grid_size_x; ++i) { - auto x = i; - - if (std::find(all_worker_cores_x_physical.begin(), all_worker_cores_x_physical.end(), x) == - all_worker_cores_x_physical.end()) { - harvested_cols.push_back(x); - } - } - - // get the ajacent cores of DRAM banks - std::vector adj_core_physical; - for (int i = 0; i < num_banks; ++i) { - auto dram_core = dram_coord_phy[i]; - uint32_t adj_core_x = dram_core.x + 1; - uint32_t adj_core_y = dram_core.y; - adj_core_physical.push_back(CoreCoord(adj_core_x, adj_core_y)); - } - - // move worker if they are in the harvested cols - for (auto& coord : adj_core_physical) { - auto x = coord.x; - - // if col is harvested, move core right by 1 - while (std::find(harvested_cols.begin(), harvested_cols.end(), x) != harvested_cols.end() and - x < (full_grid_size_x - 1)) { - x += 1; - } - - coord.x = x; - } - - // find the logical coord from physical coord - std::vector adj_core_logical_realloc; - for (int i = 0; i < adj_core_physical.size(); ++i) { - for (int j = 0; j < all_worker_cores_logical.size(); ++j) { - auto core = device->worker_core_from_logical_core(all_worker_cores_logical[j]); - if (adj_core_physical[i] == core) { - adj_core_logical_realloc.push_back(all_worker_cores_logical[j]); - } - } - } - - // create sets - std::set all_cores_set; - for (int i = 0; i < num_banks; ++i) { - all_cores_set.insert(CoreRange(adj_core_logical_realloc[i])); - } - all_cores = CoreRangeSet(all_cores_set); - all_cores_ordered = adj_core_logical_realloc; -} - void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t tile_size, uint32_t& page_size, uint32_t& num_pages) { uint64_t total_size = static_cast(num_tiles) * tile_size; @@ -419,6 +44,15 @@ void move_common_entries(std::vector& v1, std::vector& v2, } } +void get_optimal_dram_bank_to_reader_assignment(Device* device, std::vector& all_worker_cores_ordered, CoreRangeSet& all_worker_cores) { + all_worker_cores_ordered = device->get_optimal_dram_bank_to_logical_worker_assignment(); + std::set all_cores_set; + for (const auto& worker_core : all_worker_cores_ordered) { + all_cores_set.insert(CoreRange(worker_core)); + } + all_worker_cores = CoreRangeSet(all_cores_set); +} + operation::ProgramWithCallbacks create_program_dram_sharded( tt::tt_metal::Device* device, const CoreRangeSet& all_storage_cores, @@ -463,18 +97,9 @@ operation::ProgramWithCallbacks create_program_dram_sharded( tt_metal::Program program{}; // get the dram readers - CoreRangeSet all_worker_cores; std::vector all_worker_cores_ordered; - - if (device->arch() == tt::ARCH::WORMHOLE_B0) { - get_dram_reader_core_coords_wormhole_b0(device, all_worker_cores, all_worker_cores_ordered); - } else if (device->arch() == tt::ARCH::GRAYSKULL) { - get_dram_reader_core_coords_grayskull(device, all_worker_cores, all_worker_cores_ordered); - } else if (device->arch() == tt::ARCH::BLACKHOLE) { - get_dram_reader_core_coords_blackhole(device, all_worker_cores, all_worker_cores_ordered); - } else { - TT_THROW("Device not supported"); - } + CoreRangeSet all_worker_cores; + get_optimal_dram_bank_to_reader_assignment(device, all_worker_cores_ordered, all_worker_cores); // dram banks uint32_t num_dram_banks = all_worker_cores_ordered.size();