diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/pgm_dispatch_perf.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/pgm_dispatch_perf.cpp index ea519aaf097..a74842b0076 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/pgm_dispatch_perf.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/pgm_dispatch_perf.cpp @@ -3,8 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 // NULL kernel is not 0, subtract off overhead -#if KERNEL_BYTES > 30 -uint8_t data1[KERNEL_BYTES-30] __attribute__ ((section ("l1_data"))) __attribute__((used)); +#if KERNEL_BYTES > 16 +constexpr uint32_t empty_kernel_bytes = 16; +uint8_t data1[KERNEL_BYTES - empty_kernel_bytes] __attribute__ ((section ("l1_data_test_only"))) __attribute__((used)); #endif #ifdef KERNEL_GLOBAL diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp index 1f924adc26b..0839d8605e3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp @@ -64,7 +64,6 @@ void kernel_main() { zero_l1_buf(reinterpret_cast(queue_start_addr_words*PACKET_WORD_SIZE_BYTES), queue_size_words); - noc_init(); for (uint32_t i = 0; i < num_src_endpoints; i++) { src_rnd_state[i].init(prng_seed, src_endpoint_start_id+i); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp index a698fba95cd..4496deb84e0 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_tx.cpp @@ -121,7 +121,6 @@ void kernel_main() { test_results[PQ_TEST_MISC_INDEX] = 0xff000000; test_results[PQ_TEST_MISC_INDEX + 1] = 0xcc000000 | src_endpoint_id; - noc_init(); zero_l1_buf( reinterpret_cast(queue_start_addr_words * PACKET_WORD_SIZE_BYTES), queue_size_words); diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 686f653e531..03e0aa8c9ac 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -59,7 +59,6 @@ uint32_t noc_nonposted_writes_num_issued[NUM_NOCS] __attribute__((used)); uint32_t noc_nonposted_writes_acked[NUM_NOCS] __attribute__((used)); uint32_t noc_nonposted_atomics_acked[NUM_NOCS] __attribute__((used)); uint32_t noc_posted_writes_num_issued[NUM_NOCS] __attribute__((used)); -uint32_t atomic_ret_val __attribute__((section("l1_data"))) __attribute__((used)); CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used)); @@ -342,7 +341,7 @@ int main() { noc_index = 0; risc_init(); device_setup(); - noc_init(); + noc_init(MEM_NOC_ATOMIC_RET_VAL_ADDR); // Set ncrisc's resume address to 0 so we know when ncrisc has overwritten it mailboxes->ncrisc_halt.resume_addr = 0; diff --git a/tt_metal/hw/firmware/src/erisc.cc b/tt_metal/hw/firmware/src/erisc.cc index a5fa7917503..a092726c688 100644 --- a/tt_metal/hw/firmware/src/erisc.cc +++ b/tt_metal/hw/firmware/src/erisc.cc @@ -32,7 +32,6 @@ uint32_t noc_nonposted_writes_num_issued[NUM_NOCS] __attribute__((used)); uint32_t noc_nonposted_writes_acked[NUM_NOCS] __attribute__((used)); uint32_t noc_nonposted_atomics_acked[NUM_NOCS] __attribute__((used)); uint32_t noc_posted_writes_num_issued[NUM_NOCS] __attribute__((used)); -uint32_t atomic_ret_val __attribute__ ((section ("l1_data"))) __attribute__((used)); uint32_t tt_l1_ptr *rta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *crta_l1_base __attribute__((used)); @@ -47,7 +46,7 @@ void __attribute__((section("erisc_l1_code.1"), noinline)) Application(void) { wzerorange(__ldm_bss_start, __ldm_bss_end); risc_init(); - noc_init(); + noc_init(MEM_NOC_ATOMIC_RET_VAL_ADDR); wzerorange(__ldm_bss_start, __ldm_bss_end); for (uint32_t n = 0; n < NUM_NOCS; n++) { diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index 0ab226d94d1..518b33f544c 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -35,7 +35,6 @@ uint32_t noc_nonposted_writes_num_issued[NUM_NOCS] __attribute__((used)); uint32_t noc_nonposted_writes_acked[NUM_NOCS] __attribute__((used)); uint32_t noc_nonposted_atomics_acked[NUM_NOCS] __attribute__((used)); uint32_t noc_posted_writes_num_issued[NUM_NOCS] __attribute__((used)); -uint32_t atomic_ret_val __attribute__ ((section ("l1_data"))) __attribute__((used)); uint32_t tt_l1_ptr *rta_l1_base __attribute__((used)); uint32_t tt_l1_ptr *crta_l1_base __attribute__((used)); @@ -102,7 +101,7 @@ int main() { risc_init(); //device_setup(); - noc_init(); + noc_init(MEM_NOC_ATOMIC_RET_VAL_ADDR); mailboxes->go_message.signal = RUN_MSG_DONE; mailboxes->launch_msg_rd_ptr = 0; // Initialize the rdptr to 0 diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map.h b/tt_metal/hw/inc/blackhole/dev_mem_map.h index 2c54c89bb80..11dc10950a9 100644 --- a/tt_metal/hw/inc/blackhole/dev_mem_map.h +++ b/tt_metal/hw/inc/blackhole/dev_mem_map.h @@ -15,12 +15,16 @@ // Before adding a define here, read the following: // 1) Any "truly global" address must be specified explicitly here. Truly // global addresses are addresses that are referenced on both the host and -// device +// device or between processors // 2) Memory section sizes must be specified here, these are used in the // linker scripts -// 3) Device static/global variables generally should NOT be listed here. If -// they are global to a core, declare them in the that core's source code and -// tag them if needed with a section (e.g., "l1_data") +// 3) static/global variables generally should NOT be listed here. If +// they are global to a processor, declare them in the that processor's source +// code, they will get placed in local memory +// 4) L1 data sections are no longer supported as addressing them with XIP +// binaries requires runtime address patching. Instead of using named +// variables in the L1 data section use a mailbox (or address in the mailbox +// range and initialize explicitly) // ///////////// @@ -39,7 +43,6 @@ ///////////// // Firmware/kernel code holes -#define MEM_BOOT_CODE_SIZE 4 #define MEM_BRISC_FIRMWARE_SIZE (10 * 1024 + MEM_BRISC_LOCAL_SIZE) #define MEM_NCRISC_FIRMWARE_SIZE (16 * 1024 + MEM_NCRISC_LOCAL_SIZE) #define MEM_TRISC0_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE) @@ -48,6 +51,7 @@ #define MEM_ZEROS_SIZE 512 #define MEM_BOOT_CODE_BASE 0 +#define MEM_NOC_ATOMIC_RET_VAL_ADDR 4 #define MEM_L1_BARRIER 12 #define MEM_MAILBOX_BASE 16 // Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small diff --git a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h index 06a1b5b11a2..ccb4c7fa167 100644 --- a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h @@ -28,7 +28,6 @@ extern uint32_t noc_nonposted_writes_num_issued[NUM_NOCS]; extern uint32_t noc_nonposted_writes_acked[NUM_NOCS]; extern uint32_t noc_nonposted_atomics_acked[NUM_NOCS]; extern uint32_t noc_posted_writes_num_issued[NUM_NOCS]; -extern uint32_t atomic_ret_val; inline __attribute__((always_inline)) void NOC_CMD_BUF_WRITE_REG( uint32_t noc, uint32_t buf, uint32_t addr, uint32_t val) { @@ -166,7 +165,7 @@ inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_atomics_flushed( return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == noc_nonposted_atomics_acked[noc]); } -inline __attribute__((always_inline)) void noc_init() { +inline __attribute__((always_inline)) void noc_init(uint32_t atomic_ret_val) { #pragma GCC unroll 0 for (int noc = 0; noc < NUM_NOCS; noc++) { uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(noc, 0, NOC_NODE_ID); @@ -179,7 +178,7 @@ inline __attribute__((always_inline)) void noc_init() { NOC_CMD_BUF_WRITE_REG(noc, NCRISC_WR_REG_CMD_BUF, NOC_TARG_ADDR_MID, 0x0); NOC_CMD_BUF_WRITE_REG(noc, NCRISC_WR_REG_CMD_BUF, NOC_TARG_ADDR_COORDINATE, (uint32_t)(xy_local_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); - uint64_t atomic_ret_addr = NOC_XY_ADDR(my_x, my_y, (uint32_t)(&atomic_ret_val)); + uint64_t atomic_ret_addr = NOC_XY_ADDR(my_x, my_y, atomic_ret_val); NOC_CMD_BUF_WRITE_REG(noc, NCRISC_AT_CMD_BUF, NOC_RET_ADDR_LO, (uint32_t)(atomic_ret_addr & 0xFFFFFFFF)); NOC_CMD_BUF_WRITE_REG(noc, NCRISC_AT_CMD_BUF, NOC_RET_ADDR_MID, 0x0); NOC_CMD_BUF_WRITE_REG(noc, NCRISC_AT_CMD_BUF, NOC_RET_ADDR_COORDINATE, (uint32_t)(atomic_ret_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); diff --git a/tt_metal/hw/inc/grayskull/dev_mem_map.h b/tt_metal/hw/inc/grayskull/dev_mem_map.h index ff5bc488bad..d58e0076682 100644 --- a/tt_metal/hw/inc/grayskull/dev_mem_map.h +++ b/tt_metal/hw/inc/grayskull/dev_mem_map.h @@ -15,12 +15,16 @@ // Before adding a define here, read the following: // 1) Any "truly global" address must be specified explicitly here. Truly // global addresses are addresses that are referenced on both the host and -// device +// device or between processors // 2) Memory section sizes must be specified here, these are used in the // linker scripts -// 3) Device static/global variables generally should NOT be listed here. If -// they are global to a core, declare them in the that core's source code and -// tag them if needed with a section (e.g., "l1_data") +// 3) static/global variables generally should NOT be listed here. If +// they are global to a processor, declare them in the that processor's source +// code, they will get placed in local memory +// 4) L1 data sections are no longer supported as addressing them with XIP +// binaries requires runtime address patching. Instead of using named +// variables in the L1 data section use a mailbox (or address in the mailbox +// range and initialize explicitly) // ///////////// @@ -42,8 +46,6 @@ ///////////// // Firmware/kernel code holes -#define MEM_BOOT_CODE_SIZE 4 - #define MEM_BRISC_FIRMWARE_SIZE (10 * 1024 + MEM_BRISC_LOCAL_SIZE) #define MEM_NCRISC_FIRMWARE_SIZE (16 * 1024) #define MEM_TRISC0_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE) @@ -53,6 +55,7 @@ #define MEM_ZEROS_SIZE 512 #define MEM_BOOT_CODE_BASE 0 +#define MEM_NOC_ATOMIC_RET_VAL_ADDR 4 #define MEM_L1_BARRIER 12 #define MEM_MAILBOX_BASE 16 // Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small diff --git a/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h b/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h index 36fed80d2c3..0298243b385 100644 --- a/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/grayskull/noc_nonblocking_api.h @@ -26,7 +26,6 @@ extern uint32_t noc_nonposted_writes_num_issued[NUM_NOCS]; extern uint32_t noc_nonposted_writes_acked[NUM_NOCS]; extern uint32_t noc_nonposted_atomics_acked[NUM_NOCS]; extern uint32_t noc_posted_writes_num_issued[NUM_NOCS]; -extern uint32_t atomic_ret_val; inline __attribute__((always_inline)) void NOC_CMD_BUF_WRITE_REG(uint32_t noc, uint32_t buf, uint32_t addr, uint32_t val) { uint32_t offset = (buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT) + addr; @@ -139,7 +138,7 @@ inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_atomics_flushed( return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == noc_nonposted_atomics_acked[noc]); } -inline __attribute__((always_inline)) void noc_init() { +inline __attribute__((always_inline)) void noc_init(uint32_t atomic_ret_val) { #pragma GCC unroll 0 for (int noc = 0; noc < NUM_NOCS; noc++) { uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(noc, 0, NOC_NODE_ID); @@ -150,7 +149,7 @@ inline __attribute__((always_inline)) void noc_init() { NOC_CMD_BUF_WRITE_REG(noc, NCRISC_WR_CMD_BUF, NOC_TARG_ADDR_COORDINATE, (uint32_t)(xy_local_addr >> NOC_ADDR_COORD_SHIFT)); NOC_CMD_BUF_WRITE_REG(noc, NCRISC_WR_REG_CMD_BUF, NOC_TARG_ADDR_COORDINATE, (uint32_t)(xy_local_addr >> NOC_ADDR_COORD_SHIFT)); - uint64_t atomic_ret_addr = NOC_XY_ADDR(my_x, my_y, (uint32_t)(&atomic_ret_val)); + uint64_t atomic_ret_addr = NOC_XY_ADDR(my_x, my_y, atomic_ret_val); NOC_CMD_BUF_WRITE_REG(noc, NCRISC_AT_CMD_BUF, NOC_RET_ADDR_LO, (uint32_t)(atomic_ret_addr & 0xFFFFFFFF)); NOC_CMD_BUF_WRITE_REG(noc, NCRISC_AT_CMD_BUF, NOC_RET_ADDR_COORDINATE, (uint32_t)(atomic_ret_addr >> NOC_ADDR_COORD_SHIFT)); diff --git a/tt_metal/hw/inc/wormhole/dev_mem_map.h b/tt_metal/hw/inc/wormhole/dev_mem_map.h index 001e7ea1736..48388754659 100644 --- a/tt_metal/hw/inc/wormhole/dev_mem_map.h +++ b/tt_metal/hw/inc/wormhole/dev_mem_map.h @@ -15,12 +15,16 @@ // Before adding a define here, read the following: // 1) Any "truly global" address must be specified explicitly here. Truly // global addresses are addresses that are referenced on both the host and -// device +// device or between processors // 2) Memory section sizes must be specified here, these are used in the // linker scripts -// 3) Device static/global variables generally should NOT be listed here. If -// they are global to a core, declare them in the that core's source code and -// tag them if needed with a section (e.g., "l1_data") +// 3) static/global variables generally should NOT be listed here. If +// they are global to a processor, declare them in the that processor's source +// code, they will get placed in local memory +// 4) L1 data sections are no longer supported as addressing them with XIP +// binaries requires runtime address patching. Instead of using named +// variables in the L1 data section use a mailbox (or address in the mailbox +// range and initialize explicitly) // ///////////// @@ -43,7 +47,6 @@ ///////////// // Firmware/kernel code holes -#define MEM_BOOT_CODE_SIZE 4 #define MEM_BRISC_FIRMWARE_SIZE (10 * 1024 + MEM_BRISC_LOCAL_SIZE) #define MEM_NCRISC_FIRMWARE_SIZE (16 * 1024) #define MEM_TRISC0_FIRMWARE_SIZE (16 * 1024 + MEM_TRISC_LOCAL_SIZE) @@ -53,6 +56,7 @@ #define MEM_ZEROS_SIZE 512 #define MEM_BOOT_CODE_BASE 0 +#define MEM_NOC_ATOMIC_RET_VAL_ADDR 4 #define MEM_L1_BARRIER 12 #define MEM_MAILBOX_BASE 16 // Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small diff --git a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h index 1d778dc94f1..40d4a6ec39f 100644 --- a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h @@ -26,7 +26,6 @@ extern uint32_t noc_nonposted_writes_num_issued[NUM_NOCS]; extern uint32_t noc_nonposted_writes_acked[NUM_NOCS]; extern uint32_t noc_nonposted_atomics_acked[NUM_NOCS]; extern uint32_t noc_posted_writes_num_issued[NUM_NOCS]; -extern uint32_t atomic_ret_val; inline __attribute__((always_inline)) void NOC_CMD_BUF_WRITE_REG(uint32_t noc, uint32_t buf, uint32_t addr, uint32_t val) { uint32_t offset = (buf << NOC_CMD_BUF_OFFSET_BIT) + (noc << NOC_INSTANCE_OFFSET_BIT) + addr; @@ -149,7 +148,7 @@ inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_atomics_flushed( return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == noc_nonposted_atomics_acked[noc]); } -inline __attribute__((always_inline)) void noc_init() { +inline __attribute__((always_inline)) void noc_init(uint32_t atomic_ret_val) { #pragma GCC unroll 0 for (int noc = 0; noc < NUM_NOCS; noc++) { uint32_t noc_id_reg = NOC_CMD_BUF_READ_REG(noc, 0, NOC_NODE_ID); @@ -160,7 +159,7 @@ inline __attribute__((always_inline)) void noc_init() { NOC_CMD_BUF_WRITE_REG(noc, NCRISC_WR_CMD_BUF, NOC_TARG_ADDR_COORDINATE, (uint32_t)(xy_local_addr >> NOC_ADDR_COORD_SHIFT)); NOC_CMD_BUF_WRITE_REG(noc, NCRISC_WR_REG_CMD_BUF, NOC_TARG_ADDR_COORDINATE, (uint32_t)(xy_local_addr >> NOC_ADDR_COORD_SHIFT)); - uint64_t atomic_ret_addr = NOC_XY_ADDR(my_x, my_y, (uint32_t)(&atomic_ret_val)); + uint64_t atomic_ret_addr = NOC_XY_ADDR(my_x, my_y, atomic_ret_val); NOC_CMD_BUF_WRITE_REG(noc, NCRISC_AT_CMD_BUF, NOC_RET_ADDR_LO, (uint32_t)(atomic_ret_addr & 0xFFFFFFFF)); NOC_CMD_BUF_WRITE_REG(noc, NCRISC_AT_CMD_BUF, NOC_RET_ADDR_COORDINATE, (uint32_t)(atomic_ret_addr >> NOC_ADDR_COORD_SHIFT)); diff --git a/tt_metal/hw/toolchain/sections.ld b/tt_metal/hw/toolchain/sections.ld index 3033c6fedff..efcbe4efcf5 100644 --- a/tt_metal/hw/toolchain/sections.ld +++ b/tt_metal/hw/toolchain/sections.ld @@ -68,11 +68,7 @@ SECTIONS l1_data : { - *(l1_data) - } > REGION_CODE - l1_data_noinit (NOLOAD) : - { - *(l1_data_noinit) + *(l1_data_test_only) } > REGION_CODE . = ALIGN(. + MEM_PAD, MEM_ALIGN); diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp index e57398eb5dd..3ba5a9454fd 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp @@ -66,7 +66,7 @@ void dispatch_s_wr_reg_cmd_buf_init() { FORCE_INLINE void dispatch_s_atomic_cmd_buf_init() { - uint64_t atomic_ret_addr = get_noc_addr_helper(my_noc_xy, (uint32_t)(&atomic_ret_val)); + uint64_t atomic_ret_addr = get_noc_addr_helper(my_noc_xy, MEM_NOC_ATOMIC_RET_VAL_ADDR); NOC_CMD_BUF_WRITE_REG(my_noc_index, DISPATCH_S_ATOMIC_CMD_BUF, NOC_RET_ADDR_LO, (uint32_t)(atomic_ret_addr & 0xFFFFFFFF)); NOC_CMD_BUF_WRITE_REG(my_noc_index, DISPATCH_S_ATOMIC_CMD_BUF, NOC_RET_ADDR_COORDINATE, (uint32_t)(atomic_ret_addr >> NOC_ADDR_COORD_SHIFT)); } diff --git a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp index 6d7251d4515..3e37adebc80 100644 --- a/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp +++ b/tt_metal/impl/dispatch/kernels/eth_tunneler.cpp @@ -70,7 +70,6 @@ constexpr uint32_t inner_stop_mux_d_bypass = get_compile_time_arg_val(15); void kernel_main() { rtos_context_switch_ptr = (void (*)())RtosTable[0]; - noc_init(); write_test_results(test_results, PQ_TEST_STATUS_INDEX, PACKET_QUEUE_TEST_STARTED); write_test_results(test_results, PQ_TEST_MISC_INDEX, 0xff000000);