Skip to content

Commit

Permalink
#11789: Pack kernel data span into text span
Browse files Browse the repository at this point in the history
NOTE: slow dispatch doesn't check for kernel text+data overflowing
the firmware hole size.  This will be fixed w/ the ring buffer

Split linker script builds into 2, fw and kernel
Bump firmware hole size up by local sizes, kernel data now goes here
Create scratch area for firmware locals, used only during init
Check fw_size+kernel_size+kernel_local_size against max at runtime since now we
could overflow after compile succeeds
Use symbol for kernel data address, const for firmware data address
Pack data span into text span
  • Loading branch information
pgkeller committed Sep 24, 2024
1 parent f06bdbb commit 4c006aa
Show file tree
Hide file tree
Showing 25 changed files with 336 additions and 146 deletions.
6 changes: 3 additions & 3 deletions tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,15 +191,15 @@ int main(int argc, char **argv) {
JitBuildProcessorType::DATA_MOVEMENT,
0,
get_latest_kernel_binary_path(mask, riscv0_kernel));
ll_api::memory brisc_binary = llrt::get_risc_binary(brisc_hex_path);
ll_api::memory brisc_binary = llrt::get_risc_binary(brisc_hex_path, 0, llrt::PackSpans::PACK);
TT_FATAL(
brisc_binary == brisc_binaries.at(mask).at(0),
"Expected saved BRISC binary to be the same as binary in persistent cache");
std::string ncrisc_hex_path = device->build_kernel_target_path(
JitBuildProcessorType::DATA_MOVEMENT,
1,
get_latest_kernel_binary_path(mask, riscv1_kernel));
ll_api::memory ncrisc_binary = llrt::get_risc_binary(ncrisc_hex_path);
ll_api::memory ncrisc_binary = llrt::get_risc_binary(ncrisc_hex_path, 1, llrt::PackSpans::PACK);
TT_FATAL(
ncrisc_binary == ncrisc_binaries.at(mask).at(0),
"Expected saved NCRISC binary to be the same as binary in persistent cache");
Expand All @@ -209,7 +209,7 @@ int main(int argc, char **argv) {
JitBuildProcessorType::COMPUTE,
trisc_id,
get_latest_kernel_binary_path(mask, compute_kernel));
ll_api::memory trisc_binary = llrt::get_risc_binary(trisc_hex_path);
ll_api::memory trisc_binary = llrt::get_risc_binary(trisc_hex_path, 2, llrt::PackSpans::PACK);
TT_FATAL(
trisc_binary == compute_binaries.at(mask).at(trisc_id),
"Expected saved TRISC binary for {} to be the same as binary in persistent cache", trisc_id_str);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>
#include "debug/dprint.h"
#include "dataflow_api.h"

void kernel_main() {
constexpr ProgrammableCoreType eth_core_type = static_cast<ProgrammableCoreType>(get_compile_time_arg_val(0));
Expand Down
37 changes: 22 additions & 15 deletions tt_metal/hw/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
# Temporary workaround for Issue #8767
set(HW_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/runtime/hw/toolchain)
set(CORES
set(PROCS
brisc
ncrisc
trisc0
trisc1
trisc2
ierisc
)
set(TYPES
firmware
kernel
)

if("$ENV{ARCH_NAME}" STREQUAL "wormhole_b0")
set(DEV_MEM_MAP "${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/wormhole/dev_mem_map.h")
Expand All @@ -17,22 +21,25 @@ else()
set(HW_INCLUDES "${PROJECT_SOURCE_DIR}/tt_metal/hw/inc/$ENV{ARCH_NAME}")
endif()

foreach(CORE IN LISTS CORES)
set(HW_OUTPUT_FILE "${HW_OUTPUT_DIR}/${CORE}.ld")
string(TOUPPER ${CORE} CORE_DEFINE)
foreach(PROC IN LISTS PROCS)
foreach(TYPE IN LISTS TYPES)
set(HW_OUTPUT_FILE "${HW_OUTPUT_DIR}/${TYPE}_${PROC}.ld")
string(TOUPPER ${PROC} PROC_DEFINE)
string(TOUPPER ${TYPE} TYPE_DEFINE)

# custom command to preprocess/generate the output file
add_custom_command(
OUTPUT ${HW_OUTPUT_FILE}
COMMAND ${CMAKE_COMMAND} -E make_directory ${HW_OUTPUT_DIR}
COMMAND ${CMAKE_CXX_COMPILER} -DLD_TARGET=${CORE_DEFINE} -DLD_TYPE=FIRMWARE -DCOMPILE_FOR_${CORE_DEFINE} -I${HW_INCLUDES} -E -P -x c -o ${HW_OUTPUT_FILE} ${CMAKE_CURRENT_SOURCE_DIR}/toolchain/main.ld
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/toolchain/main.ld ${CMAKE_CURRENT_SOURCE_DIR}/toolchain/memory.ld ${CMAKE_CURRENT_SOURCE_DIR}/toolchain/sections.ld ${DEV_MEM_MAP}
COMMENT "Preprocessing toolchain/${CORE}.ld"
VERBATIM
)
# custom command to preprocess/generate the output file
add_custom_command(
OUTPUT ${HW_OUTPUT_FILE}
COMMAND ${CMAKE_COMMAND} -E make_directory ${HW_OUTPUT_DIR}
COMMAND ${CMAKE_CXX_COMPILER} -DLD_TARGET=${PROC_DEFINE} -DLD_TYPE=${TYPE_DEFINE} -DTARGET_${PROC_DEFINE} -DTYPE_${TYPE_DEFINE} -DCOMPILE_FOR_${PROC_DEFINE} -I${HW_INCLUDES} -E -P -x c -o ${HW_OUTPUT_FILE} ${CMAKE_CURRENT_SOURCE_DIR}/toolchain/main.ld
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/toolchain/main.ld ${CMAKE_CURRENT_SOURCE_DIR}/toolchain/memory.ld ${CMAKE_CURRENT_SOURCE_DIR}/toolchain/sections.ld ${DEV_MEM_MAP}
COMMENT "Preprocessing toolchain/${PROC}.ld"
VERBATIM
)

# add output file to the custom target
list(APPEND PREPROCESSED_LD_FILES ${HW_OUTPUT_FILE})
# add output file to the custom target
list(APPEND PREPROCESSED_LD_FILES ${HW_OUTPUT_FILE})
endforeach()
endforeach()

# Build hw lib objects
Expand Down
4 changes: 1 addition & 3 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

/** @file @brief Main firmware code */

#include <unistd.h>

#include <cstdint>

// clang-format off
Expand Down Expand Up @@ -339,7 +337,7 @@ int main() {
WAYPOINT("I");

int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint*)__ldm_data_start, (uint tt_l1_ptr*)MEM_BRISC_INIT_LOCAL_L1_BASE, num_words);
l1_to_local_mem_copy((uint*)__ldm_data_start, (uint tt_l1_ptr*)MEM_BRISC_INIT_LOCAL_L1_BASE_SCRATCH, num_words);

risc_init();
device_setup();
Expand Down
5 changes: 2 additions & 3 deletions tt_metal/hw/firmware/src/brisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@
#include "noc_nonblocking_api.h"
#include "firmware_common.h"
#include "tools/profiler/kernel_profiler.hpp"
#include "dataflow_api.h"

#include <kernel_includes.hpp>

uint8_t noc_index = NOC_INDEX;
extern uint32_t __kernel_init_local_l1_base[];

void kernel_launch() {

Expand All @@ -31,7 +30,7 @@ void kernel_launch() {
while (c_tensix_core::read_wall_clock() < end_time);
#endif
#else
firmware_kernel_common_init((void tt_l1_ptr *)MEM_BRISC_INIT_LOCAL_L1_BASE);
firmware_kernel_common_init((void tt_l1_ptr *)(__kernel_init_local_l1_base));

noc_local_state_init(noc_index);

Expand Down
2 changes: 1 addition & 1 deletion tt_metal/hw/firmware/src/idle_erisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ int main() {
WAYPOINT("I");
int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
uint32_t *local_mem_ptr = (uint32_t *)__ldm_data_start;
uint32_t *l1_data_ptr = (uint32_t *)MEM_IERISC_INIT_LOCAL_L1_BASE;
uint32_t *l1_data_ptr = (uint32_t *)MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH;
uint32_t heartbeat = 0;
for (int32_t i = 0; i < num_words; i++) {
local_mem_ptr[i] = l1_data_ptr[i];
Expand Down
8 changes: 3 additions & 5 deletions tt_metal/hw/firmware/src/idle_erisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,11 @@
#include <kernel_includes.hpp>

uint8_t noc_index = NOC_INDEX;
//inline void RISC_POST_STATUS(uint32_t status) {
// volatile uint32_t* ptr = (volatile uint32_t*)(NOC_CFG(ROUTER_CFG_2));
// ptr[0] = status;
//}
extern uint32_t __kernel_init_local_l1_base[];

void kernel_launch() {
DeviceZoneScopedMainChildN("ERISC-KERNEL");
firmware_kernel_common_init((void tt_l1_ptr *)MEM_IERISC_INIT_LOCAL_L1_BASE);
firmware_kernel_common_init((void tt_l1_ptr *)__kernel_init_local_l1_base);

noc_local_state_init(noc_index);

Expand Down
2 changes: 1 addition & 1 deletion tt_metal/hw/firmware/src/ncrisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ int main(int argc, char *argv[]) {
WAYPOINT("I");

int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint *)__ldm_data_start, (uint tt_l1_ptr *)MEM_NCRISC_INIT_LOCAL_L1_BASE, num_words);
l1_to_local_mem_copy((uint *)__ldm_data_start, (uint tt_l1_ptr *)MEM_NCRISC_INIT_LOCAL_L1_BASE_SCRATCH, num_words);

risc_init();

Expand Down
5 changes: 3 additions & 2 deletions tt_metal/hw/firmware/src/ncrisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#endif
#include "firmware_common.h"
#include "tools/profiler/kernel_profiler.hpp"
#include "dataflow_api.h"
#include "tensix_functions.h"
#include "c_tensix_core.h"

Expand All @@ -28,6 +27,8 @@ uint32_t noc_nonposted_writes_acked[NUM_NOCS];
uint32_t noc_nonposted_atomics_acked[NUM_NOCS];
uint32_t noc_posted_writes_num_issued[NUM_NOCS];

extern uint32_t __kernel_init_local_l1_base[];

void kernel_launch() {

DeviceZoneScopedMainChildN("NCRISC-KERNEL");
Expand All @@ -37,7 +38,7 @@ void kernel_launch() {
while (c_tensix_core::read_wall_clock() < KERNEL_RUN_TIME);
#endif
#else
firmware_kernel_common_init((void tt_l1_ptr *)MEM_NCRISC_INIT_LOCAL_L1_BASE);
firmware_kernel_common_init((void tt_l1_ptr *)(MEM_NCRISC_INIT_IRAM_L1_BASE + (uint32_t)__kernel_init_local_l1_base - MEM_NCRISC_IRAM_BASE));

noc_local_state_init(noc_index);

Expand Down
2 changes: 1 addition & 1 deletion tt_metal/hw/firmware/src/trisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
WAYPOINT("I");

uint tt_l1_ptr *local_l1_start_addr =
(uint tt_l1_ptr *)PREPROCESSOR_EXPAND(MEM_TRISC, COMPILE_FOR_TRISC, _INIT_LOCAL_L1_BASE);
(uint tt_l1_ptr *)PREPROCESSOR_EXPAND(MEM_TRISC, COMPILE_FOR_TRISC, _INIT_LOCAL_L1_BASE_SCRATCH);
int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint *)__ldm_data_start, local_l1_start_addr, num_words);

Expand Down
6 changes: 4 additions & 2 deletions tt_metal/hw/firmware/src/trisck.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ volatile tt_reg_ptr uint * mailbox_base[4] = {
};
}

extern uint32_t __kernel_init_local_l1_base[];

void kernel_launch()
{
DeviceZoneScopedMainChildN("TRISC-KERNEL");
Expand All @@ -41,8 +43,8 @@ void kernel_launch()
ckernel::wait(KERNEL_RUN_TIME);
#endif
#else
tt_l1_ptr uint *local_l1_start_addr = (tt_l1_ptr uint *)PREPROCESSOR_EXPAND(MEM_TRISC, COMPILE_FOR_TRISC, _INIT_LOCAL_L1_BASE);
firmware_kernel_common_init(local_l1_start_addr);
firmware_kernel_common_init((void tt_l1_ptr *)(__kernel_init_local_l1_base));

#if defined(UCK_CHLKC_UNPACK)
// Make sure DBG_FEATURE_DISABLE register is cleared before every kernel is executed
memory_write(RISCV_DEBUG_REG_DBG_FEATURE_DISABLE, 0);
Expand Down
45 changes: 28 additions & 17 deletions tt_metal/hw/inc/blackhole/dev_mem_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,13 @@
#define MEM_LOCAL_BASE 0xFFB00000
#define MEM_BRISC_LOCAL_SIZE (8 * 1024)
#define MEM_NCRISC_LOCAL_SIZE (8 * 1024)
#define MEM_IERISC_LOCAL_SIZE (8 * 1024)
#define MEM_TRISC_LOCAL_SIZE (4 * 1024)

/////////////
// Firmware/kernel code holes
#define MEM_BOOT_CODE_SIZE 4
#define MEM_BRISC_FIRMWARE_SIZE (10 * 1024)
#define MEM_NCRISC_FIRMWARE_SIZE (16 * 1024)
#define MEM_IERISC_FIRMWARE_SIZE (24 * 1024)
#define MEM_TRISC0_FIRMWARE_SIZE (16 * 1024)
#define MEM_TRISC1_FIRMWARE_SIZE (16 * 1024)
#define MEM_TRISC2_FIRMWARE_SIZE (16 * 1024)
Expand All @@ -53,45 +51,58 @@
#define MEM_L1_BARRIER 12
#define MEM_MAILBOX_BASE 16
// Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small
#define MEM_MAILBOX_SIZE 5 * 4 * 512 + 4 * 32 + 1600
#define MEM_MAILBOX_SIZE (5 * 4 * 512 + 4 * 32 + 1600)
#define MEM_MAILBOX_END (MEM_MAILBOX_BASE + MEM_MAILBOX_SIZE)
#define MEM_IERISC_MAILBOX_BASE 1024
#define MEM_IERISC_MAILBOX_END (MEM_IERISC_MAILBOX_BASE + 128)
#define MEM_ZEROS_BASE ((MEM_MAILBOX_END + 31) & ~31)

#define MEM_BRISC_FIRMWARE_BASE (MEM_ZEROS_BASE + MEM_ZEROS_SIZE)
#define MEM_NCRISC_FIRMWARE_BASE (MEM_BRISC_FIRMWARE_BASE + MEM_BRISC_FIRMWARE_SIZE)
#define MEM_IERISC_FIRMWARE_BASE 8192
#define MEM_TRISC0_FIRMWARE_BASE (MEM_NCRISC_FIRMWARE_BASE + MEM_NCRISC_FIRMWARE_SIZE)
#define MEM_TRISC1_FIRMWARE_BASE (MEM_TRISC0_FIRMWARE_BASE + MEM_TRISC0_FIRMWARE_SIZE)
#define MEM_TRISC2_FIRMWARE_BASE (MEM_TRISC1_FIRMWARE_BASE + MEM_TRISC1_FIRMWARE_SIZE)

#define MEM_MAP_END (MEM_TRISC2_FIRMWARE_BASE + MEM_TRISC2_FIRMWARE_SIZE)

// Every address after MEM_MAP_END is a "scratch" address
// These can be used by FW during init, but aren't usable once FW reaches "ready"

/////////////
// Initialization relocation L1 memory
// Host downloads to these addresses, fw copies to destination
// Note: using xmov to copy ncrisc to addresses above 1M hangs the chip
#define MEM_BRISC_INIT_LOCAL_L1_BASE (MEM_TRISC2_FIRMWARE_BASE + MEM_TRISC2_FIRMWARE_SIZE)
#define MEM_NCRISC_INIT_LOCAL_L1_BASE (MEM_BRISC_INIT_LOCAL_L1_BASE + MEM_BRISC_LOCAL_SIZE)
#define MEM_TRISC0_INIT_LOCAL_L1_BASE (MEM_NCRISC_INIT_LOCAL_L1_BASE + MEM_NCRISC_LOCAL_SIZE)
#define MEM_TRISC1_INIT_LOCAL_L1_BASE (MEM_TRISC0_INIT_LOCAL_L1_BASE + MEM_TRISC_LOCAL_SIZE)
#define MEM_TRISC2_INIT_LOCAL_L1_BASE (MEM_TRISC1_INIT_LOCAL_L1_BASE + MEM_TRISC_LOCAL_SIZE)

#define MEM_IERISC_INIT_LOCAL_L1_BASE (MEM_IERISC_FIRMWARE_BASE + MEM_IERISC_FIRMWARE_SIZE)

#define MEM_MAP_END (MEM_TRISC2_INIT_LOCAL_L1_BASE + MEM_TRISC_LOCAL_SIZE)
#define MEM_BRISC_INIT_LOCAL_L1_BASE_SCRATCH MEM_MAP_END
#define MEM_NCRISC_INIT_LOCAL_L1_BASE_SCRATCH (MEM_BRISC_INIT_LOCAL_L1_BASE_SCRATCH + MEM_BRISC_LOCAL_SIZE)
#define MEM_TRISC0_INIT_LOCAL_L1_BASE_SCRATCH (MEM_NCRISC_INIT_LOCAL_L1_BASE_SCRATCH + MEM_NCRISC_LOCAL_SIZE)
#define MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC0_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE)
#define MEM_TRISC2_INIT_LOCAL_L1_BASE_SCRATCH (MEM_TRISC1_INIT_LOCAL_L1_BASE_SCRATCH + MEM_TRISC_LOCAL_SIZE)

/////////////
// Stack info
// Increasing the stack size comes at the expense of less local memory for globals
#define MEM_BRISC_STACK_SIZE 768
#define MEM_NCRISC_STACK_SIZE 1040
#define MEM_IERISC_STACK_SIZE 768
#define MEM_TRISC0_STACK_SIZE 320
#define MEM_TRISC1_STACK_SIZE 256
#define MEM_TRISC2_STACK_SIZE 768

#define MEM_BRISC_STACK_BASE (MEM_LOCAL_BASE + MEM_BRISC_LOCAL_SIZE - MEM_BRISC_STACK_SIZE)
#define MEM_NCRISC_STACK_BASE (MEM_LOCAL_BASE + MEM_NCRISC_LOCAL_SIZE - MEM_NCRISC_STACK_SIZE)
#define MEM_IERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE)
#define MEM_TRISC0_STACK_BASE (MEM_LOCAL_BASE + MEM_TRISC_LOCAL_SIZE - MEM_TRISC0_STACK_SIZE)
#define MEM_TRISC1_STACK_BASE (MEM_LOCAL_BASE + MEM_TRISC_LOCAL_SIZE - MEM_TRISC1_STACK_SIZE)
#define MEM_TRISC2_STACK_BASE (MEM_LOCAL_BASE + MEM_TRISC_LOCAL_SIZE - MEM_TRISC2_STACK_SIZE)

/////////////
// Alignment restrictions needed in linker scripts
#define MEM_TENSIX_KERNEL_ALIGNMENT 16


/////////////
// IERISC memory map
#define MEM_IERISC_LOCAL_SIZE (8 * 1024)
#define MEM_IERISC_FIRMWARE_SIZE (24 * 1024)
#define MEM_IERISC_MAILBOX_BASE 1024
#define MEM_IERISC_MAILBOX_END (MEM_IERISC_MAILBOX_BASE + 128)
#define MEM_IERISC_FIRMWARE_BASE 8192
#define MEM_IERISC_INIT_LOCAL_L1_BASE_SCRATCH (MEM_IERISC_FIRMWARE_BASE + MEM_IERISC_FIRMWARE_SIZE)
#define MEM_IERISC_STACK_SIZE 768
#define MEM_IERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE)
3 changes: 1 addition & 2 deletions tt_metal/hw/inc/firmware_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ inline void firmware_kernel_common_init(void *init_local_l1_base) {
wzerorange(__ldm_bss_start, __ldm_bss_end);

int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
uint32_t offset = (uint32_t)__ldm_data_start - MEM_LOCAL_BASE;
l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base + offset), num_words);
l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base), num_words);

for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
(**fptr)();
Expand Down
Loading

0 comments on commit 4c006aa

Please sign in to comment.