Skip to content

Commit

Permalink
#0: reimplement l1 data copy
Browse files Browse the repository at this point in the history
  • Loading branch information
nathan-TT committed Nov 19, 2024
1 parent bfd4888 commit a9104eb
Showing 1 changed file with 30 additions and 26 deletions.
56 changes: 30 additions & 26 deletions tt_metal/hw/inc/firmware_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,46 +16,50 @@

extern uint32_t __ldm_bss_start[];
extern uint32_t __ldm_bss_end[];
extern uint32_t __ldm_data_start[];
extern uint32_t __ldm_data_end[];
extern void (* __init_array_start[])();
extern void (* __init_array_end[])();

extern void kernel_init(uint32_t kernel_init);
extern void kernel_launch(uint32_t kernel_base_addr);

inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
// Cover L1 load latency of 6 cycles for the bulk of the copy
int32_t n = 0;
while (n < len - 5) {
uint32_t v0 = l1_addr[n + 0];
uint32_t v1 = l1_addr[n + 1];
uint32_t v2 = l1_addr[n + 2];
uint32_t v3 = l1_addr[n + 3];
uint32_t v4 = l1_addr[n + 4];
uint32_t v5 = l1_addr[n + 5];
local_mem_addr[n + 0] = v0;
local_mem_addr[n + 1] = v1;
local_mem_addr[n + 2] = v2;
local_mem_addr[n + 3] = v3;
local_mem_addr[n + 4] = v4;
local_mem_addr[n + 5] = v5;
n += 6;
inline void l1_to_local_mem_copy(uint32_t *dst, uint32_t tt_l1_ptr *src, int32_t len) {
#pragma GCC unroll 0
while (len >= 3) {
auto v0 = src[0], v1 = src[1], v2 = src[2];
// 1) Make sure the optimizer does not think this is memcpy by
// hiding the pointer bookkeeping in an asm.
// 2) The scheduler doesn't know the above loads have 6 cycle
// latency. We emit the 3 bookkeeping adds as a single block
// in the load shadow before the stores. The optimizer will
// not be able to move these.
// 3) We don't need early clobbers here because of the +r
// constraint -- early clobbers would pessimize.
asm inline(
"addi %0,%0,3*%3\n\t"
"addi %1,%1,3*%3\n\t"
"addi %2,%2,-3"
: "+r"(src), "+r"(dst), "+r"(len)
: "i"(sizeof(v0)));
dst[-3] = v0, dst[-2] = v1, dst[-1] = v2;
}
// Could optimize this further (eg, loop of 2 or 4), probably not worth it
while (n < len) {
local_mem_addr[n] = l1_addr[n];
n++;
// There are 0, 1 or 2 words of residue. This is smaller than a loop.
// We get smaller code layout by expecting the conditions to be true.
if (__builtin_expect(len >= 1, true)) {
dst[0] = src[0];
if (__builtin_expect(len >= 2, true))
dst[1] = src[1];
}
}

inline void do_crt1(void *init_local_l1_base) {
inline void do_crt1(uint32_t tt_l1_ptr *data_image) {

// Handle stuff typically done in crt0 in asm. Easier to do in C
wzerorange(__ldm_bss_start, __ldm_bss_end);

int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base), num_words);
// Copy initialized data.
extern uint32_t __ldm_data_start[];
extern uint32_t __ldm_data_end[];
l1_to_local_mem_copy(__ldm_data_start, data_image, __ldm_data_end - __ldm_data_start);

for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
(**fptr)();
Expand Down

0 comments on commit a9104eb

Please sign in to comment.