Skip to content

Commit

Permalink
#14826: reimplement l1 data copy (#15226)
Browse files Browse the repository at this point in the history
1) Reduce insns in the loop. Original loop was 21 insns (3.5 per word),
new loop is 10 insns (3.3 per word).

2) Do not use a loop for residue. We only have to handle 0, 1 and 2
cases. A loop is more overhead.
  • Loading branch information
nathan-TT authored Nov 21, 2024
1 parent 8ffb0b3 commit 185ade6
Showing 1 changed file with 30 additions and 26 deletions.
56 changes: 30 additions & 26 deletions tt_metal/hw/inc/firmware_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,46 +16,50 @@

extern uint32_t __ldm_bss_start[];
extern uint32_t __ldm_bss_end[];
extern uint32_t __ldm_data_start[];
extern uint32_t __ldm_data_end[];
extern void (* __init_array_start[])();
extern void (* __init_array_end[])();

extern void kernel_init(uint32_t kernel_init);
extern void kernel_launch(uint32_t kernel_base_addr);

inline void l1_to_local_mem_copy(uint32_t *local_mem_addr, uint32_t tt_l1_ptr *l1_addr, int32_t len) {
// Cover L1 load latency of 6 cycles for the bulk of the copy
int32_t n = 0;
while (n < len - 5) {
uint32_t v0 = l1_addr[n + 0];
uint32_t v1 = l1_addr[n + 1];
uint32_t v2 = l1_addr[n + 2];
uint32_t v3 = l1_addr[n + 3];
uint32_t v4 = l1_addr[n + 4];
uint32_t v5 = l1_addr[n + 5];
local_mem_addr[n + 0] = v0;
local_mem_addr[n + 1] = v1;
local_mem_addr[n + 2] = v2;
local_mem_addr[n + 3] = v3;
local_mem_addr[n + 4] = v4;
local_mem_addr[n + 5] = v5;
n += 6;
inline void l1_to_local_mem_copy(uint32_t *dst, uint32_t tt_l1_ptr *src, int32_t len) {
#pragma GCC unroll 0
while (len >= 3) {
auto v0 = src[0], v1 = src[1], v2 = src[2];
// 1) Make sure the optimizer does not think this is memcpy by
// hiding the pointer bookkeeping in an asm.
// 2) The scheduler doesn't know the above loads have 6 cycle
// latency. We emit the 3 bookkeeping adds as a single block
// in the load shadow before the stores. The optimizer will
// not be able to move these.
// 3) We don't need early clobbers here because of the +r
// constraint -- early clobbers would pessimize.
asm inline(
"addi %0,%0,3*%3\n\t"
"addi %1,%1,3*%3\n\t"
"addi %2,%2,-3"
: "+r"(src), "+r"(dst), "+r"(len)
: "i"(sizeof(v0)));
dst[-3] = v0, dst[-2] = v1, dst[-1] = v2;
}
// Could optimize this further (eg, loop of 2 or 4), probably not worth it
while (n < len) {
local_mem_addr[n] = l1_addr[n];
n++;
// There are 0, 1 or 2 words of residue. This is smaller than a loop.
// We get smaller code layout by expecting the conditions to be true.
if (__builtin_expect(len >= 1, true)) {
dst[0] = src[0];
if (__builtin_expect(len >= 2, true))
dst[1] = src[1];
}
}

inline void do_crt1(void *init_local_l1_base) {
inline void do_crt1(uint32_t tt_l1_ptr *data_image) {

// Handle stuff typically done in crt0 in asm. Easier to do in C
wzerorange(__ldm_bss_start, __ldm_bss_end);

int32_t num_words = ((uint)__ldm_data_end - (uint)__ldm_data_start) >> 2;
l1_to_local_mem_copy((uint32_t *)__ldm_data_start, (uint32_t *)((uint8_t *)init_local_l1_base), num_words);
// Copy initialized data.
extern uint32_t __ldm_data_start[];
extern uint32_t __ldm_data_end[];
l1_to_local_mem_copy(__ldm_data_start, data_image, __ldm_data_end - __ldm_data_start);

for (void (** fptr)() = __init_array_start; fptr < __init_array_end; fptr++) {
(**fptr)();
Expand Down

0 comments on commit 185ade6

Please sign in to comment.