Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pack untilize support for block_ct_dim > 8 #6

Merged
merged 1 commit into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/inc/ckernel_gpr_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ struct p_gpr_pack
constexpr static uint TMP_LO = 30; // Temp data, upper 16-bit always 0
constexpr static uint TMP_HI = 31; // Temp data, lower 16-bit always 0
constexpr static uint PACK_STREAM_SYNC = 32; // sync between pack and output stream [32:63]
constexpr static uint OUTPUT_ADDR_OFFSET = 50; // output offset address that's added to OUTPUT_ADDR
constexpr static uint PERF_PACK_NUM_TILES = 51; // output operand num tiles
constexpr static uint EXP0_SEC_SIZE_BFP = 52; // pack0,1,2,3 exp section size for bfp8,4,2
constexpr static uint EXP1_SEC_SIZE_BFP8 = 53; // pack1 exp section size for bfp8
Expand Down
9 changes: 6 additions & 3 deletions common/inc/cpack_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ namespace ckernel::packer
(pack_count == 4) ? 0xF : 0x0;
}

constexpr uint replay_buf_offset = 16; // split replay buffer usage between fpu/sfpu
// fist 16 for sfpu, next 16 for fpu

// Pack config
typedef struct {
//word 0
Expand Down Expand Up @@ -524,11 +527,11 @@ namespace ckernel::packer
TT_SETDMAREG(0, UPPER_HALFWORD(addr), 0, HI_16(p_gpr_pack::OUTPUT_ADDR));
}

template <uint32_t block_ct_dim>
template <uint32_t block_ct_dim, uint32_t full_ct_dim>
inline void program_packer_untilized_destination(const uint32_t addr, const uint32_t pack_dst_format)
{
// Each packer packs 8 rows of block_ct_dim*TILE_C_DIM datums
const uint32_t block_size = SCALE_DATUM_SIZE(pack_dst_format, block_ct_dim * TILE_C_DIM * (TILE_R_DIM/4));
// Each packer packs 8 rows of full_ct_dim*TILE_C_DIM datums
const uint32_t block_size = SCALE_DATUM_SIZE(pack_dst_format, full_ct_dim * TILE_C_DIM * (TILE_R_DIM/4));
constexpr uint32_t offset0 = 0;
const uint32_t offset1 = (1*block_size)/16;
const uint32_t offset2 = (2*block_size)/16;
Expand Down
41 changes: 34 additions & 7 deletions llk_lib/llk_pack_untilize.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ inline void _llk_pack_untilize_configure_addrmod_() {

}

template <std::uint32_t block_ct_dim>
template <std::uint32_t block_ct_dim, std::uint32_t full_ct_dim = block_ct_dim>
inline void _llk_pack_untilize_mop_config_(const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) {
const uint PACKCNT = (face_r_dim < FACE_R_DIM) ? 1 : num_faces;
constexpr uint MEGAROW = 1;
Expand All @@ -52,28 +52,55 @@ inline void _llk_pack_untilize_mop_config_(const std::uint32_t face_r_dim = FACE
tmp.set_end_op(TT_OP_INCADCZW(p_setadc::PAC, 0, 0, 1, 0)); // w cnt points to the next tile
tmp.program(instrn_buffer);
}

if (block_ct_dim != full_ct_dim) {
const std::uint32_t replay_buf_len = 10;
TT_REPLAY(replay_buf_offset, replay_buf_len, 0, 1);
TTI_PACR(ADDR_MOD_2, 0, 0xf, 0, 0, 1, 1); // close block
// update l1 address
TTI_ADDDMAREG(0, p_gpr_pack::OUTPUT_ADDR, p_gpr_pack::OUTPUT_ADDR, p_gpr_pack::OUTPUT_ADDR_OFFSET);
TTI_ADDDMAREG(0, p_gpr_pack::OUTPUT_ADDR+1, p_gpr_pack::OUTPUT_ADDR+1, p_gpr_pack::OUTPUT_ADDR_OFFSET);
TTI_ADDDMAREG(0, p_gpr_pack::OUTPUT_ADDR+2, p_gpr_pack::OUTPUT_ADDR+2, p_gpr_pack::OUTPUT_ADDR_OFFSET);
TTI_ADDDMAREG(0, p_gpr_pack::OUTPUT_ADDR+3, p_gpr_pack::OUTPUT_ADDR+3, p_gpr_pack::OUTPUT_ADDR_OFFSET);
TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG1_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR);
TTI_REG2FLOP(1,0,0,0,THCON_SEC0_REG8_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR+1);
TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG1_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR+2);
TTI_REG2FLOP(1,0,0,0,THCON_SEC1_REG8_L1_Dest_addr_ADDR32-THCON_CFGREG_BASE_ADDR32, p_gpr_pack::OUTPUT_ADDR+3);
TTI_NOP;
}
}

template <std::uint32_t block_ct_dim>
inline void _llk_pack_untilize_init_(const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) {
template <std::uint32_t block_ct_dim, std::uint32_t full_ct_dim = block_ct_dim>
inline void _llk_pack_untilize_init_(const std::uint32_t pack_dst_format, const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4) {

_llk_pack_untilize_configure_addrmod_();

_llk_pack_untilize_mop_config_<block_ct_dim>(face_r_dim, num_faces);
_llk_pack_untilize_mop_config_<block_ct_dim, full_ct_dim>(face_r_dim, num_faces);

if (block_ct_dim != full_ct_dim) {
const std::uint32_t output_addr_offset = SCALE_DATUM_SIZE(pack_dst_format, full_ct_dim * ((num_faces>1) ? num_faces/2 : 1) * FACE_C_DIM);
TT_SETDMAREG(0, LOWER_HALFWORD(output_addr_offset/16), 0, LO_16(p_gpr_pack::OUTPUT_ADDR_OFFSET)); // store 16B aligned row offset address
}
}

template <std::uint32_t block_ct_dim>
template <std::uint32_t block_ct_dim, std::uint32_t full_ct_dim = block_ct_dim>
inline void _llk_pack_untilize_(const std::uint32_t address, const std::uint32_t pack_dst_format,const std::uint32_t face_r_dim = FACE_R_DIM, const std::uint32_t num_faces = 4 /*not used*/) {

program_packer_untilized_destination<block_ct_dim>(address, pack_dst_format);
program_packer_untilized_destination<block_ct_dim, full_ct_dim>(address, pack_dst_format);

const std::uint32_t num_rows = (face_r_dim < FACE_R_DIM) ? face_r_dim : TILE_R_DIM/4;

for (std::uint32_t row=0; row<num_rows; row++) {
TTI_SETADC(p_setadc::PAC, p_setadc::CH_0, p_setadc::SET_W, 0); // Clear tile counter
ckernel::ckernel_template::run(instrn_buffer);
TTI_ADDRCRXY(p_setadc::PAC, 0, 0, 1, 0, 0b0010); // Read new row in the tile
if constexpr (block_ct_dim != full_ct_dim) {
TTI_REPLAY(replay_buf_offset, 10, 0, 0); // update row address
}
}

if constexpr (block_ct_dim == full_ct_dim) {
TTI_PACR(ADDR_MOD_2, 0, 0xf, 0, 0, 1, 1); // close block
}

TTI_PACR(ADDR_MOD_2, 0, 0xf, 0, 0, 1, 1); // close block
}
Loading