diff --git a/docs/source/tt_metal/apis/kernel_apis/compute/add_tiles_bcast.rst b/docs/source/tt_metal/apis/kernel_apis/compute/add_tiles_bcast.rst index d4f5b006a3d9..edb1279324c8 100644 --- a/docs/source/tt_metal/apis/kernel_apis/compute/add_tiles_bcast.rst +++ b/docs/source/tt_metal/apis/kernel_apis/compute/add_tiles_bcast.rst @@ -3,5 +3,4 @@ add_tiles_bcast .. doxygenfunction:: add_bcast_cols_init_short(uint32_t icb0 = 0, uint32_t icb1 = 1) .. doxygenfunction:: add_bcast_rows_init_short(uint32_t icb0 = 0, uint32_t icb1 = 1) -.. doxygenfunction:: add_bcast_rows_init_short_post_matmul(uint32_t icb0 = 0, uint32_t icb1 = 1) .. doxygenfunction:: add_tiles_bcast(uint32_t icb0, uint32_t icb1, uint32_t itile0, uint32_t itile1, uint32_t idst) diff --git a/docs/source/tt_metal/apis/kernel_apis/compute/copy_tile.rst b/docs/source/tt_metal/apis/kernel_apis/compute/copy_tile.rst index 857ca8f2a07e..f170bd41db5e 100644 --- a/docs/source/tt_metal/apis/kernel_apis/compute/copy_tile.rst +++ b/docs/source/tt_metal/apis/kernel_apis/compute/copy_tile.rst @@ -3,4 +3,4 @@ copy_tile ========= -.. doxygenfunction:: copy_tile(uint32_t icb, uint32_t itile, uint32_t idst) +.. doxygenfunction:: copy_tile(uint32_t in_cb_id, uint32_t in_tile_index, uint32_t dst_tile_index) diff --git a/tt_eager/tt_dnn/op_library/bmm/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp b/tt_eager/tt_dnn/op_library/bmm/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp index 20ab648225cd..15e76f8bc62e 100644 --- a/tt_eager/tt_dnn/op_library/bmm/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp +++ b/tt_eager/tt_dnn/op_library/bmm/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp @@ -18,7 +18,7 @@ namespace NAMESPACE { FORCE_INLINE void reload_from_cb_to_dst(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t mm_partials_cb_id, uint32_t out_subblock_num_tiles, uint32_t out_subblock_w, uint32_t out_subblock_h, uint32_t in0_block_w) { // Reconfigure input - copy_tile_matmul_partials_init_short_with_dt(mm_partials_cb_id); + copy_tile_to_dst_init_short_with_dt(in1_cb_id, mm_partials_cb_id); cb_wait_front(mm_partials_cb_id, out_subblock_num_tiles); tile_regs_acquire(); @@ -216,11 +216,8 @@ void MAIN { #if defined FP32_DEST_ACC_EN or defined PACKER_L1_ACC PACK(( pack_reconfig_data_format(out_cb_id) )); #endif - #ifdef ARCH_GRAYSKULL - add_bcast_rows_init_short_post_matmul(); - #else - add_bcast_rows_init_short(); - #endif + + add_bcast_rows_init_short(); // reconfigure unpacker df for src B unpack_reconfig_data_format(in1_cb_id, mm_partials_cb_id, in0_cb_id, bias_cb_id); cb_wait_front(bias_cb_id, in1_per_core_w); @@ -268,11 +265,6 @@ void MAIN { if constexpr(batch > 1) { // reconfigure init for matmul mm_block_init_short(in0_cb_id, in1_cb_id, 0, out_subblock_w, out_subblock_h, in0_block_w); - #ifdef ARCH_GRAYSKULL - // reconfigure packer's dest registers to Col Major - PACK(( llk_pack_init() )); - PACK(( llk_init_packer_dest_offset_registers() )); - #endif // reconfigure unpacker df for src B unpack_reconfig_data_format(in1_cb_id, in0_cb_id); } diff --git a/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp b/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp index 4a55d82920ab..daec0e496c24 100644 --- a/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp +++ b/tt_eager/tt_dnn/op_library/conv/kernels/conv_bmm_tilize_col_major_out_blocks.cpp @@ -25,24 +25,6 @@ // SliceRange srr1 = SliceRange{.h0 = 1, .h1 = 2, .hs = 8, .w0 = 0, .w1 = 32, .ws = 1}; // SliceRange src = SliceRange{.h0 = 0, .h1 = 32, .hs = 1, .w0 = 0, .w1 = 1, .ws = 1}; - -// TODO: Uplift these APIs for compute_api.h? -inline void col_major_to_row_major_init() { - #ifdef ARCH_GRAYSKULL - // Configure to RowMajor for tilize (similar to add bcast for bias) - MATH(( llk_math_pack_sync_init() )); - PACK(( llk_pack_dest_init() )); - #endif -} - -inline void row_major_to_col_major_init() { - #ifdef ARCH_GRAYSKULL - // Configure back to ColMajor for matmul - MATH(( llk_math_pack_sync_init() )); - PACK(( llk_pack_dest_init() )); - #endif -} - inline void tilize_in( uint32_t in_cb_id, uint32_t in_subblock_h, @@ -185,9 +167,7 @@ void MAIN { #ifdef PRE_TILIZE unpack_reconfig_data_format_srca(in1_cb_id, in0_pretilize_cb_id); - col_major_to_row_major_init(); tilize_in(in0_pretilize_cb_id, in0_subblock_h, in0_block_w, in0_num_subblocks, tilized_in0_cb_id); - row_major_to_col_major_init(); // TODO: unpack_reconfig_data_format_srca(in0_pretilize_cb_id, in1_cb_id) doesn't work if in0 is BFLOATB_B and in1 is BFLOAT16 mm_block_init_short(in0_cb_id, in1_cb_id, false, out_subblock_w, out_subblock_h, in0_block_w); @@ -217,12 +197,10 @@ void MAIN { #endif unpack_reconfig_data_format_srca(in1_cb_id, in0_cb_id); - col_major_to_row_major_init(); tilize_in(in0_cb_id, in0_subblock_h, in0_block_w, in0_num_subblocks_read, tilized_in0_cb_id); #ifdef SPLIT_READER tilize_in(in0_cb_second_reader_id, in0_subblock_h, in0_block_w, in0_num_subblocks_read, tilized_in0_cb_id); #endif - row_major_to_col_major_init(); mm_block_init_short_with_dt(mm_in0_cb_id, in1_cb_id, /*srca_old_operand=*/in0_cb_id, out_subblock_w, out_subblock_h, in0_block_w); } @@ -242,7 +220,7 @@ void MAIN { for (uint32_t in1_subblock_i = 0; in1_subblock_i < in1_num_subblocks; ++in1_subblock_i) { if (enable_reload) { // Reconfigure input - copy_tile_matmul_partials_init_short_with_dt(in1_cb_id, matmul_partials_cb); + copy_tile_to_dst_init_short_with_dt(in1_cb_id, matmul_partials_cb); cb_wait_front(matmul_partials_cb, out_subblock_num_tiles); tile_regs_acquire(); @@ -313,7 +291,7 @@ void MAIN { // if last block we pack the final result with relu enabled PACK(( llk_pack_relu_config(ReluType::ZERO_RELU) )); #endif - add_bcast_rows_init_short_post_matmul(); + add_bcast_rows_init_short(); unpack_reconfig_data_format(in1_cb_id, matmul_partials_cb, mm_in0_cb_id, bias_cb_id); cb_wait_front(bias_cb_id, bias_ntiles_w); cb_wait_front(matmul_partials_cb, out_block_num_tiles); diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h index 720ba11f4b0b..55d24af14ea1 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_math_matmul_api.h @@ -10,7 +10,7 @@ * LLK MATMUL *************************************************************************/ -template +template inline void llk_math_matmul_init( const std::uint32_t operandA /*not used*/, const std::uint32_t operandB /*not used*/, @@ -19,7 +19,7 @@ inline void llk_math_matmul_init( const std::uint32_t rt_dim = 1, const std::uint32_t kt_dim = 1) { - _llk_math_matmul_init_( + _llk_math_matmul_init_( transpose, ct_dim, rt_dim, @@ -27,16 +27,16 @@ inline void llk_math_matmul_init( } -template +template inline void llk_math_matmul( - uint dst_index, + const uint dst_index, const bool transpose = false, const std::uint32_t ct_dim = 1, const std::uint32_t rt_dim = 1, const std::uint32_t kt_dim = 1) { for (std::uint32_t rt=0; rt(dst_index+rt*ct_dim+ct, transpose); + _llk_math_matmul_(dst_index+rt*ct_dim+ct, transpose); } } } diff --git a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h index e9bd5411ae11..3fdcd616be10 100644 --- a/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h +++ b/tt_metal/hw/ckernels/grayskull/metal/llk_api/llk_pack_api.h @@ -22,10 +22,10 @@ * LLK PACK *************************************************************************/ -template +template inline void llk_pack_mop_config(const uint32_t output) { constexpr bool write_tile_header = false; - _llk_pack_mop_config_(); + _llk_pack_mop_config_(); } template @@ -69,13 +69,13 @@ inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output llk_pack_reduce_hw_configure(&llk_pack_params); } -template +template inline void llk_pack_init(const std::uint32_t pack_output = 16) { const std::uint32_t output_id = get_output_id(pack_output); constexpr bool write_tile_header = false; - _llk_pack_init_(); + _llk_pack_init_(); } template @@ -155,14 +155,14 @@ inline void llk_pack_dest_section_done() { _llk_pack_dest_section_done_(); } -template +template inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 16) { - _llk_init_packer_dest_offset_registers_(); + _llk_init_packer_dest_offset_registers_(); } -template +template inline void llk_pack_dest_init(const std::uint32_t pack_output = 16) { - _llk_pack_dest_init_(); + _llk_pack_dest_init_(); } template @@ -183,17 +183,17 @@ inline void llk_pack_debug_dump_seek(std::uint8_t offset) { _llk_pack_debug_dump_seek_(offset); } -template +template inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) { std::uint32_t output_id = get_output_id(new_output); - _llk_pack_reconfig_data_format_( + _llk_pack_reconfig_data_format_( pack_dst_format[output_id], cb_interface[output_id].fifo_page_size ); } -template +template inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) { std::uint32_t old_output_id = get_output_id(old_output); std::uint32_t new_output_id = get_output_id(new_output); @@ -201,7 +201,7 @@ inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const if((pack_dst_format[old_output_id] != pack_dst_format[new_output_id]) && (pack_dst_format[old_output_id] != (uint)DataFormat::Invalid) && (pack_dst_format[new_output_id] != (uint)DataFormat::Invalid)) { - llk_pack_reconfig_data_format(new_output); + llk_pack_reconfig_data_format(new_output); } } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h index 69823d2a91ac..88fe49305850 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_math_matmul_api.h @@ -10,7 +10,7 @@ * LLK MATMUL *************************************************************************/ -template +template inline void llk_math_matmul_init( const std::uint32_t operandA, const std::uint32_t operandB, @@ -40,12 +40,13 @@ inline void llk_math_matmul_init( kt_dim); } -template +template inline void llk_math_matmul( - uint dst_index, + const uint dst_index, const bool transpose = false, const std::uint32_t ct_dim = 1, const std::uint32_t rt_dim = 1, const std::uint32_t kt_dim = 1) { + _llk_math_matmul_(dst_index, transpose, ct_dim, rt_dim, kt_dim); } diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h index bdd01ff1b027..51ec26fa0abd 100644 --- a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h +++ b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_pack_api.h @@ -22,7 +22,7 @@ * LLK PACK *************************************************************************/ -template +template inline void llk_pack_mop_config(const uint32_t output) { const std::uint32_t output_id = get_output_id(output); @@ -31,7 +31,7 @@ inline void llk_pack_mop_config(const uint32_t output) { const bool partial_face = get_output_partial_face(output_id) && IS_BFP_FORMAT((uint)pack_dst_format[output_id]); const bool narrow_tile = get_output_narrow_tile(output_id); - _llk_pack_mop_config_( + _llk_pack_mop_config_( pack_dst_format[output_id], face_r_dim, num_faces, @@ -99,7 +99,7 @@ inline void llk_pack_reduce_hw_configure_disaggregated(std::uint32_t pack_output llk_pack_reduce_hw_configure(&llk_pack_params); } -template +template inline void llk_pack_init(const std::uint32_t pack_output = 16) { const std::uint32_t output_id = get_output_id(pack_output); @@ -108,7 +108,7 @@ inline void llk_pack_init(const std::uint32_t pack_output = 16) { const bool partial_face = get_output_partial_face(output_id); const bool narrow_tile = get_output_narrow_tile(output_id); - _llk_pack_init_( + _llk_pack_init_( pack_dst_format[output_id], face_r_dim, num_faces, @@ -233,26 +233,26 @@ inline void llk_pack_dest_section_done() { _llk_pack_dest_section_done_(); } -template +template inline void llk_init_packer_dest_offset_registers(const std::uint32_t pack_output = 16) { const std::uint32_t output_id = get_output_id(pack_output); const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); const bool narrow_tile = get_output_narrow_tile(output_id); - _llk_init_packer_dest_offset_registers_( + _llk_init_packer_dest_offset_registers_( face_r_dim, narrow_tile ); } -template +template inline void llk_pack_dest_init(const std::uint32_t pack_output = 16) { const std::uint32_t output_id = get_output_id(pack_output); const std::uint32_t face_r_dim = get_output_face_r_dim(output_id); const bool narrow_tile = get_output_narrow_tile(output_id); - _llk_pack_dest_init_( + _llk_pack_dest_init_( face_r_dim, narrow_tile ); @@ -276,7 +276,7 @@ inline void llk_pack_debug_dump_seek(std::uint8_t offset) { _llk_pack_debug_dump_seek_(offset); } -template +template inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) { const std::uint32_t output_id = get_output_id(new_output); @@ -285,7 +285,7 @@ inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) { const bool partial_face = get_output_partial_face(output_id); const bool narrow_tile = get_output_narrow_tile(output_id); - _llk_pack_reconfig_data_format_( + _llk_pack_reconfig_data_format_( pack_src_format[output_id], pack_dst_format[output_id], cb_interface[output_id].fifo_page_size, @@ -296,7 +296,7 @@ inline void llk_pack_reconfig_data_format(const std::uint32_t new_output) { ); } -template +template inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const std::uint32_t new_output) { std::uint32_t old_output_id = get_output_id(old_output); std::uint32_t new_output_id = get_output_id(new_output); @@ -304,10 +304,10 @@ inline void llk_pack_reconfig_data_format(const std::uint32_t old_output, const if((pack_dst_format[old_output_id] != pack_dst_format[new_output_id]) && (pack_dst_format[old_output_id] != (uint)DataFormat::Invalid) && (pack_dst_format[new_output_id] != (uint)DataFormat::Invalid)) { - llk_pack_reconfig_data_format(new_output); + llk_pack_reconfig_data_format(new_output); } else if constexpr (is_tile_dim_reconfig_en) { // Same format but different tile dims - llk_pack_mop_config(new_output); + llk_pack_mop_config(new_output); } } diff --git a/tt_metal/include/compute_kernel_api/bcast.h b/tt_metal/include/compute_kernel_api/bcast.h index 36e8ce73d05f..81c809e186bc 100644 --- a/tt_metal/include/compute_kernel_api/bcast.h +++ b/tt_metal/include/compute_kernel_api/bcast.h @@ -120,7 +120,7 @@ void init_bcast(uint32_t icb0, uint32_t icb1, uint32_t ocb = 16) PACK(( llk_pack_hw_configure_disaggregated(ocb) )); PACK(( llk_pack_init(ocb) )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init() )); + PACK(( llk_pack_dest_init() )); MATH(( llk_math_pack_sync_init() )); } @@ -199,35 +199,8 @@ ALWI void mul_tiles_bcast(uint32_t icb0, uint32_t icb1, uint32_t itile0, uint32_ */ ALWI void add_bcast_rows_init_short(uint32_t icb0 = 0, uint32_t icb1 = 1) { - MATH(( llk_math_eltwise_binary_init() )); - // FIXME: API Update needed in compute kernel? - UNPACK(( llk_unpack_AB_init(icb0, icb1) )); -} - -/** - * This function reconfigures the compute engine from ColMajor mode to RowMajor mode. It performs - * a switch-from-matmul_block tile hw reconfiguration step needed for add_bcast_rows to be executed correctly. - * Required to be called before add_tiles_bcast if using row as broadcast type - */ -ALWI void add_bcast_rows_init_short_post_matmul(uint32_t icb0 = 0, uint32_t icb1 = 1) -{ - #ifdef ARCH_GRAYSKULL - // math MATH(( llk_math_eltwise_binary_init() )); - MATH(( llk_math_pack_sync_init() )); - - // unpacker - UNPACK(( llk_unpack_AB_init(icb0, icb1) )); - - // packer - PACK(( llk_pack_init() )); - PACK(( llk_pack_dest_init() )); - PACK(( llk_init_packer_dest_offset_registers() )); - #else - MATH(( llk_math_eltwise_binary_init() )); - // FIXME: API Update needed in compute kernel? UNPACK(( llk_unpack_AB_init(icb0, icb1) )); - #endif } /** diff --git a/tt_metal/include/compute_kernel_api/eltwise_binary.h b/tt_metal/include/compute_kernel_api/eltwise_binary.h index 70500217f91d..139527b12a6b 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_binary.h +++ b/tt_metal/include/compute_kernel_api/eltwise_binary.h @@ -37,7 +37,7 @@ ALWI void binary_op_init_common(uint32_t icb0, uint32_t icb1, uint32_t ocb=16) PACK(( llk_pack_hw_configure_disaggregated(ocb) )); PACK(( llk_pack_init(ocb) )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init() )); + PACK(( llk_pack_dest_init() )); } @@ -53,7 +53,7 @@ ALWI void mul_tiles_init_f() { MATH(( llk_math_eltwise_binary_init() )); - PACK(( llk_init_packer_dest_offset_registers() )); + PACK(( llk_init_packer_dest_offset_registers() )); UNPACK(( llk_unpack_AB_init(icb0, icb1) )); } @@ -70,7 +70,7 @@ ALWI void add_tiles_init_nof() { MATH(( llk_math_eltwise_binary_init() )); - PACK(( llk_init_packer_dest_offset_registers() )); + PACK(( llk_init_packer_dest_offset_registers() )); UNPACK(( llk_unpack_AB_init(icb0, icb1) )); } @@ -87,7 +87,7 @@ ALWI void sub_tiles_init_nof() { MATH(( llk_math_eltwise_binary_init() )); - PACK(( llk_init_packer_dest_offset_registers() )); + PACK(( llk_init_packer_dest_offset_registers() )); UNPACK(( llk_unpack_AB_init(icb0, icb1) )); } diff --git a/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h b/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h index 201d4f4d5b50..d783481f7b12 100644 --- a/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h +++ b/tt_metal/include/compute_kernel_api/eltwise_unary/eltwise_unary.h @@ -26,7 +26,7 @@ ALWI void unary_op_init_common(uint32_t icb, uint32_t ocb = 16) PACK(( llk_pack_hw_configure_disaggregated(ocb) )); PACK(( llk_pack_init(ocb) )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init() )); + PACK(( llk_pack_dest_init() )); MATH(( llk_math_eltwise_unary_datacopy_init(false /*transpose of faces*/, false /*transpose within 16x16 face*/, icb) )); MATH(( llk_math_pack_sync_init() )); diff --git a/tt_metal/include/compute_kernel_api/matmul.h b/tt_metal/include/compute_kernel_api/matmul.h index 0cc1f96efd7c..38d6bdc90fdb 100644 --- a/tt_metal/include/compute_kernel_api/matmul.h +++ b/tt_metal/include/compute_kernel_api/matmul.h @@ -30,23 +30,18 @@ namespace ckernel { ALWI void mm_init(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t out_cb_id = 16, const uint32_t transpose=0) { UNPACK(( llk_setup_operands() )); UNPACK(( llk_unpack_AB_matmul_hw_configure_disaggregated(in0_cb_id, in1_cb_id) )); - - #ifdef ARCH_GRAYSKULL UNPACK(( llk_unpack_AB_matmul_init(in0_cb_id, in1_cb_id, transpose) )); + MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id, transpose) )); - #else - UNPACK(( llk_unpack_AB_matmul_init(in0_cb_id, in1_cb_id, false) )); - MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id, false) )); - #endif MATH(( llk_math_pack_sync_init() )); PACK(( llk_pack_hw_configure_disaggregated(out_cb_id) )); PACK(( llk_pack_init(out_cb_id) )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init() )); + PACK(( llk_pack_dest_init() )); // TODO(AP): ZM-only kernel - PACK(( llk_init_packer_dest_offset_registers() )); + PACK(( llk_init_packer_dest_offset_registers() )); } ALWI void mm_init_once() { @@ -87,13 +82,8 @@ ALWI void matmul_tiles(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t in0_tile * | transpose | The transpose flag for performing transpose operation on B | uint32_t | Any positive value will indicate tranpose is set | False | */ ALWI void mm_init_short(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, const uint32_t transpose=0) { - #ifdef ARCH_GRAYSKULL MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id, transpose) )); UNPACK(( llk_unpack_AB_matmul_init(in0_cb_id, in1_cb_id, transpose) )); - #else - MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id, false) )); - UNPACK(( llk_unpack_AB_matmul_init(in0_cb_id, in1_cb_id, false) )); - #endif } /** @@ -130,31 +120,18 @@ ALWI void mm_init_short_with_dt(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, */ ALWI void mm_block_init(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t out_cb_id = 16, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1) { UNPACK(( llk_setup_operands() )); - UNPACK(( llk_unpack_AB_matmul_init(in0_cb_id, in1_cb_id, 0, ct_dim, rt_dim, kt_dim) )); UNPACK(( llk_unpack_AB_matmul_hw_configure_disaggregated(in0_cb_id, in1_cb_id) )); + UNPACK(( llk_unpack_AB_matmul_init(in0_cb_id, in1_cb_id, false, ct_dim, rt_dim, kt_dim) )); - #ifdef ARCH_GRAYSKULL - MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id) )); - #else - MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id, 0, ct_dim, rt_dim, kt_dim) )); - #endif + MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id, false, ct_dim, rt_dim, kt_dim) )); MATH(( llk_math_pack_sync_init() )); PACK(( llk_pack_hw_configure_disaggregated(out_cb_id) )); - - #ifdef ARCH_GRAYSKULL - PACK(( llk_pack_init(out_cb_id) )); + PACK(( llk_pack_init(out_cb_id) )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init() )); + PACK(( llk_pack_dest_init() )); // TODO(AP): ZM-only kernel - PACK(( llk_init_packer_dest_offset_registers() )); - #else - PACK(( llk_pack_init(out_cb_id) )); - PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init() )); - // TODO(AP): ZM-only kernel - PACK(( llk_init_packer_dest_offset_registers() )); - #endif + PACK(( llk_init_packer_dest_offset_registers() )); } /** @@ -179,12 +156,7 @@ ALWI void mm_block_init(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t */ ALWI void matmul_block(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t in0_tile_index, uint32_t in1_tile_index, uint32_t idst, const uint32_t transpose, uint32_t ct_dim, uint32_t rt_dim, uint32_t kt_dim) { UNPACK(( llk_unpack_AB_matmul(in0_cb_id, in1_cb_id, in0_tile_index, in1_tile_index, ct_dim, rt_dim, kt_dim) )); - - #ifdef ARCH_GRAYSKULL - MATH(( llk_math_matmul(idst, transpose, ct_dim, rt_dim, kt_dim) )); - #else - MATH(( llk_math_matmul(idst, transpose, ct_dim, rt_dim, kt_dim) )); - #endif + MATH(( llk_math_matmul(idst, transpose, ct_dim, rt_dim, kt_dim) )); } /** @@ -204,12 +176,7 @@ ALWI void matmul_block(uint32_t in0_cb_id, uint32_t in1_cb_id, uint32_t in0_tile */ ALWI void mm_block_init_short(uint32_t in0_cb_id = 0, uint32_t in1_cb_id = 1, uint32_t transpose=0, uint32_t ct_dim = 1, uint32_t rt_dim = 1, uint32_t kt_dim = 1) { UNPACK(( llk_unpack_AB_matmul_init(in0_cb_id, in1_cb_id, transpose, ct_dim, rt_dim, kt_dim) )); - - #ifdef ARCH_GRAYSKULL - MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id, transpose, ct_dim, rt_dim, kt_dim) )); - #else - MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id, transpose, ct_dim, rt_dim, kt_dim) )); - #endif + MATH(( llk_math_matmul_init(in0_cb_id, in1_cb_id, transpose, ct_dim, rt_dim, kt_dim) )); } /** diff --git a/tt_metal/include/compute_kernel_api/pack_untilize.h b/tt_metal/include/compute_kernel_api/pack_untilize.h index 561d5528d284..dc463555029f 100644 --- a/tt_metal/include/compute_kernel_api/pack_untilize.h +++ b/tt_metal/include/compute_kernel_api/pack_untilize.h @@ -27,11 +27,11 @@ ALWI void pack_untilize_init(uint32_t icb, uint32_t ocb) PACK(( llk_pack_hw_configure_disaggregated(ocb) )); PACK(( llk_pack_untilize_init() )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init() )); + PACK(( llk_pack_dest_init() )); UNPACK(( llk_setup_operands() )); UNPACK(( llk_unpack_A_hw_configure_disaggregated(icb) )); - UNPACK(( llk_unpack_A_init(0, 0, icb) )); // init must be after configure + UNPACK(( llk_unpack_A_init(false, false, icb) )); // init must be after configure } /** @@ -58,14 +58,14 @@ ALWI void pack_untilize_block(uint32_t icb, uint32_t block_rt_dim, uint32_t ocb) */ ALWI void pack_untilize_uninit(uint32_t ocb = 16) { PACK(( llk_pack_init(ocb) )); - PACK(( llk_init_packer_dest_offset_registers() )); + PACK(( llk_init_packer_dest_offset_registers() )); } template ALWI void pack_untilize_dst_init_short() { PACK(( llk_pack_untilize_init() )); - PACK(( llk_init_packer_dest_offset_registers() )); + PACK(( llk_init_packer_dest_offset_registers() )); } template diff --git a/tt_metal/include/compute_kernel_api/reduce.h b/tt_metal/include/compute_kernel_api/reduce.h index ab468456f9e0..b05156f42475 100644 --- a/tt_metal/include/compute_kernel_api/reduce.h +++ b/tt_metal/include/compute_kernel_api/reduce.h @@ -33,7 +33,7 @@ ALWI void reduce_init(PoolType reduce_op, ReduceDim dim, uint32_t icb, uint32_t PACK(( llk_pack_init() )); PACK(( llk_pack_reduce_config_v2(ocb) )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init() )); + PACK(( llk_pack_dest_init() )); } ALWI void reduce_init_short(PoolType reduce_op, ReduceDim dim, uint32_t icb, uint32_t icb_scaler, uint32_t ocb = 16) { diff --git a/tt_metal/include/compute_kernel_api/tile_move_copy.h b/tt_metal/include/compute_kernel_api/tile_move_copy.h index 2392a62a939d..cbd66f2f66fc 100644 --- a/tt_metal/include/compute_kernel_api/tile_move_copy.h +++ b/tt_metal/include/compute_kernel_api/tile_move_copy.h @@ -16,91 +16,43 @@ namespace ckernel { /** - * Copies a single tile from the DST register buffer at a specified index to a - * specified CB at a given index. For the out_tile_index to be valid for this - * call, cb_reserve_back(n) had to be called first to reserve at least some - * number n>0 of tiles in the output CB. The out_tile_index = 0 then references - * the first tile in the reserved section of the CB, up to index n-1 that will - * then be visible to the consumer in the same order after a cb_push_back call. - * The DST register buffer must be in acquired state via *acquire_dst* call. - * This call is blocking and is only available on the compute engine. - * - * Each subsequent pack call will increment the write pointer in the cb by single - * tile size. The pointer is then again set to a valid position with space for n - * reserved tiles by another cb_reserve_back call. - * - * Operates in tandem with functions cb_reserve_back and cb_push_back. - * - * A typical use case is first the producer ensures that there is a number of - * tiles available in the buffer via cb_reserve_back, then the producer uses - * the pack_tile call to copy a tile from one of DST slots to a slot in - * reserved space and finally cb_push_back is called to announce visibility of - * the reserved section of the circular buffer to the consumer. - * + * Perform the init short for copy tile. This does not reconfigure the unpacker data types. * Return value: None * * | Argument | Description | Type | Valid Range | Required | * |----------------|---------------------------------------------------|----------|-----------------------------------------------------|----------| - * | ifrom_dst | The index of the tile in the DST register | uint32_t | Must be less than the size of the DST register (16) | True | - * | icb | The identifier of the output circular buffer (CB) | uint32_t | 0 to 31 | True | - * | icb_tile | The index of the tile in the output CB to copy to | uint32_t | Must be less than the size of the CB | True | - */ -ALWI void copy_tile_to_dst_init_short_with_dt(uint32_t cbid) { - UNPACK(( llk_unpack_A_init() )); - // This reconfig call does a reconfig for unpack even if previous data format - // is same as new operand data format, which might cause less perf - UNPACK(( llk_unpack_reconfig_data_format_srca(cbid) )); - MATH(( llk_math_eltwise_unary_datacopy_init(false /*transpose of faces*/, false /*transpose within 16x16 face*/, cbid) )); -} - -ALWI void copy_tile_to_dst_init_short_with_dt(uint32_t old_cbid, uint32_t new_cbid) { - UNPACK(( llk_unpack_A_init() )); - // This reconfig call checks if old operand has different data format to - // new operand idx, otherwise no reconfig call occurs - UNPACK(( llk_unpack_reconfig_data_format_srca(old_cbid, new_cbid) )); - MATH(( llk_math_eltwise_unary_datacopy_init(false /*transpose of faces*/, false /*transpose within 16x16 face*/, new_cbid) )); -} - -ALWI void copy_tile_matmul_partials_init_short_with_dt(uint32_t cbid) { - #ifdef ARCH_GRAYSKULL - UNPACK(( llk_unpack_A_init(1) )); - #else - UNPACK(( llk_unpack_A_init() )); - #endif - - UNPACK(( llk_unpack_reconfig_data_format_srca(cbid) )); - MATH(( llk_math_eltwise_unary_datacopy_init(false /*transpose of faces*/, false /*transpose within 16x16 face*/, cbid) )); -} - -ALWI void copy_tile_matmul_partials_init_short_with_dt(uint32_t old_cbid, uint32_t new_cbid) { - #ifdef ARCH_GRAYSKULL - UNPACK(( llk_unpack_A_init(1) )); - #else - UNPACK(( llk_unpack_A_init() )); - #endif - - UNPACK(( llk_unpack_reconfig_data_format_srca(old_cbid, new_cbid) )); - MATH(( llk_math_eltwise_unary_datacopy_init(false /*transpose of faces*/, false /*transpose within 16x16 face*/, new_cbid) )); -} - -/** - * Perform the init short for copy tile. This does not reconfigure the unpacker data types. + * | cbid | The identifier of the input circular buffer (CB) | uint32_t | 0 to 31 | False | + * | transpose | Flag to perform transpose on SrcA | uint32_t | Any positive value will indicate tranpose is set | False | */ -ALWI void copy_tile_to_dst_init_short() +ALWI void copy_tile_to_dst_init_short(uint32_t cbid = 0, bool transpose = false) { - UNPACK(( llk_unpack_A_init() )); - MATH(( llk_math_eltwise_unary_datacopy_init() )); + UNPACK(( llk_unpack_A_init(transpose) )); + MATH(( llk_math_eltwise_unary_datacopy_init(false /*transpose of faces*/, false /*transpose within 16x16 face*/, cbid) )); } - /** * Perform a init for the copy tile operation. This calls the short init function and initializes packer dst offset registers. */ ALWI void copy_tile_init() { copy_tile_to_dst_init_short(); - PACK(( llk_init_packer_dest_offset_registers() )); + PACK(( llk_init_packer_dest_offset_registers() )); } +/** + * Return value: None + * + * | Argument | Description | Type | Valid Range | Required | + * |----------------|-------------------------------------------------------------------|----------|-----------------------------------------------------|----------| + * | old_cbid | The identifier of the previous input circular buffer (CB) to SrcA | uint32_t | 0 to 31 | False | + * | new_cbid | The identifier of the new input circular buffer (CB) to SrcA | uint32_t | 0 to 31 | False | + * | transpose | Flag to perform transpose on SrcA | uint32_t | Any positive value will indicate tranpose is set | False | + */ +ALWI void copy_tile_to_dst_init_short_with_dt(uint32_t old_cbid, uint32_t new_cbid, bool transpose = false) { + // This reconfig call checks if old operand has different data format to + // new operand idx, otherwise no reconfig call occurs + UNPACK(( llk_unpack_reconfig_data_format_srca(old_cbid, new_cbid) )); + copy_tile_to_dst_init_short(new_cbid, transpose); +} /** * Copies a single tile from the specified input CB and writes the result to @@ -120,25 +72,16 @@ ALWI void copy_tile_init() * | in_tile_index | The index of the tile to copy from the input CB | uint32_t | Must be less than the size of the CB | Yes | * | dst_tile_index | The index of the tile in the DST register | uint32_t | Must be less than the size of the DST register (16) | Yes | * */ -ALWI void copy_tile(uint32_t icb, uint32_t itile, uint32_t idst) +ALWI void copy_tile(uint32_t in_cb_id, uint32_t in_tile_index, uint32_t dst_tile_index) { - UNPACK(( llk_unpack_A(icb, itile) )); - #ifdef ARCH_GRAYSKULL - MATH(( llk_math_eltwise_unary_datacopy(idst) )); - #else - MATH(( llk_math_eltwise_unary_datacopy(idst) )); - #endif + UNPACK(( llk_unpack_A(in_cb_id, in_tile_index) )); + MATH(( llk_math_eltwise_unary_datacopy(dst_tile_index) )); } -ALWI void copy_block_matmul_partials(uint32_t icb, uint32_t start_itile, uint32_t start_idst, uint32_t ntiles) +ALWI void copy_block_matmul_partials(uint32_t in_cb_id, uint32_t start_in_tile_index, uint32_t start_dst_tile_index, uint32_t ntiles) { - #ifdef ARCH_GRAYSKULL - UNPACK(( llk_unpack_A_block(icb, start_itile, ntiles, true) )); - #else - UNPACK(( llk_unpack_A_block(icb, start_itile, ntiles, false) )); - #endif - - MATH(( llk_math_eltwise_unary_datacopy_block(start_idst, ntiles, icb) )); + UNPACK(( llk_unpack_A_block(in_cb_id, start_in_tile_index, ntiles, false) )); + MATH(( llk_math_eltwise_unary_datacopy_block(start_dst_tile_index, ntiles, in_cb_id) )); } } diff --git a/tt_metal/include/compute_kernel_api/tilize.h b/tt_metal/include/compute_kernel_api/tilize.h index 44f94b096361..da18166ed0e8 100644 --- a/tt_metal/include/compute_kernel_api/tilize.h +++ b/tt_metal/include/compute_kernel_api/tilize.h @@ -30,7 +30,7 @@ ALWI void tilize_init(uint32_t icb, uint32_t block, uint32_t ocb = 16) PACK(( llk_pack_hw_configure_disaggregated(ocb) )); PACK(( llk_pack_init(ocb) )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init(ocb) )); + PACK(( llk_pack_dest_init(ocb) )); UNPACK(( llk_setup_operands() )); UNPACK(( llk_unpack_tilize_hw_configure_disaggregated(icb) )); @@ -53,7 +53,7 @@ ALWI void tilizeA_B_reduce_init(uint32_t icb0, uint32_t icb1_scaler, uint32_t bl PACK(( llk_pack_hw_configure_disaggregated(ocb) )); PACK(( llk_pack_init(ocb) )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init(ocb) )); + PACK(( llk_pack_dest_init(ocb) )); } #endif diff --git a/tt_metal/include/compute_kernel_api/transpose_wh.h b/tt_metal/include/compute_kernel_api/transpose_wh.h index fb4c6cf98063..07b1d70a2dd5 100644 --- a/tt_metal/include/compute_kernel_api/transpose_wh.h +++ b/tt_metal/include/compute_kernel_api/transpose_wh.h @@ -32,7 +32,7 @@ ALWI void transpose_wh_init(uint32_t icb, uint32_t ocb = 16) PACK(( llk_pack_hw_configure_disaggregated(ocb) )); PACK(( llk_pack_init(ocb) )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init() )); + PACK(( llk_pack_dest_init() )); UNPACK(( llk_setup_operands() )); UNPACK(( llk_unpack_A_hw_configure_disaggregated<>(0, true) )); diff --git a/tt_metal/include/compute_kernel_api/untilize.h b/tt_metal/include/compute_kernel_api/untilize.h index ee19f53b28fb..ee423451e8ab 100644 --- a/tt_metal/include/compute_kernel_api/untilize.h +++ b/tt_metal/include/compute_kernel_api/untilize.h @@ -26,7 +26,7 @@ ALWI void untilize_init(uint32_t icb, uint32_t ocb = 16) PACK(( llk_pack_hw_configure_disaggregated(ocb) )); PACK(( llk_pack_init(ocb) )); PACK(( llk_setup_outputs() )); - PACK(( llk_pack_dest_init() )); + PACK(( llk_pack_dest_init() )); UNPACK(( llk_setup_operands() )); UNPACK(( llk_unpack_untilize_hw_configure_disaggregated(icb) ));